summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/syscall
diff options
context:
space:
mode:
authorstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
committerstevel@tonic-gate <none@none>2005-06-14 00:00:00 -0700
commit7c478bd95313f5f23a4c958a745db2134aa03244 (patch)
treec871e58545497667cbb4b0a4f2daf204743e1fe7 /usr/src/uts/common/syscall
downloadillumos-joyent-7c478bd95313f5f23a4c958a745db2134aa03244.tar.gz
OpenSolaris Launch
Diffstat (limited to 'usr/src/uts/common/syscall')
-rw-r--r--usr/src/uts/common/syscall/SYSCALL.README306
-rw-r--r--usr/src/uts/common/syscall/access.c114
-rw-r--r--usr/src/uts/common/syscall/acctctl.c620
-rw-r--r--usr/src/uts/common/syscall/acl.c430
-rw-r--r--usr/src/uts/common/syscall/adjtime.c108
-rw-r--r--usr/src/uts/common/syscall/alarm.c87
-rw-r--r--usr/src/uts/common/syscall/auditsys.c69
-rw-r--r--usr/src/uts/common/syscall/chdir.c247
-rw-r--r--usr/src/uts/common/syscall/chmod.c81
-rw-r--r--usr/src/uts/common/syscall/chown.c181
-rw-r--r--usr/src/uts/common/syscall/cladm.c100
-rw-r--r--usr/src/uts/common/syscall/close.c58
-rw-r--r--usr/src/uts/common/syscall/corectl.c558
-rw-r--r--usr/src/uts/common/syscall/exacctsys.c406
-rw-r--r--usr/src/uts/common/syscall/fcntl.c802
-rw-r--r--usr/src/uts/common/syscall/fdsync.c74
-rw-r--r--usr/src/uts/common/syscall/fsat.c162
-rw-r--r--usr/src/uts/common/syscall/getcwd.c81
-rw-r--r--usr/src/uts/common/syscall/getdents.c236
-rw-r--r--usr/src/uts/common/syscall/getloadavg.c68
-rw-r--r--usr/src/uts/common/syscall/getpagesizes.c122
-rw-r--r--usr/src/uts/common/syscall/getpid.c56
-rw-r--r--usr/src/uts/common/syscall/gid.c235
-rw-r--r--usr/src/uts/common/syscall/groups.c128
-rw-r--r--usr/src/uts/common/syscall/ioctl.c169
-rw-r--r--usr/src/uts/common/syscall/issetugid.c40
-rw-r--r--usr/src/uts/common/syscall/lgrpsys.c2105
-rw-r--r--usr/src/uts/common/syscall/link.c58
-rw-r--r--usr/src/uts/common/syscall/lseek.c380
-rw-r--r--usr/src/uts/common/syscall/lwp_create.c212
-rw-r--r--usr/src/uts/common/syscall/lwp_info.c80
-rw-r--r--usr/src/uts/common/syscall/lwp_self.c39
-rw-r--r--usr/src/uts/common/syscall/lwp_sobj.c3119
-rw-r--r--usr/src/uts/common/syscall/lwp_timer.c216
-rw-r--r--usr/src/uts/common/syscall/lwpsys.c563
-rw-r--r--usr/src/uts/common/syscall/memcntl.c394
-rw-r--r--usr/src/uts/common/syscall/mkdir.c67
-rw-r--r--usr/src/uts/common/syscall/mknod.c108
-rw-r--r--usr/src/uts/common/syscall/mount.c137
-rw-r--r--usr/src/uts/common/syscall/nice.c66
-rw-r--r--usr/src/uts/common/syscall/ntptime.c218
-rw-r--r--usr/src/uts/common/syscall/open.c305
-rw-r--r--usr/src/uts/common/syscall/p_online.c244
-rw-r--r--usr/src/uts/common/syscall/pathconf.c127
-rw-r--r--usr/src/uts/common/syscall/pause.c55
-rw-r--r--usr/src/uts/common/syscall/pgrpsys.c163
-rw-r--r--usr/src/uts/common/syscall/pipe.c178
-rw-r--r--usr/src/uts/common/syscall/poll.c2776
-rw-r--r--usr/src/uts/common/syscall/ppriv.c333
-rw-r--r--usr/src/uts/common/syscall/processor_bind.c375
-rw-r--r--usr/src/uts/common/syscall/processor_info.c71
-rw-r--r--usr/src/uts/common/syscall/profil.c95
-rw-r--r--usr/src/uts/common/syscall/pset.c797
-rw-r--r--usr/src/uts/common/syscall/rctlsys.c871
-rw-r--r--usr/src/uts/common/syscall/readlink.c119
-rw-r--r--usr/src/uts/common/syscall/rename.c139
-rw-r--r--usr/src/uts/common/syscall/resolvepath.c60
-rw-r--r--usr/src/uts/common/syscall/rlimit.c487
-rw-r--r--usr/src/uts/common/syscall/rmdir.c60
-rw-r--r--usr/src/uts/common/syscall/rusagesys.c294
-rw-r--r--usr/src/uts/common/syscall/rw.c1223
-rw-r--r--usr/src/uts/common/syscall/sem.c1208
-rw-r--r--usr/src/uts/common/syscall/sendfile.c1186
-rw-r--r--usr/src/uts/common/syscall/sigaction.c231
-rw-r--r--usr/src/uts/common/syscall/sigaltstack.c121
-rw-r--r--usr/src/uts/common/syscall/signotify.c226
-rw-r--r--usr/src/uts/common/syscall/sigpending.c72
-rw-r--r--usr/src/uts/common/syscall/sigprocmask.c127
-rw-r--r--usr/src/uts/common/syscall/sigqueue.c185
-rw-r--r--usr/src/uts/common/syscall/sigsendset.c67
-rw-r--r--usr/src/uts/common/syscall/sigsuspend.c66
-rw-r--r--usr/src/uts/common/syscall/sigtimedwait.c207
-rw-r--r--usr/src/uts/common/syscall/ssig.c169
-rw-r--r--usr/src/uts/common/syscall/stat.c675
-rw-r--r--usr/src/uts/common/syscall/statfs.c164
-rw-r--r--usr/src/uts/common/syscall/statvfs.c366
-rw-r--r--usr/src/uts/common/syscall/strcalls.c537
-rw-r--r--usr/src/uts/common/syscall/symlink.c102
-rw-r--r--usr/src/uts/common/syscall/sync.c41
-rw-r--r--usr/src/uts/common/syscall/sysconfig.c171
-rw-r--r--usr/src/uts/common/syscall/sysfs.c137
-rw-r--r--usr/src/uts/common/syscall/systeminfo.c329
-rw-r--r--usr/src/uts/common/syscall/tasksys.c266
-rw-r--r--usr/src/uts/common/syscall/time.c80
-rw-r--r--usr/src/uts/common/syscall/times.c103
-rw-r--r--usr/src/uts/common/syscall/uadmin.c373
-rw-r--r--usr/src/uts/common/syscall/ucredsys.c208
-rw-r--r--usr/src/uts/common/syscall/uid.c323
-rw-r--r--usr/src/uts/common/syscall/umask.c49
-rw-r--r--usr/src/uts/common/syscall/umount.c188
-rw-r--r--usr/src/uts/common/syscall/uname.c62
-rw-r--r--usr/src/uts/common/syscall/unlink.c111
-rw-r--r--usr/src/uts/common/syscall/utime.c230
-rw-r--r--usr/src/uts/common/syscall/utssys.c954
-rw-r--r--usr/src/uts/common/syscall/yield.c61
95 files changed, 31467 insertions, 0 deletions
diff --git a/usr/src/uts/common/syscall/SYSCALL.README b/usr/src/uts/common/syscall/SYSCALL.README
new file mode 100644
index 0000000000..2850b2f947
--- /dev/null
+++ b/usr/src/uts/common/syscall/SYSCALL.README
@@ -0,0 +1,306 @@
+
+CDDL HEADER START
+
+The contents of this file are subject to the terms of the
+Common Development and Distribution License, Version 1.0 only
+(the "License"). You may not use this file except in compliance
+with the License.
+
+You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+or http://www.opensolaris.org/os/licensing.
+See the License for the specific language governing permissions
+and limitations under the License.
+
+When distributing Covered Code, include this CDDL HEADER in each
+file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+If applicable, add the following below this CDDL HEADER, with the
+fields enclosed by brackets "[]" replaced with your own identifying
+information: Portions Copyright [yyyy] [name of copyright owner]
+
+CDDL HEADER END
+
+Copyright 2000 Sun Microsystems, Inc. All rights reserved.
+Use is subject to license terms.
+
+ident "%Z%%M% %I% %E% SMI"
+
+System Call Files
+------ ---- -----
+
+The universal dumping grounds for system calls in Solaris 2.x,
+common/os/scalls.c and common/fs/vncalls.c, have been sub-divided into
+smaller files.
+
+The old files had become quite large, and contained much completely
+unrelated code. From a software engineering standpoint, it didn't seem
+like a good idea to permit system calls or underlying routines
+to be cognizant of the internal interfaces and underlying routines
+of unrelated system calls.
+
+From a practical standpoint, recompiling all of scalls.c or vncalls.c
+after making only a small change in one system call seemed like
+cruel and unusual punishment. Also, running "bringover" after
+changing scalls.c or vncalls.c in one's own environment had a
+high probability of encountering a conflict.
+
+In an attempt to improve maintainability, we have split these files
+and created new directories to hold the results. One hopes that this
+new organization will prove easier to maintain and change.
+
+The principles listed below guided the split-up. Please try to adhere
+to them if you add new system calls.
+
+
+1) System calls now live in directories called "syscall". Architecture
+ independant system calls live in common/syscall and architecture
+ dependant system calls live in sparc/syscall or i86/syscall.
+
+2) Most system calls have their own separate file. We try to keep
+ these files as small as possible.
+
+3) Unrelated system calls should NEVER be put in the same file. Do
+ not consider any of these files "dumping grounds" for new system
+ call work.
+
+4) Some files DO contain more than one system call. This occurs
+ under the following restricted conditions:
+
+ o System calls that are internally related, either because
+ they alone call a set of static functions to do the dirty
+ work, or because they access locally-defined static data.
+ The system calls in sigqueue.c and lwpsys.c are examples
+ of the first case; lwp_sobj.c is an example of the second.
+
+ o Fairly trivial pairs of "get-" and "set-" operation system
+ calls. The file rlimit.c, containing getrlimit() and
+ setrlimit() is a case in point.
+
+ o System calls that are basically "variations on a theme,"
+ such as the the different forms of stat in stat.c.
+
+5) If a number of system calls make use of a local function, or,
+ if a function is used more widely than in a few system calls,
+ then perhaps this function needs to be moved to one of the
+ kernel-implementation files in common/os or common/fs. For
+ example, this was done with the functions namesetattr and
+ fdsetattr, which were used by several different system calls.
+ These functions were moved into common/os/fio.c, where they
+ seemed to fit better.
+
+-------------------------------------------------------------------
+System Call Reorganization
+------ ---- --------------
+
+The system calls in common/os/scalls.c, common/fs/vncalls.c
+have been broken up into smaller files. In addition, system
+calls that previously resided in <arch>/os/archdep.c have
+been removed from that file. The table below describes the
+manner in which the files have been split up.
+
+The original syscall files have not been deleted, but have been
+renamed to reflect their diminished contents. The file scalls.c
+has been renamed to ssig.c, and vncalls.c has been renamed to
+poll.c.
+
+
+Syscall Entry Point Old File New File
+------------------- --- ---- --- ----
+gtime scalls.c common/syscall/time.c
+stime scalls.c common/syscall/time.c
+
+adjtime scalls.c common/syscall/adjtime.c
+
+times scalls.c common/syscall/times.c
+
+sysconfig scalls.c common/syscall/sysconfig.c
+
+setuid scalls.c common/syscall/uid.c
+getuid scalls.c common/syscall/uid.c
+seteuid scalls.c common/syscall/uid.c
+
+setgid scalls.c common/syscall/gid.c
+getgid scalls.c common/syscall/gid.c
+setegid scalls.c common/syscall/gid.c
+
+getpid scalls.c common/syscall/getpid.c
+
+setgroups scalls.c common/syscall/groups.c
+getgroups scalls.c common/syscall/groups.c
+
+setpgrp scalls.c common/syscall/pgrpsys.c
+
+pause scalls.c common/syscall/pause.c
+
+ssig scalls.c common/syscall/ssig.c
+
+sigtimedwait scalls.c common/syscall/sigtimedwait.c
+
+sigsuspend scalls.c common/syscall/sigsuspend.c
+
+sigaltstack scalls.c common/syscall/sigaltstack.c
+
+sigpending scalls.c common/syscall/sigpending.c
+
+sigprocmask scalls.c common/syscall/sigprocmask.c
+
+sigaction scalls.c common/syscall/sigaction.c
+
+kill scalls.c common/syscall/sigqueue.c
+sigqueue scalls.c common/syscall/sigqueue.c
+
+sigsendsys scalls.c common/syscall/sigsendset.c
+
+profil scalls.c common/syscall/profil.c
+
+alarm scalls.c common/syscall/alarm.c
+
+umask scalls.c common/syscall/umask.c
+
+ulimit scalls.c common/syscall/rlimit.c
+getrlimit scalls.c common/syscall/rlimit.c
+setrlimit scalls.c common/syscall/rlimit.c
+
+utssys scalls.c common/syscall/utssys.c
+
+uname scalls.c common/syscall/uname.c
+
+uadmin scalls.c common/syscall/uadmin.c
+
+systeminfo scalls.c common/syscall/systeminfo.c
+
+syslwp_create scalls.c common/syscall/lwp_create.c
+syslwp_exit scalls.c common/syscall/lwp_create.c
+
+syslwp_syspend scalls.c common/syscall/lwpsys.c
+syslwp_continue scalls.c common/syscall/lwpsys.c
+lwp_kill scalls.c common/syscall/lwpsys.c
+lwp_wait scalls.c common/syscall/lwpsys.c
+
+yield scalls.c common/syscall/yield.c
+
+lwp_self scalls.c common/syscall/lwp_self.c
+
+lwp_info scalls.c common/syscall/lwp_info.c
+
+lwp_mutex_lock scalls.c common/syscall/lwp_sobj.c
+lwp_mutex_unlock scalls.c common/syscall/lwp_sobj.c
+lwp_cond_wait scalls.c common/syscall/lwp_sobj.c
+lwp_cond_signal scalls.c common/syscall/lwp_sobj.c
+lwp_cond_broadcast scalls.c common/syscall/lwp_sobj.c
+lwp_sema_p scalls.c common/syscall/lwp_sobj.c
+lwp_sema_v scalls.c common/syscall/lwp_sobj.c
+
+open vncalls.c common/syscall/open.c
+creat vncalls.c common/syscall/open.c
+
+close vncalls.c common/syscall/close.c
+
+read vncalls.c common/syscall/rw.c
+write vncalls.c common/syscall/rw.c
+pread vncalls.c common/syscall/rw.c
+pwrite vncalls.c common/syscall/rw.c
+readv vncalls.c common/syscall/rw.c
+writev vncalls.c common/syscall/rw.c
+
+chdir vncalls.c common/syscall/chdir.c
+fchdir vncalls.c common/syscall/chdir.c
+chroot vncalls.c common/syscall/chdir.c
+fchroot vncalls.c common/syscall/chdir.c
+
+mknod vncalls.c common/syscall/mknod.c
+xmknod vncalls.c common/syscall/mknod.c
+
+mkdir vncalls.c common/syscall/mkdir.c
+
+link vncalls.c common/syscall/link.c
+
+rename vncalls.c common/syscall/rename.c
+
+symlink vncalls.c common/syscall/symlink.c
+
+unlink vncalls.c common/syscall/unlink.c
+
+rmdir vncalls.c common/syscall/rmdir.c
+
+getdents vncalls.c common/syscall/getdents.c
+
+lseek vncalls.c common/syscall/lseek.c
+llseek vncalls.c common/syscall/lseek.c
+
+access vncalls.c common/syscall/access.c
+
+stat vncalls.c common/syscall/stat.c
+lstat vncalls.c common/syscall/stat.c
+fstat vncalls.c common/syscall/stat.c
+xstat vncalls.c common/syscall/stat.c
+lxstat vncalls.c common/syscall/stat.c
+fxstat vncalls.c common/syscall/stat.c
+
+fpathconf vncalls.c common/syscall/pathconf.c
+pathconf vncalls.c common/syscall/pathconf.c
+
+readlink vncalls.c common/syscall/readlink.c
+
+chmod vncalls.c common/syscall/chmod.c
+fchmod vncalls.c common/syscall/chmod.c
+
+chown vncalls.c common/syscall/chown.c
+lchown vncalls.c common/syscall/chown.c
+fchown vncalls.c common/syscall/chown.c
+
+utime vncalls.c common/syscall/utime.c
+utimes vncalls.c common/syscall/utime.c
+
+fdsync vncalls.c common/syscall/fdsync.c
+
+fcntl vncalls.c common/syscall/fcntl.c
+
+dup vncalls.c common/syscall/dup.c
+
+ioctl vncalls.c common/syscall/ioctl.c
+stty vncalls.c common/syscall/ioctl.c
+gtty vncalls.c common/syscall/ioctl.c
+
+poll vncalls.c common/syscall/poll.c
+
+acl vncalls.c common/syscall/acl.c
+facl vncalls.c common/syscall/acl.c
+
+mount vfs.c common/syscall/mount.c
+
+statfs vfs.c common/syscall/statfs.c
+fstatfs vfs.c common/syscall/statfs.c
+
+statvfs vfs.c common/syscall/statvfs.c
+fstatvfs vfs.c common/syscall/statvfs.c
+
+sync vfs.c common/syscall/sync.c
+
+sysfs vfs.c common/syscall/sysfs.c
+
+umount vfs.c common/syscall/umount.c
+
+nice priocntl.c common/syscall/nice.c
+
+pipe os/pipe.c common/syscall/pipe.c
+
+msgsys os/msg.c common/syscall/msg.c
+
+semsys os/sem.c common/syscall/sem.c
+
+shmsys os/shm.c common/syscall/shm.c
+
+getcontext sparc/archdep.c sparc/syscall/getcontext.c
+lwp_getprivate sparc/archdep.c sparc/syscall/lwp_private.c
+lwp_setprivate sparc/archdep.c sparc/syscall/lwp_private.c
+
+getcontext i86/archdep.c i86/syscall/getcontext.c
+lwp_getprivate i86/archdep.c i86/syscall/lwp_private.c
+lwp_setprivate i86/archdep.c i86/syscall/lwp_private.c
+
+-----------------------------------------------------------------
+
+Most of the system calls in this directory have been converted
+to use C-style argument passing, instead of the old uap-pointer
+method. This usually makes the system calls faster and more
+"natural" in implementation.
diff --git a/usr/src/uts/common/syscall/access.c b/usr/src/uts/common/syscall/access.c
new file mode 100644
index 0000000000..e13a754cc5
--- /dev/null
+++ b/usr/src/uts/common/syscall/access.c
@@ -0,0 +1,114 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred_impl.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+
+/*
+ * Determine accessibility of file.
+ */
+
+#define E_OK 010 /* use effective ids */
+#define R_OK 004
+#define W_OK 002
+#define X_OK 001
+
+int
+access(char *fname, int fmode)
+{
+ vnode_t *vp;
+ cred_t *tmpcr;
+ int error;
+ int mode;
+ int eok;
+ cred_t *cr;
+
+ if (fmode & ~(E_OK|R_OK|W_OK|X_OK))
+ return (set_errno(EINVAL));
+
+ mode = ((fmode & (R_OK|W_OK|X_OK)) << 6);
+
+ cr = CRED();
+
+ /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */
+ eok = (fmode & E_OK) ||
+ (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid);
+
+ if (eok)
+ tmpcr = cr;
+ else {
+ tmpcr = crdup(cr);
+ tmpcr->cr_uid = cr->cr_ruid;
+ tmpcr->cr_gid = cr->cr_rgid;
+ tmpcr->cr_ruid = cr->cr_uid;
+ tmpcr->cr_rgid = cr->cr_gid;
+ }
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ if (!eok)
+ crfree(tmpcr);
+ return (set_errno(error));
+ }
+
+ if (mode) {
+ error = VOP_ACCESS(vp, mode, 0, tmpcr);
+ if (error) {
+ if (error == ESTALE) {
+ VN_RELE(vp);
+ goto lookup;
+ }
+ (void) set_errno(error);
+ }
+ }
+
+ if (!eok)
+ crfree(tmpcr);
+ VN_RELE(vp);
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/acctctl.c b/usr/src/uts/common/syscall/acctctl.c
new file mode 100644
index 0000000000..8c134b0a62
--- /dev/null
+++ b/usr/src/uts/common/syscall/acctctl.c
@@ -0,0 +1,620 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/user.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/pathname.h>
+#include <sys/modctl.h>
+#include <sys/acctctl.h>
+#include <sys/bitmap.h>
+#include <sys/exacct.h>
+#include <sys/policy.h>
+
+/*
+ * acctctl(2)
+ *
+ * acctctl() provides the administrative interface to the extended accounting
+ * subsystem. The process and task accounting facilities are configurable:
+ * resources can be individually specified for recording in the appropriate
+ * accounting file.
+ *
+ * The current implementation of acctctl() requires that the process and task
+ * and flow files be distinct across all zones.
+ *
+ * Locking
+ * Each accounting species has an ac_info_t which contains a mutex,
+ * used to protect the ac_info_t's contents, and to serialize access to the
+ * appropriate file.
+ */
+
+static list_t exacct_globals_list;
+static kmutex_t exacct_globals_list_lock;
+
+static int
+ac_state_set(ac_info_t *info, void *buf, size_t bufsz)
+{
+ int state;
+
+ if (buf == NULL || (bufsz != sizeof (int)))
+ return (EINVAL);
+
+ if (copyin(buf, &state, bufsz) != 0)
+ return (EFAULT);
+
+ if (state != AC_ON && state != AC_OFF)
+ return (EINVAL);
+
+ mutex_enter(&info->ac_lock);
+ info->ac_state = state;
+ mutex_exit(&info->ac_lock);
+ return (0);
+}
+
+static int
+ac_state_get(ac_info_t *info, void *buf, size_t bufsz)
+{
+ if (buf == NULL || (bufsz != sizeof (int)))
+ return (EINVAL);
+
+ mutex_enter(&info->ac_lock);
+ if (copyout(&info->ac_state, buf, bufsz) != 0) {
+ mutex_exit(&info->ac_lock);
+ return (EFAULT);
+ }
+ mutex_exit(&info->ac_lock);
+ return (0);
+}
+
+static boolean_t
+ac_file_in_use(vnode_t *vp)
+{
+ boolean_t in_use = B_FALSE;
+ struct exacct_globals *acg;
+
+ if (vp == NULL)
+ return (B_FALSE);
+ mutex_enter(&exacct_globals_list_lock);
+ /*
+ * Start off by grabbing all locks.
+ */
+ for (acg = list_head(&exacct_globals_list); acg != NULL;
+ acg = list_next(&exacct_globals_list, acg)) {
+ mutex_enter(&acg->ac_proc.ac_lock);
+ mutex_enter(&acg->ac_task.ac_lock);
+ mutex_enter(&acg->ac_flow.ac_lock);
+ }
+
+ for (acg = list_head(&exacct_globals_list); !in_use && acg != NULL;
+ acg = list_next(&exacct_globals_list, acg)) {
+ /*
+ * We need to verify that we aren't already using this file for
+ * accounting in any zone.
+ */
+ if (vn_compare(acg->ac_proc.ac_vnode, vp) ||
+ vn_compare(acg->ac_task.ac_vnode, vp) ||
+ vn_compare(acg->ac_flow.ac_vnode, vp))
+ in_use = B_TRUE;
+ }
+
+ /*
+ * Drop all locks.
+ */
+ for (acg = list_head(&exacct_globals_list); acg != NULL;
+ acg = list_next(&exacct_globals_list, acg)) {
+ mutex_exit(&acg->ac_proc.ac_lock);
+ mutex_exit(&acg->ac_task.ac_lock);
+ mutex_exit(&acg->ac_flow.ac_lock);
+ }
+ mutex_exit(&exacct_globals_list_lock);
+ return (in_use);
+}
+
+static int
+ac_file_set(ac_info_t *info, void *ubuf, size_t bufsz)
+{
+ int error = 0;
+ void *kbuf;
+ void *namebuf;
+ int namelen;
+ vnode_t *vp;
+ void *hdr;
+ size_t hdrsize;
+
+ if (ubuf == NULL) {
+ mutex_enter(&info->ac_lock);
+
+ /*
+ * Closing accounting file
+ */
+ if (info->ac_vnode != NULL) {
+ error = VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0, CRED());
+ if (error) {
+ mutex_exit(&info->ac_lock);
+ return (error);
+ }
+ VN_RELE(info->ac_vnode);
+ info->ac_vnode = NULL;
+ }
+ if (info->ac_file != NULL) {
+ kmem_free(info->ac_file, strlen(info->ac_file) + 1);
+ info->ac_file = NULL;
+ }
+
+ mutex_exit(&info->ac_lock);
+ return (error);
+ }
+
+ if (bufsz < 2 || bufsz > MAXPATHLEN)
+ return (EINVAL);
+
+ /*
+ * We have to copy in the whole buffer since we can't tell the length
+ * of the string in user's address space.
+ */
+ kbuf = kmem_zalloc(bufsz, KM_SLEEP);
+ if ((error = copyinstr((char *)ubuf, (char *)kbuf, bufsz, NULL)) != 0) {
+ kmem_free(kbuf, bufsz);
+ return (error);
+ }
+ if (*((char *)kbuf) != '/') {
+ kmem_free(kbuf, bufsz);
+ return (EINVAL);
+ }
+
+ /*
+ * Now, allocate the space where we are going to save the
+ * name of the accounting file and kmem_free kbuf. We have to do this
+ * now because it is not good to sleep in kmem_alloc() while
+ * holding ac_info's lock.
+ */
+ namelen = strlen(kbuf) + 1;
+ namebuf = kmem_alloc(namelen, KM_SLEEP);
+ (void) strcpy(namebuf, kbuf);
+ kmem_free(kbuf, bufsz);
+
+ /*
+ * Check if this file already exists.
+ */
+ error = lookupname(namebuf, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
+
+ /*
+ * Check if the file is already in use.
+ */
+ if (!error) {
+ if (ac_file_in_use(vp)) {
+ /*
+ * If we're already using it then return EBUSY
+ */
+ kmem_free(namebuf, namelen);
+ VN_RELE(vp);
+ return (EBUSY);
+ }
+ VN_RELE(vp);
+ }
+
+ /*
+ * Now, grab info's ac_lock and try to set up everything.
+ */
+ mutex_enter(&info->ac_lock);
+
+ if ((error = vn_open(namebuf, UIO_SYSSPACE,
+ FCREAT | FWRITE | FTRUNC, 0600, &vp, CRCREAT, 0)) != 0) {
+ mutex_exit(&info->ac_lock);
+ kmem_free(namebuf, namelen);
+ return (error);
+ }
+
+ if (vp->v_type != VREG) {
+ VN_RELE(vp);
+ mutex_exit(&info->ac_lock);
+ kmem_free(namebuf, namelen);
+ return (EACCES);
+ }
+
+ if (info->ac_vnode != NULL) {
+ /*
+ * Switch from an old file to a new file by swapping
+ * their vnode pointers.
+ */
+ vnode_t *oldvp;
+ oldvp = info->ac_vnode;
+ info->ac_vnode = vp;
+ vp = oldvp;
+ } else {
+ /*
+ * Start writing accounting records to a new file.
+ */
+ info->ac_vnode = vp;
+ vp = NULL;
+ }
+ if (vp) {
+ /*
+ * We still need to close the old file.
+ */
+ if ((error = VOP_CLOSE(vp, FWRITE, 1, 0, CRED())) != 0) {
+ VN_RELE(vp);
+ mutex_exit(&info->ac_lock);
+ kmem_free(namebuf, namelen);
+ return (error);
+ }
+ VN_RELE(vp);
+ if (info->ac_file != NULL) {
+ kmem_free(info->ac_file,
+ strlen(info->ac_file) + 1);
+ info->ac_file = NULL;
+ }
+ }
+ /*
+ * Finally, point ac_file to the filename string and release the lock.
+ */
+ info->ac_file = namebuf;
+ mutex_exit(&info->ac_lock);
+
+ /*
+ * Create and write an exacct header to the file.
+ */
+ hdr = exacct_create_header(&hdrsize);
+ error = exacct_write_header(info, hdr, hdrsize);
+
+ return (error);
+}
+
+static int
+ac_file_get(ac_info_t *info, void *buf, size_t bufsz)
+{
+ int error = 0;
+ vnode_t *vnode;
+ char *file;
+
+ mutex_enter(&info->ac_lock);
+ file = info->ac_file;
+ vnode = info->ac_vnode;
+
+ if (file == NULL || vnode == NULL) {
+ mutex_exit(&info->ac_lock);
+ return (ENOTACTIVE);
+ }
+
+ if (strlen(file) >= bufsz)
+ error = ENOMEM;
+ else
+ error = copyoutstr(file, buf, MAXPATHLEN, NULL);
+
+ mutex_exit(&info->ac_lock);
+ return (error);
+}
+
+static int
+ac_res_set(ac_info_t *info, void *buf, size_t bufsz, int maxres)
+{
+ ac_res_t *res;
+ ac_res_t *tmp;
+ ulong_t *maskp;
+ int id;
+ uint_t counter = 0;
+
+ /*
+ * Validate that a non-zero buffer, sized within limits and to an
+ * integral number of ac_res_t's has been specified.
+ */
+ if (bufsz == 0 ||
+ bufsz > sizeof (ac_res_t) * (AC_MAX_RES + 1) ||
+ (bufsz / sizeof (ac_res_t)) * sizeof (ac_res_t) != bufsz)
+ return (EINVAL);
+
+ tmp = res = kmem_alloc(bufsz, KM_SLEEP);
+ if (copyin(buf, res, bufsz) != 0) {
+ kmem_free(res, bufsz);
+ return (EFAULT);
+ }
+
+ maskp = (ulong_t *)&info->ac_mask;
+
+ mutex_enter(&info->ac_lock);
+ while ((id = tmp->ar_id) != AC_NONE && counter < maxres + 1) {
+ if (id > maxres || id < 0) {
+ mutex_exit(&info->ac_lock);
+ kmem_free(res, bufsz);
+ return (EINVAL);
+ }
+ if (tmp->ar_state == AC_ON) {
+ BT_SET(maskp, id);
+ } else if (tmp->ar_state == AC_OFF) {
+ BT_CLEAR(maskp, id);
+ } else {
+ mutex_exit(&info->ac_lock);
+ kmem_free(res, bufsz);
+ return (EINVAL);
+ }
+ tmp++;
+ counter++;
+ }
+ mutex_exit(&info->ac_lock);
+ kmem_free(res, bufsz);
+ return (0);
+}
+
+static int
+ac_res_get(ac_info_t *info, void *buf, size_t bufsz, int maxres)
+{
+ int error = 0;
+ ac_res_t *res;
+ ac_res_t *tmp;
+ size_t ressz = sizeof (ac_res_t) * (maxres + 1);
+ ulong_t *maskp;
+ int id;
+
+ if (bufsz < ressz)
+ return (EINVAL);
+ tmp = res = kmem_alloc(ressz, KM_SLEEP);
+
+ mutex_enter(&info->ac_lock);
+ maskp = (ulong_t *)&info->ac_mask;
+ for (id = 1; id <= maxres; id++) {
+ tmp->ar_id = id;
+ tmp->ar_state = BT_TEST(maskp, id);
+ tmp++;
+ }
+ tmp->ar_id = AC_NONE;
+ tmp->ar_state = AC_OFF;
+ mutex_exit(&info->ac_lock);
+ error = copyout(res, buf, ressz);
+ kmem_free(res, ressz);
+ return (error);
+}
+
+/*
+ * acctctl()
+ *
+ * Overview
+ * acctctl() is the entry point for the acctctl(2) system call.
+ *
+ * Return values
+ * On successful completion, return 0; otherwise -1 is returned and errno is
+ * set appropriately.
+ *
+ * Caller's context
+ * Called from the system call path.
+ */
+int
+acctctl(int cmd, void *buf, size_t bufsz)
+{
+ int error = 0;
+ int mode = AC_MODE(cmd);
+ int option = AC_OPTION(cmd);
+ int maxres;
+ ac_info_t *info;
+ zone_t *zone = curproc->p_zone;
+ struct exacct_globals *acg;
+
+ acg = zone_getspecific(exacct_zone_key, zone);
+ /*
+ * exacct_zone_key and associated per-zone state were initialized when
+ * the module was loaded.
+ */
+ ASSERT(exacct_zone_key != ZONE_KEY_UNINITIALIZED);
+ ASSERT(acg != NULL);
+
+ switch (mode) { /* sanity check */
+ case AC_TASK:
+ info = &acg->ac_task;
+ maxres = AC_TASK_MAX_RES;
+ break;
+ case AC_PROC:
+ info = &acg->ac_proc;
+ maxres = AC_PROC_MAX_RES;
+ break;
+ case AC_FLOW:
+ /*
+ * Flow accounting isn't currently configurable in non-global
+ * zones, but we have this field on a per-zone basis for future
+ * expansion as well as the ability to return default "unset"
+ * values for the various AC_*_GET queries. AC_*_SET commands
+ * fail with EPERM for AC_FLOW in non-global zones.
+ */
+ info = &acg->ac_flow;
+ maxres = AC_FLOW_MAX_RES;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ switch (option) {
+ case AC_STATE_SET:
+ if ((error = secpolicy_acct(CRED())) != 0)
+ break;
+ if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ error = EPERM;
+ break;
+ }
+ error = ac_state_set(info, buf, bufsz);
+ break;
+ case AC_STATE_GET:
+ error = ac_state_get(info, buf, bufsz);
+ break;
+ case AC_FILE_SET:
+ if ((error = secpolicy_acct(CRED())) != 0)
+ break;
+ if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ error = EPERM;
+ break;
+ }
+ error = ac_file_set(info, buf, bufsz);
+ break;
+ case AC_FILE_GET:
+ error = ac_file_get(info, buf, bufsz);
+ break;
+ case AC_RES_SET:
+ if ((error = secpolicy_acct(CRED())) != 0)
+ break;
+ if (mode == AC_FLOW && getzoneid() != GLOBAL_ZONEID) {
+ error = EPERM;
+ break;
+ }
+ error = ac_res_set(info, buf, bufsz, maxres);
+ break;
+ case AC_RES_GET:
+ error = ac_res_get(info, buf, bufsz, maxres);
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+static struct sysent ac_sysent = {
+ 3,
+ SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
+ acctctl
+};
+
+static struct modlsys modlsys = {
+ &mod_syscallops,
+ "acctctl system call",
+ &ac_sysent
+};
+
+#ifdef _SYSCALL32_IMPL
+static struct modlsys modlsys32 = {
+ &mod_syscallops32,
+ "32-bit acctctl system call",
+ &ac_sysent
+};
+#endif
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsys,
+#ifdef _SYSCALL32_IMPL
+ &modlsys32,
+#endif
+ NULL
+};
+
+/* ARGSUSED */
+static void *
+exacct_zone_init(zoneid_t zoneid)
+{
+ struct exacct_globals *acg;
+
+ acg = kmem_zalloc(sizeof (*acg), KM_SLEEP);
+ mutex_enter(&exacct_globals_list_lock);
+ list_insert_tail(&exacct_globals_list, acg);
+ mutex_exit(&exacct_globals_list_lock);
+ return (acg);
+}
+
+static void
+exacct_free_info(ac_info_t *info)
+{
+ mutex_enter(&info->ac_lock);
+ if (info->ac_vnode) {
+ (void) VOP_CLOSE(info->ac_vnode, FWRITE, 1, 0, kcred);
+ VN_RELE(info->ac_vnode);
+ kmem_free(info->ac_file, strlen(info->ac_file) + 1);
+ }
+ info->ac_state = AC_OFF;
+ info->ac_vnode = NULL;
+ info->ac_file = NULL;
+ mutex_exit(&info->ac_lock);
+}
+
+/* ARGSUSED */
+static void
+exacct_zone_shutdown(zoneid_t zoneid, void *data)
+{
+ struct exacct_globals *acg = data;
+
+ /*
+ * The accounting files need to be closed during shutdown rather than
+ * destroy, since otherwise the filesystem they reside on may fail to
+ * unmount, thus causing the entire zone halt/reboot to fail.
+ */
+ exacct_free_info(&acg->ac_proc);
+ exacct_free_info(&acg->ac_task);
+ exacct_free_info(&acg->ac_flow);
+}
+
+/* ARGSUSED */
+static void
+exacct_zone_fini(zoneid_t zoneid, void *data)
+{
+ struct exacct_globals *acg = data;
+
+ mutex_enter(&exacct_globals_list_lock);
+ list_remove(&exacct_globals_list, acg);
+ mutex_exit(&exacct_globals_list_lock);
+
+ mutex_destroy(&acg->ac_proc.ac_lock);
+ mutex_destroy(&acg->ac_task.ac_lock);
+ mutex_destroy(&acg->ac_flow.ac_lock);
+ kmem_free(acg, sizeof (*acg));
+}
+
+int
+_init()
+{
+ int error;
+
+ mutex_init(&exacct_globals_list_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&exacct_globals_list, sizeof (struct exacct_globals),
+ offsetof(struct exacct_globals, ac_link));
+ zone_key_create(&exacct_zone_key, exacct_zone_init,
+ exacct_zone_shutdown, exacct_zone_fini);
+
+ if ((error = mod_install(&modlinkage)) != 0) {
+ (void) zone_key_delete(exacct_zone_key);
+ exacct_zone_key = ZONE_KEY_UNINITIALIZED;
+ mutex_destroy(&exacct_globals_list_lock);
+ list_destroy(&exacct_globals_list);
+ }
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini()
+{
+ return (EBUSY);
+}
diff --git a/usr/src/uts/common/syscall/acl.c b/usr/src/uts/common/syscall/acl.c
new file mode 100644
index 0000000000..a52184ec2e
--- /dev/null
+++ b/usr/src/uts/common/syscall/acl.c
@@ -0,0 +1,430 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/uio.h>
+#include <sys/kmem.h>
+#include <sys/filio.h>
+#include <sys/acl.h>
+#include <sys/cmn_err.h>
+
+#include <sys/unistd.h>
+#include <sys/debug.h>
+
+static int cacl(int cmd, int nentries, void *aclbufp,
+ vnode_t *vp, int *rv);
+
+/*
+ * Get/Set ACL of a file.
+ */
+int
+acl(const char *fname, int cmd, int nentries, void *aclbufp)
+{
+ struct vnode *vp;
+ int error;
+ int rv = 0;
+
+ /* Sanity check arguments */
+ if (fname == NULL)
+ return (set_errno(EINVAL));
+lookup:
+ error = lookupname((char *)fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+
+ error = cacl(cmd, nentries, aclbufp, vp, &rv);
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (rv);
+}
+
+/*
+ * Get/Set ACL of a file with facl system call.
+ */
+int
+facl(int fdes, int cmd, int nentries, void *aclbufp)
+{
+ file_t *fp;
+ int error;
+ int rv = 0;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+#ifdef C2_AUDIT
+ if (fp->f_flag & FREVOKED) {
+ releasef(fdes);
+ return (set_errno(EBADF));
+ }
+#endif /* C2_AUDIT */
+
+ error = cacl(cmd, nentries, aclbufp, fp->f_vnode, &rv);
+ releasef(fdes);
+
+ if (error)
+ return (set_errno(error));
+ return (rv);
+}
+
+
+/*
+ * Common code for acl() and facl().
+ */
+static int
+cacl(int cmd, int nentries, void *aclbufp, vnode_t *vp, int *rv)
+{
+ int error;
+ int aclbsize; /* size of acl list in bytes */
+ int dfaclbsize; /* size of default acl list in bytes */
+ int numacls;
+ caddr_t uaddrp;
+ aclent_t *aclp, *aaclp;
+ vsecattr_t vsecattr;
+
+ ASSERT(vp);
+
+ bzero(&vsecattr, sizeof (vsecattr_t));
+
+ switch (cmd) {
+
+ case ACE_GETACLCNT:
+ case GETACLCNT:
+ if (cmd == GETACLCNT)
+ vsecattr.vsa_mask = VSA_ACLCNT | VSA_DFACLCNT;
+ else
+ vsecattr.vsa_mask = VSA_ACECNT;
+ if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED()))
+ return (error);
+ *rv = vsecattr.vsa_aclcnt + vsecattr.vsa_dfaclcnt;
+ if (vsecattr.vsa_aclcnt && vsecattr.vsa_aclentp) {
+ kmem_free(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt * sizeof (aclent_t));
+ }
+ if (vsecattr.vsa_dfaclcnt && vsecattr.vsa_dfaclentp) {
+ kmem_free(vsecattr.vsa_dfaclentp,
+ vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
+ }
+ break;
+ case GETACL:
+ /*
+ * Minimum ACL size is three entries so might as well
+ * bail out here.
+ */
+ if (nentries < 3)
+ return (EINVAL);
+ /*
+ * NULL output buffer is also a pretty easy bail out.
+ */
+ if (aclbufp == NULL)
+ return (EFAULT);
+ vsecattr.vsa_mask = VSA_ACL | VSA_ACLCNT | VSA_DFACL |
+ VSA_DFACLCNT;
+ if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED()))
+ return (error);
+ /* Check user's buffer is big enough */
+ numacls = vsecattr.vsa_aclcnt + vsecattr.vsa_dfaclcnt;
+ aclbsize = vsecattr.vsa_aclcnt * sizeof (aclent_t);
+ dfaclbsize = vsecattr.vsa_dfaclcnt * sizeof (aclent_t);
+ if (numacls > nentries) {
+ error = ENOSPC;
+ goto errout;
+ }
+ /* Sort the acl & default acl lists */
+ if (vsecattr.vsa_aclcnt > 1)
+ ksort((caddr_t)vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt, sizeof (aclent_t), cmp2acls);
+ if (vsecattr.vsa_dfaclcnt > 1)
+ ksort((caddr_t)vsecattr.vsa_dfaclentp,
+ vsecattr.vsa_dfaclcnt, sizeof (aclent_t), cmp2acls);
+ /* Copy out acl's */
+ uaddrp = (caddr_t)aclbufp;
+ if (aclbsize > 0) { /* bug #1262490 */
+ if (copyout(vsecattr.vsa_aclentp, uaddrp, aclbsize)) {
+ error = EFAULT;
+ goto errout;
+ }
+ }
+ /* Copy out default acl's */
+ if (dfaclbsize > 0) {
+ uaddrp += aclbsize;
+ if (copyout(vsecattr.vsa_dfaclentp,
+ uaddrp, dfaclbsize)) {
+ error = EFAULT;
+ goto errout;
+ }
+ }
+ *rv = numacls;
+ if (vsecattr.vsa_aclcnt) {
+ kmem_free(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt * sizeof (aclent_t));
+ }
+ if (vsecattr.vsa_dfaclcnt) {
+ kmem_free(vsecattr.vsa_dfaclentp,
+ vsecattr.vsa_dfaclcnt * sizeof (aclent_t));
+ }
+ break;
+
+ case ACE_GETACL:
+ if (nentries < 3)
+ return (EINVAL);
+
+ if (aclbufp == NULL)
+ return (EFAULT);
+
+ vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+ if (error = VOP_GETSECATTR(vp, &vsecattr, 0, CRED()))
+ return (error);
+
+ aclbsize = vsecattr.vsa_aclcnt * sizeof (ace_t);
+ if (vsecattr.vsa_aclcnt > nentries) {
+ error = ENOSPC;
+ goto errout;
+ }
+
+ if (aclbsize > 0) {
+ if ((error = copyout(vsecattr.vsa_aclentp,
+ aclbufp, aclbsize)) != 0) {
+ goto errout;
+ }
+ }
+
+ *rv = vsecattr.vsa_aclcnt;
+ if (vsecattr.vsa_aclcnt) {
+ kmem_free(vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt * sizeof (ace_t));
+ }
+ break;
+
+ case SETACL:
+ /*
+ * Minimum ACL size is three entries so might as well
+ * bail out here. Also limit request size to prevent user
+ * from allocating too much kernel memory. Maximum size
+ * is MAX_ACL_ENTRIES for the ACL part and MAX_ACL_ENTRIES
+ * for the default ACL part. (bug 4058667)
+ */
+ if (nentries < 3 || nentries > (MAX_ACL_ENTRIES * 2))
+ return (EINVAL);
+ /*
+ * NULL output buffer is also an easy bail out.
+ */
+ if (aclbufp == NULL)
+ return (EFAULT);
+ vsecattr.vsa_mask = VSA_ACL;
+ aclbsize = nentries * sizeof (aclent_t);
+ vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+ aaclp = vsecattr.vsa_aclentp;
+ vsecattr.vsa_aclcnt = nentries;
+ uaddrp = (caddr_t)aclbufp;
+ if (copyin(uaddrp, vsecattr.vsa_aclentp, aclbsize)) {
+ kmem_free(aaclp, aclbsize);
+ return (EFAULT);
+ }
+ /* Sort the acl list */
+ ksort((caddr_t)vsecattr.vsa_aclentp,
+ vsecattr.vsa_aclcnt, sizeof (aclent_t), cmp2acls);
+
+ /* Break into acl and default acl lists */
+ for (numacls = 0, aclp = vsecattr.vsa_aclentp;
+ numacls < vsecattr.vsa_aclcnt;
+ aclp++, numacls++) {
+ if (aclp->a_type & ACL_DEFAULT)
+ break;
+ }
+
+ /* Find where defaults start (if any) */
+ if (numacls < vsecattr.vsa_aclcnt) {
+ vsecattr.vsa_mask |= VSA_DFACL;
+ vsecattr.vsa_dfaclcnt = nentries - numacls;
+ vsecattr.vsa_dfaclentp = aclp;
+ vsecattr.vsa_aclcnt = numacls;
+ }
+ /* Adjust if they're all defaults */
+ if (vsecattr.vsa_aclcnt == 0) {
+ vsecattr.vsa_mask &= ~VSA_ACL;
+ vsecattr.vsa_aclentp = NULL;
+ }
+ /* Only directories can have defaults */
+ if (vsecattr.vsa_dfaclcnt && vp->v_type != VDIR) {
+ kmem_free(aaclp, aclbsize);
+ return (ENOTDIR);
+ }
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ if (error = VOP_SETSECATTR(vp, &vsecattr, 0, CRED())) {
+ kmem_free(aaclp, aclbsize);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ return (error);
+ }
+
+ /*
+ * Should return 0 upon success according to the man page
+ * and SVR4 semantics. (Bug #1214399: SETACL returns wrong rc)
+ */
+ *rv = 0;
+ kmem_free(aaclp, aclbsize);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ break;
+
+ case ACE_SETACL:
+ if (nentries < 3 || nentries > (MAX_ACL_ENTRIES * 2))
+ return (EINVAL);
+
+ if (aclbufp == NULL)
+ return (EFAULT);
+
+ vsecattr.vsa_mask = VSA_ACE;
+ aclbsize = nentries * sizeof (ace_t);
+ vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+ aaclp = vsecattr.vsa_aclentp;
+ vsecattr.vsa_aclcnt = nentries;
+ uaddrp = (caddr_t)aclbufp;
+ if (copyin(uaddrp, vsecattr.vsa_aclentp, aclbsize)) {
+ kmem_free(aaclp, aclbsize);
+ return (EFAULT);
+ }
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ if (error = VOP_SETSECATTR(vp, &vsecattr, 0, CRED())) {
+ kmem_free(aaclp, aclbsize);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ return (error);
+ }
+ *rv = 0;
+ kmem_free(aaclp, aclbsize);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+
+errout:
+ if (aclbsize && vsecattr.vsa_aclentp)
+ kmem_free(vsecattr.vsa_aclentp, aclbsize);
+ if (dfaclbsize && vsecattr.vsa_dfaclentp)
+ kmem_free(vsecattr.vsa_dfaclentp, dfaclbsize);
+ return (error);
+}
+
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ * returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)())
+{
+ int g, i, j, ii;
+ unsigned int *p1, *p2;
+ unsigned int tmp;
+
+ /* No work to do */
+ if (v == NULL || n <= 1)
+ return;
+
+ /* Sanity check on arguments */
+ ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+ ASSERT(s > 0);
+ for (g = n / 2; g > 0; g /= 2) {
+ for (i = g; i < n; i++) {
+ for (j = i - g; j >= 0 &&
+ (*f)(v + j * s, v + (j + g) * s) == 1;
+ j -= g) {
+ p1 = (unsigned *)(v + j * s);
+ p2 = (unsigned *)(v + (j + g) * s);
+ for (ii = 0; ii < s / 4; ii++) {
+ tmp = *p1;
+ *p1++ = *p2;
+ *p2++ = tmp;
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Compare two acls, all fields. Returns:
+ * -1 (less than)
+ * 0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+ aclent_t *x = (aclent_t *)a;
+ aclent_t *y = (aclent_t *)b;
+
+ /* Compare types */
+ if (x->a_type < y->a_type)
+ return (-1);
+ if (x->a_type > y->a_type)
+ return (1);
+ /* Equal types; compare id's */
+ if (x->a_id < y->a_id)
+ return (-1);
+ if (x->a_id > y->a_id)
+ return (1);
+ /* Equal ids; compare perms */
+ if (x->a_perm < y->a_perm)
+ return (-1);
+ if (x->a_perm > y->a_perm)
+ return (1);
+ /* Totally equal */
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/adjtime.c b/usr/src/uts/common/syscall/adjtime.c
new file mode 100644
index 0000000000..dc2dde5306
--- /dev/null
+++ b/usr/src/uts/common/syscall/adjtime.c
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 1999,2001-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/debug.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+int
+adjtime(struct timeval *delta, struct timeval *olddelta)
+{
+ struct timeval atv, oatv;
+ int64_t ndelta;
+ int64_t old_delta;
+ int s;
+ model_t datamodel = get_udatamodel();
+
+ if (secpolicy_settime(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(delta, &atv, sizeof (atv)))
+ return (set_errno(EFAULT));
+ } else {
+ struct timeval32 atv32;
+
+ if (copyin(delta, &atv32, sizeof (atv32)))
+ return (set_errno(EFAULT));
+ TIMEVAL32_TO_TIMEVAL(&atv, &atv32);
+ }
+
+ if (atv.tv_usec <= -MICROSEC || atv.tv_usec >= MICROSEC)
+ return (set_errno(EINVAL));
+
+ /*
+ * The SVID specifies that if delta is 0, then there is
+ * no effect upon time correction, just return olddelta.
+ */
+ ndelta = (int64_t)atv.tv_sec * NANOSEC + atv.tv_usec * 1000;
+ mutex_enter(&tod_lock);
+ s = hr_clock_lock();
+ old_delta = timedelta;
+ if (ndelta)
+ timedelta = ndelta;
+ /*
+ * Always set tod_needsync on all adjtime() calls, since it implies
+ * someone is watching over us and keeping the local clock in sync.
+ */
+ tod_needsync = 1;
+ hr_clock_unlock(s);
+ mutex_exit(&tod_lock);
+
+ if (olddelta) {
+ oatv.tv_sec = old_delta / NANOSEC;
+ oatv.tv_usec = (old_delta % NANOSEC) / 1000;
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyout(&oatv, olddelta, sizeof (oatv)))
+ return (set_errno(EFAULT));
+ } else {
+ struct timeval32 oatv32;
+
+ if (TIMEVAL_OVERFLOW(&oatv))
+ return (set_errno(EOVERFLOW));
+
+ TIMEVAL_TO_TIMEVAL32(&oatv32, &oatv);
+
+ if (copyout(&oatv32, olddelta, sizeof (oatv32)))
+ return (set_errno(EFAULT));
+ }
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/alarm.c b/usr/src/uts/common/syscall/alarm.c
new file mode 100644
index 0000000000..15027cdd82
--- /dev/null
+++ b/usr/src/uts/common/syscall/alarm.c
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright (c) 1999-2001 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/signal.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+
+static void
+sigalarm2proc(void *arg)
+{
+ proc_t *p = arg;
+
+ mutex_enter(&p->p_lock);
+ p->p_alarmid = 0;
+ sigtoproc(p, NULL, SIGALRM);
+ mutex_exit(&p->p_lock);
+}
+
+int
+alarm(int deltat)
+{
+ proc_t *p = ttoproc(curthread);
+ clock_t del = 0;
+ clock_t ret;
+ timeout_id_t tmp_id;
+
+ /*
+ * We must single-thread this code relative to other
+ * lwps in the same process also performing an alarm().
+ * The mutex dance in the while loop is necessary because
+ * we cannot call untimeout() while holding a lock that
+ * is grabbed by the timeout function, sigalarm2proc().
+ * We can, however, hold p->p_lock across realtime_timeout().
+ */
+ mutex_enter(&p->p_lock);
+ while ((tmp_id = p->p_alarmid) != 0) {
+ p->p_alarmid = 0;
+ mutex_exit(&p->p_lock);
+ del = untimeout(tmp_id);
+ mutex_enter(&p->p_lock);
+ }
+
+ if (del < 0)
+ ret = 0;
+ else
+ ret = (del + hz - 1) / hz; /* convert to seconds */
+ if (deltat)
+ p->p_alarmid = realtime_timeout(sigalarm2proc, p, deltat * hz);
+ mutex_exit(&p->p_lock);
+ return (ret);
+}
diff --git a/usr/src/uts/common/syscall/auditsys.c b/usr/src/uts/common/syscall/auditsys.c
new file mode 100644
index 0000000000..2beaf4fc7e
--- /dev/null
+++ b/usr/src/uts/common/syscall/auditsys.c
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1994,2002-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/policy.h>
+
+#include <c2/audit.h>
+
+/*ARGSUSED1*/
+int
+auditsys(struct auditcalls *uap, rval_t *rvp)
+{
+ int err;
+
+ /*
+ * this ugly hack is because auditsys returns 0 for
+ * all cases except audit_active == 0 and
+ * uap->code == BSM_AUDITCTRL || BSM_AUDITON || default)
+ */
+
+ switch (uap->code) {
+ case BSM_GETAUID:
+ case BSM_SETAUID:
+ case BSM_GETAUDIT:
+ case BSM_SETAUDIT:
+ case BSM_AUDIT:
+ case BSM_AUDITSVC:
+ return (0);
+ case BSM_AUDITCTL:
+ case BSM_AUDITON:
+ if ((int)uap->a1 == A_GETCOND)
+ err = secpolicy_audit_getattr(CRED());
+ else
+ /* FALLTHROUGH */
+ default:
+ /* Return a different error when not privileged */
+ err = secpolicy_audit_config(CRED());
+ if (err == 0)
+ return (EINVAL);
+ else
+ return (err);
+ }
+}
diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c
new file mode 100644
index 0000000000..a8b28f9589
--- /dev/null
+++ b/usr/src/uts/common/syscall/chdir.c
@@ -0,0 +1,247 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/ioreq.h>
+#include <sys/poll.h>
+#include <sys/kmem.h>
+#include <sys/filio.h>
+#include <sys/cmn_err.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+#include <sys/debug.h>
+#include <c2/audit.h>
+
+/*
+ * Change current working directory (".").
+ */
+static int chdirec(vnode_t *, int ischroot, int do_traverse);
+
+int
+chdir(char *fname)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+
+ error = chdirec(vp, 0, 1);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+/*
+ * File-descriptor based version of 'chdir'.
+ */
+int
+fchdir(int fd)
+{
+ vnode_t *vp;
+ file_t *fp;
+ int error;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ releasef(fd);
+ error = chdirec(vp, 0, 0);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Change notion of root ("/") directory.
+ */
+int
+chroot(char *fname)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+
+ error = chdirec(vp, 1, 1);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+/*
+ * ++++++++++++++++++++++++
+ * ++ SunOS4.1 Buyback ++
+ * ++++++++++++++++++++++++
+ * Change root directory with a user given fd
+ */
+int
+fchroot(int fd)
+{
+ vnode_t *vp;
+ file_t *fp;
+ int error;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+ VN_HOLD(vp);
+ releasef(fd);
+ error = chdirec(vp, 1, 0);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+static int
+chdirec(vnode_t *vp, int ischroot, int do_traverse)
+{
+ int error;
+ vnode_t *oldvp;
+ proc_t *pp = curproc;
+ vnode_t **vpp;
+ refstr_t *cwd;
+ int newcwd = 1;
+
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad;
+ }
+ if (error = VOP_ACCESS(vp, VEXEC, 0, CRED()))
+ goto bad;
+
+ /*
+ * The VOP_ACCESS() may have covered 'vp' with a new filesystem,
+ * if 'vp' is an autoFS vnode. Traverse the mountpoint so
+ * that we don't end up with a covered current directory.
+ */
+ if (vn_mountedvfs(vp) != NULL && do_traverse) {
+ if (error = traverse(&vp))
+ goto bad;
+ }
+
+ /*
+ * Special chroot semantics: chroot is allowed if privileged
+ * or if the target is really a loopback mount of the root (or
+ * root of the zone) as determined by comparing dev and inode
+ * numbers
+ */
+ if (ischroot) {
+ struct vattr tattr;
+ struct vattr rattr;
+ vnode_t *zonevp = curproc->p_zone->zone_rootvp;
+
+ tattr.va_mask = AT_FSID|AT_NODEID;
+ if (error = VOP_GETATTR(vp, &tattr, 0, CRED()))
+ goto bad;
+
+ rattr.va_mask = AT_FSID|AT_NODEID;
+ if (error = VOP_GETATTR(zonevp, &rattr, 0, CRED()))
+ goto bad;
+
+ if ((tattr.va_fsid != rattr.va_fsid ||
+ tattr.va_nodeid != rattr.va_nodeid) &&
+ (error = secpolicy_chroot(CRED())) != 0)
+ goto bad;
+
+ vpp = &PTOU(pp)->u_rdir;
+ } else {
+ vpp = &PTOU(pp)->u_cdir;
+ }
+
+#ifdef C2_AUDIT
+ if (audit_active) /* update abs cwd/root path see c2audit.c */
+ audit_chdirec(vp, vpp);
+#endif
+
+ mutex_enter(&pp->p_lock);
+ /*
+ * This bit of logic prevents us from overwriting u_cwd if we are
+ * changing to the same directory. We set the cwd to NULL so that we
+ * don't try to do the lookup on the next call to getcwd().
+ */
+ if (!ischroot && *vpp != NULL && vp != NULL && VN_CMP(*vpp, vp))
+ newcwd = 0;
+
+ oldvp = *vpp;
+ *vpp = vp;
+ if ((cwd = PTOU(pp)->u_cwd) != NULL && newcwd)
+ PTOU(pp)->u_cwd = NULL;
+ mutex_exit(&pp->p_lock);
+
+ if (cwd && newcwd)
+ refstr_rele(cwd);
+ if (oldvp)
+ VN_RELE(oldvp);
+ return (0);
+
+bad:
+ VN_RELE(vp);
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/chmod.c b/usr/src/uts/common/syscall/chmod.c
new file mode 100644
index 0000000000..8fb42e0843
--- /dev/null
+++ b/usr/src/uts/common/syscall/chmod.c
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1989 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/dirent.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/uio.h>
+#include <sys/filio.h>
+#include <sys/debug.h>
+
+extern int namesetattr(char *, enum symfollow, vattr_t *, int);
+extern int fdsetattr(int, vattr_t *);
+
+/*
+ * Change mode of file given path name.
+ */
+int
+chmod(char *fname, int fmode)
+{
+ struct vattr vattr;
+
+ vattr.va_mode = fmode & MODEMASK;
+ vattr.va_mask = AT_MODE;
+ return (namesetattr(fname, FOLLOW, &vattr, 0));
+}
+
+/*
+ * Change mode of file given file descriptor.
+ */
+int
+fchmod(int fd, int fmode)
+{
+ struct vattr vattr;
+
+ vattr.va_mode = fmode & MODEMASK;
+ vattr.va_mask = AT_MODE;
+ return (fdsetattr(fd, &vattr));
+}
diff --git a/usr/src/uts/common/syscall/chown.c b/usr/src/uts/common/syscall/chown.c
new file mode 100644
index 0000000000..7dc7fc663e
--- /dev/null
+++ b/usr/src/uts/common/syscall/chown.c
@@ -0,0 +1,181 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <sys/var.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/filio.h>
+#include <sys/fcntl.h>
+#include <sys/debug.h>
+#include <c2/audit.h>
+
+/*
+ * nmflag has the following values
+ *
+ * 1 - Always do lookup. i.e. chown, lchown.
+ * 2 - Name is optional i.e. fchownat
+ * 0 - Don't lookup name, vp is in file_p. i.e. fchown
+ *
+ */
+int
+cfchownat(int fd, char *name, int nmflag, uid_t uid, gid_t gid, int flags)
+{
+ vnode_t *startvp, *vp;
+ file_t *filefp;
+ struct vattr vattr;
+ int error = 0;
+ char startchar;
+
+ if (uid < -1 || uid > MAXUID || gid < -1 || gid > MAXUID)
+ return (set_errno(EINVAL));
+ vattr.va_uid = uid;
+ vattr.va_gid = gid;
+ vattr.va_mask = 0;
+ if (vattr.va_uid != -1)
+ vattr.va_mask |= AT_UID;
+ if (vattr.va_gid != -1)
+ vattr.va_mask |= AT_GID;
+
+
+ if (fd == AT_FDCWD && name == NULL)
+ return (set_errno(EFAULT));
+
+ if (nmflag == 1 || (nmflag == 2 && name != NULL)) {
+ if (copyin(name, &startchar, sizeof (char)))
+ return (set_errno(EFAULT));
+ } else
+ startchar = '\0';
+
+
+ if (fd == AT_FDCWD)
+ startvp = NULL;
+ else {
+ /*
+ * only get fd if not doing absolute lookup
+ */
+ if (startchar != '/' || nmflag == 0) {
+ if ((filefp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ startvp = filefp->f_vnode;
+ VN_HOLD(startvp);
+ releasef(fd);
+ } else {
+ startvp = NULL;
+ }
+ }
+
+#if C2_AUDIT
+ if ((nmflag == 2) && audit_active)
+ audit_setfsat_path(1);
+#endif /* C2_AUDIT */
+
+ /*
+ * Do lookups for chown, lchown and fchownat when name not NULL
+ */
+ if ((nmflag == 2 && name != NULL) || nmflag == 1) {
+ if (error = lookupnameat(name, UIO_USERSPACE,
+ (flags == AT_SYMLINK_NOFOLLOW) ?
+ NO_FOLLOW : FOLLOW,
+ NULLVPP, &vp, startvp)) {
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ return (set_errno(error));
+ }
+ } else {
+ vp = startvp;
+ ASSERT(vp);
+ VN_HOLD(vp);
+ }
+
+ if (vn_is_readonly(vp)) {
+ error = EROFS;
+ } else {
+ error = VOP_SETATTR(vp, &vattr, 0, CRED(), NULL);
+ }
+
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ if (vp != NULL)
+ VN_RELE(vp);
+
+ if (error != 0)
+ return (set_errno(error));
+ else
+ return (error);
+}
+/*
+ * Change ownership of file given file name.
+ */
+int
+chown(char *fname, uid_t uid, gid_t gid)
+{
+ return (cfchownat(AT_FDCWD, fname, 1, uid, gid, 0));
+}
+
+int
+lchown(char *fname, uid_t uid, gid_t gid)
+{
+ return (cfchownat(AT_FDCWD, fname, 1, uid, gid, AT_SYMLINK_NOFOLLOW));
+}
+
+/*
+ * Change ownership of file given file descriptor.
+ */
+int
+fchown(int fd, uid_t uid, uid_t gid)
+{
+ return (cfchownat(fd, NULL, 0, uid, gid, 0));
+}
+
+int
+fchownat(int fd, char *name, uid_t uid, gid_t gid, int flags)
+{
+ return (cfchownat(fd, name, 2, uid, gid, flags));
+
+}
diff --git a/usr/src/uts/common/syscall/cladm.c b/usr/src/uts/common/syscall/cladm.c
new file mode 100644
index 0000000000..e2e034d93d
--- /dev/null
+++ b/usr/src/uts/common/syscall/cladm.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cladm.h>
+
+/*
+ * cladm(2) cluster administation system call.
+ */
+int
+cladm(int fac, int cmd, void *arg)
+{
+ int error = 0;
+ int copyout_bootflags;
+
+ switch (fac) {
+ case CL_INITIALIZE:
+ if (cmd != CL_GET_BOOTFLAG) {
+ error = EINVAL;
+ break;
+ }
+
+ /*
+ * The CLUSTER_INSTALLING and CLUSTER_DCS_ENABLED bootflags are
+ * internal flags. We do not want to expose these to the user
+ * level.
+ */
+ copyout_bootflags = (cluster_bootflags &
+ ~(CLUSTER_INSTALLING | CLUSTER_DCS_ENABLED));
+ if (copyout(&copyout_bootflags, arg, sizeof (int))) {
+ error = EFAULT;
+ }
+ break;
+
+ case CL_CONFIG:
+ /*
+ * We handle CL_NODEID here so that the node number
+ * can be returned if the system is configured as part
+ * of a cluster but not booted as part of the cluster.
+ */
+ if (cmd == CL_NODEID) {
+ nodeid_t nid;
+
+ /* return error if not configured as a cluster */
+ if (!(cluster_bootflags & CLUSTER_CONFIGURED)) {
+ error = ENOSYS;
+ break;
+ }
+
+ nid = clconf_get_nodeid();
+ error = copyout(&nid, arg, sizeof (nid));
+ break;
+ }
+ /* FALLTHROUGH */
+
+ default:
+ if ((cluster_bootflags & (CLUSTER_CONFIGURED|CLUSTER_BOOTED)) !=
+ (CLUSTER_CONFIGURED|CLUSTER_BOOTED)) {
+ error = EINVAL;
+ break;
+ }
+ error = cladmin(fac, cmd, arg);
+ /*
+ * error will be -1 if the cladm module cannot be loaded;
+ * otherwise, it is the errno value returned
+ * (see {i86,sparc}/ml/modstubs.s).
+ */
+ if (error < 0)
+ error = ENOSYS;
+ break;
+ }
+
+ return (error ? set_errno(error) : 0);
+}
diff --git a/usr/src/uts/common/syscall/close.c b/usr/src/uts/common/syscall/close.c
new file mode 100644
index 0000000000..dd79ccb10e
--- /dev/null
+++ b/usr/src/uts/common/syscall/close.c
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1998 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/debug.h>
+
+/*
+ * Close a file.
+ */
+
+int
+close(int fdes)
+{
+ int error;
+
+ if ((error = closeandsetf(fdes, NULL)) != 0)
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/corectl.c b/usr/src/uts/common/syscall/corectl.c
new file mode 100644
index 0000000000..9e67ae545a
--- /dev/null
+++ b/usr/src/uts/common/syscall/corectl.c
@@ -0,0 +1,558 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/atomic.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/procset.h>
+#include <sys/corectl.h>
+#include <sys/zone.h>
+#include <sys/cmn_err.h>
+#include <sys/policy.h>
+
+/*
+ * Core File Settings
+ * ------------------
+ *
+ * A process's core file path and content live in separate reference-counted
+ * structures. The corectl_content_t structure is fairly straightforward --
+ * the only subtlety is that we only really _need_ the mutex on architectures
+ * on which 64-bit memory operations are not atomic. The corectl_path_t
+ * structure is slightly trickier in that it contains a refstr_t rather than
+ * just a char * string. This is to allow consumers of the data in that
+ * structure (the core dumping sub-system for example) to safely use the
+ * string without holding any locks on it in light of updates.
+ *
+ * At system boot, init_core() sets init(1M)'s core file path and content to
+ * the same value as the fields core_default_path and core_default_content
+ * respectively (for the global zone). All subsequent children of init(1M)
+ * reference those same settings. During boot coreadm(1M) is invoked with
+ * the -u option to update the system settings from /etc/coreadm.conf. This
+ * has the effect of also changing the values in core_default_path and
+ * core_default_content which updates the core file settings for all
+ * processes in the zone. Each zone has different default settings; when
+ * processes enter a non-global zone, their core file path and content are
+ * set to the zone's default path and content.
+ *
+ * Processes that have their core file settings explicitly overridden using
+ * coreadm(1M) no longer reference core_default_path or core_default_content
+ * so subsequent changes to the default will not affect them.
+ */
+
+zone_key_t core_zone_key;
+
+static int set_proc_info(pid_t pid, const char *path, core_content_t content);
+
+static corectl_content_t *
+corectl_content_alloc(core_content_t cc)
+{
+ corectl_content_t *ccp;
+
+ ccp = kmem_zalloc(sizeof (corectl_content_t), KM_SLEEP);
+ ccp->ccc_content = cc;
+ ccp->ccc_refcnt = 1;
+
+ return (ccp);
+}
+
+core_content_t
+corectl_content_value(corectl_content_t *ccp)
+{
+ core_content_t content;
+
+ mutex_enter(&ccp->ccc_mtx);
+ content = ccp->ccc_content;
+ mutex_exit(&ccp->ccc_mtx);
+
+ return (content);
+}
+
+static void
+corectl_content_set(corectl_content_t *ccp, core_content_t content)
+{
+ mutex_enter(&ccp->ccc_mtx);
+ ccp->ccc_content = content;
+ mutex_exit(&ccp->ccc_mtx);
+}
+
+void
+corectl_content_hold(corectl_content_t *ccp)
+{
+ atomic_add_32(&ccp->ccc_refcnt, 1);
+}
+
+void
+corectl_content_rele(corectl_content_t *ccp)
+{
+ if (atomic_add_32_nv(&ccp->ccc_refcnt, -1) == 0)
+ kmem_free(ccp, sizeof (corectl_content_t));
+}
+
+
+static corectl_path_t *
+corectl_path_alloc(const char *path)
+{
+ corectl_path_t *ccp;
+
+ ccp = kmem_zalloc(sizeof (corectl_path_t), KM_SLEEP);
+ ccp->ccp_path = refstr_alloc(path);
+ ccp->ccp_refcnt = 1;
+
+ return (ccp);
+}
+
+refstr_t *
+corectl_path_value(corectl_path_t *ccp)
+{
+ refstr_t *path;
+
+ mutex_enter(&ccp->ccp_mtx);
+ refstr_hold(path = ccp->ccp_path);
+ mutex_exit(&ccp->ccp_mtx);
+
+ return (path);
+}
+
+static void
+corectl_path_set(corectl_path_t *ccp, const char *path)
+{
+ refstr_t *npath = refstr_alloc(path);
+
+ mutex_enter(&ccp->ccp_mtx);
+ refstr_rele(ccp->ccp_path);
+ ccp->ccp_path = npath;
+ mutex_exit(&ccp->ccp_mtx);
+}
+
+void
+corectl_path_hold(corectl_path_t *ccp)
+{
+ atomic_add_32(&ccp->ccp_refcnt, 1);
+}
+
+void
+corectl_path_rele(corectl_path_t *ccp)
+{
+ if (atomic_add_32_nv(&ccp->ccp_refcnt, -1) == 0) {
+ refstr_rele(ccp->ccp_path);
+ kmem_free(ccp, sizeof (corectl_path_t));
+ }
+}
+
+/*
+ * Constructor routine to be called when a zone is created.
+ */
+/*ARGSUSED*/
+static void *
+core_init_zone(zoneid_t zoneid)
+{
+ struct core_globals *cg;
+
+ cg = kmem_alloc(sizeof (*cg), KM_SLEEP);
+ mutex_init(&cg->core_lock, NULL, MUTEX_DEFAULT, NULL);
+ cg->core_file = NULL;
+ cg->core_options = CC_PROCESS_PATH;
+ cg->core_content = CC_CONTENT_DEFAULT;
+ cg->core_rlimit = RLIM64_INFINITY;
+ cg->core_default_path = corectl_path_alloc("core");
+ cg->core_default_content = corectl_content_alloc(CC_CONTENT_DEFAULT);
+
+ return (cg);
+}
+
+/*
+ * Destructor routine to be called when a zone is destroyed.
+ */
+/*ARGSUSED*/
+static void
+core_free_zone(zoneid_t zoneid, void *arg)
+{
+ struct core_globals *cg = arg;
+
+ if (cg == NULL)
+ return;
+ if (cg->core_file != NULL)
+ refstr_rele(cg->core_file);
+ corectl_path_rele(cg->core_default_path);
+ corectl_content_rele(cg->core_default_content);
+ kmem_free(cg, sizeof (*cg));
+}
+
+/*
+ * Called once, from icode(), to set init's core file path and content.
+ */
+void
+init_core(void)
+{
+ struct core_globals *cg;
+
+ zone_key_create(&core_zone_key, core_init_zone, NULL, core_free_zone);
+
+ /*
+ * zone_key_create will have called core_init_zone for the
+ * global zone, which sets up the default path and content
+ * variables.
+ */
+ cg = zone_getspecific(core_zone_key, global_zone);
+ ASSERT(cg != NULL);
+
+ corectl_path_hold(cg->core_default_path);
+ corectl_content_hold(cg->core_default_content);
+
+ curproc->p_corefile = cg->core_default_path;
+ curproc->p_content = cg->core_default_content;
+}
+
+int
+corectl(int subcode, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3)
+{
+ int error = 0;
+ proc_t *p;
+ refstr_t *rp;
+ size_t size;
+ char *path;
+ core_content_t content = CC_CONTENT_INVALID;
+ struct core_globals *cg;
+ zone_t *zone = curproc->p_zone;
+
+ cg = zone_getspecific(core_zone_key, zone);
+ ASSERT(cg != NULL);
+
+ switch (subcode) {
+ case CC_SET_OPTIONS:
+ if ((error = secpolicy_coreadm(CRED())) == 0) {
+ if (arg1 & ~CC_OPTIONS)
+ error = EINVAL;
+ else
+ cg->core_options = (uint32_t)arg1;
+ }
+ break;
+
+ case CC_GET_OPTIONS:
+ return (cg->core_options);
+
+ case CC_GET_GLOBAL_PATH:
+ case CC_GET_DEFAULT_PATH:
+ case CC_GET_PROCESS_PATH:
+ if (subcode == CC_GET_GLOBAL_PATH) {
+ mutex_enter(&cg->core_lock);
+ if ((rp = cg->core_file) != NULL)
+ refstr_hold(rp);
+ mutex_exit(&cg->core_lock);
+ } else if (subcode == CC_GET_DEFAULT_PATH) {
+ rp = corectl_path_value(cg->core_default_path);
+ } else {
+ rp = NULL;
+ mutex_enter(&pidlock);
+ if ((p = prfind((pid_t)arg3)) == NULL ||
+ p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ } else {
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+ mutex_enter(&p->p_crlock);
+ if (!hasprocperm(p->p_cred, CRED()))
+ error = EPERM;
+ else if (p->p_corefile != NULL)
+ rp = corectl_path_value(p->p_corefile);
+ mutex_exit(&p->p_crlock);
+ mutex_exit(&p->p_lock);
+ }
+ }
+ if (rp == NULL) {
+ if (error == 0 && suword8((void *)arg1, 0))
+ error = EFAULT;
+ } else {
+ error = copyoutstr(refstr_value(rp), (char *)arg1,
+ (size_t)arg2, NULL);
+ refstr_rele(rp);
+ }
+ break;
+
+ case CC_SET_GLOBAL_PATH:
+ case CC_SET_DEFAULT_PATH:
+ if ((error = secpolicy_coreadm(CRED())) != 0)
+ break;
+
+ /* FALLTHROUGH */
+ case CC_SET_PROCESS_PATH:
+ if ((size = MIN((size_t)arg2, MAXPATHLEN)) == 0) {
+ error = EINVAL;
+ break;
+ }
+ path = kmem_alloc(size, KM_SLEEP);
+ error = copyinstr((char *)arg1, path, size, NULL);
+ if (error == 0) {
+ if (subcode == CC_SET_PROCESS_PATH) {
+ error = set_proc_info((pid_t)arg3, path, 0);
+ } else if (subcode == CC_SET_DEFAULT_PATH) {
+ corectl_path_set(cg->core_default_path, path);
+ } else if (*path != '\0' && *path != '/') {
+ error = EINVAL;
+ } else {
+ refstr_t *nrp = refstr_alloc(path);
+
+ mutex_enter(&cg->core_lock);
+ rp = cg->core_file;
+ if (*path == '\0')
+ cg->core_file = NULL;
+ else
+ refstr_hold(cg->core_file = nrp);
+ mutex_exit(&cg->core_lock);
+
+ if (rp != NULL)
+ refstr_rele(rp);
+
+ refstr_rele(nrp);
+ }
+ }
+ kmem_free(path, size);
+ break;
+
+ case CC_SET_GLOBAL_CONTENT:
+ case CC_SET_DEFAULT_CONTENT:
+ if ((error = secpolicy_coreadm(CRED())) != 0)
+ break;
+
+ /* FALLTHROUGH */
+ case CC_SET_PROCESS_CONTENT:
+ error = copyin((void *)arg1, &content, sizeof (content));
+ if (error != 0)
+ break;
+
+ /*
+ * If any unknown bits are set, don't let this charade
+ * continue.
+ */
+ if (content & ~CC_CONTENT_ALL) {
+ error = EINVAL;
+ break;
+ }
+
+ if (subcode == CC_SET_PROCESS_CONTENT) {
+ error = set_proc_info((pid_t)arg2, NULL, content);
+ } else if (subcode == CC_SET_DEFAULT_CONTENT) {
+ corectl_content_set(cg->core_default_content, content);
+ } else {
+ mutex_enter(&cg->core_lock);
+ cg->core_content = content;
+ mutex_exit(&cg->core_lock);
+ }
+
+ break;
+
+ case CC_GET_GLOBAL_CONTENT:
+ content = cg->core_content;
+ error = copyout(&content, (void *)arg1, sizeof (content));
+ break;
+
+ case CC_GET_DEFAULT_CONTENT:
+ content = corectl_content_value(cg->core_default_content);
+ error = copyout(&content, (void *)arg1, sizeof (content));
+ break;
+
+ case CC_GET_PROCESS_CONTENT:
+ mutex_enter(&pidlock);
+ if ((p = prfind((pid_t)arg2)) == NULL || p->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ break;
+ }
+
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+ mutex_enter(&p->p_crlock);
+ if (!hasprocperm(p->p_cred, CRED()))
+ error = EPERM;
+ else if (p->p_content == NULL)
+ content = CC_CONTENT_NONE;
+ else
+ content = corectl_content_value(p->p_content);
+ mutex_exit(&p->p_crlock);
+ mutex_exit(&p->p_lock);
+
+ if (error == 0)
+ error = copyout(&content, (void *)arg1,
+ sizeof (content));
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+typedef struct {
+ int cc_count;
+ corectl_path_t *cc_path;
+ corectl_content_t *cc_content;
+} counter_t;
+
+static int
+set_one_proc_info(proc_t *p, counter_t *counterp)
+{
+ corectl_path_t *corefile;
+ corectl_content_t *content;
+
+ mutex_enter(&p->p_crlock);
+
+ if (!(p->p_flag & SSYS) && hasprocperm(p->p_cred, CRED())) {
+ mutex_exit(&p->p_crlock);
+ counterp->cc_count++;
+ if (counterp->cc_path != NULL) {
+ corectl_path_hold(counterp->cc_path);
+ mutex_enter(&p->p_lock);
+ corefile = p->p_corefile;
+ p->p_corefile = counterp->cc_path;
+ mutex_exit(&p->p_lock);
+ if (corefile != NULL)
+ corectl_path_rele(corefile);
+ } else {
+ corectl_content_hold(counterp->cc_content);
+ mutex_enter(&p->p_lock);
+ content = p->p_content;
+ p->p_content = counterp->cc_content;
+ mutex_exit(&p->p_lock);
+ if (content != NULL)
+ corectl_content_rele(content);
+ }
+ } else {
+ mutex_exit(&p->p_crlock);
+ }
+
+ return (0);
+}
+
+static int
+set_proc_info(pid_t pid, const char *path, core_content_t content)
+{
+ proc_t *p;
+ counter_t counter;
+ int error = 0;
+
+ counter.cc_count = 0;
+ /*
+ * Only one of the core file path or content can be set at a time.
+ */
+ if (path != NULL) {
+ counter.cc_path = corectl_path_alloc(path);
+ counter.cc_content = NULL;
+ } else {
+ counter.cc_path = NULL;
+ counter.cc_content = corectl_content_alloc(content);
+ }
+
+ if (pid == -1) {
+ procset_t set;
+
+ setprocset(&set, POP_AND, P_ALL, P_MYID, P_ALL, P_MYID);
+ error = dotoprocs(&set, set_one_proc_info, (char *)&counter);
+ if (error == 0 && counter.cc_count == 0)
+ error = EPERM;
+ } else if (pid > 0) {
+ mutex_enter(&pidlock);
+ if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) {
+ error = ESRCH;
+ } else {
+ (void) set_one_proc_info(p, &counter);
+ if (counter.cc_count == 0)
+ error = EPERM;
+ }
+ mutex_exit(&pidlock);
+ } else {
+ int nfound = 0;
+ pid_t pgid;
+
+ if (pid == 0)
+ pgid = curproc->p_pgrp;
+ else
+ pgid = -pid;
+
+ mutex_enter(&pidlock);
+ for (p = pgfind(pgid); p != NULL; p = p->p_pglink) {
+ if (p->p_stat != SIDL) {
+ nfound++;
+ (void) set_one_proc_info(p, &counter);
+ }
+ }
+ mutex_exit(&pidlock);
+ if (nfound == 0)
+ error = ESRCH;
+ else if (counter.cc_count == 0)
+ error = EPERM;
+ }
+
+ if (path != NULL)
+ corectl_path_rele(counter.cc_path);
+ else
+ corectl_content_rele(counter.cc_content);
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Give current process the default core settings for its current zone;
+ * used for processes entering a zone via zone_enter.
+ */
+void
+set_core_defaults(void)
+{
+ proc_t *p = curproc;
+ struct core_globals *cg;
+ corectl_path_t *oldpath, *newpath;
+ corectl_content_t *oldcontent, *newcontent;
+
+ cg = zone_getspecific(core_zone_key, p->p_zone);
+
+ /* make local copies of default values to protect against change */
+ newpath = cg->core_default_path;
+ newcontent = cg->core_default_content;
+
+ corectl_path_hold(newpath);
+ corectl_content_hold(newcontent);
+ mutex_enter(&p->p_lock);
+ oldpath = p->p_corefile;
+ p->p_corefile = newpath;
+ oldcontent = p->p_content;
+ p->p_content = newcontent;
+ mutex_exit(&p->p_lock);
+ if (oldpath != NULL)
+ corectl_path_rele(oldpath);
+ if (oldcontent != NULL)
+ corectl_content_rele(oldcontent);
+}
diff --git a/usr/src/uts/common/syscall/exacctsys.c b/usr/src/uts/common/syscall/exacctsys.c
new file mode 100644
index 0000000000..af54737c57
--- /dev/null
+++ b/usr/src/uts/common/syscall/exacctsys.c
@@ -0,0 +1,406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/acctctl.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/errno.h>
+#include <sys/exacct.h>
+#include <sys/modctl.h>
+#include <sys/procset.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/task.h>
+#include <sys/types.h>
+#include <sys/user.h>
+#include <sys/policy.h>
+
+/*
+ * getacct(2), putacct(2), and wracct(2) system calls
+ *
+ * The extended accounting subsystem provides three root-privileged system
+ * calls for interacting with the actual resource data associated with each
+ * task or process. getacct() copies a packed exacct record reflecting the
+ * resource usage out to the buffer provided by the user. wracct() writes a
+ * record to the appropriate extended accounting file. putacct() takes the
+ * buffer provided by the user, and appends a "tag" record associated with the
+ * specified task or project that encapsulates the user data. All three of
+ * these functions exit early if extended accounting is not active for the
+ * requested entity type.
+ *
+ * Locking
+ * Under the terminology introduced in os/task.c, all three of these system
+ * calls are task observers, when executing on an existing task.
+ */
+
+/*
+ * getacct_callback() is used to copyout the buffer with accounting records
+ * from the kernel back to the user. It also sets actual to the size of the
+ * kernel buffer--the required minimum size for a successful outbound copy.
+ */
+/* ARGSUSED */
+static int
+getacct_callback(ac_info_t *unused, void *ubuf, size_t usize, void *kbuf,
+ size_t ksize, size_t *actual)
+{
+ size_t size = MIN(usize, ksize);
+
+ if (ubuf != NULL && copyout(kbuf, ubuf, size) != 0)
+ return (EFAULT);
+ *actual = ksize;
+ return (0);
+}
+
+static int
+getacct_task(ac_info_t *ac_task, taskid_t tkid, void *buf, size_t bufsize,
+ size_t *sizep)
+{
+ task_t *tk;
+ int error;
+
+ mutex_enter(&ac_task->ac_lock);
+ if (ac_task->ac_state == AC_OFF) {
+ mutex_exit(&ac_task->ac_lock);
+ return (ENOTACTIVE);
+ }
+ mutex_exit(&ac_task->ac_lock);
+
+ if ((tk = task_hold_by_id(tkid)) == NULL)
+ return (ESRCH);
+ error = exacct_assemble_task_usage(ac_task, tk,
+ getacct_callback, buf, bufsize, sizep, EW_PARTIAL);
+ task_rele(tk);
+
+ return (error);
+}
+
+static int
+getacct_proc(ac_info_t *ac_proc, pid_t pid, void *buf, size_t bufsize,
+ size_t *sizep)
+{
+ proc_t *p;
+ proc_usage_t *pu;
+ ulong_t mask[AC_MASK_SZ];
+ ulong_t *ac_mask = &mask[0];
+ int error;
+
+ mutex_enter(&ac_proc->ac_lock);
+ if (ac_proc->ac_state == AC_OFF) {
+ mutex_exit(&ac_proc->ac_lock);
+ return (ENOTACTIVE);
+ }
+ bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ);
+ mutex_exit(&ac_proc->ac_lock);
+
+ pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP);
+ pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP);
+
+ mutex_enter(&pidlock);
+ if ((p = prfind(pid)) == NULL) {
+ mutex_exit(&pidlock);
+ kmem_free(pu->pu_command, MAXCOMLEN + 1);
+ kmem_free(pu, sizeof (proc_usage_t));
+ return (ESRCH);
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ exacct_calculate_proc_usage(p, pu, ac_mask, EW_PARTIAL, 0);
+ mutex_exit(&p->p_lock);
+
+ error = exacct_assemble_proc_usage(ac_proc, pu,
+ getacct_callback, buf, bufsize, sizep, EW_PARTIAL);
+
+ kmem_free(pu->pu_command, MAXCOMLEN + 1);
+ kmem_free(pu, sizeof (proc_usage_t));
+
+ return (error);
+}
+
+static ssize_t
+getacct(idtype_t idtype, id_t id, void *buf, size_t bufsize)
+{
+ size_t size = 0;
+ int error;
+ struct exacct_globals *acg;
+
+ if (bufsize > EXACCT_MAX_BUFSIZE)
+ bufsize = EXACCT_MAX_BUFSIZE;
+
+ acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
+ switch (idtype) {
+ case P_PID:
+ error = getacct_proc(&acg->ac_proc, id, buf, bufsize, &size);
+ break;
+ case P_TASKID:
+ error = getacct_task(&acg->ac_task, id, buf, bufsize, &size);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error == 0 ? (ssize_t)size : set_errno(error));
+}
+
+static int
+putacct(idtype_t idtype, id_t id, void *buf, size_t bufsize, int flags)
+{
+ int error;
+ taskid_t tkid;
+ proc_t *p;
+ task_t *tk;
+ void *kbuf;
+ struct exacct_globals *acg;
+
+ if (bufsize == 0 || bufsize > EXACCT_MAX_BUFSIZE)
+ return (set_errno(EINVAL));
+
+ kbuf = kmem_alloc(bufsize, KM_SLEEP);
+ if (copyin(buf, kbuf, bufsize) != 0) {
+ error = EFAULT;
+ goto out;
+ }
+
+ acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
+ switch (idtype) {
+ case P_PID:
+ mutex_enter(&pidlock);
+ if ((p = prfind(id)) == NULL) {
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ } else {
+ zone_t *zone = p->p_zone;
+
+ tkid = p->p_task->tk_tkid;
+ zone_hold(zone);
+ mutex_exit(&pidlock);
+
+ error = exacct_tag_proc(&acg->ac_proc, id, tkid, kbuf,
+ bufsize, flags, zone->zone_nodename);
+ zone_rele(zone);
+ }
+ break;
+ case P_TASKID:
+ if ((tk = task_hold_by_id(id)) != NULL) {
+ error = exacct_tag_task(&acg->ac_task, tk, kbuf,
+ bufsize, flags);
+ task_rele(tk);
+ } else {
+ error = ESRCH;
+ }
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+out:
+ kmem_free(kbuf, bufsize);
+ return (error == 0 ? error : set_errno(error));
+}
+
+static int
+wracct_task(ac_info_t *ac_task, taskid_t tkid, int flag, size_t *sizep)
+{
+ task_t *tk;
+ int error;
+
+ mutex_enter(&ac_task->ac_lock);
+ if (ac_task->ac_state == AC_OFF || ac_task->ac_vnode == NULL) {
+ mutex_exit(&ac_task->ac_lock);
+ return (ENOTACTIVE);
+ }
+ mutex_exit(&ac_task->ac_lock);
+
+ if ((tk = task_hold_by_id(tkid)) == NULL)
+ return (ESRCH);
+ error = exacct_assemble_task_usage(ac_task, tk, exacct_commit_callback,
+ NULL, 0, sizep, flag);
+ task_rele(tk);
+
+ return (error);
+}
+
+static int
+wracct_proc(ac_info_t *ac_proc, pid_t pid, int flag, size_t *sizep)
+{
+ proc_t *p;
+ proc_usage_t *pu;
+ ulong_t mask[AC_MASK_SZ];
+ ulong_t *ac_mask = &mask[0];
+ int error;
+
+ mutex_enter(&ac_proc->ac_lock);
+ if (ac_proc->ac_state == AC_OFF || ac_proc->ac_vnode == NULL) {
+ mutex_exit(&ac_proc->ac_lock);
+ return (ENOTACTIVE);
+ }
+ bt_copy(&ac_proc->ac_mask[0], ac_mask, AC_MASK_SZ);
+ mutex_exit(&ac_proc->ac_lock);
+
+ pu = kmem_zalloc(sizeof (proc_usage_t), KM_SLEEP);
+ pu->pu_command = kmem_zalloc(MAXCOMLEN + 1, KM_SLEEP);
+
+ mutex_enter(&pidlock);
+ if ((p = prfind(pid)) == NULL) {
+ mutex_exit(&pidlock);
+ kmem_free(pu->pu_command, MAXCOMLEN + 1);
+ kmem_free(pu, sizeof (proc_usage_t));
+ return (ESRCH);
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+ exacct_calculate_proc_usage(p, pu, ac_mask, flag, 0);
+ mutex_exit(&p->p_lock);
+
+ error = exacct_assemble_proc_usage(ac_proc, pu,
+ exacct_commit_callback, NULL, 0, sizep, flag);
+
+ kmem_free(pu->pu_command, MAXCOMLEN + 1);
+ kmem_free(pu, sizeof (proc_usage_t));
+
+ return (error);
+}
+
+static int
+wracct(idtype_t idtype, id_t id, int flags)
+{
+ int error;
+ size_t size = 0;
+ struct exacct_globals *acg;
+
+ /*
+ * Validate flags.
+ */
+ switch (flags) {
+ case EW_PARTIAL:
+ case EW_INTERVAL:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ acg = zone_getspecific(exacct_zone_key, curproc->p_zone);
+ switch (idtype) {
+ case P_PID:
+ if (flags == EW_INTERVAL)
+ return (set_errno(ENOTSUP));
+ error = wracct_proc(&acg->ac_proc, id, flags, &size);
+ break;
+ case P_TASKID:
+ error = wracct_task(&acg->ac_task, id, flags, &size);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error == 0 ? error : set_errno(error));
+}
+
+static long
+exacct(int code, idtype_t idtype, id_t id, void *buf, size_t bufsize,
+ int flags)
+{
+ if (secpolicy_acct(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (exacct_zone_key == ZONE_KEY_UNINITIALIZED)
+ return (set_errno(ENOTACTIVE));
+
+ switch (code) {
+ case 0:
+ return (getacct(idtype, id, buf, bufsize));
+ case 1:
+ return (putacct(idtype, id, buf, bufsize, flags));
+ case 2:
+ return (wracct(idtype, id, flags));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
+
+#if defined(_LP64)
+#define SE_LRVAL SE_64RVAL
+#else
+#define SE_LRVAL SE_32RVAL1
+#endif
+
+static struct sysent exacctsys_sysent = {
+ 6,
+ SE_NOUNLOAD | SE_ARGC | SE_LRVAL,
+ (int (*)())exacct
+};
+
+static struct modlsys modlsys = {
+ &mod_syscallops,
+ "extended accounting facility",
+ &exacctsys_sysent
+};
+
+#ifdef _SYSCALL32_IMPL
+
+static struct sysent exacctsys_sysent32 = {
+ 6,
+ SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
+ (int (*)())exacct
+};
+
+static struct modlsys modlsys32 = {
+ &mod_syscallops32,
+ "32-bit extended accounting facility",
+ &exacctsys_sysent32
+};
+
+#endif
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsys,
+#ifdef _SYSCALL32_IMPL
+ &modlsys32,
+#endif
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *mip)
+{
+ return (mod_info(&modlinkage, mip));
+}
diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c
new file mode 100644
index 0000000000..39e0f7f6bd
--- /dev/null
+++ b/usr/src/uts/common/syscall/fcntl.c
@@ -0,0 +1,802 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* ONC_PLUS EXTRACT START */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+/* ONC_PLUS EXTRACT END */
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+/* ONC_PLUS EXTRACT START */
+#include <sys/flock.h>
+/* ONC_PLUS EXTRACT END */
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/proc.h>
+#include <sys/filio.h>
+#include <sys/share.h>
+#include <sys/debug.h>
+#include <sys/rctl.h>
+#include <sys/nbmlock.h>
+
+/* ONC_PLUS EXTRACT START */
+static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
+static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *);
+static void fd_too_big(proc_t *);
+
+/*
+ * File control.
+ */
+int
+fcntl(int fdes, int cmd, intptr_t arg)
+{
+ int iarg;
+ int error = 0;
+ int retval;
+ proc_t *p;
+ file_t *fp;
+ vnode_t *vp;
+ u_offset_t offset;
+ u_offset_t start;
+ struct vattr vattr;
+ int in_crit;
+ int flag;
+ struct flock sbf;
+ struct flock64 bf;
+ struct o_flock obf;
+ struct flock64_32 bf64_32;
+ struct fshare fsh;
+ struct shrlock shr;
+ struct shr_locowner shr_own;
+ offset_t maxoffset;
+ model_t datamodel;
+
+#if defined(_ILP32) && !defined(lint) && defined(_SYSCALL32)
+ ASSERT(sizeof (struct flock) == sizeof (struct flock32));
+ ASSERT(sizeof (struct flock64) == sizeof (struct flock64_32));
+#endif
+#if defined(_LP64) && !defined(lint) && defined(_SYSCALL32)
+ ASSERT(sizeof (struct flock) == sizeof (struct flock64_64));
+ ASSERT(sizeof (struct flock64) == sizeof (struct flock64_64));
+#endif
+
+ /*
+ * First, for speed, deal with the subset of cases
+ * that do not require getf() / releasef().
+ */
+ switch (cmd) {
+ case F_GETFD:
+ if ((error = f_getfd_error(fdes, &flag)) == 0)
+ retval = flag;
+ goto out;
+
+ case F_SETFD:
+ error = f_setfd_error(fdes, (int)arg);
+ retval = 0;
+ goto out;
+
+ case F_GETFL:
+ if ((error = f_getfl(fdes, &flag)) == 0)
+ retval = (flag & (FMASK | FASYNC)) + FOPEN;
+ goto out;
+
+ case F_GETXFL:
+ if ((error = f_getfl(fdes, &flag)) == 0)
+ retval = flag + FOPEN;
+ goto out;
+ }
+
+ /*
+ * Second, for speed, deal with the subset of cases that
+ * require getf() / releasef() but do not require copyin.
+ */
+ if ((fp = getf(fdes)) == NULL) {
+ error = EBADF;
+ goto out;
+ }
+ iarg = (int)arg;
+
+ switch (cmd) {
+/* ONC_PLUS EXTRACT END */
+
+ case F_DUPFD:
+ p = curproc;
+ if ((uint_t)iarg >= p->p_fno_ctl) {
+ if (iarg >= 0)
+ fd_too_big(p);
+ error = EINVAL;
+ } else if ((retval = ufalloc_file(iarg, fp)) == -1) {
+ error = EMFILE;
+ } else {
+ mutex_enter(&fp->f_tlock);
+ fp->f_count++;
+ mutex_exit(&fp->f_tlock);
+ }
+ goto done;
+
+ case F_DUP2FD:
+ p = curproc;
+ if (fdes == iarg) {
+ retval = iarg;
+ } else if ((uint_t)iarg >= p->p_fno_ctl) {
+ if (iarg >= 0)
+ fd_too_big(p);
+ error = EBADF;
+ } else {
+ /*
+ * We can't hold our getf(fdes) across the call to
+ * closeandsetf() because it creates a window for
+ * deadlock: if one thread is doing dup2(a, b) while
+ * another is doing dup2(b, a), each one will block
+ * waiting for the other to call releasef(). The
+ * solution is to increment the file reference count
+ * (which we have to do anyway), then releasef(fdes),
+ * then closeandsetf(). Incrementing f_count ensures
+ * that fp won't disappear after we call releasef().
+ */
+ mutex_enter(&fp->f_tlock);
+ fp->f_count++;
+ mutex_exit(&fp->f_tlock);
+ releasef(fdes);
+ (void) closeandsetf(iarg, fp);
+ retval = iarg;
+ goto out;
+ }
+ goto done;
+
+ case F_SETFL:
+ vp = fp->f_vnode;
+ flag = fp->f_flag;
+ if ((iarg & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY))
+ iarg &= ~FNDELAY;
+ if ((error = VOP_SETFL(vp, flag, iarg, fp->f_cred)) == 0) {
+ iarg &= FMASK;
+ mutex_enter(&fp->f_tlock);
+ fp->f_flag &= ~FMASK | (FREAD|FWRITE);
+ fp->f_flag |= (iarg - FOPEN) & ~(FREAD|FWRITE);
+ mutex_exit(&fp->f_tlock);
+ }
+ retval = 0;
+ goto done;
+ }
+
+ /*
+ * Finally, deal with the expensive cases.
+ */
+ retval = 0;
+ in_crit = 0;
+ maxoffset = MAXOFF_T;
+ datamodel = DATAMODEL_NATIVE;
+#if defined(_SYSCALL32_IMPL)
+ if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32)
+ maxoffset = MAXOFF32_T;
+#endif
+
+ vp = fp->f_vnode;
+ flag = fp->f_flag;
+ offset = fp->f_offset;
+
+ switch (cmd) {
+/* ONC_PLUS EXTRACT START */
+ /*
+ * The file system and vnode layers understand and implement
+ * locking with flock64 structures. So here once we pass through
+ * the test for compatibility as defined by LFS API, (for F_SETLK,
+ * F_SETLKW, F_GETLK, F_GETLKW, F_FREESP) we transform
+ * the flock structure to a flock64 structure and send it to the
+ * lower layers. Similarly in case of GETLK the returned flock64
+ * structure is transformed to a flock structure if everything fits
+ * in nicely, otherwise we return EOVERFLOW.
+ */
+
+ case F_GETLK:
+ case F_O_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
+ case F_SETLK_NBMAND:
+
+ /*
+ * Copy in input fields only.
+ */
+
+ if (cmd == F_O_GETLK) {
+ if (datamodel != DATAMODEL_ILP32) {
+ error = EINVAL;
+ break;
+ }
+
+ if (copyin((void *)arg, &obf, sizeof (obf))) {
+ error = EFAULT;
+ break;
+ }
+ bf.l_type = obf.l_type;
+ bf.l_whence = obf.l_whence;
+ bf.l_start = (off64_t)obf.l_start;
+ bf.l_len = (off64_t)obf.l_len;
+ bf.l_sysid = (int)obf.l_sysid;
+ bf.l_pid = obf.l_pid;
+ } else if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin((void *)arg, &sbf, sizeof (sbf))) {
+ error = EFAULT;
+ break;
+ }
+ /*
+ * XXX In an LP64 kernel with an LP64 application
+ * there's no need to do a structure copy here
+ * struct flock == struct flock64. However,
+ * we did it this way to avoid more conditional
+ * compilation.
+ */
+ bf.l_type = sbf.l_type;
+ bf.l_whence = sbf.l_whence;
+ bf.l_start = (off64_t)sbf.l_start;
+ bf.l_len = (off64_t)sbf.l_len;
+ bf.l_sysid = sbf.l_sysid;
+ bf.l_pid = sbf.l_pid;
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ struct flock32 sbf32;
+ if (copyin((void *)arg, &sbf32, sizeof (sbf32))) {
+ error = EFAULT;
+ break;
+ }
+ bf.l_type = sbf32.l_type;
+ bf.l_whence = sbf32.l_whence;
+ bf.l_start = (off64_t)sbf32.l_start;
+ bf.l_len = (off64_t)sbf32.l_len;
+ bf.l_sysid = sbf32.l_sysid;
+ bf.l_pid = sbf32.l_pid;
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ /*
+ * 64-bit support: check for overflow for 32-bit lock ops
+ */
+ if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0)
+ break;
+
+ /*
+ * Not all of the filesystems understand F_O_GETLK, and
+ * there's no need for them to know. Map it to F_GETLK.
+ */
+ if ((error = VOP_FRLOCK(vp, (cmd == F_O_GETLK) ? F_GETLK : cmd,
+ &bf, flag, offset, NULL, fp->f_cred)) != 0)
+ break;
+
+ /*
+ * If command is GETLK and no lock is found, only
+ * the type field is changed.
+ */
+ if ((cmd == F_O_GETLK || cmd == F_GETLK) &&
+ bf.l_type == F_UNLCK) {
+ /* l_type always first entry, always a short */
+ if (copyout(&bf.l_type, &((struct flock *)arg)->l_type,
+ sizeof (bf.l_type)))
+ error = EFAULT;
+ break;
+ }
+
+ if (cmd == F_O_GETLK) {
+ /*
+ * Return an SVR3 flock structure to the user.
+ */
+ obf.l_type = (int16_t)bf.l_type;
+ obf.l_whence = (int16_t)bf.l_whence;
+ obf.l_start = (int32_t)bf.l_start;
+ obf.l_len = (int32_t)bf.l_len;
+ if (bf.l_sysid > SHRT_MAX || bf.l_pid > SHRT_MAX) {
+ /*
+ * One or both values for the above fields
+ * is too large to store in an SVR3 flock
+ * structure.
+ */
+ error = EOVERFLOW;
+ break;
+ }
+ obf.l_sysid = (int16_t)bf.l_sysid;
+ obf.l_pid = (int16_t)bf.l_pid;
+ if (copyout(&obf, (void *)arg, sizeof (obf)))
+ error = EFAULT;
+ } else if (cmd == F_GETLK) {
+ /*
+ * Copy out SVR4 flock.
+ */
+ int i;
+
+ if (bf.l_start > maxoffset || bf.l_len > maxoffset) {
+ error = EOVERFLOW;
+ break;
+ }
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ for (i = 0; i < 4; i++)
+ sbf.l_pad[i] = 0;
+ /*
+ * XXX In an LP64 kernel with an LP64
+ * application there's no need to do a
+ * structure copy here as currently
+ * struct flock == struct flock64.
+ * We did it this way to avoid more
+ * conditional compilation.
+ */
+ sbf.l_type = bf.l_type;
+ sbf.l_whence = bf.l_whence;
+ sbf.l_start = (off_t)bf.l_start;
+ sbf.l_len = (off_t)bf.l_len;
+ sbf.l_sysid = bf.l_sysid;
+ sbf.l_pid = bf.l_pid;
+ if (copyout(&sbf, (void *)arg, sizeof (sbf)))
+ error = EFAULT;
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ struct flock32 sbf32;
+ if (bf.l_start > MAXOFF32_T ||
+ bf.l_len > MAXOFF32_T) {
+ error = EOVERFLOW;
+ break;
+ }
+ for (i = 0; i < 4; i++)
+ sbf32.l_pad[i] = 0;
+ sbf32.l_type = (int16_t)bf.l_type;
+ sbf32.l_whence = (int16_t)bf.l_whence;
+ sbf32.l_start = (off32_t)bf.l_start;
+ sbf32.l_len = (off32_t)bf.l_len;
+ sbf32.l_sysid = (int32_t)bf.l_sysid;
+ sbf32.l_pid = (pid32_t)bf.l_pid;
+ if (copyout(&sbf32,
+ (void *)arg, sizeof (sbf32)))
+ error = EFAULT;
+ }
+#endif
+ }
+ break;
+/* ONC_PLUS EXTRACT END */
+
+ case F_CHKFL:
+ /*
+ * This is for internal use only, to allow the vnode layer
+ * to validate a flags setting before applying it. User
+ * programs can't issue it.
+ */
+ error = EINVAL;
+ break;
+
+ case F_ALLOCSP:
+ case F_FREESP:
+ if ((flag & FWRITE) == 0) {
+ error = EBADF;
+ break;
+ }
+ if (vp->v_type != VREG) {
+ error = EINVAL;
+ break;
+ }
+
+#if defined(_ILP32) || defined(_SYSCALL32_IMPL)
+ if (datamodel == DATAMODEL_ILP32) {
+ struct flock32 sbf32;
+ /*
+ * For compatibility we overlay an SVR3 flock on an SVR4
+ * flock. This works because the input field offsets
+ * in "struct flock" were preserved.
+ */
+ if (copyin((void *)arg, &sbf32, sizeof (sbf32))) {
+ error = EFAULT;
+ break;
+ } else {
+ bf.l_type = sbf32.l_type;
+ bf.l_whence = sbf32.l_whence;
+ bf.l_start = (off64_t)sbf32.l_start;
+ bf.l_len = (off64_t)sbf32.l_len;
+ bf.l_sysid = sbf32.l_sysid;
+ bf.l_pid = sbf32.l_pid;
+ }
+ }
+#endif /* _ILP32 || _SYSCALL32_IMPL */
+
+#if defined(_LP64)
+ if (datamodel == DATAMODEL_LP64) {
+ if (copyin((void *)arg, &bf, sizeof (bf))) {
+ error = EFAULT;
+ break;
+ }
+ }
+#endif
+
+ if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0)
+ break;
+
+ if (vp->v_type == VREG && bf.l_len == 0 &&
+ bf.l_start > OFFSET_MAX(fp)) {
+ error = EFBIG;
+ break;
+ }
+
+ /*
+ * Make sure that there are no conflicting non-blocking
+ * mandatory locks in the region being manipulated. If
+ * there are such locks then return EACCES.
+ */
+ if ((error = flock_get_start(vp, &bf, offset, &start)) != 0)
+ break;
+
+ if (nbl_need_check(vp)) {
+ u_offset_t begin;
+ ssize_t length;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0, CRED())) != 0)
+ break;
+ begin = start > vattr.va_size ? vattr.va_size : start;
+ length = vattr.va_size > start ? vattr.va_size - start :
+ start - vattr.va_size;
+ if (nbl_conflict(vp, NBL_WRITE, begin, length, 0)) {
+ error = EACCES;
+ break;
+ }
+ }
+ error = VOP_SPACE(vp, cmd, &bf, flag, offset, fp->f_cred, NULL);
+ break;
+
+#if !defined(_LP64) || defined(_SYSCALL32_IMPL)
+/* ONC_PLUS EXTRACT START */
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_SETLK64_NBMAND:
+ /*
+ * Large Files: Here we set cmd as *LK and send it to
+ * lower layers. *LK64 is only for the user land.
+ * Most of the comments described above for F_SETLK
+ * applies here too.
+ * Large File support is only needed for ILP32 apps!
+ */
+ if (datamodel != DATAMODEL_ILP32) {
+ error = EINVAL;
+ break;
+ }
+
+ if (cmd == F_GETLK64)
+ cmd = F_GETLK;
+ else if (cmd == F_SETLK64)
+ cmd = F_SETLK;
+ else if (cmd == F_SETLKW64)
+ cmd = F_SETLKW;
+ else if (cmd == F_SETLK64_NBMAND)
+ cmd = F_SETLK_NBMAND;
+
+ /*
+ * Note that the size of flock64 is different in the ILP32
+ * and LP64 models, due to the sucking l_pad field.
+ * We do not want to assume that the flock64 structure is
+ * laid out in the same in ILP32 and LP64 environments, so
+ * we will copy in the ILP32 version of flock64 explicitly
+ * and copy it to the native flock64 structure.
+ */
+
+ if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) {
+ error = EFAULT;
+ break;
+ }
+ bf.l_type = (short)bf64_32.l_type;
+ bf.l_whence = (short)bf64_32.l_whence;
+ bf.l_start = bf64_32.l_start;
+ bf.l_len = bf64_32.l_len;
+ bf.l_sysid = (int)bf64_32.l_sysid;
+ bf.l_pid = (pid_t)bf64_32.l_pid;
+
+ if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0)
+ break;
+
+ if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset,
+ NULL, fp->f_cred)) != 0)
+ break;
+
+ if ((cmd == F_GETLK) && bf.l_type == F_UNLCK) {
+ if (copyout(&bf.l_type, &((struct flock *)arg)->l_type,
+ sizeof (bf.l_type)))
+ error = EFAULT;
+ break;
+ }
+
+ if (cmd == F_GETLK) {
+ int i;
+
+ /*
+ * We do not want to assume that the flock64 structure
+ * is laid out in the same in ILP32 and LP64
+ * environments, so we will copy out the ILP32 version
+ * of flock64 explicitly after copying the native
+ * flock64 structure to it.
+ */
+ for (i = 0; i < 4; i++)
+ bf64_32.l_pad[i] = 0;
+ bf64_32.l_type = (int16_t)bf.l_type;
+ bf64_32.l_whence = (int16_t)bf.l_whence;
+ bf64_32.l_start = bf.l_start;
+ bf64_32.l_len = bf.l_len;
+ bf64_32.l_sysid = (int32_t)bf.l_sysid;
+ bf64_32.l_pid = (pid32_t)bf.l_pid;
+ if (copyout(&bf64_32, (void *)arg, sizeof (bf64_32)))
+ error = EFAULT;
+ }
+ break;
+/* ONC_PLUS EXTRACT END */
+
+ case F_FREESP64:
+ if (datamodel != DATAMODEL_ILP32) {
+ error = EINVAL;
+ break;
+ }
+ cmd = F_FREESP;
+ if ((flag & FWRITE) == 0)
+ error = EBADF;
+ else if (vp->v_type != VREG)
+ error = EINVAL;
+ else if (copyin((void *)arg, &bf64_32, sizeof (bf64_32)))
+ error = EFAULT;
+ else {
+ /*
+ * Note that the size of flock64 is different in
+ * the ILP32 and LP64 models, due to the l_pad field.
+ * We do not want to assume that the flock64 structure
+ * is laid out the same in ILP32 and LP64
+ * environments, so we will copy in the ILP32
+ * version of flock64 explicitly and copy it to
+ * the native flock64 structure.
+ */
+ bf.l_type = (short)bf64_32.l_type;
+ bf.l_whence = (short)bf64_32.l_whence;
+ bf.l_start = bf64_32.l_start;
+ bf.l_len = bf64_32.l_len;
+ bf.l_sysid = (int)bf64_32.l_sysid;
+ bf.l_pid = (pid_t)bf64_32.l_pid;
+
+ if ((error = flock_check(vp, &bf, offset,
+ MAXOFFSET_T)) != 0)
+ break;
+
+ if (vp->v_type == VREG && bf.l_len == 0 &&
+ bf.l_start > OFFSET_MAX(fp)) {
+ error = EFBIG;
+ break;
+ }
+ /*
+ * Make sure that there are no conflicting non-blocking
+ * mandatory locks in the region being manipulated. If
+ * there are such locks then return EACCES.
+ */
+ if ((error = flock_get_start(vp, &bf, offset,
+ &start)) != 0)
+ break;
+ if (nbl_need_check(vp)) {
+ u_offset_t begin;
+ ssize_t length;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0,
+ CRED())) != 0)
+ break;
+ begin = start > vattr.va_size ?
+ vattr.va_size : start;
+ length = vattr.va_size > start ?
+ vattr.va_size - start :
+ start - vattr.va_size;
+ if (nbl_conflict(vp, NBL_WRITE, begin,
+ length, 0)) {
+ error = EACCES;
+ break;
+ }
+ }
+ error = VOP_SPACE(vp, cmd, &bf, flag, offset,
+ fp->f_cred, NULL);
+ }
+ break;
+#endif /* !_LP64 || _SYSCALL32_IMPL */
+
+/* ONC_PLUS EXTRACT START */
+ case F_SHARE:
+ case F_SHARE_NBMAND:
+ case F_UNSHARE:
+
+ /*
+ * Copy in input fields only.
+ */
+ if (copyin((void *)arg, &fsh, sizeof (fsh))) {
+ error = EFAULT;
+ break;
+ }
+
+ /*
+ * Local share reservations always have this simple form
+ */
+ shr.s_access = fsh.f_access;
+ shr.s_deny = fsh.f_deny;
+ shr.s_sysid = 0;
+ shr.s_pid = ttoproc(curthread)->p_pid;
+ shr_own.sl_pid = shr.s_pid;
+ shr_own.sl_id = fsh.f_id;
+ shr.s_own_len = sizeof (shr_own);
+ shr.s_owner = (caddr_t)&shr_own;
+ error = VOP_SHRLOCK(vp, cmd, &shr, flag, fp->f_cred);
+/* ONC_PLUS EXTRACT END */
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (in_crit)
+ nbl_end_crit(vp);
+
+done:
+ releasef(fdes);
+out:
+ if (error)
+ return (set_errno(error));
+ return (retval);
+}
+
+int
+dup(int fd)
+{
+ return (fcntl(fd, F_DUPFD, 0));
+}
+
+/* ONC_PLUS EXTRACT START */
+int
+flock_check(vnode_t *vp, flock64_t *flp, offset_t offset, offset_t max)
+{
+ struct vattr vattr;
+ int error;
+ u_offset_t start, end;
+
+ /*
+ * Determine the starting point of the request
+ */
+ switch (flp->l_whence) {
+ case 0: /* SEEK_SET */
+ start = (u_offset_t)flp->l_start;
+ if (start > max)
+ return (EINVAL);
+ break;
+ case 1: /* SEEK_CUR */
+ if (flp->l_start > (max - offset))
+ return (EOVERFLOW);
+ start = (u_offset_t)(flp->l_start + offset);
+ if (start > max)
+ return (EINVAL);
+ break;
+ case 2: /* SEEK_END */
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, 0, CRED()))
+ return (error);
+ if (flp->l_start > (max - (offset_t)vattr.va_size))
+ return (EOVERFLOW);
+ start = (u_offset_t)(flp->l_start + (offset_t)vattr.va_size);
+ if (start > max)
+ return (EINVAL);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Determine the range covered by the request.
+ */
+ if (flp->l_len == 0)
+ end = MAXEND;
+ else if ((offset_t)flp->l_len > 0) {
+ if (flp->l_len > (max - start + 1))
+ return (EOVERFLOW);
+ end = (u_offset_t)(start + (flp->l_len - 1));
+ ASSERT(end <= max);
+ } else {
+ /*
+ * Negative length; why do we even allow this ?
+ * Because this allows easy specification of
+ * the last n bytes of the file.
+ */
+ end = start;
+ start += (u_offset_t)flp->l_len;
+ (start)++;
+ if (start > max)
+ return (EINVAL);
+ ASSERT(end <= max);
+ }
+ ASSERT(start <= max);
+ if (flp->l_type == F_UNLCK && flp->l_len > 0 &&
+ end == (offset_t)max) {
+ flp->l_len = 0;
+ }
+ if (start > end)
+ return (EINVAL);
+ return (0);
+}
+
+static int
+flock_get_start(vnode_t *vp, flock64_t *flp, offset_t offset, u_offset_t *start)
+{
+ struct vattr vattr;
+ int error;
+
+ /*
+ * Determine the starting point of the request. Assume that it is
+ * a valid starting point.
+ */
+ switch (flp->l_whence) {
+ case 0: /* SEEK_SET */
+ *start = (u_offset_t)flp->l_start;
+ break;
+ case 1: /* SEEK_CUR */
+ *start = (u_offset_t)(flp->l_start + offset);
+ break;
+ case 2: /* SEEK_END */
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, 0, CRED()))
+ return (error);
+ *start = (u_offset_t)(flp->l_start + (offset_t)vattr.va_size);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * Take rctl action when the requested file descriptor is too big.
+ */
+static void
+fd_too_big(proc_t *p)
+{
+ mutex_enter(&p->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+ p->p_rctls, p, RCA_SAFE);
+ mutex_exit(&p->p_lock);
+}
+/* ONC_PLUS EXTRACT END */
diff --git a/usr/src/uts/common/syscall/fdsync.c b/usr/src/uts/common/syscall/fdsync.c
new file mode 100644
index 0000000000..9951eb8727
--- /dev/null
+++ b/usr/src/uts/common/syscall/fdsync.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1998 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/debug.h>
+
+/*
+ * Flush output pending for file.
+ */
+int
+fdsync(int fd, int flag)
+{
+ file_t *fp;
+ register int error;
+ int syncflag;
+
+ if ((fp = getf(fd)) != NULL) {
+ /*
+ * This flag will determine the file sync
+ * or data sync.
+ * FSYNC : file sync
+ * FDSYNC : data sync
+ */
+ syncflag = flag & (FSYNC|FDSYNC);
+
+ if (error = VOP_FSYNC(fp->f_vnode, syncflag, fp->f_cred))
+ (void) set_errno(error);
+ releasef(fd);
+ } else
+ error = set_errno(EBADF);
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/fsat.c b/usr/src/uts/common/syscall/fsat.c
new file mode 100644
index 0000000000..5e78a738c7
--- /dev/null
+++ b/usr/src/uts/common/syscall/fsat.c
@@ -0,0 +1,162 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+
+extern int openat(int, char *, int, int);
+extern int renameat(int, char *, int, char *);
+extern int unlinkat(int, char *, int);
+extern int fchownat(int, char *, uid_t, gid_t, int);
+extern int fstatat(int, char *, struct stat *, int);
+extern int futimesat(int, char *, struct timeval *);
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+extern int fstatat64_32(int, char *, struct stat64_32 *, int);
+extern int fstatat32(int, char *, struct stat32 *, int);
+extern int openat32(int, char *, int, int);
+extern int fstatat64(int, char *, struct stat64 *, int);
+extern int openat64(int, char *, int, int);
+extern int fstatat64_32(int, char *, struct stat64_32 *, int);
+#endif
+
+
+/*
+ * Handle all of the *at system calls
+ *
+ * subcodes:
+ * 0 - openat
+ * 1 - openat64
+ * 2 - fstatat64
+ * 3 - fstatat
+ * 4 - fchownat
+ * 5 - unlinkat
+ * 6 - futimesat
+ * 7 - renameat
+ *
+ * The code for handling the at functionality exists in the file where the
+ * base syscall is defined. For example openat is in open.c
+ */
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+
+int
+fsat32(int code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
+ uintptr_t arg4, uintptr_t arg5)
+{
+ switch (code) {
+
+ case 0: /* openat */
+#if defined(_LP64)
+ return (openat32((int)arg1, (char *)arg2,
+ (int)arg3, (int)arg4));
+#else
+ return (openat((int)arg1, (char *)arg2,
+ (int)arg3, (int)arg4));
+#endif
+ case 1: /* openat64 */
+ return (openat64((int)arg1, (char *)arg2,
+ (int)arg3, (int)arg4));
+ case 2: /* fstatat64 */
+#if defined(_LP64)
+ return (fstatat64_32((int)arg1, (char *)arg2,
+ (struct stat64_32 *)arg3, (int)arg4));
+#else
+ return (fstatat64((int)arg1, (char *)arg2,
+ (struct stat64 *)arg3, (int)arg4));
+#endif
+ case 3: /* fstatat */
+#if defined(_LP64)
+ return (fstatat32((int)arg1, (char *)arg2,
+ (struct stat32 *)arg3, (int)arg4));
+#else
+ return (fstatat((int)arg1, (char *)arg2,
+ (struct stat *)arg3, (int)arg4));
+#endif
+ case 4: /* fchownat */
+ return (fchownat((int)arg1, (char *)arg2,
+ (uid_t)arg3, (gid_t)arg4, (int)arg5));
+ case 5: /* unlinkat */
+ return (unlinkat((int)arg1, (char *)arg2, (int)arg3));
+ case 6: /* futimesat */
+ return (futimesat((int)arg1,
+ (char *)arg2, (struct timeval *)arg3));
+ case 7: /* renameat */
+ return (renameat((int)arg1, (char *)arg2, (int)arg3,
+ (char *)arg4));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
+
+#endif
+
+/*
+ * For 64 kernels, use fsat64
+ */
+
+#if defined(_LP64)
+
+int
+fsat64(int code, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
+ uintptr_t arg4, uintptr_t arg5)
+{
+ switch (code) {
+
+ case 0: /* openat */
+ return (openat((int)arg1, (char *)arg2,
+ (int)arg3, (int)arg4));
+ case 1: /* openat64 */
+ return (set_errno(ENOSYS));
+ case 2: /* fstatat64 */
+ return (set_errno(ENOSYS));
+ case 3: /* fstatat */
+ return (fstatat((int)arg1, (char *)arg2,
+ (struct stat *)arg3, (int)arg4));
+ case 4: /* fchownat */
+ return (fchownat((int)arg1, (char *)arg2,
+ (uid_t)arg3, (gid_t)arg4, (int)arg5));
+ case 5: /* unlinkat */
+ return (unlinkat((int)arg1, (char *)arg2, (int)arg3));
+ case 6: /* futimesat */
+ return (futimesat((int)arg1,
+ (char *)arg2, (struct timeval *)arg3));
+ case 7: /* renameat */
+ return (renameat((int)arg1, (char *)arg2, (int)arg3,
+ (char *)arg4));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
+#endif
diff --git a/usr/src/uts/common/syscall/getcwd.c b/usr/src/uts/common/syscall/getcwd.c
new file mode 100644
index 0000000000..f0ce066115
--- /dev/null
+++ b/usr/src/uts/common/syscall/getcwd.c
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/copyops.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/pathname.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/vnode.h>
+
+int
+getcwd(char *buf, size_t buflen)
+{
+ int ret;
+ char *kbuf;
+ size_t kbuflen;
+
+ /*
+ * The user should be able to specify any size buffer, but we don't want
+ * to arbitrarily allocate huge kernel buffers just because the user
+ * requests it. So we'll start with MAXPATHLEN (which should hold any
+ * normal path), and only increase it if we fail with ERANGE.
+ */
+ kbuflen = MIN(buflen, MAXPATHLEN);
+
+ for (;;) {
+ kbuf = kmem_alloc(kbuflen, KM_SLEEP);
+
+ if ((ret = dogetcwd(kbuf, kbuflen)) == 0)
+ ret = copyout(kbuf, buf, strlen(kbuf) + 1);
+
+ kmem_free(kbuf, kbuflen);
+
+ if (ret == ENAMETOOLONG) {
+ /*
+ * If the user's buffer really was too small, give up.
+ * For some reason, getcwd() uses ERANGE for this case.
+ */
+ if (kbuflen == buflen) {
+ ret = ERANGE;
+ break;
+ }
+ kbuflen = MIN(kbuflen * 2, buflen);
+ } else {
+ break;
+ }
+ }
+
+ if (ret)
+ return (set_errno(ret));
+
+ return (ret);
+}
diff --git a/usr/src/uts/common/syscall/getdents.c b/usr/src/uts/common/syscall/getdents.c
new file mode 100644
index 0000000000..fe97a02621
--- /dev/null
+++ b/usr/src/uts/common/syscall/getdents.c
@@ -0,0 +1,236 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/dirent.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/uio.h>
+#include <sys/ioreq.h>
+#include <sys/filio.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+
+/*
+ * Get directory entries in a file system-independent format.
+ *
+ * The 32-bit version of this function now allocates a buffer to grab the
+ * directory entries in dirent64 formats from VOP_READDIR routines.
+ * The dirent64 structures are converted to dirent32 structures and
+ * copied to the user space.
+ *
+ * Both 32-bit and 64-bit versions of libc use getdents64() and therefore
+ * we don't expect any major performance impact due to the extra kmem_alloc's
+ * and copying done in this routine.
+ */
+
+#define MAXGETDENTS_SIZE (64 * 1024)
+
+/*
+ * Native 32-bit system call for non-large-file applications.
+ */
+int
+getdents32(int fd, void *buf, size_t count)
+{
+ vnode_t *vp;
+ file_t *fp;
+ struct uio auio;
+ struct iovec aiov;
+ register int error;
+ int sink;
+ char *newbuf;
+ char *obuf;
+ int bufsize;
+ int osize, nsize;
+ struct dirent64 *dp;
+ struct dirent32 *op;
+
+ if (count < sizeof (struct dirent32))
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+ if (vp->v_type != VDIR) {
+ releasef(fd);
+ return (set_errno(ENOTDIR));
+ }
+
+ /*
+ * Don't let the user overcommit kernel resources.
+ */
+ if (count > MAXGETDENTS_SIZE)
+ count = MAXGETDENTS_SIZE;
+
+ bufsize = count;
+ newbuf = kmem_alloc(bufsize, KM_SLEEP);
+ obuf = kmem_alloc(bufsize, KM_SLEEP);
+
+ aiov.iov_base = newbuf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_resid = count;
+ auio.uio_fmode = 0;
+ auio.uio_extflg = UIO_COPY_CACHED;
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &sink);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ if (error)
+ goto out;
+ count = count - auio.uio_resid;
+ fp->f_offset = auio.uio_loffset;
+
+ dp = (struct dirent64 *)newbuf;
+ op = (struct dirent32 *)obuf;
+ osize = 0;
+ nsize = 0;
+
+ while (nsize < count) {
+ uint32_t reclen, namlen;
+
+ /*
+ * This check ensures that the 64 bit d_ino and d_off
+ * fields will fit into their 32 bit equivalents.
+ *
+ * Although d_off is a signed value, the check is done
+ * against the full 32 bits because certain file systems,
+ * NFS for one, allow directory cookies to use the full
+ * 32 bits. We use uint64_t because there is no exact
+ * unsigned analog to the off64_t type of dp->d_off.
+ */
+ if (dp->d_ino > (ino64_t)UINT32_MAX ||
+ dp->d_off > (uint64_t)UINT32_MAX) {
+ error = EOVERFLOW;
+ goto out;
+ }
+ op->d_ino = (ino32_t)dp->d_ino;
+ op->d_off = (off32_t)dp->d_off;
+ namlen = strlen(dp->d_name);
+ reclen = DIRENT32_RECLEN(namlen);
+ op->d_reclen = (uint16_t)reclen;
+
+ /* use strncpy(9f) to zero out uninitialized bytes */
+
+ (void) strncpy(op->d_name, dp->d_name,
+ DIRENT32_NAMELEN(reclen));
+ nsize += (uint_t)dp->d_reclen;
+ osize += (uint_t)op->d_reclen;
+ dp = (struct dirent64 *)((char *)dp + (uint_t)dp->d_reclen);
+ op = (struct dirent32 *)((char *)op + (uint_t)op->d_reclen);
+ }
+
+ ASSERT(osize <= count);
+ ASSERT((char *)op <= (char *)obuf + bufsize);
+ ASSERT((char *)dp <= (char *)newbuf + bufsize);
+
+ if ((error = copyout(obuf, buf, osize)) < 0)
+ error = EFAULT;
+out:
+ kmem_free(newbuf, bufsize);
+ kmem_free(obuf, bufsize);
+
+ if (error) {
+ releasef(fd);
+ return (set_errno(error));
+ }
+
+ releasef(fd);
+ return (osize);
+}
+
+#endif /* _SYSCALL32 || _ILP32 */
+
+int
+getdents64(int fd, void *buf, size_t count)
+{
+ vnode_t *vp;
+ file_t *fp;
+ struct uio auio;
+ struct iovec aiov;
+ register int error;
+ int sink;
+
+ if (count < sizeof (struct dirent64))
+ return (set_errno(EINVAL));
+
+ /*
+ * Don't let the user overcommit kernel resources.
+ */
+ if (count > MAXGETDENTS_SIZE)
+ count = MAXGETDENTS_SIZE;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+ if (vp->v_type != VDIR) {
+ releasef(fd);
+ return (set_errno(ENOTDIR));
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = count;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = fp->f_offset;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_resid = count;
+ auio.uio_fmode = 0;
+ auio.uio_extflg = UIO_COPY_CACHED;
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &sink);
+ VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
+ if (error) {
+ releasef(fd);
+ return (set_errno(error));
+ }
+ count = count - auio.uio_resid;
+ fp->f_offset = auio.uio_loffset;
+ releasef(fd);
+ return (count);
+}
diff --git a/usr/src/uts/common/syscall/getloadavg.c b/usr/src/uts/common/syscall/getloadavg.c
new file mode 100644
index 0000000000..c669f9b8ba
--- /dev/null
+++ b/usr/src/uts/common/syscall/getloadavg.c
@@ -0,0 +1,68 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/loadavg.h>
+#include <sys/zone.h>
+#include <sys/pool_pset.h>
+
+/*
+ * Extract elements of the raw avenrun array from the kernel for the
+ * implementation of getloadavg(3c)
+ */
+int
+getloadavg(int *buf, int nelem)
+{
+ int *loadbuf = &avenrun[0];
+ int loadavg[LOADAVG_NSTATS];
+ int error;
+
+ if (nelem < 0)
+ return (set_errno(EINVAL));
+ if (nelem > LOADAVG_NSTATS)
+ nelem = LOADAVG_NSTATS;
+
+ if (!INGLOBALZONE(curproc)) {
+ mutex_enter(&cpu_lock);
+ if (pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+ error = cpupart_get_loadavg(psetid, &loadavg[0], nelem);
+ ASSERT(error == 0); /* pset isn't going anywhere */
+ loadbuf = &loadavg[0];
+ }
+ mutex_exit(&cpu_lock);
+ }
+
+ error = copyout(loadbuf, buf, nelem * sizeof (avenrun[0]));
+ if (error)
+ return (set_errno(EFAULT));
+ return (nelem);
+}
diff --git a/usr/src/uts/common/syscall/getpagesizes.c b/usr/src/uts/common/syscall/getpagesizes.c
new file mode 100644
index 0000000000..d53e9a9936
--- /dev/null
+++ b/usr/src/uts/common/syscall/getpagesizes.c
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <vm/page.h>
+#include <sys/errno.h>
+
+/*
+ * Return supported page sizes.
+ */
+int
+getpagesizes(size_t *buf, int nelem)
+{
+ int i, pagesizes = page_num_user_pagesizes();
+ size_t *pgsza;
+
+ if (nelem < 0) {
+ return (set_errno(EINVAL));
+ }
+ if (nelem == 0 && buf != NULL) {
+ return (set_errno(EINVAL));
+ }
+ if (nelem == 0 && buf == NULL) {
+ return (pagesizes);
+ }
+ if (buf == NULL) {
+ return (set_errno(EINVAL));
+ }
+ if (nelem > pagesizes) {
+ nelem = pagesizes;
+ }
+ pgsza = kmem_alloc(sizeof (*pgsza) * nelem, KM_SLEEP);
+ for (i = 0; i < nelem; i++) {
+ pgsza[i] = page_get_user_pagesize(i);
+ }
+ if (copyout(pgsza, buf, nelem * sizeof (*pgsza)) != 0) {
+ kmem_free(pgsza, sizeof (*pgsza) * nelem);
+ return (set_errno(EFAULT));
+ }
+ kmem_free(pgsza, sizeof (*pgsza) * nelem);
+ return (nelem);
+}
+
+#if defined(_SYSCALL32_IMPL)
+
+/*
+ * Some future platforms will support page sizes larger than
+ * a 32-bit address space.
+ */
+int
+getpagesizes32(size32_t *buf, int nelem)
+{
+ int i, pagesizes = page_num_user_pagesizes();
+ size32_t *pgsza32;
+ size_t pgsz;
+ int rc;
+
+ if (nelem < 0) {
+ return (set_errno(EINVAL));
+ }
+ if (nelem == 0 && buf != NULL) {
+ return (set_errno(EINVAL));
+ }
+
+ pgsza32 = kmem_alloc(sizeof (*pgsza32) * pagesizes, KM_SLEEP);
+ for (i = 0; i < pagesizes; i++) {
+ pgsz = page_get_user_pagesize(i);
+ pgsza32[i] = (size32_t)pgsz;
+ if (pgsz > (size32_t)-1) {
+ pagesizes = i - 1;
+ break;
+ }
+ }
+ ASSERT(pagesizes > 0);
+ ASSERT(page_get_user_pagesize(pagesizes - 1) <= (size32_t)-1);
+ if (nelem > pagesizes) {
+ nelem = pagesizes;
+ }
+ if (nelem == 0 && buf == NULL) {
+ rc = pagesizes;
+ goto done;
+ }
+ if (buf == NULL) {
+ rc = set_errno(EINVAL);
+ goto done;
+ }
+ if (copyout(pgsza32, buf, nelem * sizeof (*pgsza32)) != 0) {
+ rc = set_errno(EFAULT);
+ goto done;
+ }
+ rc = nelem;
+done:
+ kmem_free(pgsza32, sizeof (*pgsza32) * page_num_user_pagesizes());
+ return (rc);
+}
+#endif
diff --git a/usr/src/uts/common/syscall/getpid.c b/usr/src/uts/common/syscall/getpid.c
new file mode 100644
index 0000000000..d061fe3b8b
--- /dev/null
+++ b/usr/src/uts/common/syscall/getpid.c
@@ -0,0 +1,56 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+#include <sys/zone.h>
+
+
+int64_t
+getpid(void)
+{
+ rval_t r;
+ proc_t *p;
+
+ p = ttoproc(curthread);
+ r.r_val1 = p->p_pid;
+ if (p->p_flag & SZONETOP)
+ r.r_val2 = curproc->p_zone->zone_zsched->p_pid;
+ else
+ r.r_val2 = p->p_ppid;
+ return (r.r_vals);
+}
diff --git a/usr/src/uts/common/syscall/gid.c b/usr/src/uts/common/syscall/gid.c
new file mode 100644
index 0000000000..1cd5a4fd24
--- /dev/null
+++ b/usr/src/uts/common/syscall/gid.c
@@ -0,0 +1,235 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1994,2001-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/cred_impl.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+#include <sys/policy.h>
+
+
+int
+setgid(gid_t gid)
+{
+ register proc_t *p;
+ int error;
+ int do_nocd = 0;
+ cred_t *cr, *newcr;
+
+ if (gid < 0 || gid > MAXUID)
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if ((gid == cr->cr_rgid || gid == cr->cr_sgid) &&
+ secpolicy_allow_setid(cr, -1, B_TRUE) != 0) {
+ error = 0;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_gid = gid;
+ } else if ((error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) {
+ /*
+ * A privileged process that makes itself look like a
+ * set-gid process must be marked to produce no core dump.
+ */
+ if (cr->cr_gid != gid ||
+ cr->cr_rgid != gid ||
+ cr->cr_sgid != gid)
+ do_nocd = 1;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_gid = gid;
+ newcr->cr_rgid = gid;
+ newcr->cr_sgid = gid;
+ } else
+ crfree(newcr);
+
+ mutex_exit(&p->p_crlock);
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ return (set_errno(error));
+}
+
+int64_t
+getgid(void)
+{
+ rval_t r;
+ cred_t *cr;
+
+ cr = curthread->t_cred;
+ r.r_val1 = cr->cr_rgid;
+ r.r_val2 = cr->cr_gid;
+ return (r.r_vals);
+}
+
+int
+setegid(gid_t gid)
+{
+ register proc_t *p;
+ register cred_t *cr, *newcr;
+ int error = EPERM;
+ int do_nocd = 0;
+
+ if (gid < 0 || gid > MAXUID)
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+ if (gid == cr->cr_rgid || gid == cr->cr_gid || gid == cr->cr_sgid ||
+ (error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) {
+ /*
+ * A privileged process that makes itself look like a
+ * set-gid process must be marked to produce no core dump.
+ */
+ if (cr->cr_gid != gid && error == 0)
+ do_nocd = 1;
+ error = 0;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_gid = gid;
+ } else
+ crfree(newcr);
+
+ mutex_exit(&p->p_crlock);
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ return (set_errno(error));
+}
+
+/*
+ * Buy-back from SunOS 4.x
+ *
+ * Like setgid() and setegid() combined -except- that non-root users
+ * can change cr_rgid to cr_gid, and the semantics of cr_sgid are
+ * subtly different.
+ */
+int
+setregid(gid_t rgid, gid_t egid)
+{
+ proc_t *p;
+ int error = EPERM;
+ int do_nocd = 0;
+ cred_t *cr, *newcr;
+
+ if ((rgid != -1 && (rgid < 0 || rgid > MAXUID)) ||
+ (egid != -1 && (egid < 0 || egid > MAXUID)))
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if ((rgid == -1 ||
+ rgid == cr->cr_rgid || rgid == cr->cr_gid || rgid == cr->cr_sgid) &&
+ (egid == -1 || egid == cr->cr_rgid || egid == cr->cr_gid ||
+ egid == cr->cr_sgid) ||
+ (error = secpolicy_allow_setid(cr, -1, B_FALSE)) == 0) {
+ crhold(cr);
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+
+ if (egid != -1)
+ newcr->cr_gid = egid;
+ if (rgid != -1)
+ newcr->cr_rgid = rgid;
+ /*
+ * "If the real gid is being changed, or the effective gid is
+ * being changed to a value not equal to the real gid, the
+ * saved gid is set to the new effective gid."
+ */
+ if (rgid != -1 ||
+ (egid != -1 && newcr->cr_gid != newcr->cr_rgid))
+ newcr->cr_sgid = newcr->cr_gid;
+ /*
+ * A privileged process that makes itself look like a
+ * set-gid process must be marked to produce no core dump.
+ */
+ if ((cr->cr_gid != newcr->cr_gid ||
+ cr->cr_rgid != newcr->cr_rgid ||
+ cr->cr_sgid != newcr->cr_sgid) && error == 0)
+ do_nocd = 1;
+ error = 0;
+ crfree(cr);
+ }
+ mutex_exit(&p->p_crlock);
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ crfree(newcr);
+ return (set_errno(error));
+}
diff --git a/usr/src/uts/common/syscall/groups.c b/usr/src/uts/common/syscall/groups.c
new file mode 100644
index 0000000000..88e3777afd
--- /dev/null
+++ b/usr/src/uts/common/syscall/groups.c
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ * Copyright 2001-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/cred_impl.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/policy.h>
+
+int
+setgroups(int gidsetsize, gid_t *gidset)
+{
+ proc_t *p;
+ cred_t *cr, *newcr;
+ int i;
+ int n = gidsetsize;
+ gid_t *groups = NULL;
+ int error;
+
+ /* Perform the cheapest tests before grabbing p_crlock */
+ if (n > ngroups_max || n < 0)
+ return (set_errno(EINVAL));
+
+ if (n != 0) {
+ groups = kmem_alloc(n * sizeof (gid_t), KM_SLEEP);
+
+ if (copyin(gidset, groups, n * sizeof (gid_t)) != 0) {
+ kmem_free(groups, n * sizeof (gid_t));
+ return (set_errno(EFAULT));
+ }
+
+ for (i = 0; i < n; i++) {
+ if (groups[i] < 0 || groups[i] > MAXUID) {
+ kmem_free(groups, n * sizeof (gid_t));
+ return (set_errno(EINVAL));
+ }
+ }
+ }
+
+ /*
+ * Need to pre-allocate the new cred structure before acquiring
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if ((error = secpolicy_allow_setid(cr, -1, B_FALSE)) != 0) {
+ mutex_exit(&p->p_crlock);
+ if (groups != NULL)
+ kmem_free(groups, n * sizeof (gid_t));
+ crfree(newcr);
+ return (set_errno(error));
+ }
+
+ crdup_to(cr, newcr);
+
+ if (n != 0) {
+ bcopy(groups, newcr->cr_groups, n * sizeof (gid_t));
+ kmem_free(groups, n * sizeof (gid_t));
+ }
+
+ newcr->cr_ngroups = n;
+
+ p->p_cred = newcr;
+ crhold(newcr); /* hold for the current thread */
+ crfree(cr); /* free the old one */
+ mutex_exit(&p->p_crlock);
+
+ /*
+ * Broadcast new cred to process threads (including the current one).
+ */
+ crset(p, newcr);
+
+ return (0);
+}
+
+int
+getgroups(int gidsetsize, gid_t *gidset)
+{
+ struct cred *cr;
+ int n;
+
+ cr = curthread->t_cred;
+ n = (int)cr->cr_ngroups;
+
+ if (gidsetsize != 0) {
+ if (gidsetsize < n)
+ return (set_errno(EINVAL));
+ if (copyout(cr->cr_groups, gidset, n * sizeof (gid_t)))
+ return (set_errno(EFAULT));
+ }
+
+ return (n);
+}
diff --git a/usr/src/uts/common/syscall/ioctl.c b/usr/src/uts/common/syscall/ioctl.c
new file mode 100644
index 0000000000..c4b514d4de
--- /dev/null
+++ b/usr/src/uts/common/syscall/ioctl.c
@@ -0,0 +1,169 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/ttold.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/kmem.h>
+#include <sys/filio.h>
+#include <sys/sunddi.h>
+#include <sys/debug.h>
+#include <sys/int_limits.h>
+#include <sys/model.h>
+
+/*
+ * I/O control.
+ */
+
+int
+ioctl(int fdes, int cmd, intptr_t arg)
+{
+ file_t *fp;
+ int error = 0;
+ vnode_t *vp;
+ struct vattr vattr;
+ int32_t flag;
+ int rv = 0;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG || vp->v_type == VDIR) {
+ /*
+ * Handle these two ioctls for regular files and
+ * directories. All others will usually be failed
+ * with ENOTTY by the VFS-dependent code. System V
+ * always failed all ioctls on regular files, but SunOS
+ * supported these.
+ */
+ switch (cmd) {
+ case FIONREAD: {
+ /*
+ * offset is int32_t because that is what FIONREAD
+ * is defined in terms of. We cap at INT_MAX as in
+ * other cases for this ioctl.
+ */
+ int32_t offset;
+
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred);
+ if (error) {
+ releasef(fdes);
+ return (set_errno(error));
+ }
+ offset = MIN(vattr.va_size - fp->f_offset, INT_MAX);
+ if (copyout(&offset, (caddr_t)arg, sizeof (offset))) {
+ releasef(fdes);
+ return (set_errno(EFAULT));
+ }
+ releasef(fdes);
+ return (0);
+ }
+
+ case FIONBIO:
+ if (copyin((caddr_t)arg, &flag, sizeof (flag))) {
+ releasef(fdes);
+ return (set_errno(EFAULT));
+ }
+ mutex_enter(&fp->f_tlock);
+ if (flag)
+ fp->f_flag |= FNDELAY;
+ else
+ fp->f_flag &= ~FNDELAY;
+ mutex_exit(&fp->f_tlock);
+ releasef(fdes);
+ return (0);
+
+ default:
+ break;
+ }
+ }
+
+ /*
+ * ioctl() now passes in the model information in some high bits.
+ */
+ flag = fp->f_flag | get_udatamodel();
+ error = VOP_IOCTL(fp->f_vnode, cmd, arg, flag, CRED(), &rv);
+ if (error != 0) {
+ releasef(fdes);
+ return (set_errno(error));
+ }
+ switch (cmd) {
+ case FIONBIO:
+ if (copyin((caddr_t)arg, &flag, sizeof (flag))) {
+ releasef(fdes);
+ return (set_errno(EFAULT));
+ }
+ mutex_enter(&fp->f_tlock);
+ if (flag)
+ fp->f_flag |= FNDELAY;
+ else
+ fp->f_flag &= ~FNDELAY;
+ mutex_exit(&fp->f_tlock);
+ break;
+
+ default:
+ break;
+ }
+ releasef(fdes);
+ return (rv);
+}
+
+/*
+ * Old stty and gtty. (Still.)
+ */
+int
+stty(int fdes, intptr_t arg)
+{
+ return (ioctl(fdes, TIOCSETP, arg));
+}
+
+int
+gtty(int fdes, intptr_t arg)
+{
+ return (ioctl(fdes, TIOCGETP, arg));
+}
diff --git a/usr/src/uts/common/syscall/issetugid.c b/usr/src/uts/common/syscall/issetugid.c
new file mode 100644
index 0000000000..4c734a784a
--- /dev/null
+++ b/usr/src/uts/common/syscall/issetugid.c
@@ -0,0 +1,40 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2001 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/proc.h>
+
+/*
+ * System call returns true if the process was the result of exec'ing a set-uid
+ * or set-gid executable or was exec'ed with mismatch between real and
+ * effective uids or gids; false in all other cases.
+ */
+int
+issetugid(void)
+{
+ return ((curproc->p_flag & SUGID) != 0);
+}
diff --git a/usr/src/uts/common/syscall/lgrpsys.c b/usr/src/uts/common/syscall/lgrpsys.c
new file mode 100644
index 0000000000..09b9818ad6
--- /dev/null
+++ b/usr/src/uts/common/syscall/lgrpsys.c
@@ -0,0 +1,2105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * lgroup system calls
+ */
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/sunddi.h>
+#include <sys/systm.h>
+#include <sys/mman.h>
+#include <sys/cpupart.h>
+#include <sys/lgrp.h>
+#include <sys/lgrp_user.h>
+#include <sys/promif.h> /* for prom_printf() */
+#include <sys/sysmacros.h>
+
+#include <vm/as.h>
+
+
+/* definitions for mi_validity */
+#define VALID_ADDR 1
+#define VALID_REQ 2
+
+/*
+ * run through the given number of addresses and requests and return the
+ * corresponding memory information for each address
+ */
+static int
+meminfo(int addr_count, struct meminfo *mip)
+{
+ size_t in_size, out_size, req_size, val_size;
+ struct as *as;
+ struct hat *hat;
+ int i, j, out_idx, info_count;
+ lgrp_t *lgrp;
+ pfn_t pfn;
+ ssize_t pgsz;
+ int *req_array, *val_array;
+ uint64_t *in_array, *out_array;
+ uint64_t addr, paddr;
+ uintptr_t vaddr;
+ int ret = 0;
+ struct meminfo minfo;
+#if defined(_SYSCALL32_IMPL)
+ struct meminfo32 minfo32;
+#endif
+
+ /*
+ * Make sure that there is at least one address to translate and
+ * limit how many virtual addresses the kernel can do per call
+ */
+ if (addr_count < 1)
+ return (set_errno(EINVAL));
+ else if (addr_count > MAX_MEMINFO_CNT)
+ addr_count = MAX_MEMINFO_CNT;
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(mip, &minfo, sizeof (struct meminfo)))
+ return (set_errno(EFAULT));
+ }
+#if defined(_SYSCALL32_IMPL)
+ else {
+ bzero(&minfo, sizeof (minfo));
+ if (copyin(mip, &minfo32, sizeof (struct meminfo32)))
+ return (set_errno(EFAULT));
+ minfo.mi_inaddr = (const uint64_t *)(uintptr_t)
+ minfo32.mi_inaddr;
+ minfo.mi_info_req = (const uint_t *)(uintptr_t)
+ minfo32.mi_info_req;
+ minfo.mi_info_count = minfo32.mi_info_count;
+ minfo.mi_outdata = (uint64_t *)(uintptr_t)
+ minfo32.mi_outdata;
+ minfo.mi_validity = (uint_t *)(uintptr_t)
+ minfo32.mi_validity;
+ }
+#endif
+ /*
+ * all the input parameters have been copied in:-
+ * addr_count - number of input addresses
+ * minfo.mi_inaddr - array of input addresses
+ * minfo.mi_info_req - array of types of information requested
+ * minfo.mi_info_count - no. of pieces of info requested for each addr
+ * minfo.mi_outdata - array into which the results are placed
+ * minfo.mi_validity - array containing bitwise result codes; 0th bit
+ * evaluates validity of corresponding input
+ * address, 1st bit validity of response to first
+ * member of info_req, etc.
+ */
+
+ /* make sure mi_info_count is within limit */
+ info_count = minfo.mi_info_count;
+ if (info_count < 1 || info_count > MAX_MEMINFO_REQ)
+ return (set_errno(EINVAL));
+
+ /*
+ * allocate buffer in_array for the input addresses and copy them in
+ */
+ in_size = sizeof (uint64_t) * addr_count;
+ in_array = kmem_alloc(in_size, KM_SLEEP);
+ if (copyin(minfo.mi_inaddr, in_array, in_size)) {
+ kmem_free(in_array, in_size);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * allocate buffer req_array for the input info_reqs and copy them in
+ */
+ req_size = sizeof (uint_t) * info_count;
+ req_array = kmem_alloc(req_size, KM_SLEEP);
+ if (copyin(minfo.mi_info_req, req_array, req_size)) {
+ kmem_free(req_array, req_size);
+ kmem_free(in_array, in_size);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * allocate buffer out_array which holds the results and will have
+ * to be copied out later
+ */
+ out_size = sizeof (uint64_t) * addr_count * info_count;
+ out_array = kmem_alloc(out_size, KM_SLEEP);
+
+ /*
+ * allocate buffer val_array which holds the validity bits and will
+ * have to be copied out later
+ */
+ val_size = sizeof (uint_t) * addr_count;
+ val_array = kmem_alloc(val_size, KM_SLEEP);
+
+ if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) {
+ /* find the corresponding lgroup for each physical address */
+ for (i = 0; i < addr_count; i++) {
+ paddr = in_array[i];
+ pfn = btop(paddr);
+ lgrp = lgrp_pfn_to_lgrp(pfn);
+ if (lgrp) {
+ out_array[i] = lgrp->lgrp_id;
+ val_array[i] = VALID_ADDR | VALID_REQ;
+ } else {
+ out_array[i] = NULL;
+ val_array[i] = 0;
+ }
+ }
+ } else {
+ /* get the corresponding memory info for each virtual address */
+ as = curproc->p_as;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ hat = as->a_hat;
+ for (i = out_idx = 0; i < addr_count; i++, out_idx +=
+ info_count) {
+ addr = in_array[i];
+ vaddr = (uintptr_t)(addr & ~PAGEOFFSET);
+ if (!as_segat(as, (caddr_t)vaddr)) {
+ val_array[i] = 0;
+ continue;
+ }
+ val_array[i] = VALID_ADDR;
+ pfn = hat_getpfnum(hat, (caddr_t)vaddr);
+ if (pfn != PFN_INVALID) {
+ paddr = (uint64_t)((pfn << PAGESHIFT) |
+ (addr & PAGEOFFSET));
+ for (j = 0; j < info_count; j++) {
+ switch (req_array[j] & MEMINFO_MASK) {
+ case MEMINFO_VPHYSICAL:
+ /*
+ * return the physical address
+ * corresponding to the input
+ * virtual address
+ */
+ out_array[out_idx + j] = paddr;
+ val_array[i] |= VALID_REQ << j;
+ break;
+ case MEMINFO_VLGRP:
+ /*
+ * return the lgroup of physical
+ * page corresponding to the
+ * input virtual address
+ */
+ lgrp = lgrp_pfn_to_lgrp(pfn);
+ if (lgrp) {
+ out_array[out_idx + j] =
+ lgrp->lgrp_id;
+ val_array[i] |=
+ VALID_REQ << j;
+ }
+ break;
+ case MEMINFO_VPAGESIZE:
+ /*
+ * return the size of physical
+ * page corresponding to the
+ * input virtual address
+ */
+ pgsz = hat_getpagesize(hat,
+ (caddr_t)vaddr);
+ if (pgsz != -1) {
+ out_array[out_idx + j] =
+ pgsz;
+ val_array[i] |=
+ VALID_REQ << j;
+ }
+ break;
+ case MEMINFO_VREPLCNT:
+ /*
+ * for future use:-
+ * return the no. replicated
+ * physical pages corresponding
+ * to the input virtual address,
+ * so it is always 0 at the
+ * moment
+ */
+ out_array[out_idx + j] = 0;
+ val_array[i] |= VALID_REQ << j;
+ break;
+ case MEMINFO_VREPL:
+ /*
+ * for future use:-
+ * return the nth physical
+ * replica of the specified
+ * virtual address
+ */
+ break;
+ case MEMINFO_VREPL_LGRP:
+ /*
+ * for future use:-
+ * return the lgroup of nth
+ * physical replica of the
+ * specified virtual address
+ */
+ break;
+ case MEMINFO_PLGRP:
+ /*
+ * this is for physical address
+ * only, shouldn't mix with
+ * virtual address
+ */
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ }
+
+ /* copy out the results and validity bits and free the buffers */
+ if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) ||
+ (copyout(val_array, minfo.mi_validity, val_size) != 0))
+ ret = set_errno(EFAULT);
+
+ kmem_free(in_array, in_size);
+ kmem_free(out_array, out_size);
+ kmem_free(req_array, req_size);
+ kmem_free(val_array, val_size);
+
+ return (ret);
+}
+
+
+/*
+ * Initialize lgroup affinities for thread
+ */
+void
+lgrp_affinity_init(lgrp_affinity_t **bufaddr)
+{
+ if (bufaddr)
+ *bufaddr = NULL;
+}
+
+
+/*
+ * Free lgroup affinities for thread and set to NULL
+ * just in case thread gets recycled
+ */
+void
+lgrp_affinity_free(lgrp_affinity_t **bufaddr)
+{
+ if (bufaddr && *bufaddr) {
+ kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t));
+ *bufaddr = NULL;
+ }
+}
+
+
+#define P_ANY -2 /* cookie specifying any ID */
+
+
+/*
+ * Find LWP with given ID in specified process and get its affinity for
+ * specified lgroup
+ */
+lgrp_affinity_t
+lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp)
+{
+ lgrp_affinity_t aff;
+ int found;
+ kthread_t *t;
+
+ ASSERT(MUTEX_HELD(&p->p_lock));
+
+ aff = LGRP_AFF_NONE;
+ found = 0;
+ t = p->p_tlist;
+ /*
+ * The process may be executing in proc_exit() and its p->p_list may be
+ * already NULL.
+ */
+ if (t == NULL)
+ return (set_errno(ESRCH));
+
+ do {
+ if (t->t_tid == lwpid || lwpid == P_ANY) {
+ thread_lock(t);
+ /*
+ * Check to see whether caller has permission to set
+ * affinity for LWP
+ */
+ if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
+ thread_unlock(t);
+ return (set_errno(EPERM));
+ }
+
+ if (t->t_lgrp_affinity)
+ aff = t->t_lgrp_affinity[lgrp];
+ thread_unlock(t);
+ found = 1;
+ break;
+ }
+ } while ((t = t->t_forw) != p->p_tlist);
+ if (!found)
+ aff = set_errno(ESRCH);
+
+ return (aff);
+}
+
+
+/*
+ * Get lgroup affinity for given LWP
+ */
+lgrp_affinity_t
+lgrp_affinity_get(lgrp_affinity_args_t *ap)
+{
+ lgrp_affinity_t aff;
+ lgrp_affinity_args_t args;
+ id_t id;
+ idtype_t idtype;
+ lgrp_id_t lgrp;
+ proc_t *p;
+ kthread_t *t;
+
+ /*
+ * Copyin arguments
+ */
+ if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
+ return (set_errno(EFAULT));
+
+ id = args.id;
+ idtype = args.idtype;
+ lgrp = args.lgrp;
+
+ /*
+ * Check for invalid lgroup
+ */
+ if (lgrp < 0 || lgrp == LGRP_NONE)
+ return (set_errno(EINVAL));
+
+ /*
+ * Check for existing lgroup
+ */
+ if (lgrp > lgrp_alloc_max)
+ return (set_errno(ESRCH));
+
+ /*
+ * Get lgroup affinity for given LWP or process
+ */
+ switch (idtype) {
+
+ case P_LWPID:
+ /*
+ * LWP in current process
+ */
+ p = curproc;
+ mutex_enter(&p->p_lock);
+ if (id != P_MYID) /* different thread */
+ aff = lgrp_affinity_get_thread(p, id, lgrp);
+ else { /* current thread */
+ aff = LGRP_AFF_NONE;
+ t = curthread;
+ thread_lock(t);
+ if (t->t_lgrp_affinity)
+ aff = t->t_lgrp_affinity[lgrp];
+ thread_unlock(t);
+ }
+ mutex_exit(&p->p_lock);
+ break;
+
+ case P_PID:
+ /*
+ * Process
+ */
+ mutex_enter(&pidlock);
+
+ if (id == P_MYID)
+ p = curproc;
+ else {
+ p = prfind(id);
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+ }
+
+ mutex_enter(&p->p_lock);
+ aff = lgrp_affinity_get_thread(p, P_ANY, lgrp);
+ mutex_exit(&p->p_lock);
+
+ mutex_exit(&pidlock);
+ break;
+
+ default:
+ aff = set_errno(EINVAL);
+ break;
+ }
+
+ return (aff);
+}
+
+
+/*
+ * Find lgroup for which this thread has most affinity in specified partition
+ */
+lpl_t *
+lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start)
+{
+ lgrp_affinity_t *affs;
+ lgrp_affinity_t best_aff;
+ lpl_t *best_lpl;
+ lgrp_id_t home;
+ lgrp_id_t lgrpid;
+ lpl_t *lpl;
+
+ ASSERT(t != NULL);
+ ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) ||
+ (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t)));
+ ASSERT(cpupart != NULL);
+
+ if (t->t_lgrp_affinity == NULL)
+ return (NULL);
+
+ affs = t->t_lgrp_affinity;
+
+ /*
+ * Thread bound to CPU
+ */
+ if (t->t_bind_cpu != PBIND_NONE) {
+ cpu_t *cp;
+
+ /*
+ * See whether thread has more affinity for root lgroup
+ * than lgroup containing CPU
+ */
+ cp = cpu[t->t_bind_cpu];
+ lpl = cp->cpu_lpl;
+ lgrpid = LGRP_ROOTID;
+ if (affs[lgrpid] > affs[lpl->lpl_lgrpid])
+ return (&cpupart->cp_lgrploads[lgrpid]);
+ return (lpl);
+ }
+
+ /*
+ * Start searching at given lgroup
+ */
+ ASSERT(start >= 0 && start <= lgrp_alloc_max);
+ lgrpid = start;
+
+ /*
+ * Begin with home as best lgroup if it's root or in this pset
+ * Otherwise, use starting lgroup given above as best first.
+ */
+ home = t->t_lpl->lpl_lgrpid;
+ if (LGRP_CPUS_IN_PART(home, cpupart))
+ best_lpl = &cpupart->cp_lgrploads[home];
+ else
+ best_lpl = &cpupart->cp_lgrploads[lgrpid];
+
+ best_aff = affs[best_lpl->lpl_lgrpid];
+
+ do {
+ /*
+ * Skip any lgroups that don't have CPU resources
+ * in this processor set.
+ */
+ if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) {
+ if (++lgrpid > lgrp_alloc_max)
+ lgrpid = 0; /* wrap the search */
+ continue;
+ }
+
+ /*
+ * Find lgroup with most affinity
+ */
+ lpl = &cpupart->cp_lgrploads[lgrpid];
+ if (affs[lgrpid] > best_aff) {
+ best_aff = affs[lgrpid];
+ best_lpl = lpl;
+ }
+
+ if (++lgrpid > lgrp_alloc_max)
+ lgrpid = 0; /* wrap the search */
+
+ } while (lgrpid != start);
+
+ /*
+ * No lgroup (in this pset) with any affinity
+ */
+ if (best_aff == LGRP_AFF_NONE)
+ return (NULL);
+
+ lgrpid = best_lpl->lpl_lgrpid;
+ ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0);
+
+ return (best_lpl);
+}
+
+
+/*
+ * Set thread's affinity for given lgroup
+ */
+int
+lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff,
+ lgrp_affinity_t **aff_buf)
+{
+ lpl_t *best_lpl;
+ lgrp_id_t home;
+ int retval;
+ lgrp_id_t start;
+
+ ASSERT(t != NULL);
+ ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
+
+ retval = 0;
+
+ thread_lock(t);
+
+ /*
+ * Check to see whether caller has permission to set affinity for
+ * thread
+ */
+ if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
+ thread_unlock(t);
+ return (set_errno(EPERM));
+ }
+
+ if (t->t_lgrp_affinity == NULL) {
+ if (aff == LGRP_AFF_NONE) {
+ thread_unlock(t);
+ return (0);
+ }
+ ASSERT(aff_buf != NULL && *aff_buf != NULL);
+ t->t_lgrp_affinity = *aff_buf;
+ *aff_buf = NULL;
+ }
+
+ t->t_lgrp_affinity[lgrp] = aff;
+
+ /*
+ * Select a new home if the thread's affinity is being cleared
+ */
+ if (aff == LGRP_AFF_NONE) {
+ lgrp_move_thread(t, lgrp_choose(t, t->t_cpupart), 1);
+ thread_unlock(t);
+ return (retval);
+ }
+
+ /*
+ * Find lgroup for which thread has most affinity,
+ * starting after home
+ */
+ home = t->t_lpl->lpl_lgrpid;
+ start = home + 1;
+ if (start > lgrp_alloc_max)
+ start = 0;
+
+ best_lpl = lgrp_affinity_best(t, t->t_cpupart, start);
+
+ /*
+ * Rehome if found lgroup with more affinity than home
+ */
+ if (best_lpl != NULL && best_lpl != t->t_lpl)
+ lgrp_move_thread(t, best_lpl, 1);
+
+ thread_unlock(t);
+
+ return (retval);
+}
+
+
+/*
+ * Set process' affinity for specified lgroup
+ */
+int
+lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff,
+ lgrp_affinity_t **aff_buf_array)
+{
+ lgrp_affinity_t *buf;
+ int err = 0;
+ int i;
+ int retval;
+ kthread_t *t;
+
+ ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock));
+ ASSERT(aff_buf_array != NULL);
+
+ i = 0;
+ t = p->p_tlist;
+ if (t != NULL) {
+ do {
+ /*
+ * Set lgroup affinity for thread
+ */
+ buf = aff_buf_array[i];
+ retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf);
+
+ if (err == 0 && retval != 0)
+ err = retval;
+
+ /*
+ * Advance pointer to next buffer
+ */
+ if (buf == NULL) {
+ ASSERT(i < p->p_lwpcnt);
+ aff_buf_array[i] = NULL;
+ i++;
+ }
+
+ } while ((t = t->t_forw) != p->p_tlist);
+ }
+ return (err);
+}
+
+
+/*
+ * Set LWP's or process' affinity for specified lgroup
+ *
+ * When setting affinities, pidlock, process p_lock, and thread_lock()
+ * need to be held in that order to protect target thread's pset, process,
+ * process contents, and thread contents. thread_lock() does splhigh(),
+ * so it ends up having similiar effect as kpreempt_disable(), so it will
+ * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes.
+ */
+int
+lgrp_affinity_set(lgrp_affinity_args_t *ap)
+{
+ lgrp_affinity_t aff;
+ lgrp_affinity_t *aff_buf;
+ lgrp_affinity_args_t args;
+ id_t id;
+ idtype_t idtype;
+ lgrp_id_t lgrp;
+ int nthreads;
+ proc_t *p;
+ int retval;
+
+ /*
+ * Copyin arguments
+ */
+ if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
+ return (set_errno(EFAULT));
+
+ idtype = args.idtype;
+ id = args.id;
+ lgrp = args.lgrp;
+ aff = args.aff;
+
+ /*
+ * Check for invalid lgroup
+ */
+ if (lgrp < 0 || lgrp == LGRP_NONE)
+ return (set_errno(EINVAL));
+
+ /*
+ * Check for existing lgroup
+ */
+ if (lgrp > lgrp_alloc_max)
+ return (set_errno(ESRCH));
+
+ /*
+ * Check for legal affinity
+ */
+ if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK &&
+ aff != LGRP_AFF_STRONG)
+ return (set_errno(EINVAL));
+
+ /*
+ * Must be process or LWP ID
+ */
+ if (idtype != P_LWPID && idtype != P_PID)
+ return (set_errno(EINVAL));
+
+ /*
+ * Set given LWP's or process' affinity for specified lgroup
+ */
+ switch (idtype) {
+
+ case P_LWPID:
+ /*
+ * Allocate memory for thread's lgroup affinities
+ * ahead of time w/o holding locks
+ */
+ aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t),
+ KM_SLEEP);
+
+ p = curproc;
+
+ /*
+ * Set affinity for thread
+ */
+ mutex_enter(&p->p_lock);
+ if (id == P_MYID) { /* current thread */
+ retval = lgrp_affinity_set_thread(curthread, lgrp, aff,
+ &aff_buf);
+ } else if (p->p_tlist == NULL) {
+ retval = set_errno(ESRCH);
+ } else { /* other thread */
+ int found = 0;
+ kthread_t *t;
+
+ t = p->p_tlist;
+ do {
+ if (t->t_tid == id) {
+ retval = lgrp_affinity_set_thread(t,
+ lgrp, aff, &aff_buf);
+ found = 1;
+ break;
+ }
+ } while ((t = t->t_forw) != p->p_tlist);
+ if (!found)
+ retval = set_errno(ESRCH);
+ }
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Free memory for lgroup affinities,
+ * since thread didn't need it
+ */
+ if (aff_buf)
+ kmem_free(aff_buf,
+ nlgrpsmax * sizeof (lgrp_affinity_t));
+
+ break;
+
+ case P_PID:
+
+ do {
+ lgrp_affinity_t **aff_buf_array;
+ int i;
+ size_t size;
+
+ /*
+ * Get process
+ */
+ mutex_enter(&pidlock);
+
+ if (id == P_MYID)
+ p = curproc;
+ else
+ p = prfind(id);
+
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+
+ /*
+ * Get number of threads in process
+ *
+ * NOTE: Only care about user processes,
+ * so p_lwpcnt should be number of threads.
+ */
+ mutex_enter(&p->p_lock);
+ nthreads = p->p_lwpcnt;
+ mutex_exit(&p->p_lock);
+
+ mutex_exit(&pidlock);
+
+ if (nthreads < 1)
+ return (set_errno(ESRCH));
+
+ /*
+ * Preallocate memory for lgroup affinities for
+ * each thread in process now to avoid holding
+ * any locks. Allocate an array to hold a buffer
+ * for each thread.
+ */
+ aff_buf_array = kmem_zalloc(nthreads *
+ sizeof (lgrp_affinity_t *), KM_SLEEP);
+
+ size = nlgrpsmax * sizeof (lgrp_affinity_t);
+ for (i = 0; i < nthreads; i++)
+ aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP);
+
+ mutex_enter(&pidlock);
+
+ /*
+ * Get process again since dropped locks to allocate
+ * memory (except current process)
+ */
+ if (id != P_MYID)
+ p = prfind(id);
+
+ /*
+ * Process went away after we dropped locks and before
+ * reacquiring them, so drop locks, free memory, and
+ * return.
+ */
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ for (i = 0; i < nthreads; i++)
+ kmem_free(aff_buf_array[i], size);
+ kmem_free(aff_buf_array,
+ nthreads * sizeof (lgrp_affinity_t *));
+ return (set_errno(ESRCH));
+ }
+
+ mutex_enter(&p->p_lock);
+
+ /*
+ * See whether number of threads is same
+ * If not, drop locks, free memory, and try again
+ */
+ if (nthreads != p->p_lwpcnt) {
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ for (i = 0; i < nthreads; i++)
+ kmem_free(aff_buf_array[i], size);
+ kmem_free(aff_buf_array,
+ nthreads * sizeof (lgrp_affinity_t *));
+ continue;
+ }
+
+ /*
+ * Set lgroup affinity for threads in process
+ */
+ retval = lgrp_affinity_set_proc(p, lgrp, aff,
+ aff_buf_array);
+
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ /*
+ * Free any leftover memory, since some threads may
+ * have already allocated memory and set lgroup
+ * affinities before
+ */
+ for (i = 0; i < nthreads; i++)
+ if (aff_buf_array[i] != NULL)
+ kmem_free(aff_buf_array[i], size);
+ kmem_free(aff_buf_array,
+ nthreads * sizeof (lgrp_affinity_t *));
+
+ break;
+
+ } while (nthreads != p->p_lwpcnt);
+
+ break;
+
+ default:
+ retval = set_errno(EINVAL);
+ break;
+ }
+
+ return (retval);
+}
+
+
+/*
+ * Return the latest generation number for the lgroup hierarchy
+ * with the given view
+ */
+lgrp_gen_t
+lgrp_generation(lgrp_view_t view)
+{
+ cpupart_t *cpupart;
+ uint_t gen;
+
+ kpreempt_disable();
+
+ /*
+ * Determine generation number for given view
+ */
+ if (view == LGRP_VIEW_OS)
+ /*
+ * Return generation number of lgroup hierarchy for OS view
+ */
+ gen = lgrp_gen;
+ else {
+ /*
+ * For caller's view, use generation numbers for lgroup
+ * hierarchy and caller's pset
+ * NOTE: Caller needs to check for change in pset ID
+ */
+ cpupart = curthread->t_cpupart;
+ ASSERT(cpupart);
+ gen = lgrp_gen + cpupart->cp_gen;
+ }
+
+ kpreempt_enable();
+
+ return (gen);
+}
+
+
+lgrp_id_t
+lgrp_home_thread(kthread_t *t)
+{
+ lgrp_id_t home;
+
+ ASSERT(t != NULL);
+ ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
+
+ thread_lock(t);
+
+ /*
+ * Check to see whether caller has permission to set affinity for
+ * thread
+ */
+ if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
+ thread_unlock(t);
+ return (set_errno(EPERM));
+ }
+
+ home = lgrp_home_id(t);
+
+ thread_unlock(t);
+ return (home);
+}
+
+
+/*
+ * Get home lgroup of given process or thread
+ */
+lgrp_id_t
+lgrp_home_get(idtype_t idtype, id_t id)
+{
+ proc_t *p;
+ lgrp_id_t retval;
+ kthread_t *t;
+
+ /*
+ * Get home lgroup of given LWP or process
+ */
+ switch (idtype) {
+
+ case P_LWPID:
+ p = curproc;
+
+ /*
+ * Set affinity for thread
+ */
+ mutex_enter(&p->p_lock);
+ if (id == P_MYID) { /* current thread */
+ retval = lgrp_home_thread(curthread);
+ } else if (p->p_tlist == NULL) {
+ retval = set_errno(ESRCH);
+ } else { /* other thread */
+ int found = 0;
+
+ t = p->p_tlist;
+ do {
+ if (t->t_tid == id) {
+ retval = lgrp_home_thread(t);
+ found = 1;
+ break;
+ }
+ } while ((t = t->t_forw) != p->p_tlist);
+ if (!found)
+ retval = set_errno(ESRCH);
+ }
+ mutex_exit(&p->p_lock);
+ break;
+
+ case P_PID:
+ /*
+ * Get process
+ */
+ mutex_enter(&pidlock);
+
+ if (id == P_MYID)
+ p = curproc;
+ else
+ p = prfind(id);
+
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+
+ mutex_enter(&p->p_lock);
+ t = p->p_tlist;
+ if (t == NULL)
+ retval = set_errno(ESRCH);
+ else
+ retval = lgrp_home_thread(t);
+ mutex_exit(&p->p_lock);
+
+ mutex_exit(&pidlock);
+
+ break;
+
+ default:
+ retval = set_errno(EINVAL);
+ break;
+ }
+
+ return (retval);
+}
+
+
+/*
+ * Return latency between "from" and "to" lgroups
+ *
+ * This latency number can only be used for relative comparison
+ * between lgroups on the running system, cannot be used across platforms,
+ * and may not reflect the actual latency. It is platform and implementation
+ * specific, so platform gets to decide its value. It would be nice if the
+ * number was at least proportional to make comparisons more meaningful though.
+ */
+int
+lgrp_latency(lgrp_id_t from, lgrp_id_t to)
+{
+ lgrp_t *from_lgrp;
+ int i;
+ int latency;
+ int latency_max;
+ lgrp_t *to_lgrp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (from < 0 || to < 0)
+ return (set_errno(EINVAL));
+
+ if (from > lgrp_alloc_max || to > lgrp_alloc_max)
+ return (set_errno(ESRCH));
+
+ from_lgrp = lgrp_table[from];
+ to_lgrp = lgrp_table[to];
+
+ if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) {
+ return (set_errno(ESRCH));
+ }
+
+ /*
+ * Get latency for same lgroup
+ */
+ if (from == to) {
+ latency = from_lgrp->lgrp_latency;
+ return (latency);
+ }
+
+ /*
+ * Get latency between leaf lgroups
+ */
+ if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0)
+ return (lgrp_plat_latency(from_lgrp->lgrp_plathand,
+ to_lgrp->lgrp_plathand));
+
+ /*
+ * Determine max latency between resources in two lgroups
+ */
+ latency_max = 0;
+ for (i = 0; i <= lgrp_alloc_max; i++) {
+ lgrp_t *from_rsrc;
+ int j;
+ lgrp_t *to_rsrc;
+
+ from_rsrc = lgrp_table[i];
+ if (!LGRP_EXISTS(from_rsrc) ||
+ !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i))
+ continue;
+
+ for (j = 0; j <= lgrp_alloc_max; j++) {
+ to_rsrc = lgrp_table[j];
+ if (!LGRP_EXISTS(to_rsrc) ||
+ klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM],
+ j) == 0)
+ continue;
+ latency = lgrp_plat_latency(from_rsrc->lgrp_plathand,
+ to_rsrc->lgrp_plathand);
+ if (latency > latency_max)
+ latency_max = latency;
+ }
+ }
+ return (latency_max);
+}
+
+
+/*
+ * Return lgroup interface version number
+ * 0 - none
+ * 1 - original
+ * 2 - lgrp_latency_cookie() and lgrp_resources() added
+ */
+int
+lgrp_version(int version)
+{
+ /*
+ * Return LGRP_VER_NONE when requested version isn't supported
+ */
+ if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT)
+ return (LGRP_VER_NONE);
+
+ /*
+ * Return current version when LGRP_VER_NONE passed in
+ */
+ if (version == LGRP_VER_NONE)
+ return (LGRP_VER_CURRENT);
+
+ /*
+ * Otherwise, return supported version.
+ */
+ return (version);
+}
+
+
+/*
+ * Snapshot of lgroup hieararchy
+ *
+ * One snapshot is kept and is based on the kernel's native data model, so
+ * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the
+ * 64-bit kernel. If a 32-bit user wants a snapshot from the 64-bit kernel,
+ * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot.
+ *
+ * The format is defined by lgroup snapshot header and the layout of
+ * the snapshot in memory is as follows:
+ * 1) lgroup snapshot header
+ * - specifies format of snapshot
+ * - defined by lgrp_snapshot_header_t
+ * 2) lgroup info array
+ * - contains information about each lgroup
+ * - one element for each lgroup
+ * - each element is defined by lgrp_info_t
+ * 3) lgroup CPU ID array
+ * - contains list (array) of CPU IDs for each lgroup
+ * - lgrp_info_t points into array and specifies how many CPUs belong to
+ * given lgroup
+ * 4) lgroup parents array
+ * - contains lgroup bitmask of parents for each lgroup
+ * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
+ * 5) lgroup children array
+ * - contains lgroup bitmask of children for each lgroup
+ * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
+ * 6) lgroup resources array
+ * - contains lgroup bitmask of resources for each lgroup
+ * - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
+ * 7) lgroup latency table
+ * - contains latency from each lgroup to each of other lgroups
+ *
+ * NOTE: Must use nlgrpsmax for per lgroup data structures because lgroups
+ * may be sparsely allocated.
+ */
+lgrp_snapshot_header_t *lgrp_snap = NULL; /* lgroup snapshot */
+static kmutex_t lgrp_snap_lock; /* snapshot lock */
+
+
+/*
+ * Take a snapshot of lgroup hierarchy and return size of buffer
+ * needed to hold snapshot
+ */
+static int
+lgrp_snapshot(void)
+{
+ size_t bitmask_size;
+ size_t bitmasks_size;
+ size_t bufsize;
+ int cpu_index;
+ size_t cpuids_size;
+ int i;
+ int j;
+ size_t info_size;
+ size_t lats_size;
+ ulong_t *lgrp_children;
+ processorid_t *lgrp_cpuids;
+ lgrp_info_t *lgrp_info;
+ int **lgrp_lats;
+ ulong_t *lgrp_parents;
+ ulong_t *lgrp_rsets;
+ ulong_t *lgrpset;
+ int snap_ncpus;
+ int snap_nlgrps;
+ int snap_nlgrpsmax;
+ size_t snap_hdr_size;
+#ifdef _SYSCALL32_IMPL
+ model_t model = DATAMODEL_NATIVE;
+
+ /*
+ * Have up-to-date snapshot, so check to see whether caller is 32-bit
+ * program and need to return size of 32-bit snapshot now.
+ */
+ model = get_udatamodel();
+ if (model == DATAMODEL_ILP32 && lgrp_snap &&
+ lgrp_snap->ss_gen == lgrp_gen) {
+
+ snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;
+
+ /*
+ * Calculate size of buffer needed for 32-bit snapshot,
+ * rounding up size of each object to allow for alignment
+ * of next object in buffer.
+ */
+ snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
+ sizeof (caddr32_t));
+ info_size =
+ P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
+ sizeof (processorid_t));
+ cpuids_size =
+ P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
+ sizeof (ulong_t));
+
+ /*
+ * lgroup bitmasks needed for parents, children, and resources
+ * for each lgroup and pset lgroup set
+ */
+ bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
+ bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
+ snap_nlgrpsmax) + 1) * bitmask_size;
+
+ /*
+ * Size of latency table and buffer
+ */
+ lats_size = snap_nlgrpsmax * sizeof (caddr32_t) +
+ snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);
+
+ bufsize = snap_hdr_size + info_size + cpuids_size +
+ bitmasks_size + lats_size;
+ return (bufsize);
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ /*
+ * Check whether snapshot is up-to-date
+ * Free it and take another one if not
+ */
+ if (lgrp_snap) {
+ if (lgrp_snap->ss_gen == lgrp_gen)
+ return (lgrp_snap->ss_size);
+
+ kmem_free(lgrp_snap, lgrp_snap->ss_size);
+ lgrp_snap = NULL;
+ }
+
+ /*
+ * Allocate memory for snapshot
+ * w/o holding cpu_lock while waiting for memory
+ */
+ while (lgrp_snap == NULL) {
+ int old_generation;
+
+ /*
+ * Take snapshot of lgroup generation number
+ * and configuration size dependent information
+ * NOTE: Only count number of online CPUs,
+ * since only online CPUs appear in lgroups.
+ */
+ mutex_enter(&cpu_lock);
+ old_generation = lgrp_gen;
+ snap_ncpus = ncpus_online;
+ snap_nlgrps = nlgrps;
+ snap_nlgrpsmax = nlgrpsmax;
+ mutex_exit(&cpu_lock);
+
+ /*
+ * Calculate size of buffer needed for snapshot,
+ * rounding up size of each object to allow for alignment
+ * of next object in buffer.
+ */
+ snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
+ sizeof (void *));
+ info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
+ sizeof (processorid_t));
+ cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
+ sizeof (ulong_t));
+ /*
+ * lgroup bitmasks needed for pset lgroup set and parents,
+ * children, and resource sets for each lgroup
+ */
+ bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
+ bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
+ snap_nlgrpsmax) + 1) * bitmask_size;
+
+ /*
+ * Size of latency table and buffer
+ */
+ lats_size = snap_nlgrpsmax * sizeof (int *) +
+ snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);
+
+ bufsize = snap_hdr_size + info_size + cpuids_size +
+ bitmasks_size + lats_size;
+
+ /*
+ * Allocate memory for buffer
+ */
+ lgrp_snap = kmem_zalloc(bufsize, KM_NOSLEEP);
+ if (lgrp_snap == NULL)
+ return (set_errno(ENOMEM));
+
+ /*
+ * Check whether generation number has changed
+ */
+ mutex_enter(&cpu_lock);
+ if (lgrp_gen == old_generation)
+ break; /* hasn't change, so done. */
+
+ /*
+ * Generation number changed, so free memory and try again.
+ */
+ mutex_exit(&cpu_lock);
+ kmem_free(lgrp_snap, bufsize);
+ lgrp_snap = NULL;
+ }
+
+ /*
+ * Fill in lgroup snapshot header
+ * (including pointers to tables of lgroup info, CPU IDs, and parents
+ * and children)
+ */
+ lgrp_snap->ss_version = LGRP_VER_CURRENT;
+
+ /*
+ * XXX For now, liblgrp only needs to know whether the hierarchy
+ * XXX only has one level or not
+ */
+ if (snap_nlgrps == 1)
+ lgrp_snap->ss_levels = 1;
+ else
+ lgrp_snap->ss_levels = 2;
+
+ lgrp_snap->ss_root = LGRP_ROOTID;
+
+ lgrp_snap->ss_nlgrps = lgrp_snap->ss_nlgrps_os = snap_nlgrps;
+ lgrp_snap->ss_nlgrps_max = snap_nlgrpsmax;
+ lgrp_snap->ss_ncpus = snap_ncpus;
+ lgrp_snap->ss_gen = lgrp_gen;
+ lgrp_snap->ss_view = LGRP_VIEW_OS;
+ lgrp_snap->ss_pset = 0; /* NOTE: caller should set if needed */
+ lgrp_snap->ss_size = bufsize;
+ lgrp_snap->ss_magic = (uintptr_t)lgrp_snap;
+
+ lgrp_snap->ss_info = lgrp_info =
+ (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);
+
+ lgrp_snap->ss_cpuids = lgrp_cpuids =
+ (processorid_t *)((uintptr_t)lgrp_info + info_size);
+
+ lgrp_snap->ss_lgrpset = lgrpset =
+ (ulong_t *)((uintptr_t)lgrp_cpuids + cpuids_size);
+
+ lgrp_snap->ss_parents = lgrp_parents =
+ (ulong_t *)((uintptr_t)lgrpset + bitmask_size);
+
+ lgrp_snap->ss_children = lgrp_children =
+ (ulong_t *)((uintptr_t)lgrp_parents + (snap_nlgrpsmax *
+ bitmask_size));
+
+ lgrp_snap->ss_rsets = lgrp_rsets =
+ (ulong_t *)((uintptr_t)lgrp_children + (snap_nlgrpsmax *
+ bitmask_size));
+
+ lgrp_snap->ss_latencies = lgrp_lats =
+ (int **)((uintptr_t)lgrp_rsets + (LGRP_RSRC_COUNT *
+ snap_nlgrpsmax * bitmask_size));
+
+ /*
+ * Fill in lgroup information
+ */
+ cpu_index = 0;
+ for (i = 0; i < snap_nlgrpsmax; i++) {
+ struct cpu *cp;
+ int cpu_count;
+ struct cpu *head;
+ int k;
+ lgrp_t *lgrp;
+
+ lgrp = lgrp_table[i];
+ if (!LGRP_EXISTS(lgrp)) {
+ bzero(&lgrp_info[i], sizeof (lgrp_info[i]));
+ lgrp_info[i].info_lgrpid = LGRP_NONE;
+ continue;
+ }
+
+ lgrp_info[i].info_lgrpid = i;
+ lgrp_info[i].info_latency = lgrp->lgrp_latency;
+
+ /*
+ * Fill in parents, children, and lgroup resources
+ */
+ lgrp_info[i].info_parents =
+ (ulong_t *)((uintptr_t)lgrp_parents + (i * bitmask_size));
+
+ if (lgrp->lgrp_parent)
+ BT_SET(lgrp_info[i].info_parents,
+ lgrp->lgrp_parent->lgrp_id);
+
+ lgrp_info[i].info_children =
+ (ulong_t *)((uintptr_t)lgrp_children + (i * bitmask_size));
+
+ for (j = 0; j < snap_nlgrpsmax; j++)
+ if (klgrpset_ismember(lgrp->lgrp_children, j))
+ BT_SET(lgrp_info[i].info_children, j);
+
+ lgrp_info[i].info_rset =
+ (ulong_t *)((uintptr_t)lgrp_rsets +
+ (i * LGRP_RSRC_COUNT * bitmask_size));
+
+ for (j = 0; j < LGRP_RSRC_COUNT; j++) {
+ ulong_t *rset;
+
+ rset = (ulong_t *)((uintptr_t)lgrp_info[i].info_rset +
+ (j * bitmask_size));
+ for (k = 0; k < snap_nlgrpsmax; k++)
+ if (klgrpset_ismember(lgrp->lgrp_set[j], k))
+ BT_SET(rset, k);
+ }
+
+ /*
+ * Fill in CPU IDs
+ */
+ cpu_count = 0;
+ lgrp_info[i].info_cpuids = NULL;
+ cp = head = lgrp->lgrp_cpu;
+ if (head != NULL) {
+ lgrp_info[i].info_cpuids = &lgrp_cpuids[cpu_index];
+ do {
+ lgrp_cpuids[cpu_index] = cp->cpu_id;
+ cpu_index++;
+ cpu_count++;
+ cp = cp->cpu_next_lgrp;
+ } while (cp != head);
+ }
+ ASSERT(cpu_count == lgrp->lgrp_cpucnt);
+ lgrp_info[i].info_ncpus = cpu_count;
+
+ /*
+ * Fill in memory sizes for lgroups that directly contain
+ * memory
+ */
+ if (klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], i)) {
+ lgrp_info[i].info_mem_free =
+ lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);
+ lgrp_info[i].info_mem_install =
+ lgrp_mem_size(i, LGRP_MEM_SIZE_INSTALL);
+ }
+
+ /*
+ * Fill in latency table and buffer
+ */
+ lgrp_lats[i] = (int *)((uintptr_t)lgrp_lats + snap_nlgrpsmax *
+ sizeof (int *) + i * snap_nlgrpsmax * sizeof (int));
+ for (j = 0; j < snap_nlgrpsmax; j++) {
+ lgrp_t *to;
+
+ to = lgrp_table[j];
+ if (!LGRP_EXISTS(to))
+ continue;
+ lgrp_lats[i][j] = lgrp_latency(lgrp->lgrp_id,
+ to->lgrp_id);
+ }
+ }
+ ASSERT(cpu_index == snap_ncpus);
+
+
+ mutex_exit(&cpu_lock);
+
+#ifdef _SYSCALL32_IMPL
+ /*
+ * Check to see whether caller is 32-bit program and need to return
+ * size of 32-bit snapshot now that snapshot has been taken/updated.
+ * May not have been able to do this earlier if snapshot was out of
+ * date or didn't exist yet.
+ */
+ if (model == DATAMODEL_ILP32) {
+
+ snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;
+
+ /*
+ * Calculate size of buffer needed for 32-bit snapshot,
+ * rounding up size of each object to allow for alignment
+ * of next object in buffer.
+ */
+ snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
+ sizeof (caddr32_t));
+ info_size =
+ P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
+ sizeof (processorid_t));
+ cpuids_size =
+ P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
+ sizeof (ulong_t));
+
+ bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
+ bitmasks_size = (((2 + LGRP_RSRC_COUNT) * snap_nlgrpsmax) +
+ 1) * bitmask_size;
+
+
+ /*
+ * Size of latency table and buffer
+ */
+ lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
+ (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));
+
+ bufsize = snap_hdr_size + info_size + cpuids_size +
+ bitmasks_size + lats_size;
+ return (bufsize);
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ return (lgrp_snap->ss_size);
+}
+
+
+/*
+ * Copy snapshot into given user buffer, fix up any pointers in buffer to point
+ * into user instead of kernel address space, and return size of buffer
+ * needed to hold snapshot
+ */
+static int
+lgrp_snapshot_copy(char *buf, size_t bufsize)
+{
+ size_t bitmask_size;
+ int cpu_index;
+ size_t cpuids_size;
+ int i;
+ size_t info_size;
+ lgrp_info_t *lgrp_info;
+ int retval;
+ size_t snap_hdr_size;
+ int snap_ncpus;
+ int snap_nlgrpsmax;
+ lgrp_snapshot_header_t *user_snap;
+ lgrp_info_t *user_info;
+ lgrp_info_t *user_info_buffer;
+ processorid_t *user_cpuids;
+ ulong_t *user_lgrpset;
+ ulong_t *user_parents;
+ ulong_t *user_children;
+ int **user_lats;
+ int **user_lats_buffer;
+ ulong_t *user_rsets;
+
+ if (lgrp_snap == NULL)
+ return (0);
+
+ if (buf == NULL || bufsize <= 0)
+ return (lgrp_snap->ss_size);
+
+ /*
+ * User needs to try getting size of buffer again
+ * because given buffer size is too small.
+ * The lgroup hierarchy may have changed after they asked for the size
+ * but before the snapshot was taken.
+ */
+ if (bufsize < lgrp_snap->ss_size)
+ return (set_errno(EAGAIN));
+
+ snap_ncpus = lgrp_snap->ss_ncpus;
+ snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;
+
+ /*
+ * Fill in lgrpset now because caller may have change psets
+ */
+ kpreempt_disable();
+ for (i = 0; i < snap_nlgrpsmax; i++) {
+ if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
+ i)) {
+ BT_SET(lgrp_snap->ss_lgrpset, i);
+ }
+ }
+ kpreempt_enable();
+
+ /*
+ * Copy lgroup snapshot (snapshot header, lgroup info, and CPU IDs)
+ * into user buffer all at once
+ */
+ if (copyout(lgrp_snap, buf, lgrp_snap->ss_size) != 0)
+ return (set_errno(EFAULT));
+
+ /*
+ * Round up sizes of lgroup snapshot header and info for alignment
+ */
+ snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
+ sizeof (void *));
+ info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
+ sizeof (processorid_t));
+ cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
+ sizeof (ulong_t));
+
+ bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
+
+ /*
+ * Calculate pointers into user buffer for lgroup snapshot header,
+ * info, and CPU IDs
+ */
+ user_snap = (lgrp_snapshot_header_t *)buf;
+ user_info = (lgrp_info_t *)((uintptr_t)user_snap + snap_hdr_size);
+ user_cpuids = (processorid_t *)((uintptr_t)user_info + info_size);
+ user_lgrpset = (ulong_t *)((uintptr_t)user_cpuids + cpuids_size);
+ user_parents = (ulong_t *)((uintptr_t)user_lgrpset + bitmask_size);
+ user_children = (ulong_t *)((uintptr_t)user_parents +
+ (snap_nlgrpsmax * bitmask_size));
+ user_rsets = (ulong_t *)((uintptr_t)user_children +
+ (snap_nlgrpsmax * bitmask_size));
+ user_lats = (int **)((uintptr_t)user_rsets +
+ (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size));
+
+ /*
+ * Copyout magic number (ie. pointer to beginning of buffer)
+ */
+ if (copyout(&buf, &user_snap->ss_magic, sizeof (buf)) != 0)
+ return (set_errno(EFAULT));
+
+ /*
+ * Fix up pointers in user buffer to point into user buffer
+ * not kernel snapshot
+ */
+ if (copyout(&user_info, &user_snap->ss_info, sizeof (user_info)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_cpuids, &user_snap->ss_cpuids,
+ sizeof (user_cpuids)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_lgrpset, &user_snap->ss_lgrpset,
+ sizeof (user_lgrpset)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_parents, &user_snap->ss_parents,
+ sizeof (user_parents)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_children, &user_snap->ss_children,
+ sizeof (user_children)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_rsets, &user_snap->ss_rsets,
+ sizeof (user_rsets)) != 0)
+ return (set_errno(EFAULT));
+
+ if (copyout(&user_lats, &user_snap->ss_latencies,
+ sizeof (user_lats)) != 0)
+ return (set_errno(EFAULT));
+
+ /*
+ * Make copies of lgroup info and latency table, fix up pointers,
+ * and then copy them into user buffer
+ */
+ user_info_buffer = kmem_zalloc(info_size, KM_NOSLEEP);
+ if (user_info_buffer == NULL)
+ return (set_errno(ENOMEM));
+
+ user_lats_buffer = kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
+ KM_NOSLEEP);
+ if (user_lats_buffer == NULL) {
+ kmem_free(user_info_buffer, info_size);
+ return (set_errno(ENOMEM));
+ }
+
+ lgrp_info = (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);
+ bcopy(lgrp_info, user_info_buffer, info_size);
+
+ cpu_index = 0;
+ for (i = 0; i < snap_nlgrpsmax; i++) {
+ ulong_t *snap_rset;
+
+ /*
+ * Skip non-existent lgroups
+ */
+ if (user_info_buffer[i].info_lgrpid == LGRP_NONE)
+ continue;
+
+ /*
+ * Update free memory size since it changes frequently
+ * Only do so for lgroups directly containing memory
+ *
+ * NOTE: This must be done before changing the pointers to
+ * point into user space since we need to dereference
+ * lgroup resource set
+ */
+ snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
+ BT_BITOUL(snap_nlgrpsmax)];
+ if (BT_TEST(snap_rset, i))
+ user_info_buffer[i].info_mem_free =
+ lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);
+
+ /*
+ * Fix up pointers to parents, children, resources, and
+ * latencies
+ */
+ user_info_buffer[i].info_parents =
+ (ulong_t *)((uintptr_t)user_parents + (i * bitmask_size));
+ user_info_buffer[i].info_children =
+ (ulong_t *)((uintptr_t)user_children + (i * bitmask_size));
+ user_info_buffer[i].info_rset =
+ (ulong_t *)((uintptr_t)user_rsets +
+ (i * LGRP_RSRC_COUNT * bitmask_size));
+ user_lats_buffer[i] = (int *)((uintptr_t)user_lats +
+ (snap_nlgrpsmax * sizeof (int *)) + (i * snap_nlgrpsmax *
+ sizeof (int)));
+
+ /*
+ * Fix up pointer to CPU IDs
+ */
+ if (user_info_buffer[i].info_ncpus == 0) {
+ user_info_buffer[i].info_cpuids = NULL;
+ continue;
+ }
+ user_info_buffer[i].info_cpuids = &user_cpuids[cpu_index];
+ cpu_index += user_info_buffer[i].info_ncpus;
+ }
+ ASSERT(cpu_index == snap_ncpus);
+
+ /*
+ * Copy lgroup info and latency table with pointers fixed up to point
+ * into user buffer out to user buffer now
+ */
+ retval = lgrp_snap->ss_size;
+ if (copyout(user_info_buffer, user_info, info_size) != 0)
+ retval = set_errno(EFAULT);
+ kmem_free(user_info_buffer, info_size);
+
+ if (copyout(user_lats_buffer, user_lats, snap_nlgrpsmax *
+ sizeof (int *)) != 0)
+ retval = set_errno(EFAULT);
+ kmem_free(user_lats_buffer, snap_nlgrpsmax * sizeof (int *));
+
+ return (retval);
+}
+
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * Make 32-bit copy of snapshot, fix up any pointers in buffer to point
+ * into user instead of kernel address space, copy 32-bit snapshot into
+ * given user buffer, and return size of buffer needed to hold snapshot
+ */
+static int
+lgrp_snapshot_copy32(caddr32_t buf, size32_t bufsize)
+{
+ size32_t bitmask_size;
+ size32_t bitmasks_size;
+ size32_t children_size;
+ int cpu_index;
+ size32_t cpuids_size;
+ int i;
+ int j;
+ size32_t info_size;
+ size32_t lats_size;
+ lgrp_info_t *lgrp_info;
+ lgrp_snapshot_header32_t *lgrp_snap32;
+ lgrp_info32_t *lgrp_info32;
+ processorid_t *lgrp_cpuids32;
+ caddr32_t *lgrp_lats32;
+ int **lgrp_lats32_kernel;
+ uint_t *lgrp_set32;
+ uint_t *lgrp_parents32;
+ uint_t *lgrp_children32;
+ uint_t *lgrp_rsets32;
+ size32_t parents_size;
+ size32_t rsets_size;
+ size32_t set_size;
+ size32_t snap_hdr_size;
+ int snap_ncpus;
+ int snap_nlgrpsmax;
+ size32_t snap_size;
+
+ if (lgrp_snap == NULL)
+ return (0);
+
+ snap_ncpus = lgrp_snap->ss_ncpus;
+ snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;
+
+ /*
+ * Calculate size of buffer needed for 32-bit snapshot,
+ * rounding up size of each object to allow for alignment
+ * of next object in buffer.
+ */
+ snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
+ sizeof (caddr32_t));
+ info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
+ sizeof (processorid_t));
+ cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
+ sizeof (ulong_t));
+
+ bitmask_size = BT_SIZEOFMAP32(snap_nlgrpsmax);
+
+ set_size = bitmask_size;
+ parents_size = snap_nlgrpsmax * bitmask_size;
+ children_size = snap_nlgrpsmax * bitmask_size;
+ rsets_size = P2ROUNDUP(LGRP_RSRC_COUNT * snap_nlgrpsmax *
+ (int)bitmask_size, sizeof (caddr32_t));
+
+ bitmasks_size = set_size + parents_size + children_size + rsets_size;
+
+ /*
+ * Size of latency table and buffer
+ */
+ lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
+ (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));
+
+ snap_size = snap_hdr_size + info_size + cpuids_size + bitmasks_size +
+ lats_size;
+
+ if (buf == NULL || bufsize <= 0) {
+ return (snap_size);
+ }
+
+ /*
+ * User needs to try getting size of buffer again
+ * because given buffer size is too small.
+ * The lgroup hierarchy may have changed after they asked for the size
+ * but before the snapshot was taken.
+ */
+ if (bufsize < snap_size)
+ return (set_errno(EAGAIN));
+
+ /*
+ * Make 32-bit copy of snapshot, fix up pointers to point into user
+ * buffer not kernel, and then copy whole thing into user buffer
+ */
+ lgrp_snap32 = kmem_zalloc(snap_size, KM_NOSLEEP);
+ if (lgrp_snap32 == NULL)
+ return (set_errno(ENOMEM));
+
+ /*
+ * Calculate pointers into 32-bit copy of snapshot
+ * for lgroup info, CPU IDs, pset lgroup bitmask, parents, children,
+ * resources, and latency table and buffer
+ */
+ lgrp_info32 = (lgrp_info32_t *)((uintptr_t)lgrp_snap32 +
+ snap_hdr_size);
+ lgrp_cpuids32 = (processorid_t *)((uintptr_t)lgrp_info32 + info_size);
+ lgrp_set32 = (uint_t *)((uintptr_t)lgrp_cpuids32 + cpuids_size);
+ lgrp_parents32 = (uint_t *)((uintptr_t)lgrp_set32 + set_size);
+ lgrp_children32 = (uint_t *)((uintptr_t)lgrp_parents32 + parents_size);
+ lgrp_rsets32 = (uint_t *)((uintptr_t)lgrp_children32 + children_size);
+ lgrp_lats32 = (caddr32_t *)((uintptr_t)lgrp_rsets32 + rsets_size);
+
+ /*
+ * Make temporary lgroup latency table of pointers for kernel to use
+ * to fill in rows of table with latencies from each lgroup
+ */
+ lgrp_lats32_kernel = kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
+ KM_NOSLEEP);
+ if (lgrp_lats32_kernel == NULL) {
+ kmem_free(lgrp_snap32, snap_size);
+ return (set_errno(ENOMEM));
+ }
+
+ /*
+ * Fill in 32-bit lgroup snapshot header
+ * (with pointers into user's buffer for lgroup info, CPU IDs,
+ * bit masks, and latencies)
+ */
+ lgrp_snap32->ss_version = lgrp_snap->ss_version;
+ lgrp_snap32->ss_levels = lgrp_snap->ss_levels;
+ lgrp_snap32->ss_nlgrps = lgrp_snap32->ss_nlgrps_os =
+ lgrp_snap->ss_nlgrps;
+ lgrp_snap32->ss_nlgrps_max = snap_nlgrpsmax;
+ lgrp_snap32->ss_root = lgrp_snap->ss_root;
+ lgrp_snap32->ss_ncpus = lgrp_snap->ss_ncpus;
+ lgrp_snap32->ss_gen = lgrp_snap->ss_gen;
+ lgrp_snap32->ss_view = LGRP_VIEW_OS;
+ lgrp_snap32->ss_size = snap_size;
+ lgrp_snap32->ss_magic = buf;
+ lgrp_snap32->ss_info = buf + snap_hdr_size;
+ lgrp_snap32->ss_cpuids = lgrp_snap32->ss_info + info_size;
+ lgrp_snap32->ss_lgrpset = lgrp_snap32->ss_cpuids + cpuids_size;
+ lgrp_snap32->ss_parents = lgrp_snap32->ss_lgrpset + bitmask_size;
+ lgrp_snap32->ss_children = lgrp_snap32->ss_parents +
+ (snap_nlgrpsmax * bitmask_size);
+ lgrp_snap32->ss_rsets = lgrp_snap32->ss_children +
+ (snap_nlgrpsmax * bitmask_size);
+ lgrp_snap32->ss_latencies = lgrp_snap32->ss_rsets +
+ (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size);
+
+ /*
+ * Fill in lgrpset now because caller may have change psets
+ */
+ kpreempt_disable();
+ for (i = 0; i < snap_nlgrpsmax; i++) {
+ if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
+ i)) {
+ BT_SET32(lgrp_set32, i);
+ }
+ }
+ kpreempt_enable();
+
+ /*
+ * Fill in 32-bit copy of lgroup info and fix up pointers
+ * to point into user's buffer instead of kernel's
+ */
+ cpu_index = 0;
+ lgrp_info = lgrp_snap->ss_info;
+ for (i = 0; i < snap_nlgrpsmax; i++) {
+ uint_t *children;
+ uint_t *lgrp_rset;
+ uint_t *parents;
+ ulong_t *snap_rset;
+
+ /*
+ * Skip non-existent lgroups
+ */
+ if (lgrp_info[i].info_lgrpid == LGRP_NONE) {
+ bzero(&lgrp_info32[i], sizeof (lgrp_info32[i]));
+ lgrp_info32[i].info_lgrpid = LGRP_NONE;
+ continue;
+ }
+
+ /*
+ * Fill in parents, children, lgroup resource set, and
+ * latencies from snapshot
+ */
+ parents = (uint_t *)((uintptr_t)lgrp_parents32 +
+ i * bitmask_size);
+ children = (uint_t *)((uintptr_t)lgrp_children32 +
+ i * bitmask_size);
+ snap_rset = (ulong_t *)((uintptr_t)lgrp_snap->ss_rsets +
+ (i * LGRP_RSRC_COUNT * BT_SIZEOFMAP(snap_nlgrpsmax)));
+ lgrp_rset = (uint_t *)((uintptr_t)lgrp_rsets32 +
+ (i * LGRP_RSRC_COUNT * bitmask_size));
+ lgrp_lats32_kernel[i] = (int *)((uintptr_t)lgrp_lats32 +
+ snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
+ sizeof (int));
+ for (j = 0; j < snap_nlgrpsmax; j++) {
+ int k;
+ uint_t *rset;
+
+ if (BT_TEST(&lgrp_snap->ss_parents[i], j))
+ BT_SET32(parents, j);
+
+ if (BT_TEST(&lgrp_snap->ss_children[i], j))
+ BT_SET32(children, j);
+
+ for (k = 0; k < LGRP_RSRC_COUNT; k++) {
+ rset = (uint_t *)((uintptr_t)lgrp_rset +
+ k * bitmask_size);
+ if (BT_TEST(&snap_rset[k], j))
+ BT_SET32(rset, j);
+ }
+
+ lgrp_lats32_kernel[i][j] =
+ lgrp_snap->ss_latencies[i][j];
+ }
+
+ /*
+ * Fix up pointer to latency buffer
+ */
+ lgrp_lats32[i] = lgrp_snap32->ss_latencies +
+ snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
+ sizeof (int);
+
+ /*
+ * Fix up pointers for parents, children, and resources
+ */
+ lgrp_info32[i].info_parents = lgrp_snap32->ss_parents +
+ (i * bitmask_size);
+ lgrp_info32[i].info_children = lgrp_snap32->ss_children +
+ (i * bitmask_size);
+ lgrp_info32[i].info_rset = lgrp_snap32->ss_rsets +
+ (i * LGRP_RSRC_COUNT * bitmask_size);
+
+ /*
+ * Fill in memory and CPU info
+ * Only fill in memory for lgroups directly containing memory
+ */
+ snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
+ BT_BITOUL(snap_nlgrpsmax)];
+ if (BT_TEST(snap_rset, i)) {
+ lgrp_info32[i].info_mem_free = lgrp_mem_size(i,
+ LGRP_MEM_SIZE_FREE);
+ lgrp_info32[i].info_mem_install =
+ lgrp_info[i].info_mem_install;
+ }
+
+ lgrp_info32[i].info_ncpus = lgrp_info[i].info_ncpus;
+
+ lgrp_info32[i].info_lgrpid = lgrp_info[i].info_lgrpid;
+ lgrp_info32[i].info_latency = lgrp_info[i].info_latency;
+
+ if (lgrp_info32[i].info_ncpus == 0) {
+ lgrp_info32[i].info_cpuids = 0;
+ continue;
+ }
+
+ /*
+ * Fix up pointer for CPU IDs
+ */
+ lgrp_info32[i].info_cpuids = lgrp_snap32->ss_cpuids +
+ (cpu_index * sizeof (processorid_t));
+ cpu_index += lgrp_info32[i].info_ncpus;
+ }
+ ASSERT(cpu_index == snap_ncpus);
+
+ /*
+ * Copy lgroup CPU IDs into 32-bit snapshot
+ * before copying it out into user's buffer
+ */
+ bcopy(lgrp_snap->ss_cpuids, lgrp_cpuids32, cpuids_size);
+
+ /*
+ * Copy 32-bit lgroup snapshot into user's buffer all at once
+ */
+ if (copyout(lgrp_snap32, (void *)(uintptr_t)buf, snap_size) != 0) {
+ kmem_free(lgrp_snap32, snap_size);
+ kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(lgrp_snap32, snap_size);
+ kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));
+
+ return (snap_size);
+}
+#endif /* _SYSCALL32_IMPL */
+
+
+int
+lgrpsys(int subcode, long ia, void *ap)
+{
+ size_t bufsize;
+ int latency;
+
+ switch (subcode) {
+
+ case LGRP_SYS_AFFINITY_GET:
+ return (lgrp_affinity_get((lgrp_affinity_args_t *)ap));
+
+ case LGRP_SYS_AFFINITY_SET:
+ return (lgrp_affinity_set((lgrp_affinity_args_t *)ap));
+
+ case LGRP_SYS_GENERATION:
+ return (lgrp_generation(ia));
+
+ case LGRP_SYS_HOME:
+ return (lgrp_home_get((idtype_t)ia, (id_t)(uintptr_t)ap));
+
+ case LGRP_SYS_LATENCY:
+ mutex_enter(&cpu_lock);
+ latency = lgrp_latency(ia, (lgrp_id_t)(uintptr_t)ap);
+ mutex_exit(&cpu_lock);
+ return (latency);
+
+ case LGRP_SYS_MEMINFO:
+ return (meminfo(ia, (struct meminfo *)ap));
+
+ case LGRP_SYS_VERSION:
+ return (lgrp_version(ia));
+
+ case LGRP_SYS_SNAPSHOT:
+ mutex_enter(&lgrp_snap_lock);
+ bufsize = lgrp_snapshot();
+ if (ap && ia > 0) {
+ if (get_udatamodel() == DATAMODEL_NATIVE)
+ bufsize = lgrp_snapshot_copy(ap, ia);
+#ifdef _SYSCALL32_IMPL
+ else
+ bufsize = lgrp_snapshot_copy32(
+ (caddr32_t)(uintptr_t)ap, ia);
+#endif /* _SYSCALL32_IMPL */
+ }
+ mutex_exit(&lgrp_snap_lock);
+ return (bufsize);
+
+ default:
+ break;
+
+ }
+
+ return (set_errno(EINVAL));
+}
diff --git a/usr/src/uts/common/syscall/link.c b/usr/src/uts/common/syscall/link.c
new file mode 100644
index 0000000000..a63b04f133
--- /dev/null
+++ b/usr/src/uts/common/syscall/link.c
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1989 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+
+/*
+ * Make a hard link.
+ */
+int
+link(char *from, char *to)
+{
+ int error;
+
+ if (error = vn_link(from, to, UIO_USERSPACE))
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/lseek.c b/usr/src/uts/common/syscall/lseek.c
new file mode 100644
index 0000000000..d03687eb68
--- /dev/null
+++ b/usr/src/uts/common/syscall/lseek.c
@@ -0,0 +1,380 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/filio.h>
+
+/*
+ * These are defined in unistd.h - but we can't include that
+ */
+#define SEEK_SET 0 /* Set file pointer to "offset" */
+#define SEEK_CUR 1 /* Set file pointer to current plus "offset" */
+#define SEEK_END 2 /* Set file pointer to EOF plus "offset" */
+#define SEEK_DATA 3 /* Set file pointer to next data past offset */
+#define SEEK_HOLE 4 /* Set file pointer to next hole past offset */
+
+/*
+ * Seek on a file
+ */
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+/*
+ * Workhorse for the 32-bit seek variants: lseek32 and llseek32
+ *
+ * 'max' represents the maximum possible representation of offset
+ * in the data type corresponding to lseek and llseek. It is
+ * MAXOFF32_T for off32_t and MAXOFFSET_T for off64_t.
+ * We return EOVERFLOW if we cannot represent the resulting offset
+ * in the data type.
+ * We provide support for character devices to be seeked beyond MAXOFF32_T
+ * by lseek. To maintain compatibility in such cases lseek passes
+ * the arguments carefully to lseek_common when file is not regular.
+ * (/dev/kmem is a good example of a > 2Gbyte seek!)
+ */
+static int
+lseek32_common(file_t *fp, int stype, offset_t off, offset_t max,
+ offset_t *retoff)
+{
+ vnode_t *vp;
+ struct vattr vattr;
+ int error;
+ u_offset_t noff;
+ offset_t curoff, newoff;
+ int reg;
+
+ vp = fp->f_vnode;
+ reg = (vp->v_type == VREG);
+
+ curoff = fp->f_offset;
+
+ switch (stype) {
+ case SEEK_SET:
+ noff = (u_offset_t)off;
+ if (reg && noff > max) {
+ error = EINVAL;
+ goto out;
+ }
+ break;
+
+ case SEEK_CUR:
+ if (reg && off > (max - curoff)) {
+ error = EOVERFLOW;
+ goto out;
+ }
+ noff = (u_offset_t)(off + curoff);
+ if (reg && noff > max) {
+ error = EINVAL;
+ goto out;
+ }
+ break;
+
+ case SEEK_END:
+ vattr.va_mask = AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred)) {
+ goto out;
+ }
+ if (reg && (off > (max - (offset_t)vattr.va_size))) {
+ error = EOVERFLOW;
+ goto out;
+ }
+ noff = (u_offset_t)(off + (offset_t)vattr.va_size);
+ if (reg && noff > max) {
+ error = EINVAL;
+ goto out;
+ }
+ break;
+
+ case SEEK_DATA:
+ /*
+ * Get and set the file pointer to the offset of the next
+ * data past "off"
+ */
+ noff = (u_offset_t)off;
+ error = VOP_IOCTL(vp, _FIO_SEEK_DATA, (intptr_t)(&noff),
+ FKIOCTL, kcred, NULL);
+ if (error) {
+ if (error != ENOTTY)
+ return (error);
+ /*
+ * The ioctl is not supported, check the supplied
+ * "off" is not past the end of file
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred);
+ if (error)
+ return (error);
+ if (noff >= (u_offset_t)vattr.va_size)
+ return (ENXIO);
+ }
+ if (reg && (noff > max))
+ return (EOVERFLOW);
+
+ fp->f_offset = (offset_t)noff;
+ (*retoff) = (offset_t)noff;
+ return (0);
+
+ case SEEK_HOLE:
+ /*
+ * Get and set the file pointer to the offset of the next
+ * hole past "off"
+ */
+ noff = (u_offset_t)off;
+ error = VOP_IOCTL(vp, _FIO_SEEK_HOLE, (intptr_t)(&noff),
+ FKIOCTL, kcred, NULL);
+ if (error) {
+ if (error != ENOTTY)
+ return (error);
+ /*
+ * ioctl is not supported, if the off is valid return
+ * the "virtual hole" at the end of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred);
+ if (error)
+ return (error);
+ if (off < (offset_t)vattr.va_size)
+ noff = (u_offset_t)vattr.va_size;
+ else
+ return (ENXIO);
+ }
+ if (reg && (noff > max))
+ return (EOVERFLOW);
+
+ fp->f_offset = (offset_t)noff;
+ (*retoff) = (offset_t)noff;
+ return (0);
+
+ default:
+ error = EINVAL;
+ goto out;
+ }
+
+ ASSERT((reg && noff <= max) || !reg);
+ newoff = (offset_t)noff;
+ if ((error = VOP_SEEK(vp, curoff, &newoff)) == 0) {
+ fp->f_offset = newoff;
+ (*retoff) = newoff;
+ return (0);
+ }
+out:
+ return (error);
+}
+
+off32_t
+lseek32(int32_t fdes, off32_t off, int32_t stype)
+{
+ file_t *fp;
+ int error;
+ offset_t retoff;
+
+ if ((fp = getf(fdes)) == NULL)
+ return ((off32_t)set_errno(EBADF));
+
+ /*
+ * lseek32 returns EOVERFLOW if we cannot represent the resulting
+ * offset from seek in a 32-bit off_t.
+ * The following routines are sensitive to sign extensions and
+ * calculations and if ever you change this make sure it works for
+ * special files.
+ *
+ * When VREG is not set we do the check for stype != SEEK_SET
+ * to send the unsigned value to lseek_common and not the sign
+ * extended value. (The maximum representable value is not
+ * checked by lseek_common for special files.)
+ */
+ if (fp->f_vnode->v_type == VREG || stype != SEEK_SET)
+ error = lseek32_common(fp, stype, (offset_t)off,
+ (offset_t)MAXOFF32_T, &retoff);
+ else if (stype == SEEK_SET)
+ error = lseek32_common(fp, stype, (offset_t)(uint_t)off,
+ (offset_t)(uint_t)UINT_MAX, &retoff);
+
+ releasef(fdes);
+ if (!error)
+ return ((off32_t)retoff);
+ return ((off32_t)set_errno(error));
+}
+
+/*
+ * 64-bit seeks from 32-bit applications
+ */
+offset_t
+llseek32(int32_t fdes, uint32_t off1, uint32_t off2, int stype)
+{
+ file_t *fp;
+ int error;
+ offset_t retoff;
+#if defined(_LITTLE_ENDIAN)
+ offset_t off = ((u_offset_t)off2 << 32) | (u_offset_t)off1;
+#else
+ offset_t off = ((u_offset_t)off1 << 32) | (u_offset_t)off2;
+#endif
+
+ if ((fp = getf(fdes)) == NULL)
+ error = EBADF;
+ else {
+ error = lseek32_common(fp, stype, off, MAXOFFSET_T, &retoff);
+ releasef(fdes);
+ }
+
+ return (error ? (offset_t)set_errno(error) : retoff);
+}
+#endif /* _SYSCALL32_IMPL || _ILP32 */
+
+#ifdef _LP64
+/*
+ * Seek on a file.
+ *
+ * Life is almost simple again (at least until we do 128-bit files ;-)
+ * This is both 'lseek' and 'llseek' to a 64-bit application.
+ */
+off_t
+lseek64(int fdes, off_t off, int stype)
+{
+ file_t *fp;
+ vnode_t *vp;
+ struct vattr vattr;
+ int error;
+ off_t old_off;
+ offset_t new_off;
+
+ if ((fp = getf(fdes)) == NULL)
+ return ((off_t)set_errno(EBADF));
+
+ vp = fp->f_vnode;
+ new_off = off;
+
+ switch (stype) {
+ case SEEK_CUR:
+ new_off += fp->f_offset;
+ break;
+
+ case SEEK_END:
+ vattr.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred)) != 0)
+ goto lseek64error;
+ new_off += vattr.va_size;
+ break;
+
+ case SEEK_SET:
+ break;
+
+ case SEEK_DATA:
+ /*
+ * Get and set the file pointer to the offset of the next
+ * data past "off"
+ */
+ new_off = (offset_t)off;
+ error = VOP_IOCTL(vp, _FIO_SEEK_DATA, (intptr_t)(&new_off),
+ FKIOCTL, kcred, NULL);
+ if (error) {
+ if (error != ENOTTY) {
+ goto lseek64error;
+ }
+ /*
+ * The ioctl is not supported, check the supplied off
+ * is not past end of file
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred);
+ if (error)
+ goto lseek64error;
+ if (new_off >= (offset_t)vattr.va_size) {
+ error = ENXIO;
+ goto lseek64error;
+ }
+ }
+ fp->f_offset = new_off;
+ releasef(fdes);
+ return (new_off);
+
+ case SEEK_HOLE:
+ /*
+ * Get and set the file pointer to the offset of the next
+ * hole past "off"
+ */
+ new_off = off;
+ error = VOP_IOCTL(vp, _FIO_SEEK_HOLE, (intptr_t)(&new_off),
+ FKIOCTL, kcred, NULL);
+ if (error) {
+ if (error != ENOTTY)
+ goto lseek64error;
+ /*
+ * ioctl is not supported, if the off is valid return
+ * the "virtual hole" at the end of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred);
+ if (error)
+ goto lseek64error;
+ if (off < (offset_t)vattr.va_size) {
+ new_off = (offset_t)vattr.va_size;
+ } else {
+ error = ENXIO;
+ goto lseek64error;
+ }
+ }
+ fp->f_offset = new_off;
+ releasef(fdes);
+ return (new_off);
+
+ default:
+ error = EINVAL;
+ goto lseek64error;
+ }
+
+ old_off = fp->f_offset;
+ if ((error = VOP_SEEK(vp, old_off, &new_off)) == 0) {
+ fp->f_offset = new_off;
+ releasef(fdes);
+ return (new_off);
+ }
+
+lseek64error:
+ releasef(fdes);
+ return ((off_t)set_errno(error));
+}
+#endif /* _LP64 */
diff --git a/usr/src/uts/common/syscall/lwp_create.c b/usr/src/uts/common/syscall/lwp_create.c
new file mode 100644
index 0000000000..e0bf63c886
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwp_create.c
@@ -0,0 +1,212 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/syscall.h>
+#include <sys/proc.h>
+#include <sys/processor.h>
+#include <sys/fault.h>
+#include <sys/ucontext.h>
+#include <sys/signal.h>
+#include <sys/unistd.h>
+#include <sys/procfs.h>
+#include <sys/prsystm.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/klwp.h>
+#include <sys/pool.h>
+
+/*
+ * System call to create an lwp.
+ *
+ * Notes on the LWP_DETACHED and LWP_DAEMON flags:
+ *
+ * A detached lwp (LWP_DETACHED) cannot be the specific target of
+ * lwp_wait() (it is not joinable), but lwp_wait(0, ...) is required
+ * to sleep until all non-daemon detached lwps have terminated before
+ * returning EDEADLK because a detached lwp might create a non-detached lwp
+ * that could then be returned by lwp_wait(0, ...). See also lwp_detach().
+ *
+ * A daemon lwp (LWP_DAEMON) is a detached lwp that has the additional
+ * property that it does not affect the termination condition of the
+ * process: The last non-daemon lwp to call lwp_exit() causes the process
+ * to exit and lwp_wait(0, ...) does not sleep waiting for daemon lwps
+ * to terminate. See the block comment before lwp_wait().
+ */
+int
+syslwp_create(ucontext_t *ucp, int flags, id_t *new_lwp)
+{
+ klwp_t *lwp;
+ proc_t *p = ttoproc(curthread);
+ kthread_t *t;
+ ucontext_t uc;
+#ifdef _SYSCALL32_IMPL
+ ucontext32_t uc32;
+#endif /* _SYSCALL32_IMPL */
+ k_sigset_t sigmask;
+ int tid;
+ model_t model = get_udatamodel();
+ uintptr_t thrptr = 0;
+
+ if (flags & ~(LWP_DAEMON|LWP_DETACHED|LWP_SUSPENDED))
+ return (set_errno(EINVAL));
+
+ /*
+ * lwp_create() is disallowed for the /proc agent lwp.
+ */
+ if (curthread == p->p_agenttp)
+ return (set_errno(ENOTSUP));
+
+ if (model == DATAMODEL_NATIVE) {
+ if (copyin(ucp, &uc, sizeof (ucontext_t)))
+ return (set_errno(EFAULT));
+ sigutok(&uc.uc_sigmask, &sigmask);
+#if defined(__i386)
+ /*
+ * libc stashed thrptr into unused kernel %sp.
+ * See setup_context() in libc.
+ */
+ thrptr = (uint32_t)uc.uc_mcontext.gregs[ESP];
+#endif
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ if (copyin(ucp, &uc32, sizeof (ucontext32_t)))
+ return (set_errno(EFAULT));
+ sigutok(&uc32.uc_sigmask, &sigmask);
+#if defined(__sparc)
+ ucontext_32ton(&uc32, &uc, NULL, NULL);
+#else /* __amd64 */
+ ucontext_32ton(&uc32, &uc);
+ /*
+ * libc stashed thrptr into unused kernel %sp.
+ * See setup_context() in libc.
+ */
+ thrptr = (uint32_t)uc32.uc_mcontext.gregs[ESP];
+#endif
+ }
+#endif /* _SYSCALL32_IMPL */
+
+ (void) save_syscall_args(); /* save args for tracing first */
+
+ mutex_enter(&curproc->p_lock);
+ pool_barrier_enter();
+ mutex_exit(&curproc->p_lock);
+ lwp = lwp_create(lwp_rtt, NULL, NULL, curproc, TS_STOPPED,
+ curthread->t_pri, &sigmask, curthread->t_cid, 0);
+ mutex_enter(&curproc->p_lock);
+ pool_barrier_exit();
+ mutex_exit(&curproc->p_lock);
+ if (lwp == NULL)
+ return (set_errno(EAGAIN));
+
+ lwp_load(lwp, uc.uc_mcontext.gregs, thrptr);
+
+ t = lwptot(lwp);
+ /*
+ * Copy the new lwp's lwpid into the caller's specified buffer.
+ */
+ if (new_lwp && copyout(&t->t_tid, new_lwp, sizeof (id_t))) {
+ /*
+ * caller's buffer is not writable, return
+ * EFAULT, and terminate new lwp.
+ */
+ mutex_enter(&p->p_lock);
+ t->t_proc_flag |= TP_EXITLWP;
+ t->t_sig_check = 1;
+ t->t_sysnum = 0;
+ t->t_proc_flag &= ~TP_HOLDLWP;
+ lwp_create_done(t);
+ mutex_exit(&p->p_lock);
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * clone callers context, if any. must be invoked
+ * while -not- holding p_lock.
+ */
+ if (curthread->t_ctx)
+ lwp_createctx(curthread, t);
+
+ /*
+ * copy current contract templates
+ */
+ lwp_ctmpl_copy(lwp, ttolwp(curthread));
+
+ mutex_enter(&p->p_lock);
+ /*
+ * Copy the syscall arguments to the new lwp's arg area
+ * for the benefit of debuggers.
+ */
+ t->t_sysnum = SYS_lwp_create;
+ lwp->lwp_ap = lwp->lwp_arg;
+ lwp->lwp_arg[0] = (long)ucp;
+ lwp->lwp_arg[1] = (long)flags;
+ lwp->lwp_arg[2] = (long)new_lwp;
+ lwp->lwp_argsaved = 1;
+
+ if (!(flags & (LWP_DETACHED|LWP_DAEMON)))
+ t->t_proc_flag |= TP_TWAIT;
+ if (flags & LWP_DAEMON) {
+ t->t_proc_flag |= TP_DAEMON;
+ p->p_lwpdaemon++;
+ }
+
+ tid = (int)t->t_tid; /* for /proc debuggers */
+
+ /*
+ * We now set the newly-created lwp running.
+ * If it is being created as LWP_SUSPENDED, we leave its
+ * TP_HOLDLWP flag set so it will stop in system call exit.
+ */
+ if (!(flags & LWP_SUSPENDED))
+ t->t_proc_flag &= ~TP_HOLDLWP;
+ lwp_create_done(t);
+ mutex_exit(&p->p_lock);
+
+ return (tid);
+}
+
+/*
+ * Exit the calling lwp
+ */
+void
+syslwp_exit()
+{
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&p->p_lock);
+ lwp_exit();
+ /* NOTREACHED */
+}
diff --git a/usr/src/uts/common/syscall/lwp_info.c b/usr/src/uts/common/syscall/lwp_info.c
new file mode 100644
index 0000000000..21ac0ca4c3
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwp_info.c
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/debug.h>
+#include <sys/model.h>
+#include <sys/msacct.h>
+
+/*
+ * Get the time accounting information for the calling LWP.
+ */
+int
+lwp_info(timestruc_t *tvp)
+{
+ timestruc_t tv[2];
+ hrtime_t hrutime, hrstime;
+ klwp_t *lwp = ttolwp(curthread);
+
+ hrutime = lwp->lwp_mstate.ms_acct[LMS_USER];
+ hrstime = lwp->lwp_mstate.ms_acct[LMS_SYSTEM] +
+ lwp->lwp_mstate.ms_acct[LMS_TRAP];
+ scalehrtime(&hrutime);
+ scalehrtime(&hrstime);
+
+ hrt2ts(hrutime, &tv[0]);
+ hrt2ts(hrstime, &tv[1]);
+
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyout(tv, tvp, sizeof (tv)))
+ return (set_errno(EFAULT));
+ } else {
+ timestruc32_t tv32[2];
+
+ if (TIMESPEC_OVERFLOW(&tv[0]) ||
+ TIMESPEC_OVERFLOW(&tv[1]))
+ return (set_errno(EOVERFLOW)); /* unlikely */
+
+ TIMESPEC_TO_TIMESPEC32(&tv32[0], &tv[0]);
+ TIMESPEC_TO_TIMESPEC32(&tv32[1], &tv[1]);
+
+ if (copyout(tv32, tvp, sizeof (tv32)))
+ return (set_errno(EFAULT));
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/lwp_self.c b/usr/src/uts/common/syscall/lwp_self.c
new file mode 100644
index 0000000000..bbd5b40632
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwp_self.c
@@ -0,0 +1,39 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* Copyright (c) 1994 Sun Microsystems, Inc. */
+/* All Rights Reserved */
+
+
+#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/thread.h>
+
+int
+lwp_self()
+{
+ return (curthread->t_tid);
+}
diff --git a/usr/src/uts/common/syscall/lwp_sobj.c b/usr/src/uts/common/syscall/lwp_sobj.c
new file mode 100644
index 0000000000..5b255912a0
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwp_sobj.c
@@ -0,0 +1,3119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/prsystm.h>
+#include <sys/kmem.h>
+#include <sys/sobject.h>
+#include <sys/fault.h>
+#include <sys/procfs.h>
+#include <sys/watchpoint.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/machlock.h>
+#include <sys/debug.h>
+#include <sys/synch.h>
+#include <sys/synch32.h>
+#include <sys/mman.h>
+#include <sys/class.h>
+#include <sys/schedctl.h>
+#include <sys/sleepq.h>
+#include <sys/policy.h>
+#include <sys/tnf_probe.h>
+#include <sys/lwpchan_impl.h>
+#include <sys/turnstile.h>
+#include <sys/atomic.h>
+#include <sys/lwp_timer_impl.h>
+#include <sys/lwp_upimutex_impl.h>
+#include <vm/as.h>
+#include <sys/sdt.h>
+
+static kthread_t *lwpsobj_owner(caddr_t);
+static void lwp_unsleep(kthread_t *t);
+static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
+static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
+
+extern int lwp_cond_signal(lwp_cond_t *cv);
+
+/*
+ * Maximum number of user prio inheritance locks that can be held by a thread.
+ * Used to limit kmem for each thread. This is a per-thread limit that
+ * can be administered on a system wide basis (using /etc/system).
+ *
+ * Also, when a limit, say maxlwps is added for numbers of lwps within a
+ * process, the per-thread limit automatically becomes a process-wide limit
+ * of maximum number of held upi locks within a process:
+ * maxheldupimx = maxnestupimx * maxlwps;
+ */
+static uint32_t maxnestupimx = 2000;
+
+/*
+ * The sobj_ops vector exports a set of functions needed when a thread
+ * is asleep on a synchronization object of this type.
+ */
+static sobj_ops_t lwp_sobj_ops = {
+ SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
+};
+
+static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
+
+static sobj_ops_t lwp_sobj_pi_ops = {
+ SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
+ turnstile_change_pri
+};
+
+static sleepq_head_t lwpsleepq[NSLEEPQ];
+upib_t upimutextab[UPIMUTEX_TABSIZE];
+
+#define LWPCHAN_LOCK_SHIFT 10 /* 1024 locks for each pool */
+#define LWPCHAN_LOCK_SIZE (1 << LWPCHAN_LOCK_SHIFT)
+
+/*
+ * We know that both lc_wchan and lc_wchan0 are addresses that most
+ * likely are 8-byte aligned, so we shift off the low-order 3 bits.
+ * 'pool' is either 0 or 1.
+ */
+#define LWPCHAN_LOCK_HASH(X, pool) \
+ (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
+ (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
+
+static kmutex_t lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
+
+/*
+ * Is this a POSIX threads user-level lock requiring priority inheritance?
+ */
+#define UPIMUTEX(type) ((type) & LOCK_PRIO_INHERIT)
+
+static sleepq_head_t *
+lwpsqhash(lwpchan_t *lwpchan)
+{
+ uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
+ return (&lwpsleepq[SQHASHINDEX(x)]);
+}
+
+/*
+ * Lock an lwpchan.
+ * Keep this in sync with lwpchan_unlock(), below.
+ */
+static void
+lwpchan_lock(lwpchan_t *lwpchan, int pool)
+{
+ uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
+ mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
+}
+
+/*
+ * Unlock an lwpchan.
+ * Keep this in sync with lwpchan_lock(), above.
+ */
+static void
+lwpchan_unlock(lwpchan_t *lwpchan, int pool)
+{
+ uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
+ mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
+}
+
+/*
+ * Delete mappings from the lwpchan cache for pages that are being
+ * unmapped by as_unmap(). Given a range of addresses, "start" to "end",
+ * all mappings within the range are deleted from the lwpchan cache.
+ */
+void
+lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
+{
+ lwpchan_data_t *lcp;
+ lwpchan_hashbucket_t *hashbucket;
+ lwpchan_hashbucket_t *endbucket;
+ lwpchan_entry_t *ent;
+ lwpchan_entry_t **prev;
+ caddr_t addr;
+
+ mutex_enter(&p->p_lcp_lock);
+ lcp = p->p_lcp;
+ hashbucket = lcp->lwpchan_cache;
+ endbucket = hashbucket + lcp->lwpchan_size;
+ for (; hashbucket < endbucket; hashbucket++) {
+ if (hashbucket->lwpchan_chain == NULL)
+ continue;
+ mutex_enter(&hashbucket->lwpchan_lock);
+ prev = &hashbucket->lwpchan_chain;
+ /* check entire chain */
+ while ((ent = *prev) != NULL) {
+ addr = ent->lwpchan_addr;
+ if (start <= addr && addr < end) {
+ *prev = ent->lwpchan_next;
+ if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
+ (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
+ lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
+ kmem_free(ent, sizeof (*ent));
+ atomic_add_32(&lcp->lwpchan_entries, -1);
+ } else {
+ prev = &ent->lwpchan_next;
+ }
+ }
+ mutex_exit(&hashbucket->lwpchan_lock);
+ }
+ mutex_exit(&p->p_lcp_lock);
+}
+
+/*
+ * Given an lwpchan cache pointer and a process virtual address,
+ * return a pointer to the corresponding lwpchan hash bucket.
+ */
+static lwpchan_hashbucket_t *
+lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
+{
+ uint_t i;
+
+ /*
+ * All user-level sync object addresses are 8-byte aligned.
+ * Ignore the lowest 3 bits of the address and use the
+ * higher-order 2*lwpchan_bits bits for the hash index.
+ */
+ addr >>= 3;
+ i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
+ return (lcp->lwpchan_cache + i);
+}
+
+/*
+ * (Re)allocate the per-process lwpchan cache.
+ */
+static void
+lwpchan_alloc_cache(proc_t *p, uint_t bits)
+{
+ lwpchan_data_t *lcp;
+ lwpchan_data_t *old_lcp;
+ lwpchan_hashbucket_t *hashbucket;
+ lwpchan_hashbucket_t *endbucket;
+ lwpchan_hashbucket_t *newbucket;
+ lwpchan_entry_t *ent;
+ lwpchan_entry_t *next;
+ uint_t count;
+
+ ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
+
+ lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
+ lcp->lwpchan_bits = bits;
+ lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
+ lcp->lwpchan_mask = lcp->lwpchan_size - 1;
+ lcp->lwpchan_entries = 0;
+ lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
+ sizeof (lwpchan_hashbucket_t), KM_SLEEP);
+ lcp->lwpchan_next_data = NULL;
+
+ mutex_enter(&p->p_lcp_lock);
+ if ((old_lcp = p->p_lcp) != NULL) {
+ if (old_lcp->lwpchan_bits >= bits) {
+ /* someone beat us to it */
+ mutex_exit(&p->p_lcp_lock);
+ kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
+ sizeof (lwpchan_hashbucket_t));
+ kmem_free(lcp, sizeof (lwpchan_data_t));
+ return;
+ }
+ /*
+ * Acquire all of the old hash table locks.
+ */
+ hashbucket = old_lcp->lwpchan_cache;
+ endbucket = hashbucket + old_lcp->lwpchan_size;
+ for (; hashbucket < endbucket; hashbucket++)
+ mutex_enter(&hashbucket->lwpchan_lock);
+ /*
+ * Move all of the old hash table entries to the
+ * new hash table. The new hash table has not yet
+ * been installed so we don't need any of its locks.
+ */
+ count = 0;
+ hashbucket = old_lcp->lwpchan_cache;
+ for (; hashbucket < endbucket; hashbucket++) {
+ ent = hashbucket->lwpchan_chain;
+ while (ent != NULL) {
+ next = ent->lwpchan_next;
+ newbucket = lwpchan_bucket(lcp,
+ (uintptr_t)ent->lwpchan_addr);
+ ent->lwpchan_next = newbucket->lwpchan_chain;
+ newbucket->lwpchan_chain = ent;
+ ent = next;
+ count++;
+ }
+ hashbucket->lwpchan_chain = NULL;
+ }
+ lcp->lwpchan_entries = count;
+ }
+
+ /*
+ * Retire the old hash table. We can't actually kmem_free() it
+ * now because someone may still have a pointer to it. Instead,
+ * we link it onto the new hash table's list of retired hash tables.
+ * The new hash table is double the size of the previous one, so
+ * the total size of all retired hash tables is less than the size
+ * of the new one. exit() and exec() free the retired hash tables
+ * (see lwpchan_destroy_cache(), below).
+ */
+ lcp->lwpchan_next_data = old_lcp;
+
+ /*
+ * As soon as we store the new lcp, future locking operations will
+ * use it. Therefore, we must ensure that all the state we've just
+ * established reaches global visibility before the new lcp does.
+ */
+ membar_producer();
+ p->p_lcp = lcp;
+
+ if (old_lcp != NULL) {
+ /*
+ * Release all of the old hash table locks.
+ */
+ hashbucket = old_lcp->lwpchan_cache;
+ for (; hashbucket < endbucket; hashbucket++)
+ mutex_exit(&hashbucket->lwpchan_lock);
+ }
+ mutex_exit(&p->p_lcp_lock);
+}
+
+/*
+ * Deallocate the lwpchan cache, and any dynamically allocated mappings.
+ * Called when the process exits or execs. All lwps except one have
+ * exited so we need no locks here.
+ */
+void
+lwpchan_destroy_cache(int exec)
+{
+ proc_t *p = curproc;
+ lwpchan_hashbucket_t *hashbucket;
+ lwpchan_hashbucket_t *endbucket;
+ lwpchan_data_t *lcp;
+ lwpchan_entry_t *ent;
+ lwpchan_entry_t *next;
+ uint16_t lockflg;
+
+ lcp = p->p_lcp;
+ p->p_lcp = NULL;
+
+ lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
+ hashbucket = lcp->lwpchan_cache;
+ endbucket = hashbucket + lcp->lwpchan_size;
+ for (; hashbucket < endbucket; hashbucket++) {
+ ent = hashbucket->lwpchan_chain;
+ hashbucket->lwpchan_chain = NULL;
+ while (ent != NULL) {
+ next = ent->lwpchan_next;
+ if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
+ (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
+ lwp_mutex_cleanup(ent, lockflg);
+ kmem_free(ent, sizeof (*ent));
+ ent = next;
+ }
+ }
+
+ while (lcp != NULL) {
+ lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
+ kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
+ sizeof (lwpchan_hashbucket_t));
+ kmem_free(lcp, sizeof (lwpchan_data_t));
+ lcp = next_lcp;
+ }
+}
+
+/*
+ * Return zero when there is an entry in the lwpchan cache for the
+ * given process virtual address and non-zero when there is not.
+ * The returned non-zero value is the current length of the
+ * hash chain plus one. The caller holds the hash bucket lock.
+ */
+static uint_t
+lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
+ lwpchan_hashbucket_t *hashbucket)
+{
+ lwpchan_entry_t *ent;
+ uint_t count = 1;
+
+ for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
+ if (ent->lwpchan_addr == addr) {
+ if (ent->lwpchan_type != type ||
+ ent->lwpchan_pool != pool) {
+ /*
+ * This shouldn't happen, but might if the
+ * process reuses its memory for different
+ * types of sync objects. We test first
+ * to avoid grabbing the memory cache line.
+ */
+ ent->lwpchan_type = (uint16_t)type;
+ ent->lwpchan_pool = (uint16_t)pool;
+ }
+ *lwpchan = ent->lwpchan_lwpchan;
+ return (0);
+ }
+ count++;
+ }
+ return (count);
+}
+
+/*
+ * Return the cached lwpchan mapping if cached, otherwise insert
+ * a virtual address to lwpchan mapping into the cache.
+ */
+static int
+lwpchan_get_mapping(struct as *as, caddr_t addr,
+ int type, lwpchan_t *lwpchan, int pool)
+{
+ proc_t *p = curproc;
+ lwpchan_data_t *lcp;
+ lwpchan_hashbucket_t *hashbucket;
+ lwpchan_entry_t *ent;
+ memid_t memid;
+ uint_t count;
+ uint_t bits;
+
+top:
+ /* initialize the lwpchan cache, if necesary */
+ if ((lcp = p->p_lcp) == NULL) {
+ lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
+ goto top;
+ }
+ hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
+ mutex_enter(&hashbucket->lwpchan_lock);
+ if (lcp != p->p_lcp) {
+ /* someone resized the lwpchan cache; start over */
+ mutex_exit(&hashbucket->lwpchan_lock);
+ goto top;
+ }
+ if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
+ /* it's in the cache */
+ mutex_exit(&hashbucket->lwpchan_lock);
+ return (1);
+ }
+ mutex_exit(&hashbucket->lwpchan_lock);
+ if (as_getmemid(as, addr, &memid) != 0)
+ return (0);
+ lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
+ lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
+ ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
+ mutex_enter(&hashbucket->lwpchan_lock);
+ if (lcp != p->p_lcp) {
+ /* someone resized the lwpchan cache; start over */
+ mutex_exit(&hashbucket->lwpchan_lock);
+ kmem_free(ent, sizeof (*ent));
+ goto top;
+ }
+ count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
+ if (count == 0) {
+ /* someone else added this entry to the cache */
+ mutex_exit(&hashbucket->lwpchan_lock);
+ kmem_free(ent, sizeof (*ent));
+ return (1);
+ }
+ if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
+ (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
+ /* hash chain too long; reallocate the hash table */
+ mutex_exit(&hashbucket->lwpchan_lock);
+ kmem_free(ent, sizeof (*ent));
+ lwpchan_alloc_cache(p, bits + 1);
+ goto top;
+ }
+ ent->lwpchan_addr = addr;
+ ent->lwpchan_type = (uint16_t)type;
+ ent->lwpchan_pool = (uint16_t)pool;
+ ent->lwpchan_lwpchan = *lwpchan;
+ ent->lwpchan_next = hashbucket->lwpchan_chain;
+ hashbucket->lwpchan_chain = ent;
+ atomic_add_32(&lcp->lwpchan_entries, 1);
+ mutex_exit(&hashbucket->lwpchan_lock);
+ return (1);
+}
+
+/*
+ * Return a unique pair of identifiers that corresponds to a
+ * synchronization object's virtual address. Process-shared
+ * sync objects usually get vnode/offset from as_getmemid().
+ */
+static int
+get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
+{
+ /*
+ * If the lwp synch object is defined to be process-private,
+ * we just make the first field of the lwpchan be 'as' and
+ * the second field be the synch object's virtual address.
+ * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
+ * The lwpchan cache is used only for process-shared objects.
+ */
+ if ((type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) == 0) {
+ lwpchan->lc_wchan0 = (caddr_t)as;
+ lwpchan->lc_wchan = addr;
+ return (1);
+ }
+ /* check the lwpchan cache for mapping */
+ return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
+}
+
+static void
+lwp_block(lwpchan_t *lwpchan)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ sleepq_head_t *sqh;
+
+ thread_lock(t);
+ t->t_flag |= T_WAKEABLE;
+ t->t_lwpchan = *lwpchan;
+ t->t_sobj_ops = &lwp_sobj_ops;
+ t->t_release = 0;
+ sqh = lwpsqhash(lwpchan);
+ disp_lock_enter_high(&sqh->sq_lock);
+ CL_SLEEP(t);
+ DTRACE_SCHED(sleep);
+ THREAD_SLEEP(t, &sqh->sq_lock);
+ sleepq_insert(&sqh->sq_queue, t);
+ thread_unlock(t);
+ lwp->lwp_asleep = 1;
+ lwp->lwp_sysabort = 0;
+ lwp->lwp_ru.nvcsw++;
+ (void) new_mstate(curthread, LMS_SLEEP);
+}
+
+static kthread_t *
+lwpsobj_pi_owner(upimutex_t *up)
+{
+ return (up->upi_owner);
+}
+
+static struct upimutex *
+upi_get(upib_t *upibp, lwpchan_t *lcp)
+{
+ struct upimutex *upip;
+
+ for (upip = upibp->upib_first; upip != NULL;
+ upip = upip->upi_nextchain) {
+ if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
+ upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
+ break;
+ }
+ return (upip);
+}
+
+static void
+upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
+{
+ ASSERT(MUTEX_HELD(&upibp->upib_lock));
+
+ /*
+ * Insert upimutex at front of list. Maybe a bit unfair
+ * but assume that not many lwpchans hash to the same
+ * upimutextab bucket, i.e. the list of upimutexes from
+ * upib_first is not too long.
+ */
+ upimutex->upi_nextchain = upibp->upib_first;
+ upibp->upib_first = upimutex;
+}
+
+static void
+upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
+{
+ struct upimutex **prev;
+
+ ASSERT(MUTEX_HELD(&upibp->upib_lock));
+
+ prev = &upibp->upib_first;
+ while (*prev != upimutex) {
+ prev = &(*prev)->upi_nextchain;
+ }
+ *prev = upimutex->upi_nextchain;
+ upimutex->upi_nextchain = NULL;
+}
+
+/*
+ * Add upimutex to chain of upimutexes held by curthread.
+ * Returns number of upimutexes held by curthread.
+ */
+static uint32_t
+upi_mylist_add(struct upimutex *upimutex)
+{
+ kthread_t *t = curthread;
+
+ /*
+ * Insert upimutex at front of list of upimutexes owned by t. This
+ * would match typical LIFO order in which nested locks are acquired
+ * and released.
+ */
+ upimutex->upi_nextowned = t->t_upimutex;
+ t->t_upimutex = upimutex;
+ t->t_nupinest++;
+ ASSERT(t->t_nupinest > 0);
+ return (t->t_nupinest);
+}
+
+/*
+ * Delete upimutex from list of upimutexes owned by curthread.
+ */
+static void
+upi_mylist_del(struct upimutex *upimutex)
+{
+ kthread_t *t = curthread;
+ struct upimutex **prev;
+
+ /*
+ * Since the order in which nested locks are acquired and released,
+ * is typically LIFO, and typical nesting levels are not too deep, the
+ * following should not be expensive in the general case.
+ */
+ prev = &t->t_upimutex;
+ while (*prev != upimutex) {
+ prev = &(*prev)->upi_nextowned;
+ }
+ *prev = upimutex->upi_nextowned;
+ upimutex->upi_nextowned = NULL;
+ ASSERT(t->t_nupinest > 0);
+ t->t_nupinest--;
+}
+
+/*
+ * Returns true if upimutex is owned. Should be called only when upim points
+ * to kmem which cannot disappear from underneath.
+ */
+static int
+upi_owned(upimutex_t *upim)
+{
+ return (upim->upi_owner == curthread);
+}
+
+/*
+ * Returns pointer to kernel object (upimutex_t *) if lp is owned.
+ */
+static struct upimutex *
+lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
+{
+ lwpchan_t lwpchan;
+ upib_t *upibp;
+ struct upimutex *upimutex;
+
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL))
+ return (NULL);
+
+ upibp = &UPI_CHAIN(lwpchan);
+ mutex_enter(&upibp->upib_lock);
+ upimutex = upi_get(upibp, &lwpchan);
+ if (upimutex == NULL || upimutex->upi_owner != curthread) {
+ mutex_exit(&upibp->upib_lock);
+ return (NULL);
+ }
+ mutex_exit(&upibp->upib_lock);
+ return (upimutex);
+}
+
+/*
+ * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
+ * no lock hand-off occurrs.
+ */
+static void
+upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
+{
+ turnstile_t *ts;
+ upib_t *upibp;
+ kthread_t *newowner;
+
+ upi_mylist_del(upimutex);
+ upibp = upimutex->upi_upibp;
+ mutex_enter(&upibp->upib_lock);
+ if (upimutex->upi_waiter != 0) { /* if waiters */
+ ts = turnstile_lookup(upimutex);
+ if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
+ /* hand-off lock to highest prio waiter */
+ newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
+ upimutex->upi_owner = newowner;
+ if (ts->ts_waiters == 1)
+ upimutex->upi_waiter = 0;
+ turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
+ mutex_exit(&upibp->upib_lock);
+ return;
+ } else if (ts != NULL) {
+ /* LOCK_NOTRECOVERABLE: wakeup all */
+ turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
+ } else {
+ /*
+ * Misleading w bit. Waiters might have been
+ * interrupted. No need to clear the w bit (upimutex
+ * will soon be freed). Re-calculate PI from existing
+ * waiters.
+ */
+ turnstile_exit(upimutex);
+ turnstile_pi_recalc();
+ }
+ }
+ /*
+ * no waiters, or LOCK_NOTRECOVERABLE.
+ * remove from the bucket chain of upi mutexes.
+ * de-allocate kernel memory (upimutex).
+ */
+ upi_chain_del(upimutex->upi_upibp, upimutex);
+ mutex_exit(&upibp->upib_lock);
+ kmem_free(upimutex, sizeof (upimutex_t));
+}
+
+static int
+lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
+{
+ label_t ljb;
+ int error = 0;
+ lwpchan_t lwpchan;
+ uint16_t flag;
+ upib_t *upibp;
+ volatile struct upimutex *upimutex = NULL;
+ turnstile_t *ts;
+ uint32_t nupinest;
+ volatile int upilocked = 0;
+
+ if (on_fault(&ljb)) {
+ if (upilocked)
+ upimutex_unlock((upimutex_t *)upimutex, 0);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * The apparent assumption made in implementing other _lwp_* synch
+ * primitives, is that get_lwpchan() does not return a unique cookie
+ * for the case where 2 processes (one forked from the other) point
+ * at the same underlying object, which is typed USYNC_PROCESS, but
+ * mapped MAP_PRIVATE, since the object has not yet been written to,
+ * in the child process.
+ *
+ * Since get_lwpchan() has been fixed, it is not necessary to do the
+ * dummy writes to force a COW fault as in other places (which should
+ * be fixed).
+ */
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ upibp = &UPI_CHAIN(lwpchan);
+retry:
+ mutex_enter(&upibp->upib_lock);
+ upimutex = upi_get(upibp, &lwpchan);
+ if (upimutex == NULL) {
+ /* lock available since lwpchan has no upimutex */
+ upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
+ upi_chain_add(upibp, (upimutex_t *)upimutex);
+ upimutex->upi_owner = curthread; /* grab lock */
+ upimutex->upi_upibp = upibp;
+ upimutex->upi_vaddr = lp;
+ upimutex->upi_lwpchan = lwpchan;
+ mutex_exit(&upibp->upib_lock);
+ nupinest = upi_mylist_add((upimutex_t *)upimutex);
+ upilocked = 1;
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (nupinest > maxnestupimx &&
+ secpolicy_resource(CRED()) != 0) {
+ upimutex_unlock((upimutex_t *)upimutex, flag);
+ error = ENOMEM;
+ goto out;
+ }
+ if (flag & LOCK_OWNERDEAD) {
+ /*
+ * Return with upimutex held.
+ */
+ error = EOWNERDEAD;
+ } else if (flag & LOCK_NOTRECOVERABLE) {
+ /*
+ * Since the setting of LOCK_NOTRECOVERABLE
+ * was done under the high-level upi mutex,
+ * in lwp_upimutex_unlock(), this flag needs to
+ * be checked while holding the upi mutex.
+ * If set, this thread should return without
+ * the lock held, and with the right error
+ * code.
+ */
+ upimutex_unlock((upimutex_t *)upimutex, flag);
+ upilocked = 0;
+ error = ENOTRECOVERABLE;
+ }
+ goto out;
+ }
+ /*
+ * If a upimutex object exists, it must have an owner.
+ * This is due to lock hand-off, and release of upimutex when no
+ * waiters are present at unlock time,
+ */
+ ASSERT(upimutex->upi_owner != NULL);
+ if (upimutex->upi_owner == curthread) {
+ /*
+ * The user wrapper can check if the mutex type is
+ * ERRORCHECK: if not, it should stall at user-level.
+ * If so, it should return the error code.
+ */
+ mutex_exit(&upibp->upib_lock);
+ error = EDEADLK;
+ goto out;
+ }
+ if (try == UPIMUTEX_TRY) {
+ mutex_exit(&upibp->upib_lock);
+ error = EBUSY;
+ goto out;
+ }
+ /*
+ * Block for the lock.
+ * Put the lwp in an orderly state for debugging.
+ * Calling prstop() has to be done here, and not in
+ * turnstile_block(), since the preceding call to
+ * turnstile_lookup() raises the PIL to a level
+ * at which calls to prstop() should not be made.
+ */
+ if ((error = lwptp->lwpt_time_error) != 0) {
+ /*
+ * The SUSV3 Posix spec is very clear that we
+ * should get no error from validating the
+ * timer until we would actually sleep.
+ */
+ mutex_exit(&upibp->upib_lock);
+ goto out;
+ }
+ prstop(PR_REQUESTED, 0);
+ if (lwptp->lwpt_tsp != NULL) {
+ /*
+ * If we successfully queue the timeout
+ * (lwp_timer_enqueue() returns zero),
+ * then don't drop t_delay_lock until we are
+ * on the sleep queue (in turnstile_block()).
+ * Otherwise we will get an immediate timeout
+ * when we attempt to sleep in turnstile_block().
+ */
+ mutex_enter(&curthread->t_delay_lock);
+ if (lwp_timer_enqueue(lwptp) != 0)
+ mutex_exit(&curthread->t_delay_lock);
+ }
+ /*
+ * Now, set the waiter bit and block for the lock in turnstile_block().
+ * No need to preserve the previous wbit since a lock try is not
+ * attempted after setting the wait bit. Wait bit is set under
+ * the upib_lock, which is not released until the turnstile lock
+ * is acquired. Say, the upimutex is L:
+ *
+ * 1. upib_lock is held so the waiter does not have to retry L after
+ * setting the wait bit: since the owner has to grab the upib_lock
+ * to unlock L, it will certainly see the wait bit set.
+ * 2. upib_lock is not released until the turnstile lock is acquired.
+ * This is the key to preventing a missed wake-up. Otherwise, the
+ * owner could acquire the upib_lock, and the tc_lock, to call
+ * turnstile_wakeup(). All this, before the waiter gets tc_lock
+ * to sleep in turnstile_block(). turnstile_wakeup() will then not
+ * find this waiter, resulting in the missed wakeup.
+ * 3. The upib_lock, being a kernel mutex, cannot be released while
+ * holding the tc_lock (since mutex_exit() could need to acquire
+ * the same tc_lock)...and so is held when calling turnstile_block().
+ * The address of upib_lock is passed to turnstile_block() which
+ * releases it after releasing all turnstile locks, and before going
+ * to sleep in swtch().
+ * 4. The waiter value cannot be a count of waiters, because a waiter
+ * can be interrupted. The interrupt occurs under the tc_lock, at
+ * which point, the upib_lock cannot be locked, to decrement waiter
+ * count. So, just treat the waiter state as a bit, not a count.
+ */
+ ts = turnstile_lookup((upimutex_t *)upimutex);
+ upimutex->upi_waiter = 1;
+ error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
+ &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
+ /*
+ * Hand-off implies that we wakeup holding the lock, except when:
+ * - deadlock is detected
+ * - lock is not recoverable
+ * - we got an interrupt or timeout
+ * If we wake up due to an interrupt or timeout, we may
+ * or may not be holding the lock due to mutex hand-off.
+ * Use lwp_upimutex_owned() to check if we do hold the lock.
+ */
+ if (error != 0) {
+ if ((error == EINTR || error == ETIME) &&
+ (upimutex = lwp_upimutex_owned(lp, type))) {
+ /*
+ * Unlock and return - the re-startable syscall will
+ * try the lock again if we got EINTR.
+ */
+ (void) upi_mylist_add((upimutex_t *)upimutex);
+ upimutex_unlock((upimutex_t *)upimutex, 0);
+ }
+ /*
+ * The only other possible error is EDEADLK. If so, upimutex
+ * is valid, since its owner is deadlocked with curthread.
+ */
+ ASSERT(error == EINTR || error == ETIME ||
+ (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
+ ASSERT(!lwp_upimutex_owned(lp, type));
+ goto out;
+ }
+ if (lwp_upimutex_owned(lp, type)) {
+ ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
+ nupinest = upi_mylist_add((upimutex_t *)upimutex);
+ upilocked = 1;
+ }
+ /*
+ * Now, need to read the user-level lp->mutex_flag to do the following:
+ *
+ * - if lock is held, check if EOWNERDEAD should be returned
+ * - if lock isn't held, check if ENOTRECOVERABLE should be returned
+ *
+ * Now, either lp->mutex_flag is readable or it's not. If not
+ * readable, the on_fault path will cause a return with EFAULT as
+ * it should. If it is readable, the state of the flag encodes the
+ * robustness state of the lock:
+ *
+ * If the upimutex is locked here, the flag's LOCK_OWNERDEAD setting
+ * will influence the return code appropriately. If the upimutex is
+ * not locked here, this could be due to a spurious wake-up or a
+ * NOTRECOVERABLE event. The flag's setting can be used to distinguish
+ * between these two events.
+ */
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (upilocked) {
+ /*
+ * If the thread wakes up from turnstile_block with the lock
+ * held, the flag could not be set to LOCK_NOTRECOVERABLE,
+ * since it would not have been handed-off the lock.
+ * So, no need to check for this case.
+ */
+ if (nupinest > maxnestupimx &&
+ secpolicy_resource(CRED()) != 0) {
+ upimutex_unlock((upimutex_t *)upimutex, flag);
+ upilocked = 0;
+ error = ENOMEM;
+ } else if (flag & LOCK_OWNERDEAD) {
+ error = EOWNERDEAD;
+ }
+ } else {
+ /*
+ * Wake-up without the upimutex held. Either this is a
+ * spurious wake-up (due to signals, forkall(), whatever), or
+ * it is a LOCK_NOTRECOVERABLE robustness event. The setting
+ * of the mutex flag can be used to distinguish between the
+ * two events.
+ */
+ if (flag & LOCK_NOTRECOVERABLE) {
+ error = ENOTRECOVERABLE;
+ } else {
+ /*
+ * Here, the flag could be set to LOCK_OWNERDEAD or
+ * not. In both cases, this is a spurious wakeup,
+ * since the upi lock is not held, but the thread
+ * has returned from turnstile_block().
+ *
+ * The user flag could be LOCK_OWNERDEAD if, at the
+ * same time as curthread having been woken up
+ * spuriously, the owner (say Tdead) has died, marked
+ * the mutex flag accordingly, and handed off the lock
+ * to some other waiter (say Tnew). curthread just
+ * happened to read the flag while Tnew has yet to deal
+ * with the owner-dead event.
+ *
+ * In this event, curthread should retry the lock.
+ * If Tnew is able to cleanup the lock, curthread
+ * will eventually get the lock with a zero error code,
+ * If Tnew is unable to cleanup, its eventual call to
+ * unlock the lock will result in the mutex flag being
+ * set to LOCK_NOTRECOVERABLE, and the wake-up of
+ * all waiters, including curthread, which will then
+ * eventually return ENOTRECOVERABLE due to the above
+ * check.
+ *
+ * Of course, if the user-flag is not set with
+ * LOCK_OWNERDEAD, retrying is the thing to do, since
+ * this is definitely a spurious wakeup.
+ */
+ goto retry;
+ }
+ }
+
+out:
+ no_fault();
+ return (error);
+}
+
+
+static int
+lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
+{
+ label_t ljb;
+ int error = 0;
+ lwpchan_t lwpchan;
+ uint16_t flag;
+ upib_t *upibp;
+ volatile struct upimutex *upimutex = NULL;
+ volatile int upilocked = 0;
+
+ if (on_fault(&ljb)) {
+ if (upilocked)
+ upimutex_unlock((upimutex_t *)upimutex, 0);
+ error = EFAULT;
+ goto out;
+ }
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ upibp = &UPI_CHAIN(lwpchan);
+ mutex_enter(&upibp->upib_lock);
+ upimutex = upi_get(upibp, &lwpchan);
+ /*
+ * If the lock is not held, or the owner is not curthread, return
+ * error. The user-level wrapper can return this error or stall,
+ * depending on whether mutex is of ERRORCHECK type or not.
+ */
+ if (upimutex == NULL || upimutex->upi_owner != curthread) {
+ mutex_exit(&upibp->upib_lock);
+ error = EPERM;
+ goto out;
+ }
+ mutex_exit(&upibp->upib_lock); /* release for user memory access */
+ upilocked = 1;
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & LOCK_OWNERDEAD) {
+ /*
+ * transition mutex to the LOCK_NOTRECOVERABLE state.
+ */
+ flag &= ~LOCK_OWNERDEAD;
+ flag |= LOCK_NOTRECOVERABLE;
+ suword16_noerr(&lp->mutex_flag, flag);
+ }
+ upimutex_unlock((upimutex_t *)upimutex, flag);
+ upilocked = 0;
+out:
+ no_fault();
+ return (error);
+}
+
+/*
+ * Mark user mutex state, corresponding to kernel upimutex, as LOCK_OWNERDEAD.
+ */
+static int
+upi_dead(upimutex_t *upip)
+{
+ label_t ljb;
+ int error = 0;
+ lwp_mutex_t *lp;
+ uint16_t flag;
+
+ if (on_fault(&ljb)) {
+ error = EFAULT;
+ goto out;
+ }
+
+ lp = upip->upi_vaddr;
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ flag |= LOCK_OWNERDEAD;
+ suword16_noerr(&lp->mutex_flag, flag);
+out:
+ no_fault();
+ return (error);
+}
+
+/*
+ * Unlock all upimutexes held by curthread, since curthread is dying.
+ * For each upimutex, attempt to mark its corresponding user mutex object as
+ * dead.
+ */
+void
+upimutex_cleanup()
+{
+ kthread_t *t = curthread;
+ struct upimutex *upip;
+
+ while ((upip = t->t_upimutex) != NULL) {
+ if (upi_dead(upip) != 0) {
+ /*
+ * If the user object associated with this upimutex is
+ * unmapped, unlock upimutex with the
+ * LOCK_NOTRECOVERABLE flag, so that all waiters are
+ * woken up. Since user object is unmapped, it could
+ * not be marked as dead or notrecoverable.
+ * The waiters will now all wake up and return
+ * ENOTRECOVERABLE, since they would find that the lock
+ * has not been handed-off to them.
+ * See lwp_upimutex_lock().
+ */
+ upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
+ } else {
+ /*
+ * The user object has been updated as dead.
+ * Unlock the upimutex: if no waiters, upip kmem will
+ * be freed. If there is a waiter, the lock will be
+ * handed off. If exit() is in progress, each existing
+ * waiter will successively get the lock, as owners
+ * die, and each new owner will call this routine as
+ * it dies. The last owner will free kmem, since
+ * it will find the upimutex has no waiters. So,
+ * eventually, the kmem is guaranteed to be freed.
+ */
+ upimutex_unlock(upip, 0);
+ }
+ /*
+ * Note that the call to upimutex_unlock() above will delete
+ * upimutex from the t_upimutexes chain. And so the
+ * while loop will eventually terminate.
+ */
+ }
+}
+
+int
+lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ lwp_timer_t lwpt;
+ caddr_t timedwait;
+ int error = 0;
+ int time_error;
+ clock_t tim = -1;
+ uchar_t waiters;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ volatile uint8_t type = 0;
+ lwpchan_t lwpchan;
+ sleepq_head_t *sqh;
+ static int iswanted();
+ uint16_t flag;
+ int imm_timeout = 0;
+
+ if ((caddr_t)lp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ timedwait = (caddr_t)tsp;
+ if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
+ lwpt.lwpt_imm_timeout) {
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+
+ /*
+ * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
+ * this micro state is really a run state. If the thread indeed blocks,
+ * this state becomes valid. If not, the state is converted back to
+ * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
+ * when blocking.
+ */
+ (void) new_mstate(t, LMS_USER_LOCK);
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
+ if (UPIMUTEX(type)) {
+ no_fault();
+ error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
+ if ((error == 0 || error == EOWNERDEAD) &&
+ (type & USYNC_PROCESS))
+ (void) suword32(&lp->mutex_ownerpid, p->p_pid);
+ if (tsp && !time_error) /* copyout the residual time left */
+ error = lwp_timer_copyout(&lwpt, error);
+ if (error)
+ return (set_errno(error));
+ return (0);
+ }
+ /*
+ * Force Copy-on-write fault if lwp_mutex_t object is
+ * defined to be MAP_PRIVATE and it was initialized to
+ * USYNC_PROCESS.
+ */
+ suword8_noerr(&lp->mutex_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ fuword8_noerr(&lp->mutex_waiters, &waiters);
+ suword8_noerr(&lp->mutex_waiters, 1);
+ if (type & USYNC_PROCESS_ROBUST) {
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & LOCK_NOTRECOVERABLE) {
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = ENOTRECOVERABLE;
+ goto out;
+ }
+ }
+
+ /*
+ * If watchpoints are set, they need to be restored, since
+ * atomic accesses of memory such as the call to ulock_try()
+ * below cannot be watched.
+ */
+
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+
+ while (!ulock_try(&lp->mutex_lockw)) {
+ if (time_error) {
+ /*
+ * The SUSV3 Posix spec is very clear that we
+ * should get no error from validating the
+ * timer until we would actually sleep.
+ */
+ error = time_error;
+ break;
+ }
+
+ if (watched) {
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ watched = 0;
+ }
+
+ /*
+ * Put the lwp in an orderly state for debugging.
+ */
+ prstop(PR_REQUESTED, 0);
+ if (timedwait) {
+ /*
+ * If we successfully queue the timeout,
+ * then don't drop t_delay_lock until
+ * we are on the sleep queue (below).
+ */
+ mutex_enter(&t->t_delay_lock);
+ if (lwp_timer_enqueue(&lwpt) != 0) {
+ mutex_exit(&t->t_delay_lock);
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+ }
+ lwp_block(&lwpchan);
+ /*
+ * Nothing should happen to cause the lwp to go to
+ * sleep again until after it returns from swtch().
+ */
+ if (timedwait)
+ mutex_exit(&t->t_delay_lock);
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
+ setrun(t);
+ swtch();
+ t->t_flag &= ~T_WAKEABLE;
+ if (timedwait)
+ tim = lwp_timer_dequeue(&lwpt);
+ setallwatch();
+ if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
+ error = EINTR;
+ else if (imm_timeout || (timedwait && tim == -1))
+ error = ETIME;
+ if (error) {
+ lwp->lwp_asleep = 0;
+ lwp->lwp_sysabort = 0;
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
+ S_WRITE);
+
+ /*
+ * Need to re-compute waiters bit. The waiters field in
+ * the lock is not reliable. Either of two things could
+ * have occurred: no lwp may have called lwp_release()
+ * for me but I have woken up due to a signal or
+ * timeout. In this case, the waiter bit is incorrect
+ * since it is still set to 1, set above.
+ * OR an lwp_release() did occur for some other lwp on
+ * the same lwpchan. In this case, the waiter bit is
+ * correct. But which event occurred, one can't tell.
+ * So, recompute.
+ */
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ sqh = lwpsqhash(&lwpchan);
+ disp_lock_enter(&sqh->sq_lock);
+ waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
+ disp_lock_exit(&sqh->sq_lock);
+ break;
+ }
+ lwp->lwp_asleep = 0;
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
+ S_WRITE);
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ fuword8_noerr(&lp->mutex_waiters, &waiters);
+ suword8_noerr(&lp->mutex_waiters, 1);
+ if (type & USYNC_PROCESS_ROBUST) {
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & LOCK_NOTRECOVERABLE) {
+ error = ENOTRECOVERABLE;
+ break;
+ }
+ }
+ }
+
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+
+ if (!error && (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))) {
+ suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
+ if (type & USYNC_PROCESS_ROBUST) {
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & LOCK_OWNERDEAD)
+ error = EOWNERDEAD;
+ else if (flag & LOCK_UNMAPPED)
+ error = ELOCKUNMAPPED;
+ }
+ }
+ suword8_noerr(&lp->mutex_waiters, waiters);
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (tsp && !time_error) /* copyout the residual time left */
+ error = lwp_timer_copyout(&lwpt, error);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Obsolete lwp_mutex_lock() interface, no longer called from libc.
+ * libc now calls lwp_mutex_timedlock(lp, NULL).
+ * This system call trap continues to exist solely for the benefit
+ * of old statically-linked binaries from Solaris 9 and before.
+ * It should be removed from the system when we no longer care
+ * about such applications.
+ */
+int
+lwp_mutex_lock(lwp_mutex_t *lp)
+{
+ return (lwp_mutex_timedlock(lp, NULL));
+}
+
+static int
+iswanted(kthread_t *t, lwpchan_t *lwpchan)
+{
+ /*
+ * The caller holds the dispatcher lock on the sleep queue.
+ */
+ while (t != NULL) {
+ if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
+ t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
+ return (1);
+ t = t->t_link;
+ }
+ return (0);
+}
+
+/*
+ * Return the highest priority thread sleeping on this lwpchan.
+ */
+static kthread_t *
+lwp_queue_waiter(lwpchan_t *lwpchan)
+{
+ sleepq_head_t *sqh;
+ kthread_t *tp;
+
+ sqh = lwpsqhash(lwpchan);
+ disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */
+ for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
+ if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
+ tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
+ break;
+ }
+ disp_lock_exit(&sqh->sq_lock);
+ return (tp);
+}
+
+static int
+lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
+{
+ sleepq_head_t *sqh;
+ kthread_t *tp;
+ kthread_t **tpp;
+
+ sqh = lwpsqhash(lwpchan);
+ disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */
+ tpp = &sqh->sq_queue.sq_first;
+ while ((tp = *tpp) != NULL) {
+ if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
+ tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
+ /*
+ * The following is typically false. It could be true
+ * only if lwp_release() is called from
+ * lwp_mutex_wakeup() after reading the waiters field
+ * from memory in which the lwp lock used to be, but has
+ * since been re-used to hold a lwp cv or lwp semaphore.
+ * The thread "tp" found to match the lwp lock's wchan
+ * is actually sleeping for the cv or semaphore which
+ * now has the same wchan. In this case, lwp_release()
+ * should return failure.
+ */
+ if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
+ ASSERT(sync_type == 0);
+ /*
+ * assert that this can happen only for mutexes
+ * i.e. sync_type == 0, for correctly written
+ * user programs.
+ */
+ disp_lock_exit(&sqh->sq_lock);
+ return (0);
+ }
+ *waiters = iswanted(tp->t_link, lwpchan);
+ sleepq_unlink(tpp, tp);
+ DTRACE_SCHED1(wakeup, kthread_t *, tp);
+ tp->t_wchan0 = NULL;
+ tp->t_wchan = NULL;
+ tp->t_sobj_ops = NULL;
+ tp->t_release = 1;
+ THREAD_TRANSITION(tp); /* drops sleepq lock */
+ CL_WAKEUP(tp);
+ thread_unlock(tp); /* drop run queue lock */
+ return (1);
+ }
+ tpp = &tp->t_link;
+ }
+ *waiters = 0;
+ disp_lock_exit(&sqh->sq_lock);
+ return (0);
+}
+
+static void
+lwp_release_all(lwpchan_t *lwpchan)
+{
+ sleepq_head_t *sqh;
+ kthread_t *tp;
+ kthread_t **tpp;
+
+ sqh = lwpsqhash(lwpchan);
+ disp_lock_enter(&sqh->sq_lock); /* lock sleep q queue */
+ tpp = &sqh->sq_queue.sq_first;
+ while ((tp = *tpp) != NULL) {
+ if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
+ tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
+ sleepq_unlink(tpp, tp);
+ DTRACE_SCHED1(wakeup, kthread_t *, tp);
+ tp->t_wchan0 = NULL;
+ tp->t_wchan = NULL;
+ tp->t_sobj_ops = NULL;
+ CL_WAKEUP(tp);
+ thread_unlock_high(tp); /* release run queue lock */
+ } else {
+ tpp = &tp->t_link;
+ }
+ }
+ disp_lock_exit(&sqh->sq_lock); /* drop sleep q lock */
+}
+
+/*
+ * unblock a lwp that is trying to acquire this mutex. the blocked
+ * lwp resumes and retries to acquire the lock.
+ */
+int
+lwp_mutex_wakeup(lwp_mutex_t *lp)
+{
+ proc_t *p = ttoproc(curthread);
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ volatile uint8_t type = 0;
+ label_t ljb;
+ int error = 0;
+
+ if ((caddr_t)lp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_mutex_t object is
+ * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
+ */
+ fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
+ suword8_noerr(&lp->mutex_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ /*
+ * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
+ * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
+ * may fail. If it fails, do not write into the waiter bit.
+ * The call to lwp_release() might fail due to one of three reasons:
+ *
+ * 1. due to the thread which set the waiter bit not actually
+ * sleeping since it got the lock on the re-try. The waiter
+ * bit will then be correctly updated by that thread. This
+ * window may be closed by reading the wait bit again here
+ * and not calling lwp_release() at all if it is zero.
+ * 2. the thread which set the waiter bit and went to sleep
+ * was woken up by a signal. This time, the waiter recomputes
+ * the wait bit in the return with EINTR code.
+ * 3. the waiter bit read by lwp_mutex_wakeup() was in
+ * memory that has been re-used after the lock was dropped.
+ * In this case, writing into the waiter bit would cause data
+ * corruption.
+ */
+ if (lwp_release(&lwpchan, &waiters, 0) == 1) {
+ suword8_noerr(&lp->mutex_waiters, waiters);
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * lwp_cond_wait() has four arguments, a pointer to a condition variable,
+ * a pointer to a mutex, a pointer to a timespec for a timed wait and
+ * a flag telling the kernel whether or not to honor the kernel/user
+ * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
+ * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
+ * lwpchan, returned by get_lwpchan(). If the timespec pointer is non-NULL,
+ * it is used an an in/out parameter. On entry, it contains the relative
+ * time until timeout. On exit, we copyout the residual time left to it.
+ */
+int
+lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ lwp_timer_t lwpt;
+ lwpchan_t cv_lwpchan;
+ lwpchan_t m_lwpchan;
+ caddr_t timedwait;
+ volatile uint16_t type = 0;
+ volatile uint8_t mtype = 0;
+ uchar_t waiters;
+ volatile int error;
+ clock_t tim = -1;
+ volatile int locked = 0;
+ volatile int m_locked = 0;
+ volatile int cvwatched = 0;
+ volatile int mpwatched = 0;
+ label_t ljb;
+ volatile int no_lwpchan = 1;
+ int imm_timeout = 0;
+ int imm_unpark = 0;
+
+ if ((caddr_t)cv >= p->p_as->a_userlimit ||
+ (caddr_t)mp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ timedwait = (caddr_t)tsp;
+ if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
+ return (set_errno(error));
+ if (lwpt.lwpt_imm_timeout) {
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ if (on_fault(&ljb)) {
+ if (no_lwpchan) {
+ error = EFAULT;
+ goto out;
+ }
+ if (m_locked) {
+ m_locked = 0;
+ lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
+ }
+ if (locked) {
+ locked = 0;
+ lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
+ }
+ /*
+ * set up another on_fault() for a possible fault
+ * on the user lock accessed at "efault"
+ */
+ if (on_fault(&ljb)) {
+ if (m_locked) {
+ m_locked = 0;
+ lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
+ }
+ goto out;
+ }
+ error = EFAULT;
+ goto efault;
+ }
+
+ /*
+ * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t
+ * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS
+ */
+ fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
+ if (UPIMUTEX(mtype) == 0) {
+ suword8_noerr(&mp->mutex_type, mtype);
+ /* convert user level mutex, "mp", to a unique lwpchan */
+ /* check if mtype is ok to use below, instead of type from cv */
+ if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
+ &m_lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ }
+ fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
+ suword16_noerr(&cv->cond_type, type);
+ /* convert user level condition variable, "cv", to a unique lwpchan */
+ if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
+ &cv_lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ no_lwpchan = 0;
+ cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+ if (UPIMUTEX(mtype) == 0)
+ mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
+ S_WRITE);
+
+ /*
+ * lwpchan_lock ensures that the calling lwp is put to sleep atomically
+ * with respect to a possible wakeup which is a result of either
+ * an lwp_cond_signal() or an lwp_cond_broadcast().
+ *
+ * What's misleading, is that the lwp is put to sleep after the
+ * condition variable's mutex is released. This is OK as long as
+ * the release operation is also done while holding lwpchan_lock.
+ * The lwp is then put to sleep when the possibility of pagefaulting
+ * or sleeping is completely eliminated.
+ */
+ lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ if (UPIMUTEX(mtype) == 0) {
+ lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
+ m_locked = 1;
+ suword8_noerr(&cv->cond_waiters_kernel, 1);
+ /*
+ * unlock the condition variable's mutex. (pagefaults are
+ * possible here.)
+ */
+ ulock_clear(&mp->mutex_lockw);
+ fuword8_noerr(&mp->mutex_waiters, &waiters);
+ if (waiters != 0) {
+ /*
+ * Given the locking of lwpchan_lock around the release
+ * of the mutex and checking for waiters, the following
+ * call to lwp_release() can fail ONLY if the lock
+ * acquirer is interrupted after setting the waiter bit,
+ * calling lwp_block() and releasing lwpchan_lock.
+ * In this case, it could get pulled off the lwp sleep
+ * q (via setrun()) before the following call to
+ * lwp_release() occurs. In this case, the lock
+ * requestor will update the waiter bit correctly by
+ * re-evaluating it.
+ */
+ if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
+ suword8_noerr(&mp->mutex_waiters, waiters);
+ }
+ m_locked = 0;
+ lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
+ } else {
+ suword8_noerr(&cv->cond_waiters_kernel, 1);
+ error = lwp_upimutex_unlock(mp, mtype);
+ if (error) { /* if the upimutex unlock failed */
+ locked = 0;
+ lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
+ goto out;
+ }
+ }
+ no_fault();
+
+ if (mpwatched) {
+ watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
+ mpwatched = 0;
+ }
+ if (cvwatched) {
+ watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+ cvwatched = 0;
+ }
+
+ /*
+ * Put the lwp in an orderly state for debugging.
+ */
+ prstop(PR_REQUESTED, 0);
+ if (check_park && (!schedctl_is_park() || t->t_unpark)) {
+ /*
+ * We received a signal at user-level before calling here
+ * or another thread wants us to return immediately
+ * with EINTR. See lwp_unpark().
+ */
+ imm_unpark = 1;
+ t->t_unpark = 0;
+ timedwait = NULL;
+ } else if (timedwait) {
+ /*
+ * If we successfully queue the timeout,
+ * then don't drop t_delay_lock until
+ * we are on the sleep queue (below).
+ */
+ mutex_enter(&t->t_delay_lock);
+ if (lwp_timer_enqueue(&lwpt) != 0) {
+ mutex_exit(&t->t_delay_lock);
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+ }
+ t->t_flag |= T_WAITCVSEM;
+ lwp_block(&cv_lwpchan);
+ /*
+ * Nothing should happen to cause the lwp to go to sleep
+ * until after it returns from swtch().
+ */
+ if (timedwait)
+ mutex_exit(&t->t_delay_lock);
+ locked = 0;
+ lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
+ if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
+ (imm_timeout | imm_unpark))
+ setrun(t);
+ swtch();
+ t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
+ if (timedwait)
+ tim = lwp_timer_dequeue(&lwpt);
+ if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
+ MUSTRETURN(p, t) || imm_unpark)
+ error = EINTR;
+ else if (imm_timeout || (timedwait && tim == -1))
+ error = ETIME;
+ lwp->lwp_asleep = 0;
+ lwp->lwp_sysabort = 0;
+ setallwatch();
+
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+
+ if (tsp && check_park) /* copyout the residual time left */
+ error = lwp_timer_copyout(&lwpt, error);
+
+ /* the mutex is reacquired by the caller on return to user level */
+ if (error) {
+ /*
+ * If we were concurrently lwp_cond_signal()d and we
+ * received a UNIX signal or got a timeout, then perform
+ * another lwp_cond_signal() to avoid consuming the wakeup.
+ */
+ if (t->t_release)
+ (void) lwp_cond_signal(cv);
+ return (set_errno(error));
+ }
+ return (0);
+
+efault:
+ /*
+ * make sure that the user level lock is dropped before
+ * returning to caller, since the caller always re-acquires it.
+ */
+ if (UPIMUTEX(mtype) == 0) {
+ lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
+ m_locked = 1;
+ ulock_clear(&mp->mutex_lockw);
+ fuword8_noerr(&mp->mutex_waiters, &waiters);
+ if (waiters != 0) {
+ /*
+ * See comment above on lock clearing and lwp_release()
+ * success/failure.
+ */
+ if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
+ suword8_noerr(&mp->mutex_waiters, waiters);
+ }
+ m_locked = 0;
+ lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
+ } else {
+ (void) lwp_upimutex_unlock(mp, mtype);
+ }
+out:
+ no_fault();
+ if (mpwatched)
+ watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
+ if (cvwatched)
+ watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+ return (set_errno(error));
+}
+
+/*
+ * wakeup one lwp that's blocked on this condition variable.
+ */
+int
+lwp_cond_signal(lwp_cond_t *cv)
+{
+ proc_t *p = ttoproc(curthread);
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ volatile uint16_t type = 0;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ int error = 0;
+
+ if ((caddr_t)cv >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_cond_t object is
+ * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
+ */
+ fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
+ suword16_noerr(&cv->cond_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
+ if (waiters != 0) {
+ /*
+ * The following call to lwp_release() might fail but it is
+ * OK to write into the waiters bit below, since the memory
+ * could not have been re-used or unmapped (for correctly
+ * written user programs) as in the case of lwp_mutex_wakeup().
+ * For an incorrect program, we should not care about data
+ * corruption since this is just one instance of other places
+ * where corruption can occur for such a program. Of course
+ * if the memory is unmapped, normal fault recovery occurs.
+ */
+ (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
+ suword8_noerr(&cv->cond_waiters_kernel, waiters);
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * wakeup every lwp that's blocked on this condition variable.
+ */
+int
+lwp_cond_broadcast(lwp_cond_t *cv)
+{
+ proc_t *p = ttoproc(curthread);
+ lwpchan_t lwpchan;
+ volatile uint16_t type = 0;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ uchar_t waiters;
+ int error = 0;
+
+ if ((caddr_t)cv >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_cond_t object is
+ * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
+ */
+ fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
+ suword16_noerr(&cv->cond_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
+ if (waiters != 0) {
+ lwp_release_all(&lwpchan);
+ suword8_noerr(&cv->cond_waiters_kernel, 0);
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+int
+lwp_sema_trywait(lwp_sema_t *sp)
+{
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ label_t ljb;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ volatile uint16_t type = 0;
+ int count;
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ int error = 0;
+
+ if ((caddr_t)sp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_sema_t object is
+ * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
+ */
+ fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
+ suword16_noerr((void *)&sp->sema_type, type);
+ if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
+ if (count == 0)
+ error = EBUSY;
+ else
+ suword32_noerr((void *)&sp->sema_count, --count);
+ if (count != 0) {
+ fuword8_noerr(&sp->sema_waiters, &waiters);
+ if (waiters != 0) {
+ (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
+ suword8_noerr(&sp->sema_waiters, waiters);
+ }
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
+ */
+int
+lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ lwp_timer_t lwpt;
+ caddr_t timedwait;
+ clock_t tim = -1;
+ label_t ljb;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ volatile uint16_t type = 0;
+ int count;
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ int error = 0;
+ int time_error;
+ int imm_timeout = 0;
+ int imm_unpark = 0;
+
+ if ((caddr_t)sp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ timedwait = (caddr_t)tsp;
+ if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
+ lwpt.lwpt_imm_timeout) {
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+
+ watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_sema_t object is
+ * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
+ */
+ fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
+ suword16_noerr((void *)&sp->sema_type, type);
+ if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
+ while (error == 0 && count == 0) {
+ if (time_error) {
+ /*
+ * The SUSV3 Posix spec is very clear that we
+ * should get no error from validating the
+ * timer until we would actually sleep.
+ */
+ error = time_error;
+ break;
+ }
+ suword8_noerr(&sp->sema_waiters, 1);
+ if (watched)
+ watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+ /*
+ * Put the lwp in an orderly state for debugging.
+ */
+ prstop(PR_REQUESTED, 0);
+ if (check_park && (!schedctl_is_park() || t->t_unpark)) {
+ /*
+ * We received a signal at user-level before calling
+ * here or another thread wants us to return
+ * immediately with EINTR. See lwp_unpark().
+ */
+ imm_unpark = 1;
+ t->t_unpark = 0;
+ timedwait = NULL;
+ } else if (timedwait) {
+ /*
+ * If we successfully queue the timeout,
+ * then don't drop t_delay_lock until
+ * we are on the sleep queue (below).
+ */
+ mutex_enter(&t->t_delay_lock);
+ if (lwp_timer_enqueue(&lwpt) != 0) {
+ mutex_exit(&t->t_delay_lock);
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+ }
+ t->t_flag |= T_WAITCVSEM;
+ lwp_block(&lwpchan);
+ /*
+ * Nothing should happen to cause the lwp to sleep
+ * again until after it returns from swtch().
+ */
+ if (timedwait)
+ mutex_exit(&t->t_delay_lock);
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
+ (imm_timeout | imm_unpark))
+ setrun(t);
+ swtch();
+ t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
+ if (timedwait)
+ tim = lwp_timer_dequeue(&lwpt);
+ setallwatch();
+ if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
+ MUSTRETURN(p, t) || imm_unpark)
+ error = EINTR;
+ else if (imm_timeout || (timedwait && tim == -1))
+ error = ETIME;
+ lwp->lwp_asleep = 0;
+ lwp->lwp_sysabort = 0;
+ watched = watch_disable_addr((caddr_t)sp,
+ sizeof (*sp), S_WRITE);
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
+ }
+ if (error == 0)
+ suword32_noerr((void *)&sp->sema_count, --count);
+ if (count != 0) {
+ (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
+ suword8_noerr(&sp->sema_waiters, waiters);
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+ if (tsp && check_park && !time_error)
+ error = lwp_timer_copyout(&lwpt, error);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Obsolete lwp_sema_wait() interface, no longer called from libc.
+ * libc now calls lwp_sema_timedwait().
+ * This system call trap exists solely for the benefit of old
+ * statically linked applications from Solaris 9 and before.
+ * It should be removed when we no longer care about such applications.
+ */
+int
+lwp_sema_wait(lwp_sema_t *sp)
+{
+ return (lwp_sema_timedwait(sp, NULL, 0));
+}
+
+int
+lwp_sema_post(lwp_sema_t *sp)
+{
+ proc_t *p = ttoproc(curthread);
+ label_t ljb;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ volatile uint16_t type = 0;
+ int count;
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ int error = 0;
+
+ if ((caddr_t)sp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_sema_t object is
+ * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
+ */
+ fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
+ suword16_noerr(&sp->sema_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
+ if (count == _SEM_VALUE_MAX)
+ error = EOVERFLOW;
+ else
+ suword32_noerr(&sp->sema_count, ++count);
+ if (count == 1) {
+ fuword8_noerr(&sp->sema_waiters, &waiters);
+ if (waiters) {
+ (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
+ suword8_noerr(&sp->sema_waiters, waiters);
+ }
+ }
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+#define TRW_WANT_WRITE 0x1
+#define TRW_LOCK_GRANTED 0x2
+
+#define READ_LOCK 0
+#define WRITE_LOCK 1
+#define TRY_FLAG 0x10
+#define READ_LOCK_TRY (READ_LOCK | TRY_FLAG)
+#define WRITE_LOCK_TRY (WRITE_LOCK | TRY_FLAG)
+
+/*
+ * Release one writer or one or more readers. Compute the rwstate word to
+ * reflect the new state of the queue. For a safe hand-off we copy the new
+ * rwstate value back to userland before we wake any of the new lock holders.
+ *
+ * Note that sleepq_insert() implements a prioritized FIFO (with writers
+ * being given precedence over readers of the same priority).
+ *
+ * If the first thread is a reader we scan the queue releasing all readers
+ * until we hit a writer or the end of the queue. If the first thread is a
+ * writer we still need to check for another writer (i.e. URW_WRITE_WANTED).
+ */
+void
+lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
+{
+ sleepq_head_t *sqh;
+ kthread_t *tp;
+ kthread_t **tpp;
+ kthread_t *tpnext;
+ kthread_t *wakelist = NULL;
+ uint32_t rwstate = 0;
+ int wcount = 0;
+ int rcount = 0;
+
+ sqh = lwpsqhash(lwpchan);
+ disp_lock_enter(&sqh->sq_lock);
+ tpp = &sqh->sq_queue.sq_first;
+ while ((tp = *tpp) != NULL) {
+ if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
+ tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
+ if (tp->t_writer & TRW_WANT_WRITE) {
+ if ((wcount++ == 0) && (rcount == 0)) {
+ rwstate |= URW_WRITE_LOCKED;
+
+ /* Just one writer to wake. */
+ sleepq_unlink(tpp, tp);
+ wakelist = tp;
+
+ /* tpp already set for next thread. */
+ continue;
+ } else {
+ rwstate |=
+ (URW_WRITE_WANTED|URW_HAS_WAITERS);
+
+ /* We need look no further. */
+ break;
+ }
+ } else {
+ rcount++;
+ if (wcount == 0) {
+ rwstate++;
+
+ /* Add reader to wake list. */
+ sleepq_unlink(tpp, tp);
+ tp->t_link = wakelist;
+ wakelist = tp;
+
+ /* tpp already set for next thread. */
+ continue;
+ } else
+ rwstate |= URW_HAS_WAITERS;
+ }
+ }
+ tpp = &tp->t_link;
+ }
+
+ /* Copy the new rwstate back to userland. */
+ suword32_noerr(&rw->rwlock_readers, rwstate);
+
+ /* Wake the new lock holder(s) up. */
+ tp = wakelist;
+ while (tp != NULL) {
+ DTRACE_SCHED1(wakeup, kthread_t *, tp);
+ tp->t_wchan0 = NULL;
+ tp->t_wchan = NULL;
+ tp->t_sobj_ops = NULL;
+ tp->t_writer |= TRW_LOCK_GRANTED;
+ tpnext = tp->t_link;
+ tp->t_link = NULL;
+ CL_WAKEUP(tp);
+ thread_unlock_high(tp);
+ tp = tpnext;
+ }
+
+ disp_lock_exit(&sqh->sq_lock);
+}
+
+/*
+ * We enter here holding the user-level mutex, which we must release before
+ * returning or blocking. Based on lwp_cond_wait().
+ */
+static int
+lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
+{
+ lwp_mutex_t *mp = NULL;
+ kthread_t *t = curthread;
+ kthread_t *tp;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ lwp_timer_t lwpt;
+ lwpchan_t lwpchan;
+ lwpchan_t mlwpchan;
+ caddr_t timedwait;
+ volatile uint16_t type = 0;
+ volatile uint8_t mtype = 0;
+ uchar_t mwaiters;
+ volatile int error = 0;
+ int time_error;
+ clock_t tim = -1;
+ volatile int locked = 0;
+ volatile int mlocked = 0;
+ volatile int watched = 0;
+ volatile int mwatched = 0;
+ label_t ljb;
+ volatile int no_lwpchan = 1;
+ int imm_timeout = 0;
+ int try_flag;
+ uint32_t rwstate;
+ int acquired = 0;
+
+ /* We only check rw because the mutex is included in it. */
+ if ((caddr_t)rw >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ /* We must only report this error if we are about to sleep (later). */
+ timedwait = (caddr_t)tsp;
+ if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
+ lwpt.lwpt_imm_timeout) {
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ if (on_fault(&ljb)) {
+ if (no_lwpchan) {
+ error = EFAULT;
+ goto out_nodrop;
+ }
+ if (mlocked) {
+ mlocked = 0;
+ lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
+ }
+ if (locked) {
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ }
+ /*
+ * Set up another on_fault() for a possible fault
+ * on the user lock accessed at "out_drop".
+ */
+ if (on_fault(&ljb)) {
+ if (mlocked) {
+ mlocked = 0;
+ lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
+ }
+ error = EFAULT;
+ goto out_nodrop;
+ }
+ error = EFAULT;
+ goto out_nodrop;
+ }
+
+ /* Process rd_wr (including sanity check). */
+ try_flag = (rd_wr & TRY_FLAG);
+ rd_wr &= ~TRY_FLAG;
+ if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
+ error = EINVAL;
+ goto out_nodrop;
+ }
+
+ /* We can only continue for simple USYNC_PROCESS locks. */
+ mp = &rw->mutex;
+ fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
+ fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
+ if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
+ error = EINVAL;
+ goto out_nodrop;
+ }
+
+ /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
+ suword8_noerr(&mp->mutex_type, mtype);
+ suword16_noerr(&rw->rwlock_type, type);
+
+ /* Convert user level mutex, "mp", to a unique lwpchan. */
+ if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
+ &mlwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out_nodrop;
+ }
+
+ /* Convert user level rwlock, "rw", to a unique lwpchan. */
+ if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out_nodrop;
+ }
+
+ no_lwpchan = 0;
+ watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
+ mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
+
+ /*
+ * lwpchan_lock() ensures that the calling LWP is put to sleep
+ * atomically with respect to a possible wakeup which is a result
+ * of lwp_rwlock_unlock().
+ *
+ * What's misleading is that the LWP is put to sleep after the
+ * rwlock's mutex is released. This is OK as long as the release
+ * operation is also done while holding mlwpchan. The LWP is then
+ * put to sleep when the possibility of pagefaulting or sleeping
+ * has been completely eliminated.
+ */
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+ lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
+ mlocked = 1;
+
+ /*
+ * Fetch the current rwlock state.
+ *
+ * The possibility of spurious wake-ups or killed waiters means that
+ * rwstate's URW_HAS_WAITERS and URW_WRITE_WANTED bits may indicate
+ * false positives. We only fix these if they are important to us.
+ *
+ * Although various error states can be observed here (e.g. the lock
+ * is not held, but there are waiters) we assume these are applicaton
+ * errors and so we take no corrective action.
+ */
+ fuword32_noerr(&rw->rwlock_readers, &rwstate);
+
+ /*
+ * If the lock is uncontended we can acquire it here. These tests
+ * should have already been done at user-level, we just need to be
+ * sure.
+ */
+ if (rd_wr == READ_LOCK) {
+ if ((rwstate & ~URW_READERS_MASK) == 0) {
+ rwstate++;
+ acquired = 1;
+ }
+ } else if (rwstate == 0) {
+ rwstate = URW_WRITE_LOCKED;
+ acquired = 1;
+ }
+
+ /*
+ * We can only try harder if the lock isn't held by a writer.
+ */
+ if (!acquired && !(rwstate & URW_WRITE_LOCKED)) {
+ tp = lwp_queue_waiter(&lwpchan);
+ if (tp == NULL) {
+ /*
+ * Hmmm, rwstate indicates waiters but there are
+ * none queued. This could just be the result of a
+ * spurious wakeup, so let's fix it.
+ */
+ rwstate &= URW_READERS_MASK;
+
+ /*
+ * We now have another chance to acquire the lock
+ * uncontended, but this is the last chance for a
+ * writer to acquire the lock without blocking.
+ */
+ if (rd_wr == READ_LOCK) {
+ rwstate++;
+ acquired = 1;
+ } else if (rwstate == 0) {
+ rwstate = URW_WRITE_LOCKED;
+ acquired = 1;
+ }
+ } else if (rd_wr == READ_LOCK) {
+ /*
+ * This is the last chance for a reader to acquire
+ * the lock now, but it can only do so if there is
+ * no writer of equal or greater priority at the
+ * head of the queue .
+ *
+ * It is also just possible that there is a reader
+ * at the head of the queue. This may be the result
+ * of a spurious wakeup or an application failure.
+ * In this case we only acquire the lock if we have
+ * equal or greater priority. It is not our job to
+ * release spurious waiters.
+ */
+ pri_t our_pri = DISP_PRIO(t);
+ pri_t his_pri = DISP_PRIO(tp);
+
+ if ((our_pri > his_pri) || ((our_pri == his_pri) &&
+ !(tp->t_writer & TRW_WANT_WRITE))) {
+ rwstate++;
+ acquired = 1;
+ }
+ }
+ }
+
+ if (acquired || try_flag || time_error) {
+ /*
+ * We're not going to block this time!
+ */
+ suword32_noerr(&rw->rwlock_readers, rwstate);
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 0;
+
+ if (acquired) {
+ /*
+ * Got the lock!
+ */
+ error = 0;
+
+ } else if (try_flag) {
+ /*
+ * We didn't get the lock and we're about to block.
+ * If we're doing a trylock, return EBUSY instead.
+ */
+ error = EBUSY;
+
+ } else if (time_error) {
+ /*
+ * The SUSV3 POSIX spec is very clear that we should
+ * get no error from validating the timer (above)
+ * until we would actually sleep.
+ */
+ error = time_error;
+ }
+
+ goto out_drop;
+ }
+
+ /*
+ * We're about to block, so indicate what kind of waiter we are.
+ */
+ t->t_writer = 0;
+ rwstate |= URW_HAS_WAITERS;
+ if (rd_wr == WRITE_LOCK) {
+ t->t_writer = TRW_WANT_WRITE;
+ rwstate |= URW_WRITE_WANTED;
+ }
+ suword32_noerr(&rw->rwlock_readers, rwstate);
+
+ /*
+ * Unlock the rwlock's mutex (pagefaults are possible here).
+ */
+ ulock_clear(&mp->mutex_lockw);
+ fuword8_noerr(&mp->mutex_waiters, &mwaiters);
+ if (mwaiters != 0) {
+ /*
+ * Given the locking of mlwpchan around the release of
+ * the mutex and checking for waiters, the following
+ * call to lwp_release() can fail ONLY if the lock
+ * acquirer is interrupted after setting the waiter bit,
+ * calling lwp_block() and releasing mlwpchan.
+ * In this case, it could get pulled off the LWP sleep
+ * queue (via setrun()) before the following call to
+ * lwp_release() occurs, and the lock requestor will
+ * update the waiter bit correctly by re-evaluating it.
+ */
+ if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
+ suword8_noerr(&mp->mutex_waiters, mwaiters);
+ }
+ lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
+ mlocked = 0;
+ no_fault();
+
+ if (mwatched) {
+ watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
+ mwatched = 0;
+ }
+ if (watched) {
+ watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
+ watched = 0;
+ }
+
+ /*
+ * Put the LWP in an orderly state for debugging.
+ */
+ prstop(PR_REQUESTED, 0);
+ if (timedwait) {
+ /*
+ * If we successfully queue the timeout,
+ * then don't drop t_delay_lock until
+ * we are on the sleep queue (below).
+ */
+ mutex_enter(&t->t_delay_lock);
+ if (lwp_timer_enqueue(&lwpt) != 0) {
+ mutex_exit(&t->t_delay_lock);
+ imm_timeout = 1;
+ timedwait = NULL;
+ }
+ }
+ t->t_flag |= T_WAITCVSEM;
+ lwp_block(&lwpchan);
+
+ /*
+ * Nothing should happen to cause the LWp to go to sleep until after
+ * it returns from swtch().
+ */
+ if (timedwait)
+ mutex_exit(&t->t_delay_lock);
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t))
+ setrun(t);
+ swtch();
+
+ /*
+ * We're back, but we need to work out why. Were we interrupted? Did
+ * we timeout? Were we granted the lock?
+ */
+ error = EAGAIN;
+ acquired = (t->t_writer & TRW_LOCK_GRANTED);
+ t->t_writer = 0;
+ t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
+ if (timedwait)
+ tim = lwp_timer_dequeue(&lwpt);
+ if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
+ error = EINTR;
+ else if (imm_timeout || (timedwait && tim == -1))
+ error = ETIME;
+ lwp->lwp_asleep = 0;
+ lwp->lwp_sysabort = 0;
+ setallwatch();
+
+ /*
+ * If we were granted the lock we don't care about EINTR or ETIME.
+ */
+ if (acquired)
+ error = 0;
+
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+
+out_drop:
+ /*
+ * Make sure that the user level lock is dropped before returning
+ * to the caller.
+ */
+ if (!mlocked) {
+ lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
+ mlocked = 1;
+ }
+ suword32_noerr(&mp->mutex_ownerpid, 0);
+ ulock_clear(&mp->mutex_lockw);
+ fuword8_noerr(&mp->mutex_waiters, &mwaiters);
+ if (mwaiters != 0) {
+ /*
+ * See comment above on lock clearing and lwp_release()
+ * success/failure.
+ */
+ if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
+ suword8_noerr(&mp->mutex_waiters, mwaiters);
+ }
+ lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
+ mlocked = 0;
+
+out_nodrop:
+ no_fault();
+ if (mwatched)
+ watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
+ if (watched)
+ watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
+ * we never drop the lock.
+ */
+static int
+lwp_rwlock_unlock(lwp_rwlock_t *rw)
+{
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ lwpchan_t lwpchan;
+ volatile uint16_t type = 0;
+ volatile int error = 0;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ volatile int no_lwpchan = 1;
+ uint32_t rwstate;
+
+ /* We only check rw because the mutex is included in it. */
+ if ((caddr_t)rw >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ if (on_fault(&ljb)) {
+ if (no_lwpchan) {
+ error = EFAULT;
+ goto out_nodrop;
+ }
+ if (locked) {
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ }
+ error = EFAULT;
+ goto out_nodrop;
+ }
+
+ /* We can only continue for simple USYNC_PROCESS locks. */
+ fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
+ if (type != USYNC_PROCESS) {
+ error = EINVAL;
+ goto out_nodrop;
+ }
+
+ /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
+ suword16_noerr(&rw->rwlock_type, type);
+
+ /* Convert user level rwlock, "rw", to a unique lwpchan. */
+ if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
+ &lwpchan, LWPCHAN_CVPOOL)) {
+ error = EFAULT;
+ goto out_nodrop;
+ }
+
+ no_lwpchan = 0;
+ watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
+
+ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 1;
+
+ /*
+ * We can resolve multiple readers (except the last reader) here.
+ * For the last reader or a writer we need lwp_rwlock_release(),
+ * to which we also delegate the task of copying the new rwstate
+ * back to userland (see the comment there).
+ */
+ fuword32_noerr(&rw->rwlock_readers, &rwstate);
+ if (rwstate & URW_WRITE_LOCKED)
+ lwp_rwlock_release(&lwpchan, rw);
+ else if ((rwstate & URW_READERS_MASK) > 0) {
+ rwstate--;
+ if ((rwstate & URW_READERS_MASK) == 0)
+ lwp_rwlock_release(&lwpchan, rw);
+ else
+ suword32_noerr(&rw->rwlock_readers, rwstate);
+ }
+
+ lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
+ locked = 0;
+ error = 0;
+
+out_nodrop:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+int
+lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
+{
+ switch (subcode) {
+ case 0:
+ return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
+ case 1:
+ return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
+ case 2:
+ return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
+ case 3:
+ return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
+ case 4:
+ return (lwp_rwlock_unlock(rwlp));
+ }
+ return (set_errno(EINVAL));
+}
+
+/*
+ * Return the owner of the user-level s-object.
+ * Since we can't really do this, return NULL.
+ */
+/* ARGSUSED */
+static kthread_t *
+lwpsobj_owner(caddr_t sobj)
+{
+ return ((kthread_t *)NULL);
+}
+
+/*
+ * Wake up a thread asleep on a user-level synchronization
+ * object.
+ */
+static void
+lwp_unsleep(kthread_t *t)
+{
+ ASSERT(THREAD_LOCK_HELD(t));
+ if (t->t_wchan0 != NULL) {
+ sleepq_head_t *sqh;
+ sleepq_t *sqp = t->t_sleepq;
+
+ if (sqp != NULL) {
+ sqh = lwpsqhash(&t->t_lwpchan);
+ ASSERT(&sqh->sq_queue == sqp);
+ sleepq_unsleep(t);
+ disp_lock_exit_high(&sqh->sq_lock);
+ CL_SETRUN(t);
+ return;
+ }
+ }
+ panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
+}
+
+/*
+ * Change the priority of a thread asleep on a user-level
+ * synchronization object. To maintain proper priority order,
+ * we:
+ * o dequeue the thread.
+ * o change its priority.
+ * o re-enqueue the thread.
+ * Assumption: the thread is locked on entry.
+ */
+static void
+lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
+{
+ ASSERT(THREAD_LOCK_HELD(t));
+ if (t->t_wchan0 != NULL) {
+ sleepq_t *sqp = t->t_sleepq;
+
+ sleepq_dequeue(t);
+ *t_prip = pri;
+ sleepq_insert(sqp, t);
+ } else
+ panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
+}
+
+/*
+ * Clean up a locked a robust mutex
+ */
+static void
+lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
+{
+ uint16_t flag;
+ uchar_t waiters;
+ label_t ljb;
+ pid_t owner_pid;
+ lwp_mutex_t *lp;
+ volatile int locked = 0;
+ volatile int watched = 0;
+
+ ASSERT(ent->lwpchan_type & USYNC_PROCESS_ROBUST);
+
+ lp = (lwp_mutex_t *)ent->lwpchan_addr;
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
+ goto out;
+ }
+ fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
+ if (owner_pid != curproc->p_pid) {
+ goto out;
+ }
+ lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) {
+ flag |= lockflg;
+ suword16_noerr(&lp->mutex_flag, flag);
+ }
+ suword32_noerr(&lp->mutex_ownerpid, 0);
+ ulock_clear(&lp->mutex_lockw);
+ fuword8_noerr(&lp->mutex_waiters, &waiters);
+ if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
+ suword8_noerr(&lp->mutex_waiters, waiters);
+ lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+}
+
+/*
+ * Register the mutex and initialize the mutex if it is not already
+ */
+int
+lwp_mutex_init(lwp_mutex_t *lp, int type)
+{
+ proc_t *p = curproc;
+ int error = 0;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ uint16_t flag;
+ lwpchan_t lwpchan;
+ pid_t owner_pid;
+
+ if ((caddr_t)lp >= (caddr_t)USERLIMIT)
+ return (set_errno(EFAULT));
+
+ if (type != USYNC_PROCESS_ROBUST)
+ return (set_errno(EINVAL));
+
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ /*
+ * Force Copy-on-write fault if lwp_mutex_t object is
+ * defined to be MAP_PRIVATE and it was initialized to
+ * USYNC_PROCESS.
+ */
+ suword8_noerr(&lp->mutex_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & LOCK_INITED) {
+ if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
+ fuword32_noerr(&lp->mutex_ownerpid,
+ (uint32_t *)&owner_pid);
+ if (owner_pid == p->p_pid) {
+ flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
+ suword16_noerr(&lp->mutex_flag, flag);
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ goto out;
+ }
+ }
+ error = EBUSY;
+ } else {
+ suword8_noerr(&lp->mutex_waiters, 0);
+ suword8_noerr(&lp->mutex_lockw, 0);
+ suword16_noerr(&lp->mutex_flag, LOCK_INITED);
+ suword32_noerr(&lp->mutex_ownerpid, 0);
+ }
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+int
+lwp_mutex_trylock(lwp_mutex_t *lp)
+{
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ int error = 0;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ label_t ljb;
+ volatile uint8_t type = 0;
+ uint16_t flag;
+ lwpchan_t lwpchan;
+
+ if ((caddr_t)lp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
+ if (UPIMUTEX(type)) {
+ no_fault();
+ error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
+ if ((error == 0 || error == EOWNERDEAD) &&
+ (type & USYNC_PROCESS))
+ (void) suword32(&lp->mutex_ownerpid, p->p_pid);
+ if (error)
+ return (set_errno(error));
+ return (0);
+ }
+ /*
+ * Force Copy-on-write fault if lwp_mutex_t object is
+ * defined to be MAP_PRIVATE and it was initialized to
+ * USYNC_PROCESS.
+ */
+ suword8_noerr(&lp->mutex_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ if (type & USYNC_PROCESS_ROBUST) {
+ fuword16_noerr((uint16_t *)(&lp->mutex_flag), &flag);
+ if (flag & LOCK_NOTRECOVERABLE) {
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = ENOTRECOVERABLE;
+ goto out;
+ }
+ }
+
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+
+ if (!ulock_try(&lp->mutex_lockw))
+ error = EBUSY;
+ else if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
+ suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
+ if (type & USYNC_PROCESS_ROBUST) {
+ if (flag & LOCK_OWNERDEAD)
+ error = EOWNERDEAD;
+ else if (flag & LOCK_UNMAPPED)
+ error = ELOCKUNMAPPED;
+ }
+ }
+ locked = 0;
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+out:
+
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * unlock the mutex and unblock lwps that is trying to acquire this mutex.
+ * the blocked lwp resumes and retries to acquire the lock.
+ */
+int
+lwp_mutex_unlock(lwp_mutex_t *lp)
+{
+ proc_t *p = ttoproc(curthread);
+ lwpchan_t lwpchan;
+ uchar_t waiters;
+ volatile int locked = 0;
+ volatile int watched = 0;
+ volatile uint8_t type = 0;
+ label_t ljb;
+ uint16_t flag;
+ int error = 0;
+
+ if ((caddr_t)lp >= p->p_as->a_userlimit)
+ return (set_errno(EFAULT));
+
+ if (on_fault(&ljb)) {
+ if (locked)
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+ error = EFAULT;
+ goto out;
+ }
+ fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
+ if (UPIMUTEX(type)) {
+ no_fault();
+ error = lwp_upimutex_unlock(lp, type);
+ if (error)
+ return (set_errno(error));
+ return (0);
+ }
+
+ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+
+ /*
+ * Force Copy-on-write fault if lwp_mutex_t object is
+ * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
+ */
+ suword8_noerr(&lp->mutex_type, type);
+ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
+ &lwpchan, LWPCHAN_MPPOOL)) {
+ error = EFAULT;
+ goto out;
+ }
+ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
+ locked = 1;
+ if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
+ if (type & USYNC_PROCESS_ROBUST) {
+ fuword16_noerr(&lp->mutex_flag, &flag);
+ if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
+ flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
+ flag |= LOCK_NOTRECOVERABLE;
+ suword16_noerr(&lp->mutex_flag, flag);
+ }
+ }
+ suword32_noerr(&lp->mutex_ownerpid, 0);
+ }
+ ulock_clear(&lp->mutex_lockw);
+ /*
+ * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
+ * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
+ * may fail. If it fails, do not write into the waiter bit.
+ * The call to lwp_release() might fail due to one of three reasons:
+ *
+ * 1. due to the thread which set the waiter bit not actually
+ * sleeping since it got the lock on the re-try. The waiter
+ * bit will then be correctly updated by that thread. This
+ * window may be closed by reading the wait bit again here
+ * and not calling lwp_release() at all if it is zero.
+ * 2. the thread which set the waiter bit and went to sleep
+ * was woken up by a signal. This time, the waiter recomputes
+ * the wait bit in the return with EINTR code.
+ * 3. the waiter bit read by lwp_mutex_wakeup() was in
+ * memory that has been re-used after the lock was dropped.
+ * In this case, writing into the waiter bit would cause data
+ * corruption.
+ */
+ fuword8_noerr(&lp->mutex_waiters, &waiters);
+ if (waiters) {
+ if ((type & USYNC_PROCESS_ROBUST) &&
+ (flag & LOCK_NOTRECOVERABLE)) {
+ lwp_release_all(&lwpchan);
+ suword8_noerr(&lp->mutex_waiters, 0);
+ } else if (lwp_release(&lwpchan, &waiters, 0) == 1) {
+ suword8_noerr(&lp->mutex_waiters, waiters);
+ }
+ }
+
+ lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
+out:
+ no_fault();
+ if (watched)
+ watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/lwp_timer.c b/usr/src/uts/common/syscall/lwp_timer.c
new file mode 100644
index 0000000000..7c1d862bea
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwp_timer.c
@@ -0,0 +1,216 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/timer.h>
+#include <sys/lwp_timer_impl.h>
+
+/*
+ * lwp_timer_timeout() is called from a timeout set up in lwp_cond_wait(),
+ * lwp_mutex_timedlock(), lwp_sema_timedwait() or lwp_rwlock_lock().
+ *
+ * It recomputes the time remaining until the absolute time when the
+ * wait is supposed to timeout and either calls realtime_timeout()
+ * to reschedule itself or calls setrun() on the sleeping thread.
+ *
+ * This is done to ensure that the waiting thread does not wake up
+ * due to timer expiration until the absolute future time of the
+ * timeout has been reached. Until that time, the thread must
+ * remain on its sleep queue.
+ *
+ * An lwp_timer_t structure is used to pass information
+ * about the sleeping thread to the timeout function.
+ */
+
+static void
+lwp_timer_timeout(void *arg)
+{
+ lwp_timer_t *lwptp = arg;
+ kthread_t *t = lwptp->lwpt_thread;
+ timespec_t now;
+
+ mutex_enter(&t->t_delay_lock);
+ gethrestime(&now);
+ /*
+ * Requeue the timeout if no one has reset the system time
+ * and if the absolute future time has not been reached.
+ */
+ if (lwptp->lwpt_timecheck == timechanged &&
+ (lwptp->lwpt_rqtime.tv_sec > now.tv_sec ||
+ (lwptp->lwpt_rqtime.tv_sec == now.tv_sec &&
+ lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) {
+ lwptp->lwpt_id = realtime_timeout(lwp_timer_timeout, lwptp,
+ timespectohz_adj(&lwptp->lwpt_rqtime, now));
+ } else {
+ /*
+ * Set the thread running only if it is asleep on
+ * its lwpchan sleep queue (not if it is asleep on
+ * the t_delay_lock mutex).
+ */
+ thread_lock(t);
+ if (t->t_state == TS_SLEEP &&
+ (t->t_flag & T_WAKEABLE) &&
+ t->t_wchan0 != NULL)
+ setrun_locked(t);
+ thread_unlock(t);
+ }
+ mutex_exit(&t->t_delay_lock);
+}
+
+int
+lwp_timer_copyin(lwp_timer_t *lwptp, timespec_t *tsp)
+{
+ timespec_t now;
+ int error = 0;
+
+ if (tsp == NULL) /* not really an error, just need to bzero() */
+ goto err;
+ lwptp->lwpt_timecheck = timechanged; /* do this before gethrestime() */
+ gethrestime(&now); /* do this before copyin() */
+ if (curproc->p_model == DATAMODEL_NATIVE) {
+ if (copyin(tsp, &lwptp->lwpt_rqtime, sizeof (timespec_t))) {
+ error = EFAULT;
+ goto err;
+ }
+ } else {
+ timespec32_t ts32;
+ if (copyin(tsp, &ts32, sizeof (timespec32_t))) {
+ error = EFAULT;
+ goto err;
+ }
+ TIMESPEC32_TO_TIMESPEC(&lwptp->lwpt_rqtime, &ts32);
+ }
+ if (itimerspecfix(&lwptp->lwpt_rqtime)) {
+ error = EINVAL;
+ goto err;
+ }
+ /*
+ * Unless the requested timeout is zero,
+ * get the precise future (absolute) time at
+ * which we are to time out and return ETIME.
+ * We must not return ETIME before that time.
+ */
+ if (lwptp->lwpt_rqtime.tv_sec == 0 && lwptp->lwpt_rqtime.tv_nsec == 0) {
+ bzero(lwptp, sizeof (lwp_timer_t));
+ lwptp->lwpt_imm_timeout = 1;
+ } else {
+ lwptp->lwpt_thread = curthread;
+ lwptp->lwpt_tsp = tsp;
+ lwptp->lwpt_time_error = 0;
+ lwptp->lwpt_id = 0;
+ lwptp->lwpt_imm_timeout = 0;
+ timespecadd(&lwptp->lwpt_rqtime, &now);
+ }
+ return (0);
+err:
+ bzero(lwptp, sizeof (lwp_timer_t));
+ lwptp->lwpt_time_error = error;
+ return (error);
+}
+
+int
+lwp_timer_enqueue(lwp_timer_t *lwptp)
+{
+ timespec_t now;
+
+ ASSERT(lwptp->lwpt_thread == curthread);
+ ASSERT(MUTEX_HELD(&curthread->t_delay_lock));
+ gethrestime(&now);
+ if (lwptp->lwpt_timecheck == timechanged &&
+ (lwptp->lwpt_rqtime.tv_sec > now.tv_sec ||
+ (lwptp->lwpt_rqtime.tv_sec == now.tv_sec &&
+ lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) {
+ /*
+ * Queue the timeout.
+ */
+ lwptp->lwpt_id = realtime_timeout(lwp_timer_timeout, lwptp,
+ timespectohz_adj(&lwptp->lwpt_rqtime, now));
+ return (0);
+ }
+
+ /*
+ * Time has already run out or someone reset the system time;
+ * just cause an immediate timeout.
+ */
+ lwptp->lwpt_imm_timeout = 1;
+ return (1);
+}
+
+clock_t
+lwp_timer_dequeue(lwp_timer_t *lwptp)
+{
+ kthread_t *t = curthread;
+ clock_t tim = -1;
+ timeout_id_t tmp_id;
+
+ mutex_enter(&t->t_delay_lock);
+ while ((tmp_id = lwptp->lwpt_id) != 0) {
+ lwptp->lwpt_id = 0;
+ mutex_exit(&t->t_delay_lock);
+ tim = untimeout(tmp_id);
+ mutex_enter(&t->t_delay_lock);
+ }
+ mutex_exit(&t->t_delay_lock);
+ return (tim);
+}
+
+int
+lwp_timer_copyout(lwp_timer_t *lwptp, int error)
+{
+ timespec_t rmtime;
+ timespec_t now;
+
+ if (lwptp->lwpt_tsp == NULL) /* nothing to do */
+ return (error);
+
+ rmtime.tv_sec = rmtime.tv_nsec = 0;
+ if (error != ETIME) {
+ gethrestime(&now);
+ if ((now.tv_sec < lwptp->lwpt_rqtime.tv_sec) ||
+ ((now.tv_sec == lwptp->lwpt_rqtime.tv_sec) &&
+ (now.tv_nsec < lwptp->lwpt_rqtime.tv_nsec))) {
+ rmtime = lwptp->lwpt_rqtime;
+ timespecsub(&rmtime, &now);
+ }
+ }
+ if (curproc->p_model == DATAMODEL_NATIVE) {
+ if (copyout(&rmtime, lwptp->lwpt_tsp, sizeof (timespec_t)))
+ error = EFAULT;
+ } else {
+ timespec32_t rmtime32;
+
+ TIMESPEC_TO_TIMESPEC32(&rmtime32, &rmtime);
+ if (copyout(&rmtime32, lwptp->lwpt_tsp, sizeof (timespec32_t)))
+ error = EFAULT;
+ }
+
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/lwpsys.c b/usr/src/uts/common/syscall/lwpsys.c
new file mode 100644
index 0000000000..8868468a44
--- /dev/null
+++ b/usr/src/uts/common/syscall/lwpsys.c
@@ -0,0 +1,563 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/prsystm.h>
+#include <sys/cred.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/signal.h>
+#include <sys/kmem.h>
+#include <sys/unistd.h>
+#include <sys/cmn_err.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+#include <sys/contract/process_impl.h>
+
+kthread_t *
+idtot(proc_t *p, id_t lwpid)
+{
+ lwpdir_t *ldp;
+
+ if ((ldp = lwp_hash_lookup(p, lwpid)) != NULL)
+ return (ldp->ld_entry->le_thread);
+ return (NULL);
+}
+
+/*
+ * Stop an lwp of the current process
+ */
+int
+syslwp_suspend(id_t lwpid)
+{
+ kthread_t *t;
+ int error;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&p->p_lock);
+ if ((t = idtot(p, lwpid)) == NULL)
+ error = ESRCH;
+ else
+ error = lwp_suspend(t);
+ mutex_exit(&p->p_lock);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+int
+syslwp_continue(id_t lwpid)
+{
+ kthread_t *t;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&p->p_lock);
+ if ((t = idtot(p, lwpid)) == NULL) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(ESRCH));
+ }
+ lwp_continue(t);
+ mutex_exit(&p->p_lock);
+ return (0);
+}
+
+int
+lwp_kill(id_t lwpid, int sig)
+{
+ sigqueue_t *sqp;
+ kthread_t *t;
+ proc_t *p = ttoproc(curthread);
+
+ if (sig < 0 || sig >= NSIG)
+ return (set_errno(EINVAL));
+ if (sig != 0)
+ sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
+ mutex_enter(&p->p_lock);
+ if ((t = idtot(p, lwpid)) == NULL) {
+ mutex_exit(&p->p_lock);
+ if (sig != 0)
+ kmem_free(sqp, sizeof (sigqueue_t));
+ return (set_errno(ESRCH));
+ }
+ if (sig == 0) {
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+ sqp->sq_info.si_signo = sig;
+ sqp->sq_info.si_code = SI_LWP;
+ sqp->sq_info.si_pid = p->p_pid;
+ sqp->sq_info.si_ctid = PRCTID(p);
+ sqp->sq_info.si_zoneid = getzoneid();
+ sqp->sq_info.si_uid = crgetruid(CRED());
+ sigaddqa(p, t, sqp);
+ mutex_exit(&p->p_lock);
+ return (0);
+}
+
+/*
+ * This is the specification of lwp_wait() from the _lwp_wait(2) manual page:
+ *
+ * The lwp_wait() function blocks the current lwp until the lwp specified
+ * by 'lwpid' terminates. If the specified lwp terminated prior to the call
+ * to lwp_wait(), then lwp_wait() returns immediately. If 'lwpid' is zero,
+ * then lwp_wait() waits for any undetached lwp in the current process.
+ * If 'lwpid' is not zero, then it must specify an undetached lwp in the
+ * current process. If 'departed' is not NULL, then it points to a location
+ * where the id of the exited lwp is stored.
+ *
+ * When an lwp exits and there are one or more lwps in the process waiting
+ * for this specific lwp to exit, then one of the waiting lwps is unblocked
+ * and it returns from lwp_wait() successfully. Any other lwps waiting for
+ * this same lwp to exit are also unblocked, however, they return from
+ * lwp_wait() with the error ESRCH. If there are no lwps in the process
+ * waiting for this specific lwp to exit but there are one or more lwps
+ * waiting for any lwp to exit, then one of the waiting lwps is unblocked
+ * and it returns from lwp_wait() successfully.
+ *
+ * If an lwp is waiting for any lwp to exit, it blocks until an undetached
+ * lwp for which no other lwp is waiting terminates, at which time it returns
+ * successfully, or until all other lwps in the process are either daemon
+ * lwps or lwps waiting in lwp_wait(), in which case it returns EDEADLK.
+ */
+int
+lwp_wait(id_t lwpid, id_t *departed)
+{
+ proc_t *p = ttoproc(curthread);
+ int error = 0;
+ int daemon = (curthread->t_proc_flag & TP_DAEMON)? 1 : 0;
+ lwpent_t *target_lep;
+ lwpdir_t *ldp;
+ lwpent_t *lep;
+
+ /*
+ * lwp_wait() is not supported for the /proc agent lwp.
+ */
+ if (curthread == p->p_agenttp)
+ return (set_errno(ENOTSUP));
+
+ mutex_enter(&p->p_lock);
+ prbarrier(p);
+
+ curthread->t_waitfor = lwpid;
+ p->p_lwpwait++;
+ p->p_lwpdwait += daemon;
+
+ if (lwpid != 0) {
+ if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL)
+ target_lep = NULL;
+ else {
+ target_lep = ldp->ld_entry;
+ target_lep->le_waiters++;
+ target_lep->le_dwaiters += daemon;
+ }
+ }
+
+ while (error == 0) {
+ kthread_t *t;
+ id_t tid;
+ int i;
+
+ if (lwpid != 0) {
+ /*
+ * Look for a specific zombie lwp.
+ */
+ if (target_lep == NULL)
+ error = ESRCH;
+ else if ((t = target_lep->le_thread) != NULL) {
+ if (!(t->t_proc_flag & TP_TWAIT))
+ error = EINVAL;
+ } else {
+ /*
+ * We found the zombie we are waiting for.
+ */
+ ASSERT(p->p_zombcnt > 0);
+ p->p_zombcnt--;
+ p->p_lwpwait--;
+ p->p_lwpdwait -= daemon;
+ curthread->t_waitfor = -1;
+ lwp_hash_out(p, lwpid);
+ mutex_exit(&p->p_lock);
+ if (departed != NULL &&
+ copyout(&lwpid, departed, sizeof (id_t)))
+ return (set_errno(EFAULT));
+ return (0);
+ }
+ } else {
+ /*
+ * Look for any zombie lwp.
+ */
+ int some_non_daemon_will_return = 0;
+
+ /* for each entry in the lwp directory... */
+ ldp = p->p_lwpdir;
+ for (i = 0; i < p->p_lwpdir_sz; i++, ldp++) {
+
+ if ((lep = ldp->ld_entry) == NULL ||
+ lep->le_thread != NULL)
+ continue;
+
+ /*
+ * We found a zombie lwp. If there is some
+ * other thread waiting specifically for the
+ * zombie we just found, then defer to the other
+ * waiting thread and continue searching for
+ * another zombie. Also check to see if there
+ * is some non-daemon thread sleeping here in
+ * lwp_wait() that will succeed and return when
+ * we drop p->p_lock. This is tested below.
+ */
+ tid = lep->le_lwpid;
+ if (lep->le_waiters != 0) {
+ if (lep->le_waiters - lep->le_dwaiters)
+ some_non_daemon_will_return = 1;
+ continue;
+ }
+
+ /*
+ * We found a zombie that no one else
+ * is specifically waiting for.
+ */
+ ASSERT(p->p_zombcnt > 0);
+ p->p_zombcnt--;
+ p->p_lwpwait--;
+ p->p_lwpdwait -= daemon;
+ curthread->t_waitfor = -1;
+ lwp_hash_out(p, tid);
+ mutex_exit(&p->p_lock);
+ if (departed != NULL &&
+ copyout(&tid, departed, sizeof (id_t)))
+ return (set_errno(EFAULT));
+ return (0);
+ }
+
+ /*
+ * We are waiting for anyone. If all non-daemon lwps
+ * are waiting here, and if we determined above that
+ * no non-daemon lwp will return, we have deadlock.
+ */
+ if (!some_non_daemon_will_return &&
+ p->p_lwpcnt == p->p_lwpdaemon +
+ (p->p_lwpwait - p->p_lwpdwait))
+ error = EDEADLK;
+ }
+
+ if (error == 0 && lwpid != 0) {
+ /*
+ * We are waiting for a specific non-zombie lwp.
+ * Fail if there is a deadlock loop.
+ */
+ for (;;) {
+ if (t == curthread) {
+ error = EDEADLK;
+ break;
+ }
+ /* who is he waiting for? */
+ if ((tid = t->t_waitfor) == -1)
+ break;
+ if (tid == 0) {
+ /*
+ * The lwp we are waiting for is
+ * waiting for anyone (transitively).
+ * If there are no zombies right now
+ * and if we would have deadlock due
+ * to all non-daemon lwps waiting here,
+ * wake up the lwp that is waiting for
+ * anyone so it can return EDEADLK.
+ */
+ if (p->p_zombcnt == 0 &&
+ p->p_lwpcnt == p->p_lwpdaemon +
+ p->p_lwpwait - p->p_lwpdwait)
+ cv_broadcast(&p->p_lwpexit);
+ break;
+ }
+ if ((ldp = lwp_hash_lookup(p, tid)) == NULL ||
+ (t = ldp->ld_entry->le_thread) == NULL)
+ break;
+ }
+ }
+
+ if (error)
+ break;
+
+ /*
+ * Wait for some lwp to terminate.
+ */
+ if (!cv_wait_sig(&p->p_lwpexit, &p->p_lock))
+ error = EINTR;
+ prbarrier(p);
+
+ if (lwpid != 0) {
+ if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL)
+ target_lep = NULL;
+ else
+ target_lep = ldp->ld_entry;
+ }
+ }
+
+ if (lwpid != 0 && target_lep != NULL) {
+ target_lep->le_waiters--;
+ target_lep->le_dwaiters -= daemon;
+ }
+ p->p_lwpwait--;
+ p->p_lwpdwait -= daemon;
+ curthread->t_waitfor = -1;
+ mutex_exit(&p->p_lock);
+ return (set_errno(error));
+}
+
+int
+lwp_detach(id_t lwpid)
+{
+ kthread_t *t;
+ proc_t *p = ttoproc(curthread);
+ lwpdir_t *ldp;
+ int error = 0;
+
+ mutex_enter(&p->p_lock);
+ prbarrier(p);
+ if ((ldp = lwp_hash_lookup(p, lwpid)) == NULL)
+ error = ESRCH;
+ else if ((t = ldp->ld_entry->le_thread) != NULL) {
+ if (!(t->t_proc_flag & TP_TWAIT))
+ error = EINVAL;
+ else {
+ t->t_proc_flag &= ~TP_TWAIT;
+ cv_broadcast(&p->p_lwpexit);
+ }
+ } else {
+ ASSERT(p->p_zombcnt > 0);
+ p->p_zombcnt--;
+ lwp_hash_out(p, lwpid);
+ }
+ mutex_exit(&p->p_lock);
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Unpark the specified lwp.
+ */
+static int
+lwp_unpark(id_t lwpid)
+{
+ proc_t *p = ttoproc(curthread);
+ kthread_t *t;
+ int error = 0;
+
+ mutex_enter(&p->p_lock);
+ if ((t = idtot(p, lwpid)) == NULL)
+ error = ESRCH;
+ else {
+ mutex_enter(&t->t_delay_lock);
+ t->t_unpark = 1;
+ cv_signal(&t->t_delay_cv);
+ mutex_exit(&t->t_delay_lock);
+ }
+ mutex_exit(&p->p_lock);
+ return (error);
+}
+
+/*
+ * Sleep until we are set running by lwp_unpark() or until we are
+ * interrupted by a signal or until we exhaust our timeout.
+ * timeoutp is an in/out parameter. On entry, it contains the relative
+ * time until timeout. On exit, we copyout the residual time left to it.
+ */
+static int
+lwp_park(timespec_t *timeoutp, id_t lwpid)
+{
+ timespec_t rqtime;
+ timespec_t rmtime;
+ timespec_t now;
+ timespec_t *rqtp = NULL;
+ kthread_t *t = curthread;
+ int timecheck = 0;
+ int error = 0;
+ model_t datamodel = ttoproc(t)->p_model;
+
+ if (lwpid != 0) /* unpark the other lwp, if any */
+ (void) lwp_unpark(lwpid);
+
+ if (timeoutp) {
+ timecheck = timechanged;
+ gethrestime(&now);
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &rqtime, sizeof (timespec_t))) {
+ error = EFAULT;
+ goto out;
+ }
+ } else {
+ timespec32_t timeout32;
+
+ if (copyin(timeoutp, &timeout32, sizeof (timeout32))) {
+ error = EFAULT;
+ goto out;
+ }
+ TIMESPEC32_TO_TIMESPEC(&rqtime, &timeout32)
+ }
+
+ if (itimerspecfix(&rqtime)) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Convert the timespec value into absolute time.
+ */
+ timespecadd(&rqtime, &now);
+ rqtp = &rqtime;
+ }
+
+ (void) new_mstate(t, LMS_USER_LOCK);
+
+ mutex_enter(&t->t_delay_lock);
+ if (!schedctl_is_park())
+ error = EINTR;
+ while (error == 0 && t->t_unpark == 0) {
+ switch (cv_waituntil_sig(&t->t_delay_cv,
+ &t->t_delay_lock, rqtp, timecheck)) {
+ case 0:
+ error = EINTR;
+ break;
+ case -1:
+ error = ETIME;
+ break;
+ }
+ }
+ t->t_unpark = 0;
+ mutex_exit(&t->t_delay_lock);
+
+ if (timeoutp != NULL) {
+ rmtime.tv_sec = rmtime.tv_nsec = 0;
+ if (error != ETIME) {
+ gethrestime(&now);
+ if ((now.tv_sec < rqtime.tv_sec) ||
+ ((now.tv_sec == rqtime.tv_sec) &&
+ (now.tv_nsec < rqtime.tv_nsec))) {
+ rmtime = rqtime;
+ timespecsub(&rmtime, &now);
+ }
+ }
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyout(&rmtime, timeoutp, sizeof (rmtime)))
+ error = EFAULT;
+ } else {
+ timespec32_t rmtime32;
+
+ TIMESPEC_TO_TIMESPEC32(&rmtime32, &rmtime);
+ if (copyout(&rmtime32, timeoutp, sizeof (rmtime32)))
+ error = EFAULT;
+ }
+ }
+out:
+ schedctl_unpark();
+ if (t->t_mstate == LMS_USER_LOCK)
+ (void) new_mstate(t, LMS_SYSTEM);
+ return (error);
+}
+
+#define MAXLWPIDS 1024
+
+/*
+ * Unpark all of the specified lwps.
+ * Do it in chunks of MAXLWPIDS to avoid allocating too much memory.
+ */
+static int
+lwp_unpark_all(id_t *lwpidp, int nids)
+{
+ proc_t *p = ttoproc(curthread);
+ kthread_t *t;
+ int error = 0;
+ id_t *lwpid;
+ size_t lwpidsz;
+ int n;
+ int i;
+
+ if (nids <= 0)
+ return (EINVAL);
+
+ lwpidsz = MIN(nids, MAXLWPIDS) * sizeof (id_t);
+ lwpid = kmem_alloc(lwpidsz, KM_SLEEP);
+ while (nids > 0) {
+ n = MIN(nids, MAXLWPIDS);
+ if (copyin(lwpidp, lwpid, n * sizeof (id_t))) {
+ error = EFAULT;
+ break;
+ }
+ mutex_enter(&p->p_lock);
+ for (i = 0; i < n; i++) {
+ if ((t = idtot(p, lwpid[i])) == NULL)
+ error = ESRCH;
+ else {
+ mutex_enter(&t->t_delay_lock);
+ t->t_unpark = 1;
+ cv_signal(&t->t_delay_cv);
+ mutex_exit(&t->t_delay_lock);
+ }
+ }
+ mutex_exit(&p->p_lock);
+ lwpidp += n;
+ nids -= n;
+ }
+ kmem_free(lwpid, lwpidsz);
+ return (error);
+}
+
+/*
+ * SYS_lwp_park() system call.
+ */
+int
+syslwp_park(int which, uintptr_t arg1, uintptr_t arg2)
+{
+ int error;
+
+ switch (which) {
+ case 0:
+ error = lwp_park((timespec_t *)arg1, (id_t)arg2);
+ break;
+ case 1:
+ error = lwp_unpark((id_t)arg1);
+ break;
+ case 2:
+ error = lwp_unpark_all((id_t *)arg1, (int)arg2);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c
new file mode 100644
index 0000000000..5e162c2002
--- /dev/null
+++ b/usr/src/uts/common/syscall/memcntl.c
@@ -0,0 +1,394 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/bitmap.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/unistd.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/mman.h>
+#include <sys/tuneable.h>
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/vmsystm.h>
+#include <sys/debug.h>
+#include <sys/policy.h>
+
+#include <vm/as.h>
+#include <vm/seg.h>
+
+static uint_t mem_getpgszc(size_t);
+
+/*
+ * Memory control operations
+ */
+int
+memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask)
+{
+ struct as *as = ttoproc(curthread)->p_as;
+ struct proc *p = ttoproc(curthread);
+ size_t pgsz;
+ uint_t szc, oszc, pgcmd;
+ int error = 0;
+ faultcode_t fc;
+ uintptr_t iarg;
+ STRUCT_DECL(memcntl_mha, mha);
+
+ if (mask)
+ return (set_errno(EINVAL));
+ if ((cmd == MC_LOCKAS) || (cmd == MC_UNLOCKAS)) {
+ if ((addr != 0) || (len != 0)) {
+ return (set_errno(EINVAL));
+ }
+ } else if (cmd != MC_HAT_ADVISE) {
+ if (((uintptr_t)addr & PAGEOFFSET) != 0 || len == 0) {
+ return (set_errno(EINVAL));
+ }
+ /*
+ * We're only concerned with the address range
+ * here, not the protections. The protections
+ * are only used as a "filter" in this code,
+ * they aren't set or modified here.
+ */
+ if (valid_usr_range(addr, len, 0, as,
+ as->a_userlimit) != RANGE_OKAY) {
+ return (set_errno(ENOMEM));
+ }
+ }
+
+ if (cmd == MC_HAT_ADVISE) {
+ if (attr != 0 || mask != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ } else {
+ if ((VALID_ATTR & attr) != attr) {
+ return (set_errno(EINVAL));
+ }
+ if ((attr & SHARED) && (attr & PRIVATE)) {
+ return (set_errno(EINVAL));
+ }
+ if (((cmd == MC_LOCKAS) || (cmd == MC_LOCK) ||
+ (cmd == MC_UNLOCKAS) || (cmd == MC_UNLOCK)) &&
+ (error = secpolicy_lock_memory(CRED())) != 0)
+ return (set_errno(error));
+ }
+ if (attr) {
+ attr |= PROT_USER;
+ }
+
+ switch (cmd) {
+ case MC_SYNC:
+ /*
+ * MS_SYNC used to be defined to be zero but is now non-zero.
+ * For binary compatibility we still accept zero
+ * (the absence of MS_ASYNC) to mean the same thing.
+ */
+ iarg = (uintptr_t)arg;
+ if ((iarg & ~MS_INVALIDATE) == 0)
+ iarg |= MS_SYNC;
+
+ if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) ||
+ ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) {
+ error = set_errno(EINVAL);
+ } else {
+ error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0);
+ if (error) {
+ (void) set_errno(error);
+ }
+ }
+ return (error);
+ case MC_LOCKAS:
+ if ((uintptr_t)arg & ~(MCL_FUTURE|MCL_CURRENT) ||
+ (uintptr_t)arg == 0) {
+ return (set_errno(EINVAL));
+ }
+ break;
+ case MC_LOCK:
+ case MC_UNLOCKAS:
+ case MC_UNLOCK:
+ break;
+ case MC_HAT_ADVISE:
+ /*
+ * Set prefered page size.
+ */
+ STRUCT_INIT(mha, get_udatamodel());
+ if (copyin(arg, STRUCT_BUF(mha), STRUCT_SIZE(mha))) {
+ return (set_errno(EFAULT));
+ }
+
+ pgcmd = STRUCT_FGET(mha, mha_cmd);
+
+ /*
+ * Currently only MHA_MAPSIZE_VA, MHA_MAPSIZE_STACK
+ * and MHA_MAPSIZE_BSSBRK are supported. Only one
+ * command may be specified at a time.
+ */
+ if ((~(MHA_MAPSIZE_VA|MHA_MAPSIZE_STACK|MHA_MAPSIZE_BSSBRK) &
+ pgcmd) || pgcmd == 0 || !ISP2(pgcmd) ||
+ STRUCT_FGET(mha, mha_flags))
+ return (set_errno(EINVAL));
+
+ pgsz = STRUCT_FGET(mha, mha_pagesize);
+
+ /*
+ * call platform specific map_pgsz() routine to get the
+ * optimal pgsz if pgsz is 0.
+ *
+ * For stack and heap operations addr and len must be zero.
+ */
+ if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
+ if (addr != NULL || len != 0) {
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * Disable autompss for this process unless pgsz == 0,
+ * which means the system should pick. In the
+ * pgsz == 0 case, leave the SAUTOLPG setting alone, as
+ * we don't want to enable it when someone has
+ * disabled automatic large page selection for the
+ * whole system.
+ */
+ mutex_enter(&p->p_lock);
+ if (pgsz != 0) {
+ p->p_flag &= ~SAUTOLPG;
+ }
+ mutex_exit(&p->p_lock);
+
+ as_rangelock(as);
+
+ if (pgsz == 0) {
+ int type;
+
+ if (pgcmd == MHA_MAPSIZE_BSSBRK)
+ type = MAPPGSZ_HEAP;
+ else
+ type = MAPPGSZ_STK;
+
+ pgsz = map_pgsz(type, p, 0, 0, NULL);
+ }
+ } else {
+ /*
+ * Note that we don't disable automatic large page
+ * selection for anon segments based on use of
+ * memcntl().
+ */
+ if (pgsz == 0) {
+ pgsz = map_pgsz(MAPPGSZ_VA, p, addr, len,
+ NULL);
+ }
+
+ /*
+ * addr and len must be prefered page size aligned
+ * and valid for range specified.
+ */
+ if (!IS_P2ALIGNED(addr, pgsz) ||
+ !IS_P2ALIGNED(len, pgsz)) {
+ return (set_errno(EINVAL));
+ }
+ if (valid_usr_range(addr, len, 0, as,
+ as->a_userlimit) != RANGE_OKAY) {
+ return (set_errno(ENOMEM));
+ }
+ }
+
+ szc = mem_getpgszc(pgsz);
+ if (szc == (uint_t)-1) {
+ if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK))
+ != 0) {
+ as_rangeunlock(as);
+ }
+ return (set_errno(EINVAL));
+ }
+
+ /*
+ * For stack and heap operations we first need to pad
+ * out existing range (create new mappings) to the new
+ * prefered page size boundary. Also the start of the
+ * .bss for the heap or user's stack base may not be on
+ * the new prefered page size boundary. For these cases
+ * we align the base of the request on the new prefered
+ * page size.
+ */
+ if (pgcmd & MHA_MAPSIZE_BSSBRK) {
+ if (szc == p->p_brkpageszc) {
+ as_rangeunlock(as);
+ return (0);
+ }
+ if (szc > p->p_brkpageszc) {
+ error = brk_internal(p->p_brkbase
+ + p->p_brksize, szc);
+ if (error) {
+ as_rangeunlock(as);
+ return (set_errno(error));
+ }
+ }
+ oszc = p->p_brkpageszc;
+ p->p_brkpageszc = szc;
+
+ ASSERT(IS_P2ALIGNED(p->p_brkbase + p->p_brksize, pgsz));
+ addr = (caddr_t)P2ROUNDUP((uintptr_t)p->p_bssbase,
+ pgsz);
+ len = (p->p_brkbase + p->p_brksize) - addr;
+ ASSERT(IS_P2ALIGNED(len, pgsz));
+ /*
+ * Perhaps no existing pages to promote.
+ */
+ if (len == 0) {
+ as_rangeunlock(as);
+ return (0);
+ }
+ }
+ /*
+ * The code below, as does grow.c, assumes stacks always grow
+ * downward.
+ */
+ if (pgcmd & MHA_MAPSIZE_STACK) {
+ /*
+ * Some boxes (x86) have a top of stack that
+ * is not large page aligned. Since stacks are
+ * usually small we'll just return and do nothing
+ * for theses cases. Prefeered page size is advisory
+ * so no need to return an error.
+ */
+ if (szc == p->p_stkpageszc ||
+ !IS_P2ALIGNED(p->p_usrstack, pgsz)) {
+ as_rangeunlock(as);
+ return (0);
+ }
+
+ if (szc > p->p_stkpageszc) {
+ error = grow_internal(p->p_usrstack
+ - p->p_stksize, szc);
+ if (error) {
+ as_rangeunlock(as);
+ return (set_errno(error));
+ }
+ }
+ oszc = p->p_stkpageszc;
+ p->p_stkpageszc = szc;
+
+ ASSERT(IS_P2ALIGNED(p->p_usrstack, pgsz));
+ addr = p->p_usrstack - p->p_stksize;
+ len = p->p_stksize;
+
+ /*
+ * Perhaps nothing to promote, we wrapped around
+ * or grow did not not grow the stack to a large
+ * page boundary.
+ */
+ if (!IS_P2ALIGNED(len, pgsz) || len == 0 ||
+ addr >= p->p_usrstack || (addr + len) < addr) {
+ as_rangeunlock(as);
+ return (0);
+ }
+ }
+ ASSERT(IS_P2ALIGNED(addr, pgsz));
+ ASSERT(IS_P2ALIGNED(len, pgsz));
+ error = as_setpagesize(as, addr, len, szc, B_TRUE);
+
+ /*
+ * On stack or heap failures restore original
+ * pg size code.
+ */
+ if (error) {
+ if ((pgcmd & MHA_MAPSIZE_BSSBRK) != 0) {
+ p->p_brkpageszc = oszc;
+ }
+ if ((pgcmd & MHA_MAPSIZE_STACK) != 0) {
+ p->p_stkpageszc = oszc;
+ }
+ (void) set_errno(error);
+ }
+ if ((pgcmd & (MHA_MAPSIZE_BSSBRK|MHA_MAPSIZE_STACK)) != 0) {
+ as_rangeunlock(as);
+ }
+ return (error);
+ case MC_ADVISE:
+ switch ((uintptr_t)arg) {
+ case MADV_WILLNEED:
+ fc = as_faulta(as, addr, len);
+ if (fc) {
+ if (FC_CODE(fc) == FC_OBJERR)
+ error = set_errno(FC_ERRNO(fc));
+ else if (FC_CODE(fc) == FC_NOMAP)
+ error = set_errno(ENOMEM);
+ else
+ error = set_errno(EINVAL);
+ return (error);
+ }
+ break;
+
+ case MADV_DONTNEED:
+ /*
+ * For now, don't need is turned into an as_ctl(MC_SYNC)
+ * operation flagged for async invalidate.
+ */
+ error = as_ctl(as, addr, len, MC_SYNC, attr,
+ MS_ASYNC | MS_INVALIDATE, NULL, 0);
+ if (error)
+ (void) set_errno(error);
+ return (error);
+
+ default:
+ error = as_ctl(as, addr, len, cmd, attr,
+ (uintptr_t)arg, NULL, 0);
+ if (error)
+ (void) set_errno(error);
+ return (error);
+ }
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ error = as_ctl(as, addr, len, cmd, attr, (uintptr_t)arg, NULL, 0);
+
+ if (error)
+ (void) set_errno(error);
+ return (error);
+}
+
+/*
+ * Return page size code for page size passed in. If
+ * matching page size not found return -1.
+ */
+static uint_t
+mem_getpgszc(size_t pgsz) {
+ return ((uint_t)page_user_szc(pgsz));
+}
diff --git a/usr/src/uts/common/syscall/mkdir.c b/usr/src/uts/common/syscall/mkdir.c
new file mode 100644
index 0000000000..fc9262b0a3
--- /dev/null
+++ b/usr/src/uts/common/syscall/mkdir.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1996 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+
+/*
+ * Make a directory.
+ */
+int
+mkdir(char *dname, int dmode)
+{
+ vnode_t *vp;
+ struct vattr vattr;
+ int error;
+
+ vattr.va_type = VDIR;
+ vattr.va_mode = dmode & PERMMASK;
+ vattr.va_mask = AT_TYPE|AT_MODE;
+ error = vn_create(dname, UIO_USERSPACE, &vattr, EXCL, 0, &vp, CRMKDIR,
+ 0, u.u_cmask);
+ if (error)
+ return (set_errno(error));
+ VN_RELE(vp);
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/mknod.c b/usr/src/uts/common/syscall/mknod.c
new file mode 100644
index 0000000000..26250387e4
--- /dev/null
+++ b/usr/src/uts/common/syscall/mknod.c
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/uio.h>
+#include <sys/mkdev.h>
+#include <sys/policy.h>
+#include <sys/debug.h>
+
+/*
+ * Create a special file, a regular file, or a FIFO.
+ * fname - pathname passed by user
+ * fmode - mode of pathname
+ * dev = device number - b/c specials only
+ */
+int
+mknod(char *fname, mode_t fmode, dev_t dev)
+{
+ vnode_t *vp;
+ struct vattr vattr;
+ int error;
+ enum create why;
+
+ /*
+ * Zero type is equivalent to a regular file.
+ */
+ if ((fmode & S_IFMT) == 0)
+ fmode |= S_IFREG;
+
+ /*
+ * Must be privileged unless making a FIFO node.
+ */
+ if (((fmode & S_IFMT) != S_IFIFO) && secpolicy_sys_devices(CRED()) != 0)
+ return (set_errno(EPERM));
+ /*
+ * Set up desired attributes and vn_create the file.
+ */
+ vattr.va_type = IFTOVT(fmode);
+ vattr.va_mode = fmode & MODEMASK;
+ vattr.va_mask = AT_TYPE|AT_MODE;
+ if (vattr.va_type == VCHR || vattr.va_type == VBLK) {
+ if (get_udatamodel() != DATAMODEL_NATIVE)
+ dev = expldev(dev);
+ if (dev == NODEV || (getemajor(dev)) == (major_t)NODEV)
+ return (set_errno(EINVAL));
+ vattr.va_rdev = dev;
+ vattr.va_mask |= AT_RDEV;
+ }
+ why = ((fmode & S_IFMT) == S_IFDIR) ? CRMKDIR : CRMKNOD;
+ if (error = vn_create(fname, UIO_USERSPACE, &vattr, EXCL, 0, &vp,
+ why, 0, u.u_cmask))
+ return (set_errno(error));
+ VN_RELE(vp);
+ return (0);
+}
+
+#if defined(__i386) || defined(__i386_COMPAT)
+
+/*ARGSUSED*/
+int
+xmknod(int version, char *fname, mode_t fmode, dev_t dev)
+{
+ return (mknod(fname, fmode, dev));
+}
+
+#endif
diff --git a/usr/src/uts/common/syscall/mount.c b/usr/src/uts/common/syscall/mount.c
new file mode 100644
index 0000000000..ef681ea586
--- /dev/null
+++ b/usr/src/uts/common/syscall/mount.c
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/user.h>
+#include <sys/fstyp.h>
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/dnlc.h>
+#include <sys/file.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+#include <sys/swap.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+#include <sys/cladm.h>
+
+/*
+ * System calls.
+ */
+
+/*
+ * "struct mounta" defined in sys/vfs.h.
+ */
+
+/* ARGSUSED */
+int
+mount(long *lp, rval_t *rp)
+{
+ vnode_t *vp = NULL;
+ struct vfs *vfsp; /* dummy argument */
+ int error;
+ struct mounta *uap;
+#if defined(_LP64)
+ struct mounta native;
+
+ /*
+ * Make a struct mounta if we are DATAMODEL_LP64
+ */
+ uap = &native;
+ uap->spec = (char *)*lp++;
+ uap->dir = (char *)*lp++;
+ uap->flags = (int)*lp++;
+ uap->fstype = (char *)*lp++;
+ uap->dataptr = (char *)*lp++;
+ uap->datalen = (int)*lp++;
+ uap->optptr = (char *)*lp++;
+ uap->optlen = (int)*lp++;
+#else /* !defined(_LP64) */
+ /*
+ * 32 bit kernels can take a shortcut and just cast
+ * the args array to the structure.
+ */
+ uap = (struct mounta *)lp;
+#endif /* _LP64 */
+ /*
+ * Resolve second path name (mount point).
+ */
+ if (error = lookupname(uap->dir, UIO_USERSPACE, FOLLOW, NULLVPP, &vp))
+ return (set_errno(error));
+
+ /*
+ * Some mount flags are disallowed through the system call interface.
+ */
+ uap->flags &= MS_MASK;
+
+ if ((vp->v_flag & VPXFS) && ((uap->flags & MS_GLOBAL) != MS_GLOBAL)) {
+ /*
+ * Clustering: if we're doing a mount onto the global
+ * namespace, and the mount is not a global mount, return
+ * an error.
+ */
+ error = ENOTSUP;
+ } else if (uap->flags & MS_GLOBAL) {
+ /*
+ * Clustering: global mount specified.
+ */
+ if ((cluster_bootflags & CLUSTER_BOOTED) == 0) {
+ /*
+ * If we're not booted as a cluster,
+ * global mounts are not allowed.
+ */
+ error = ENOTSUP;
+ } else {
+ error = domount("pxfs", uap, vp, CRED(), &vfsp);
+ if (!error)
+ VFS_RELE(vfsp);
+ }
+ } else {
+ error = domount(NULL, uap, vp, CRED(), &vfsp);
+ if (!error)
+ VFS_RELE(vfsp);
+ }
+ VN_RELE(vp);
+ rp->r_val2 = error;
+ return (error ? set_errno(error) : 0);
+}
diff --git a/usr/src/uts/common/syscall/nice.c b/usr/src/uts/common/syscall/nice.c
new file mode 100644
index 0000000000..55db136f7b
--- /dev/null
+++ b/usr/src/uts/common/syscall/nice.c
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+/* Copyright (c) 1994 Sun Microsystems, Inc. */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.15 */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+#include <sys/class.h>
+#include <sys/mutex.h>
+
+/*
+ * We support the nice system call for compatibility although
+ * the priocntl system call supports a superset of nice's functionality.
+ * We support nice only for time sharing threads. It will fail
+ * if called by a thread from another class.
+ */
+
+int
+nice(int niceness)
+{
+ int error = 0;
+ int err, retval;
+ kthread_id_t t;
+ proc_t *p = curproc;
+
+ mutex_enter(&p->p_lock);
+ t = p->p_tlist;
+ do {
+ err = CL_DONICE(t, CRED(), niceness, &retval);
+ if (error == 0 && err)
+ error = set_errno(err);
+ } while ((t = t->t_forw) != p->p_tlist);
+ mutex_exit(&p->p_lock);
+ if (error)
+ return (error);
+ return (retval);
+}
diff --git a/usr/src/uts/common/syscall/ntptime.c b/usr/src/uts/common/syscall/ntptime.c
new file mode 100644
index 0000000000..7f38b65db4
--- /dev/null
+++ b/usr/src/uts/common/syscall/ntptime.c
@@ -0,0 +1,218 @@
+/*
+ * Copyright 1994,1996-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) David L. Mills 1993, 1994
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * documentation for any purpose and without fee is hereby granted, provided
+ * that the above copyright notice appears in all copies and that both the
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name University of Delaware not be used in
+ * advertising or publicity pertaining to distribution of the software
+ * without specific, written prior permission. The University of Delaware
+ * makes no representations about the suitability this software for any
+ * purpose. It is provided "as is" without express or implied warranty.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Modification history kern_ntptime.c
+ *
+ * 24 Sep 94 David L. Mills
+ * Tightened code at exits.
+ *
+ * 24 Mar 94 David L. Mills
+ * Revised syscall interface to include new variables for PPS
+ * time discipline.
+ *
+ * 14 Feb 94 David L. Mills
+ * Added code for external clock
+ *
+ * 28 Nov 93 David L. Mills
+ * Revised frequency scaling to conform with adjusted parameters
+ *
+ * 17 Sep 93 David L. Mills
+ * Created file
+ */
+/*
+ * ntp_gettime(), ntp_adjtime() - precision time interface
+ *
+ * These routines consitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by clock() to adjust the phase and
+ * frequency of the phase-lock loop which controls the system clock.
+ */
+#include <sys/param.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/timer.h>
+#include <sys/debug.h>
+#include <sys/timex.h>
+#include <sys/model.h>
+#include <sys/policy.h>
+
+/*
+ * ntp_gettime() - NTP user application interface
+ */
+int
+ntp_gettime(struct ntptimeval *tp)
+{
+ timestruc_t tod;
+ struct ntptimeval ntv;
+ model_t datamodel = get_udatamodel();
+
+ gethrestime(&tod);
+ if (tod.tv_sec > TIME32_MAX)
+ return (set_errno(EOVERFLOW));
+ ntv.time.tv_sec = tod.tv_sec;
+ ntv.time.tv_usec = tod.tv_nsec / (NANOSEC / MICROSEC);
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyout(&ntv, tp, sizeof (ntv)))
+ return (set_errno(EFAULT));
+ } else {
+ struct ntptimeval32 ntv32;
+
+ if (TIMEVAL_OVERFLOW(&ntv.time))
+ return (set_errno(EOVERFLOW));
+
+ TIMEVAL_TO_TIMEVAL32(&ntv32.time, &ntv.time);
+
+ ntv32.maxerror = ntv.maxerror;
+ ntv32.esterror = ntv.esterror;
+
+ if (copyout(&ntv32, tp, sizeof (ntv32)))
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * Status word error decode. If any of these conditions
+ * occur, an error is returned, instead of the status
+ * word. Most applications will care only about the fact
+ * the system clock may not be trusted, not about the
+ * details.
+ *
+ * Hardware or software error
+ */
+ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+ /*
+ * PPS signal lost when either time or frequency
+ * synchronization requested
+ */
+ (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) ||
+
+ /*
+ * PPS jitter exceeded when time synchronization
+ * requested
+ */
+ (time_status & STA_PPSTIME && time_status & STA_PPSJITTER) ||
+
+ /*
+ * PPS wander exceeded or calibration error when
+ * frequency synchronization requested
+ */
+ (time_status & STA_PPSFREQ && time_status &
+ (STA_PPSWANDER | STA_PPSERROR)))
+ return (TIME_ERROR);
+
+ return (time_state);
+}
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ */
+int
+ntp_adjtime(struct timex *tp)
+{
+ struct timex ntv;
+ int modes;
+
+ if (copyin(tp, &ntv, sizeof (ntv)))
+ return (set_errno(EFAULT));
+
+ /*
+ * Update selected clock variables - only privileged users can
+ * change anything. Note that there is no error checking here on
+ * the assumption privileged users know what they're doing.
+ */
+ modes = ntv.modes;
+
+ if (modes != 0 && secpolicy_settime(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (ntv.constant < 0 || ntv.constant > 30)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&tod_lock);
+ if (modes & MOD_FREQUENCY)
+ time_freq = ntv.freq - pps_freq;
+ if (modes & MOD_MAXERROR)
+ time_maxerror = ntv.maxerror;
+ if (modes & MOD_ESTERROR)
+ time_esterror = ntv.esterror;
+ if (modes & MOD_STATUS) {
+ time_status &= STA_RONLY;
+ time_status |= ntv.status & ~STA_RONLY;
+ }
+ if (modes & MOD_TIMECONST)
+ time_constant = ntv.constant;
+
+ if (modes & MOD_OFFSET)
+ clock_update(ntv.offset);
+
+ /*
+ * Retrieve all clock variables
+ */
+ ntv.offset = time_offset / SCALE_UPDATE;
+ ntv.freq = time_freq + pps_freq;
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.status = time_status;
+ ntv.constant = time_constant;
+ ntv.precision = time_precision;
+ ntv.tolerance = time_tolerance;
+ ntv.shift = pps_shift;
+ ntv.ppsfreq = pps_freq;
+ ntv.jitter = pps_jitter >> PPS_AVG;
+ ntv.stabil = pps_stabil;
+ ntv.calcnt = pps_calcnt;
+ ntv.errcnt = pps_errcnt;
+ ntv.jitcnt = pps_jitcnt;
+ ntv.stbcnt = pps_stbcnt;
+ mutex_exit(&tod_lock);
+
+ if (copyout(&ntv, tp, sizeof (ntv)))
+ return (set_errno(EFAULT));
+
+ /*
+ * Status word error decode. See comments in
+ * ntp_gettime() routine.
+ */
+ if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
+ (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) ||
+ (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) ||
+ (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR)))
+ return (TIME_ERROR);
+
+ return (time_state);
+}
diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c
new file mode 100644
index 0000000000..40e7faa097
--- /dev/null
+++ b/usr/src/uts/common/syscall/open.c
@@ -0,0 +1,305 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+#include <c2/audit.h>
+
+/*
+ * Common code for open()/openat() and creat(). Check permissions, allocate
+ * an open file structure, and call the device open routine (if any).
+ */
+
+static int
+copen(int startfd, char *fname, int filemode, int createmode)
+{
+ struct pathname pn;
+ vnode_t *vp, *sdvp;
+ file_t *fp, *startfp;
+ enum vtype type;
+ int error;
+ int fd, dupfd;
+ vnode_t *startvp;
+ proc_t *p = curproc;
+
+ if (startfd == AT_FDCWD) {
+ /*
+ * Regular open()
+ */
+ startvp = NULL;
+ } else {
+ /*
+ * We're here via openat()
+ */
+ char startchar;
+
+ if (copyin(fname, &startchar, sizeof (char)))
+ return (set_errno(EFAULT));
+
+ /*
+ * if startchar is / then startfd is ignored
+ */
+ if (startchar == '/')
+ startvp = NULL;
+ else {
+ if ((startfp = getf(startfd)) == NULL)
+ return (set_errno(EBADF));
+ startvp = startfp->f_vnode;
+ VN_HOLD(startvp);
+ releasef(startfd);
+ }
+ }
+
+ if (filemode & FXATTR) {
+
+ /*
+ * Make sure we have a valid request.
+ * We must either have a real fd or AT_FDCWD
+ */
+
+ if (startfd != AT_FDCWD && startvp == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (error = pn_get(fname, UIO_USERSPACE, &pn)) {
+ goto out;
+ }
+
+ if (startfd == AT_FDCWD) {
+ mutex_enter(&p->p_lock);
+ startvp = PTOU(p)->u_cdir;
+ VN_HOLD(startvp);
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Verify permission to put attributes on file
+ */
+
+ if ((VOP_ACCESS(startvp, VREAD, 0, CRED()) != 0) &&
+ (VOP_ACCESS(startvp, VWRITE, 0, CRED()) != 0) &&
+ (VOP_ACCESS(startvp, VEXEC, 0, CRED()) != 0)) {
+ error = EACCES;
+ pn_free(&pn);
+ goto out;
+ }
+
+ if ((startvp->v_vfsp->vfs_flag & VFS_XATTR) != 0) {
+ error = VOP_LOOKUP(startvp, "", &sdvp, &pn,
+ LOOKUP_XATTR|CREATE_XATTR_DIR, rootvp, CRED());
+ } else {
+ error = EINVAL;
+ }
+ pn_free(&pn);
+ if (error != 0)
+ goto out;
+
+ VN_RELE(startvp);
+ startvp = sdvp;
+ }
+
+ if ((filemode & (FREAD|FWRITE)) != 0) {
+ if ((filemode & (FNONBLOCK|FNDELAY)) == (FNONBLOCK|FNDELAY))
+ filemode &= ~FNDELAY;
+ error = falloc((vnode_t *)NULL, filemode, &fp, &fd);
+ if (error == 0) {
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_setfsat_path(1);
+#endif /* C2_AUDIT */
+ /*
+ * Last arg is a don't-care term if
+ * !(filemode & FCREAT).
+ */
+ error = vn_openat(fname, UIO_USERSPACE, filemode,
+ (int)(createmode & MODEMASK), &vp, CRCREAT,
+ u.u_cmask, startvp);
+
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ if (error == 0) {
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_copen(fd, fp, vp);
+#endif /* C2_AUDIT */
+ if ((vp->v_flag & VDUP) == 0) {
+ fp->f_vnode = vp;
+ mutex_exit(&fp->f_tlock);
+ /*
+ * We must now fill in the slot
+ * falloc reserved.
+ */
+ setf(fd, fp);
+ return (fd);
+ } else {
+ /*
+ * Special handling for /dev/fd.
+ * Give up the file pointer
+ * and dup the indicated file descriptor
+ * (in v_rdev). This is ugly, but I've
+ * seen worse.
+ */
+ unfalloc(fp);
+ dupfd = getminor(vp->v_rdev);
+ type = vp->v_type;
+ mutex_enter(&vp->v_lock);
+ vp->v_flag &= ~VDUP;
+ mutex_exit(&vp->v_lock);
+ VN_RELE(vp);
+ if (type != VCHR)
+ return (set_errno(EINVAL));
+ if ((fp = getf(dupfd)) == NULL) {
+ setf(fd, NULL);
+ return (set_errno(EBADF));
+ }
+ mutex_enter(&fp->f_tlock);
+ fp->f_count++;
+ mutex_exit(&fp->f_tlock);
+ setf(fd, fp);
+ releasef(dupfd);
+ }
+ return (fd);
+ } else {
+ setf(fd, NULL);
+ unfalloc(fp);
+ return (set_errno(error));
+ }
+ }
+ } else {
+ error = EINVAL;
+ }
+out:
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ return (set_errno(error));
+}
+
+#define OPENMODE32(fmode) ((int)((fmode)-FOPEN))
+#define CREATMODE32 (FWRITE|FCREAT|FTRUNC)
+#define OPENMODE64(fmode) (OPENMODE32(fmode) | FOFFMAX)
+#define CREATMODE64 (CREATMODE32 | FOFFMAX)
+#ifdef _LP64
+#define OPENMODE(fmode) OPENMODE64(fmode)
+#define CREATMODE CREATMODE64
+#else
+#define OPENMODE OPENMODE32
+#define CREATMODE CREATMODE32
+#endif
+
+/*
+ * Open a file.
+ */
+int
+open(char *fname, int fmode, int cmode)
+{
+ return (copen(AT_FDCWD, fname, OPENMODE(fmode), cmode));
+}
+
+/*
+ * Create a file.
+ */
+int
+creat(char *fname, int cmode)
+{
+ return (copen(AT_FDCWD, fname, CREATMODE, cmode));
+}
+
+int
+openat(int fd, char *path, int fmode, int cmode)
+{
+ return (copen(fd, path, OPENMODE(fmode), cmode));
+}
+
+#if defined(_ILP32) || defined(_SYSCALL32_IMPL)
+/*
+ * Open and Creat for large files in 32-bit environment. Sets the FOFFMAX flag.
+ */
+int
+open64(char *fname, int fmode, int cmode)
+{
+ return (copen(AT_FDCWD, fname, OPENMODE64(fmode), cmode));
+}
+
+int
+creat64(char *fname, int cmode)
+{
+ return (copen(AT_FDCWD, fname, CREATMODE64, cmode));
+}
+
+int
+openat64(int fd, char *path, int fmode, int cmode)
+{
+ return (copen(fd, path, OPENMODE64(fmode), cmode));
+}
+
+#endif /* _ILP32 || _SYSCALL32_IMPL */
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * Open and Creat for 32-bit compatibility on 64-bit kernel
+ */
+int
+open32(char *fname, int fmode, int cmode)
+{
+ return (copen(AT_FDCWD, fname, OPENMODE32(fmode), cmode));
+}
+
+int
+creat32(char *fname, int cmode)
+{
+ return (copen(AT_FDCWD, fname, CREATMODE32, cmode));
+}
+
+int
+openat32(int fd, char *path, int fmode, int cmode)
+{
+ return (copen(fd, path, OPENMODE32(fmode), cmode));
+}
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/p_online.c b/usr/src/uts/common/syscall/p_online.c
new file mode 100644
index 0000000000..004627569c
--- /dev/null
+++ b/usr/src/uts/common/syscall/p_online.c
@@ -0,0 +1,244 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/var.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/kstat.h>
+#include <sys/uadmin.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/procset.h>
+#include <sys/processor.h>
+#include <sys/debug.h>
+#include <sys/policy.h>
+
+/*
+ * CPU state diagram
+ *
+ * P_SPARE
+ * P_POWEROFF <---> P_OFFLINE <---> P_ONLINE <---> P_NOINTR
+ * P_FAULTED
+ */
+int
+p_online_internal(processorid_t cpun, int new_status, int *old_status)
+{
+ cpu_t *cp;
+ int status;
+ int error = 0;
+ int flags = 0;
+
+ /*
+ * Try to get a pointer to the requested CPU structure.
+ */
+ mutex_enter(&cpu_lock); /* protects CPU states */
+ if ((cp = cpu_get(cpun)) == NULL) {
+ error = EINVAL;
+ goto out;
+ }
+
+ if (new_status & P_FORCED)
+ flags = CPU_FORCED;
+ *old_status = status = cpu_get_state(cp); /* get processor status */
+ new_status &= ~P_FORCED;
+
+ /*
+ * Perform credentials check.
+ */
+ switch (new_status) {
+ case P_STATUS:
+ goto out;
+ case P_ONLINE:
+ case P_OFFLINE:
+ case P_NOINTR:
+ case P_FAULTED:
+ case P_SPARE:
+ if (secpolicy_ponline(CRED()) != 0)
+ error = EPERM;
+ break;
+ default:
+ error = EINVAL;
+ }
+
+ if (error)
+ goto out;
+
+ /*
+ * return 0 if the CPU is already in the desired new state.
+ */
+ if (status == new_status)
+ goto out;
+
+ switch (new_status) {
+ case P_ONLINE:
+ switch (status) {
+ case P_POWEROFF:
+ /*
+ * If CPU is powered off, power it on.
+ */
+ if (error = cpu_poweron(cp))
+ break;
+ ASSERT(cpu_get_state(cp) == P_OFFLINE);
+ /* FALLTHROUGH */
+ case P_OFFLINE:
+ case P_FAULTED:
+ case P_SPARE:
+ /*
+ * If CPU is in one of the offline states,
+ * bring it online.
+ */
+ error = cpu_online(cp);
+ break;
+ case P_NOINTR:
+ cpu_intr_enable(cp);
+ break;
+ }
+ break;
+
+ case P_OFFLINE:
+ switch (status) {
+ case P_NOINTR:
+ /*
+ * Before we take the CPU offline, we first enable I/O
+ * interrupts.
+ */
+ cpu_intr_enable(cp);
+ /* FALLTHROUGH */
+ case P_ONLINE:
+ case P_FAULTED:
+ case P_SPARE:
+ /*
+ * CPU is online, or in a special offline state.
+ * Take it offline.
+ */
+ error = cpu_offline(cp, flags);
+ break;
+ case P_POWEROFF:
+ /*
+ * If CPU is powered off, power it on.
+ */
+ error = cpu_poweron(cp);
+ }
+ break;
+
+ case P_NOINTR:
+ switch (status) {
+ case P_POWEROFF:
+ /*
+ * if CPU is powered off, power it on.
+ */
+ if (error = cpu_poweron(cp))
+ break;
+ ASSERT(cpu_get_state(cp) == P_OFFLINE);
+ /* FALLTHROUGH */
+ case P_OFFLINE:
+ case P_FAULTED:
+ case P_SPARE:
+ /*
+ * First, bring the CPU online.
+ */
+ if (error = cpu_online(cp))
+ break;
+ /* FALLTHROUGH */
+ case P_ONLINE:
+ /*
+ * CPU is now online. Try to disable interrupts.
+ */
+ error = cpu_intr_disable(cp);
+ }
+ break;
+
+ case P_FAULTED:
+ switch (status) {
+ case P_POWEROFF:
+ /*
+ * If CPU is powered off, power it on.
+ */
+ if (error = cpu_poweron(cp))
+ break;
+ ASSERT(cpu_get_state(cp) == P_OFFLINE);
+ /*FALLTHROUGH*/
+ case P_OFFLINE:
+ case P_SPARE:
+ case P_ONLINE:
+ case P_NOINTR:
+ /*
+ * Mark this CPU as faulted.
+ */
+ error = cpu_faulted(cp, flags);
+ }
+ break;
+
+ case P_SPARE:
+ switch (status) {
+ case P_POWEROFF:
+ /*
+ * If CPU is powered off, power it on.
+ */
+ if (error = cpu_poweron(cp))
+ break;
+ ASSERT(cpu_get_state(cp) == P_OFFLINE);
+ /*FALLTHROUGH*/
+ case P_OFFLINE:
+ case P_FAULTED:
+ case P_ONLINE:
+ case P_NOINTR:
+ /*
+ * Mark this CPU as a spare.
+ */
+ error = cpu_spare(cp, flags);
+ }
+ break;
+ }
+out:
+ mutex_exit(&cpu_lock);
+ return (error);
+}
+
+/*
+ * p_online(2) - get/change processor operational status.
+ *
+ * As noted in os/cpu.c, the P_ONLINE and other state constants are for use
+ * only in this system call path and other paths conveying CPU state to
+ * userland. In general, other kernel consumers should be using the accessor
+ * functions in uts/common/os/cpu.c.
+ */
+int
+p_online(processorid_t cpun, int new_status)
+{
+ int ret;
+ int old_status;
+
+ ret = p_online_internal(cpun, new_status, &old_status);
+ if (ret != 0)
+ return (set_errno(ret));
+ return (old_status);
+}
diff --git a/usr/src/uts/common/syscall/pathconf.c b/usr/src/uts/common/syscall/pathconf.c
new file mode 100644
index 0000000000..788076d25e
--- /dev/null
+++ b/usr/src/uts/common/syscall/pathconf.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+
+/*
+ * Common code for pathconf(), fpathconf() system calls
+ */
+static long
+cpathconf(register vnode_t *vp, int cmd, struct cred *cr)
+{
+ int error;
+ ulong_t val;
+
+ switch (cmd) {
+ case _PC_2_SYMLINKS:
+ if (error = VOP_PATHCONF(vp, _PC_SYMLINK_MAX, &val, cr))
+ return ((long)set_errno(error));
+ return ((long)(val > 0));
+
+ case _PC_ALLOC_SIZE_MIN:
+ case _PC_REC_INCR_XFER_SIZE:
+ case _PC_REC_MAX_XFER_SIZE:
+ case _PC_REC_MIN_XFER_SIZE:
+ case _PC_REC_XFER_ALIGN:
+ return ((long)set_errno(EINVAL));
+
+ case _PC_ASYNC_IO:
+ return (1l);
+
+ case _PC_PRIO_IO:
+ return ((long)set_errno(EINVAL));
+
+ case _PC_SYNC_IO:
+ if (!(error = VOP_FSYNC(vp, FSYNC, cr)))
+ return (1l);
+ return ((long)set_errno(error));
+
+ case _PC_XATTR_ENABLED:
+ return ((vp->v_vfsp->vfs_flag & VFS_XATTR) ? 1 : 0);
+
+ default:
+ if (error = VOP_PATHCONF(vp, cmd, &val, cr))
+ return ((long)set_errno(error));
+ return (val);
+ }
+ /* NOTREACHED */
+}
+
+/* fpathconf/pathconf interfaces */
+
+long
+fpathconf(int fdes, int name)
+{
+ file_t *fp;
+ long retval;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ retval = cpathconf(fp->f_vnode, name, fp->f_cred);
+ releasef(fdes);
+ return (retval);
+}
+
+long
+pathconf(char *fname, int name)
+{
+ vnode_t *vp;
+ long retval;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return ((long)set_errno(error));
+ }
+
+ retval = cpathconf(vp, name, CRED());
+ VN_RELE(vp);
+ return (retval);
+}
diff --git a/usr/src/uts/common/syscall/pause.c b/usr/src/uts/common/syscall/pause.c
new file mode 100644
index 0000000000..3c621859e7
--- /dev/null
+++ b/usr/src/uts/common/syscall/pause.c
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright (c) 1994-2000 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/debug.h>
+
+/*
+ * Indefinite wait.
+ */
+int
+pause()
+{
+ mutex_enter(&curthread->t_delay_lock);
+ while (cv_wait_sig_swap(&curthread->t_delay_cv,
+ &curthread->t_delay_lock))
+ ;
+ mutex_exit(&curthread->t_delay_lock);
+ return (set_errno(EINTR));
+}
diff --git a/usr/src/uts/common/syscall/pgrpsys.c b/usr/src/uts/common/syscall/pgrpsys.c
new file mode 100644
index 0000000000..e8be876537
--- /dev/null
+++ b/usr/src/uts/common/syscall/pgrpsys.c
@@ -0,0 +1,163 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/session.h>
+#include <sys/debug.h>
+
+/* ARGSUSED */
+int
+setpgrp(int flag, int pid, int pgid)
+{
+ register proc_t *p = ttoproc(curthread);
+ register int retval = 0;
+
+ switch (flag) {
+
+ case 1: /* setpgrp() */
+ mutex_enter(&pidlock);
+ if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) {
+ mutex_exit(&pidlock);
+ sess_create();
+ } else
+ mutex_exit(&pidlock);
+ return (p->p_sessp->s_sid);
+
+ case 3: /* setsid() */
+ mutex_enter(&pidlock);
+ if (p->p_pgidp == p->p_pidp || pgmembers(p->p_pid)) {
+ mutex_exit(&pidlock);
+ return (set_errno(EPERM));
+ }
+ mutex_exit(&pidlock);
+ sess_create();
+ return (p->p_sessp->s_sid);
+
+ case 5: /* setpgid() */
+ {
+ mutex_enter(&pidlock);
+ if (pid == 0)
+ pid = p->p_pid;
+ else if (pid < 0 || pid >= maxpid) {
+ mutex_exit(&pidlock);
+ return (set_errno(EINVAL));
+ } else if (pid != p->p_pid) {
+ for (p = p->p_child; /* empty */; p = p->p_sibling) {
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+ if (p->p_pid == pid)
+ break;
+ }
+ if (p->p_flag & SEXECED) {
+ mutex_exit(&pidlock);
+ return (set_errno(EACCES));
+ }
+ if (p->p_sessp != ttoproc(curthread)->p_sessp) {
+ mutex_exit(&pidlock);
+ return (set_errno(EPERM));
+ }
+ }
+
+ if (p->p_sessp->s_sid == pid) {
+ mutex_exit(&pidlock);
+ return (set_errno(EPERM));
+ }
+
+ if (pgid == 0)
+ pgid = p->p_pid;
+ else if (pgid < 0 || pgid >= maxpid) {
+ mutex_exit(&pidlock);
+ return (set_errno(EINVAL));
+ }
+
+ if (p->p_pgrp == pgid) {
+ mutex_exit(&pidlock);
+ break;
+ } else if (p->p_pid == pgid) {
+ /*
+ * We need to protect p_pgidp with p_lock because
+ * /proc looks at it while holding only p_lock.
+ */
+ mutex_enter(&p->p_lock);
+ pgexit(p);
+ pgjoin(p, p->p_pidp);
+ mutex_exit(&p->p_lock);
+ } else {
+ register proc_t *q;
+
+ if ((q = pgfind(pgid)) == NULL ||
+ q->p_sessp != p->p_sessp) {
+ mutex_exit(&pidlock);
+ return (set_errno(EPERM));
+ }
+ /*
+ * See comment above about p_lock and /proc
+ */
+ mutex_enter(&p->p_lock);
+ pgexit(p);
+ pgjoin(p, q->p_pgidp);
+ mutex_exit(&p->p_lock);
+ }
+ mutex_exit(&pidlock);
+ break;
+ }
+
+ case 0: /* getpgrp() */
+ mutex_enter(&pidlock);
+ retval = p->p_pgrp;
+ mutex_exit(&pidlock);
+ break;
+
+ case 2: /* getsid() */
+ case 4: /* getpgid() */
+ if (pid < 0 || pid >= maxpid) {
+ return (set_errno(EINVAL));
+ }
+ mutex_enter(&pidlock);
+ if (pid != 0 && p->p_pid != pid &&
+ ((p = prfind(pid)) == NULL || p->p_stat == SIDL)) {
+ mutex_exit(&pidlock);
+ return (set_errno(ESRCH));
+ }
+ if (flag == 2)
+ retval = p->p_sessp->s_sid;
+ else
+ retval = p->p_pgrp;
+ mutex_exit(&pidlock);
+ break;
+
+ }
+ return (retval);
+}
diff --git a/usr/src/uts/common/syscall/pipe.c b/usr/src/uts/common/syscall/pipe.c
new file mode 100644
index 0000000000..c980270a55
--- /dev/null
+++ b/usr/src/uts/common/syscall/pipe.c
@@ -0,0 +1,178 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.11 */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <sys/fs/fifonode.h>
+
+/*
+ * This is the loadable module wrapper.
+ */
+#include <sys/modctl.h>
+#include <sys/syscall.h>
+
+char _depends_on[] = "fs/fifofs";
+
+longlong_t pipe();
+
+static struct sysent pipe_sysent = {
+ 0,
+ SE_32RVAL1 | SE_32RVAL2 | SE_NOUNLOAD | SE_ARGC,
+ (int (*)())pipe
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modlsys modlsys = {
+ &mod_syscallops, "pipe(2) syscall", &pipe_sysent
+};
+
+#ifdef _SYSCALL32_IMPL
+static struct modlsys modlsys32 = {
+ &mod_syscallops32, "32-bit pipe(2) syscall", &pipe_sysent
+};
+#endif
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsys,
+#ifdef _SYSCALL32_IMPL
+ &modlsys32,
+#endif
+ NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * pipe(2) system call.
+ * Create a pipe by connecting two streams together. Associate
+ * each end of the pipe with a vnode, a file descriptor and
+ * one of the streams.
+ */
+longlong_t
+pipe()
+{
+ vnode_t *vp1, *vp2;
+ struct file *fp1, *fp2;
+ int error = 0;
+ int fd1, fd2;
+ rval_t r;
+
+ /*
+ * Allocate and initialize two vnodes.
+ */
+ makepipe(&vp1, &vp2);
+
+ /*
+ * Allocate and initialize two file table entries and two
+ * file pointers. Each file pointer is open for read and
+ * write.
+ */
+ if (error = falloc(vp1, FWRITE|FREAD, &fp1, &fd1)) {
+ VN_RELE(vp1);
+ VN_RELE(vp2);
+ return ((longlong_t)set_errno(error));
+ }
+
+ if (error = falloc(vp2, FWRITE|FREAD, &fp2, &fd2))
+ goto out2;
+
+ /*
+ * Create two stream heads and attach to each vnode.
+ */
+ if (error = fifo_stropen(&vp1, FWRITE|FREAD, fp1->f_cred, 0, 0))
+ goto out;
+
+ if (error = fifo_stropen(&vp2, FWRITE|FREAD, fp2->f_cred, 0, 0)) {
+ (void) VOP_CLOSE(vp1, FWRITE|FREAD, 1, (offset_t)0,
+ fp1->f_cred);
+ goto out;
+ }
+
+ strmate(vp1, vp2);
+
+ VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid();
+
+ /*
+ * Now fill in the entries that falloc reserved
+ */
+ mutex_exit(&fp1->f_tlock);
+ mutex_exit(&fp2->f_tlock);
+ setf(fd1, fp1);
+ setf(fd2, fp2);
+
+ /*
+ * Return the file descriptors to the user. They now
+ * point to two different vnodes which have different
+ * stream heads.
+ */
+ r.r_val1 = fd1;
+ r.r_val2 = fd2;
+ return (r.r_vals);
+out:
+ unfalloc(fp2);
+ setf(fd2, NULL);
+out2:
+ unfalloc(fp1);
+ setf(fd1, NULL);
+ VN_RELE(vp1);
+ VN_RELE(vp2);
+ return ((longlong_t)set_errno(error));
+}
diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c
new file mode 100644
index 0000000000..4d3c2f1060
--- /dev/null
+++ b/usr/src/uts/common/syscall/poll.c
@@ -0,0 +1,2776 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/user.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/poll_impl.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/bitmap.h>
+#include <sys/kstat.h>
+#include <sys/rctl.h>
+#include <sys/port_kernel.h>
+#include <sys/schedctl.h>
+
+#define NPHLOCKS 64 /* Number of locks; must be power of 2 */
+#define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)]
+#define PHLOCK(php) PHLOCKADDR(php).pp_lock
+#define PH_ENTER(php) mutex_enter(PHLOCK(php))
+#define PH_EXIT(php) mutex_exit(PHLOCK(php))
+#define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \
+ | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL)
+
+/*
+ * global counters to collect some stats
+ */
+static struct {
+ kstat_named_t polllistmiss; /* failed to find a cached poll list */
+ kstat_named_t pollcachehit; /* list matched 100% w/ cached one */
+ kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */
+ kstat_named_t pollcachemiss; /* every list entry is dif from cache */
+} pollstats = {
+ { "polllistmiss", KSTAT_DATA_UINT64 },
+ { "pollcachehit", KSTAT_DATA_UINT64 },
+ { "pollcachephit", KSTAT_DATA_UINT64 },
+ { "pollcachemiss", KSTAT_DATA_UINT64 }
+};
+
+kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats;
+uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t);
+
+struct pplock {
+ kmutex_t pp_lock;
+ short pp_flag;
+ kcondvar_t pp_wait_cv;
+ int32_t pp_pad; /* to a nice round 16 bytes */
+};
+
+static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */
+
+#ifdef DEBUG
+static int pollchecksanity(pollstate_t *, nfds_t);
+static int pollcheckxref(pollstate_t *, int);
+static void pollcheckphlist(void);
+static int pollcheckrevents(pollstate_t *, int, int, int);
+static void checkpolldat(pollstate_t *);
+#endif /* DEBUG */
+static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int,
+ int *);
+
+/*
+ * Data structure overview:
+ * The per-thread poll state consists of
+ * one pollstate_t
+ * one pollcache_t
+ * one bitmap with one event bit per fd
+ * a (two-dimensional) hashed array of polldat_t structures - one entry
+ * per fd
+ *
+ * This conglomerate of data structures interact with
+ * the pollhead which is used by VOP_POLL and pollwakeup
+ * (protected by the PHLOCK, cached array of plocks), and
+ * the fpollinfo list hanging off the fi_list which is used to notify
+ * poll when a cached fd is closed. This is protected by uf_lock.
+ *
+ * Invariants:
+ * pd_php (pollhead pointer) is set iff (if and only if) the polldat
+ * is on that pollhead. This is modified atomically under pc_lock.
+ *
+ * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo
+ * list for that open file.
+ * This is modified atomically under pc_lock.
+ *
+ * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt.
+ * Iff pd_ref[i].xf_refcnt >= 1 then
+ * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd
+ * Iff pd_ref[i].xf_refcnt > 1 then
+ * In ps_pcacheset[i].pcs_pollfd between index
+ * pd_ref[i].xf_position] and the end of the list
+ * there are xf_refcnt entries with .fd == pd_fd
+ *
+ * Locking design:
+ * Whenever possible the design relies on the fact that the poll cache state
+ * is per thread thus for both poll and exit it is self-synchronizing.
+ * Thus the key interactions where other threads access the state are:
+ * pollwakeup (and polltime), and
+ * close cleaning up the cached references to an open file
+ *
+ * The two key locks in poll proper is ps_lock and pc_lock.
+ *
+ * The ps_lock is used for synchronization between poll, (lwp_)exit and close
+ * to ensure that modifications to pollcacheset structure are serialized.
+ * This lock is held through most of poll() except where poll sleeps
+ * since there is little need to handle closes concurrently with the execution
+ * of poll.
+ * The pc_lock protects most of the fields in pollcache structure and polldat
+ * structures (which are accessed by poll, pollwakeup, and polltime)
+ * with the exception of fields that are only modified when only one thread
+ * can access this per-thread state.
+ * Those exceptions occur in poll when first allocating the per-thread state,
+ * when poll grows the number of polldat (never shrinks), and when
+ * exit/pollcleanup has ensured that there are no references from either
+ * pollheads or fpollinfo to the threads poll state.
+ *
+ * Poll(2) system call is the only path which ps_lock and pc_lock are both
+ * held, in that order. It needs ps_lock to synchronize with close and
+ * lwp_exit; and pc_lock with pollwakeup.
+ *
+ * The locking interaction between pc_lock and PHLOCK take into account
+ * that poll acquires these locks in the order of pc_lock and then PHLOCK
+ * while pollwakeup does it in the reverse order. Thus pollwakeup implements
+ * deadlock avoidance by dropping the locks and reacquiring them in the
+ * reverse order. For this to work pollwakeup needs to prevent the thread
+ * from exiting and freeing all of the poll related state. Thus is done
+ * using
+ * the pc_no_exit lock
+ * the pc_busy counter
+ * the pc_busy_cv condition variable
+ *
+ * The locking interaction between pc_lock and uf_lock has similar
+ * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef
+ * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock
+ * to prevent poll or exit from doing a delfpollinfo after which the thread
+ * might exit. But the cleanup needs to acquire pc_lock when modifying
+ * the poll cache state. The solution is to use pc_busy and do the close
+ * cleanup in two phases:
+ * First close calls pollblockexit which increments pc_busy.
+ * This prevents the per-thread poll related state from being freed.
+ * Then close drops uf_lock and calls pollcacheclean.
+ * This routine can then acquire pc_lock and remove any references
+ * to the closing fd (as well as recording that it has been closed
+ * so that a POLLNVAL can be generated even if the fd is reused before
+ * poll has been woken up and checked getf() again).
+ *
+ * When removing a polled fd from poll cache, the fd is always removed
+ * from pollhead list first and then from fpollinfo list, i.e.,
+ * pollhead_delete() is called before delfpollinfo().
+ *
+ *
+ * Locking hierarchy:
+ * pc_no_exit is a leaf level lock.
+ * ps_lock is held when acquiring pc_lock (except when pollwakeup
+ * acquires pc_lock).
+ * pc_lock might be held when acquiring PHLOCK (pollhead_insert/
+ * pollhead_delete)
+ * pc_lock is always held (but this is not required)
+ * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called
+ * from pcache_clean_entry).
+ * pc_lock is held across addfpollinfo/delfpollinfo which acquire
+ * uf_lock.
+ * pc_lock is held across getf/releasef which acquire uf_lock.
+ * ps_lock might be held across getf/releasef which acquire uf_lock.
+ * pollwakeup tries to acquire pc_lock while holding PHLOCK
+ * but drops the locks and reacquire them in reverse order to avoid
+ * deadlock.
+ *
+ * Note also that there is deadlock avoidance support for VOP_POLL routines
+ * and pollwakeup involving a file system or driver lock.
+ * See below.
+ */
+
+/*
+ * Deadlock avoidance support for VOP_POLL() routines. This is
+ * sometimes necessary to prevent deadlock between polling threads
+ * (which hold poll locks on entry to xx_poll(), then acquire foo)
+ * and pollwakeup() threads (which hold foo, then acquire poll locks).
+ *
+ * pollunlock(void) releases whatever poll locks the current thread holds,
+ * returning a cookie for use by pollrelock();
+ *
+ * pollrelock(cookie) reacquires previously dropped poll locks;
+ *
+ * polllock(php, mutex) does the common case: pollunlock(),
+ * acquire the problematic mutex, pollrelock().
+ */
+int
+pollunlock(void)
+{
+ pollcache_t *pcp;
+ int lockstate = 0;
+
+ /*
+ * t_pollcache is set by /dev/poll and event ports (port_fd.c).
+ * If the pollrelock/pollunlock is called as a result of poll(2),
+ * the t_pollcache should be NULL.
+ */
+ if (curthread->t_pollcache == NULL)
+ pcp = curthread->t_pollstate->ps_pcache;
+ else
+ pcp = curthread->t_pollcache;
+
+ if (mutex_owned(&pcp->pc_lock)) {
+ lockstate = 1;
+ mutex_exit(&pcp->pc_lock);
+ }
+ return (lockstate);
+}
+
+void
+pollrelock(int lockstate)
+{
+ pollcache_t *pcp;
+
+ /*
+ * t_pollcache is set by /dev/poll and event ports (port_fd.c).
+ * If the pollrelock/pollunlock is called as a result of poll(2),
+ * the t_pollcache should be NULL.
+ */
+ if (curthread->t_pollcache == NULL)
+ pcp = curthread->t_pollstate->ps_pcache;
+ else
+ pcp = curthread->t_pollcache;
+
+ if (lockstate > 0)
+ mutex_enter(&pcp->pc_lock);
+}
+
+/* ARGSUSED */
+void
+polllock(pollhead_t *php, kmutex_t *lp)
+{
+ if (!mutex_tryenter(lp)) {
+ int lockstate = pollunlock();
+ mutex_enter(lp);
+ pollrelock(lockstate);
+ }
+}
+
+static int
+poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp)
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ int fdcnt = 0;
+ int rval;
+ int i;
+ timespec_t *rqtp = NULL;
+ int timecheck = 0;
+ int imm_timeout = 0;
+ pollfd_t *pollfdp;
+ pollstate_t *ps;
+ pollcache_t *pcp;
+ int error = 0;
+ nfds_t old_nfds;
+ int cacheindex = 0; /* which cache set is used */
+
+ /*
+ * Determine the precise future time of the requested timeout, if any.
+ */
+ if (tsp != NULL) {
+ if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
+ imm_timeout = 1;
+ else {
+ timespec_t now;
+ timecheck = timechanged;
+ gethrestime(&now);
+ rqtp = tsp;
+ timespecadd(rqtp, &now);
+ }
+ }
+
+ /*
+ * Reset our signal mask, if requested.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+ lwp->lwp_sigoldmask = t->t_hold;
+ t->t_hold = *ksetp;
+ t->t_flag |= T_TOMASK;
+ /*
+ * Call cv_timedwait_sig() just to check for signals.
+ * We will return immediately with either 0 or -1.
+ */
+ if (!cv_timedwait_sig(&t->t_delay_cv, &p->p_lock, lbolt)) {
+ mutex_exit(&p->p_lock);
+ error = EINTR;
+ goto pollout;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ /*
+ * Check to see if this guy just wants to use poll() as a timeout.
+ * If yes then bypass all the other stuff and make him sleep.
+ */
+ if (nfds == 0) {
+ /*
+ * Sleep until we have passed the requested future
+ * time or until interrupted by a signal.
+ * Do not check for signals if we have a zero timeout.
+ */
+ if (!imm_timeout) {
+ mutex_enter(&t->t_delay_lock);
+ while ((rval = cv_waituntil_sig(&t->t_delay_cv,
+ &t->t_delay_lock, rqtp, timecheck)) > 0)
+ continue;
+ mutex_exit(&t->t_delay_lock);
+ if (rval == 0)
+ error = EINTR;
+ }
+ goto pollout;
+ }
+
+ if (nfds > p->p_fno_ctl) {
+ mutex_enter(&p->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
+ p->p_rctls, p, RCA_SAFE);
+ mutex_exit(&p->p_lock);
+ error = EINVAL;
+ goto pollout;
+ }
+
+ /*
+ * Need to allocate memory for pollstate before anything because
+ * the mutex and cv are created in this space
+ */
+ if ((ps = t->t_pollstate) == NULL) {
+ t->t_pollstate = pollstate_create();
+ ps = t->t_pollstate;
+ }
+
+ if (ps->ps_pcache == NULL)
+ ps->ps_pcache = pcache_alloc();
+ pcp = ps->ps_pcache;
+
+ /*
+ * NOTE: for performance, buffers are saved across poll() calls.
+ * The theory is that if a process polls heavily, it tends to poll
+ * on the same set of descriptors. Therefore, we only reallocate
+ * buffers when nfds changes. There is no hysteresis control,
+ * because there is no data to suggest that this is necessary;
+ * the penalty of reallocating is not *that* great in any event.
+ */
+ old_nfds = ps->ps_nfds;
+ if (nfds != old_nfds) {
+
+ kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t));
+ pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+ ps->ps_pollfd = pollfdp;
+ ps->ps_nfds = nfds;
+ }
+
+ pollfdp = ps->ps_pollfd;
+ if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) {
+ error = EFAULT;
+ goto pollout;
+ }
+
+ if (fds == NULL) {
+ /*
+ * If the process has page 0 mapped, then the copyin() above
+ * will succeed even if fds is NULL. However, our cached
+ * poll lists are keyed by the address of the passed-in fds
+ * structure, and we use the value NULL to indicate an unused
+ * poll cache list entry. As such, we elect not to support
+ * NULL as a valid (user) memory address and fail the poll()
+ * call.
+ */
+ error = EINVAL;
+ goto pollout;
+ }
+
+ /*
+ * If this thread polls for the first time, allocate ALL poll
+ * cache data structures and cache the poll fd list. This
+ * allocation is delayed till now because lwp's polling 0 fd
+ * (i.e. using poll as timeout()) don't need this memory.
+ */
+ mutex_enter(&ps->ps_lock);
+ pcp = ps->ps_pcache;
+ ASSERT(pcp != NULL);
+ if (pcp->pc_bitmap == NULL) {
+ pcache_create(pcp, nfds);
+ /*
+ * poll and cache this poll fd list in ps_pcacheset[0].
+ */
+ error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex);
+ if (fdcnt || error) {
+ mutex_exit(&ps->ps_lock);
+ goto pollout;
+ }
+ } else {
+ pollcacheset_t *pcset = ps->ps_pcacheset;
+
+ /*
+ * Not first time polling. Select a cached poll list by
+ * matching user pollfd list buffer address.
+ */
+ for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) {
+ if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) {
+ if ((++pcset[cacheindex].pcs_count) == 0) {
+ /*
+ * counter is wrapping around.
+ */
+ pcacheset_reset_count(ps, cacheindex);
+ }
+ /*
+ * examine and resolve possible
+ * difference of the current poll
+ * list and previously cached one.
+ * If there is an error during resolve(),
+ * the callee will guarantee the consistency
+ * of cached poll list and cache content.
+ */
+ error = pcacheset_resolve(ps, nfds, &fdcnt,
+ cacheindex);
+ if (error) {
+ mutex_exit(&ps->ps_lock);
+ goto pollout;
+ }
+ break;
+ }
+
+ /*
+ * Note that pcs_usradr field of an used entry won't be
+ * NULL because it stores the address of passed-in fds,
+ * and NULL fds will not be cached (Then it is either
+ * the special timeout case when nfds is 0 or it returns
+ * failure directly).
+ */
+ if (pcset[cacheindex].pcs_usradr == NULL) {
+ /*
+ * found an unused entry. Use it to cache
+ * this poll list.
+ */
+ error = pcacheset_cache_list(ps, fds, &fdcnt,
+ cacheindex);
+ if (fdcnt || error) {
+ mutex_exit(&ps->ps_lock);
+ goto pollout;
+ }
+ break;
+ }
+ }
+ if (cacheindex == ps->ps_nsets) {
+ /*
+ * We failed to find a matching cached poll fd list.
+ * replace an old list.
+ */
+ pollstats.polllistmiss.value.ui64++;
+ cacheindex = pcacheset_replace(ps);
+ ASSERT(cacheindex < ps->ps_nsets);
+ pcset[cacheindex].pcs_usradr = (uintptr_t)fds;
+ error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex);
+ if (error) {
+ mutex_exit(&ps->ps_lock);
+ goto pollout;
+ }
+ }
+ }
+
+ /*
+ * Always scan the bitmap with the lock on the pollcache held.
+ * This is to make sure that a wakeup does not come undetected.
+ * If the lock is not held, a pollwakeup could have come for an
+ * fd we already checked but before this thread sleeps, in which
+ * case the wakeup is missed. Now we hold the pcache lock and
+ * check the bitmap again. This will prevent wakeup from happening
+ * while we hold pcache lock since pollwakeup() will also lock
+ * the pcache before updating poll bitmap.
+ */
+ mutex_enter(&pcp->pc_lock);
+ for (;;) {
+ pcp->pc_flag = 0;
+ error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex);
+ if (fdcnt || error) {
+ mutex_exit(&pcp->pc_lock);
+ mutex_exit(&ps->ps_lock);
+ break;
+ }
+
+ /*
+ * If T_POLLWAKE is set, a pollwakeup() was performed on
+ * one of the file descriptors. This can happen only if
+ * one of the VOP_POLL() functions dropped pcp->pc_lock.
+ * The only current cases of this is in procfs (prpoll())
+ * and STREAMS (strpoll()).
+ */
+ if (pcp->pc_flag & T_POLLWAKE)
+ continue;
+
+ /*
+ * If you get here, the poll of fds was unsuccessful.
+ * Wait until some fd becomes readable, writable, or gets
+ * an exception, or until a signal or a timeout occurs.
+ * Do not check for signals if we have a zero timeout.
+ */
+ mutex_exit(&ps->ps_lock);
+ if (imm_timeout)
+ rval = -1;
+ else
+ rval = cv_waituntil_sig(&pcp->pc_cv, &pcp->pc_lock,
+ rqtp, timecheck);
+ mutex_exit(&pcp->pc_lock);
+ /*
+ * If we have received a signal or timed out
+ * then break out and return.
+ */
+ if (rval <= 0) {
+ if (rval == 0)
+ error = EINTR;
+ break;
+ }
+ /*
+ * We have not received a signal or timed out.
+ * Continue around and poll fds again.
+ */
+ mutex_enter(&ps->ps_lock);
+ mutex_enter(&pcp->pc_lock);
+ }
+
+pollout:
+ /*
+ * If we changed the signal mask but we received
+ * no signal then restore the signal mask.
+ * Otherwise psig() will deal with the signal mask.
+ */
+ if (ksetp != NULL) {
+ mutex_enter(&p->p_lock);
+ if (lwp->lwp_cursig == 0) {
+ t->t_hold = lwp->lwp_sigoldmask;
+ t->t_flag &= ~T_TOMASK;
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ if (error)
+ return (set_errno(error));
+
+ /*
+ * Copy out the events and return the fdcnt to the user.
+ */
+ if (nfds != 0 &&
+ copyout(pollfdp, fds, nfds * sizeof (pollfd_t)))
+ return (set_errno(EFAULT));
+
+#ifdef DEBUG
+ /*
+ * Another sanity check:
+ */
+ if (fdcnt) {
+ int reventcnt = 0;
+
+ for (i = 0; i < nfds; i++) {
+ if (pollfdp[i].fd < 0) {
+ ASSERT(pollfdp[i].revents == 0);
+ continue;
+ }
+ if (pollfdp[i].revents) {
+ reventcnt++;
+ }
+ }
+ ASSERT(fdcnt == reventcnt);
+ } else {
+ for (i = 0; i < nfds; i++) {
+ ASSERT(pollfdp[i].revents == 0);
+ }
+ }
+#endif /* DEBUG */
+
+ return (fdcnt);
+}
+
+/*
+ * This system call trap exists solely for binary compatibility with
+ * old statically-linked applications. It is not called from libc.
+ * It should be removed in the next release.
+ */
+int
+poll(pollfd_t *fds, nfds_t nfds, int time_out)
+{
+ timespec_t ts;
+ timespec_t *tsp;
+
+ if (time_out < 0)
+ tsp = NULL;
+ else {
+ ts.tv_sec = time_out / MILLISEC;
+ ts.tv_nsec = (time_out % MILLISEC) * MICROSEC;
+ tsp = &ts;
+ }
+
+ return (poll_common(fds, nfds, tsp, NULL));
+}
+
+/*
+ * This is the system call trap that poll(),
+ * select() and pselect() are built upon.
+ * It is a private interface between libc and the kernel.
+ */
+int
+pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp)
+{
+ timespec_t ts;
+ timespec_t *tsp;
+ sigset_t set;
+ k_sigset_t kset;
+ k_sigset_t *ksetp;
+ model_t datamodel = get_udatamodel();
+
+ if (timeoutp == NULL)
+ tsp = NULL;
+ else {
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &ts, sizeof (ts)))
+ return (set_errno(EFAULT));
+ } else {
+ timespec32_t ts32;
+
+ if (copyin(timeoutp, &ts32, sizeof (ts32)))
+ return (set_errno(EFAULT));
+ TIMESPEC32_TO_TIMESPEC(&ts, &ts32)
+ }
+
+ if (itimerspecfix(&ts))
+ return (set_errno(EINVAL));
+ tsp = &ts;
+ }
+
+ if (setp == NULL)
+ ksetp = NULL;
+ else {
+ if (copyin(setp, &set, sizeof (set)))
+ return (set_errno(EFAULT));
+ sigutok(&set, &kset);
+ ksetp = &kset;
+ }
+
+ return (poll_common(fds, nfds, tsp, ksetp));
+}
+
+/*
+ * Clean up any state left around by poll(2). Called when a thread exits.
+ */
+void
+pollcleanup()
+{
+ pollstate_t *ps = curthread->t_pollstate;
+ pollcache_t *pcp;
+
+ if (ps == NULL)
+ return;
+ pcp = ps->ps_pcache;
+ /*
+ * free up all cached poll fds
+ */
+ if (pcp == NULL) {
+ /* this pollstate is used by /dev/poll */
+ goto pollcleanout;
+ }
+
+ if (pcp->pc_bitmap != NULL) {
+ ASSERT(MUTEX_NOT_HELD(&ps->ps_lock));
+ /*
+ * a close lwp can race with us when cleaning up a polldat
+ * entry. We hold the ps_lock when cleaning hash table.
+ * Since this pollcache is going away anyway, there is no
+ * need to hold the pc_lock.
+ */
+ mutex_enter(&ps->ps_lock);
+ pcache_clean(pcp);
+ mutex_exit(&ps->ps_lock);
+#ifdef DEBUG
+ /*
+ * At this point, all fds cached by this lwp should be
+ * cleaned up. There should be no fd in fi_list still
+ * reference this thread.
+ */
+ checkfpollinfo(); /* sanity check */
+ pollcheckphlist(); /* sanity check */
+#endif /* DEBUG */
+ }
+ /*
+ * Be sure no one is referencing thread before exiting
+ */
+ mutex_enter(&pcp->pc_no_exit);
+ ASSERT(pcp->pc_busy >= 0);
+ while (pcp->pc_busy > 0)
+ cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
+ mutex_exit(&pcp->pc_no_exit);
+pollcleanout:
+ pollstate_destroy(ps);
+ curthread->t_pollstate = NULL;
+}
+
+/*
+ * pollwakeup() - poke threads waiting in poll() for some event
+ * on a particular object.
+ *
+ * The threads hanging off of the specified pollhead structure are scanned.
+ * If their event mask matches the specified event(s), then pollnotify() is
+ * called to poke the thread.
+ *
+ * Multiple events may be specified. When POLLHUP or POLLERR are specified,
+ * all waiting threads are poked.
+ *
+ * It is important that pollnotify() not drop the lock protecting the list
+ * of threads.
+ */
+void
+pollwakeup(pollhead_t *php, short events_arg)
+{
+ polldat_t *pdp;
+ int events = (ushort_t)events_arg;
+
+retry:
+ PH_ENTER(php);
+
+ /*
+ * About half of all pollwakeups don't do anything, because the
+ * pollhead list is empty (i.e, nobody is interested in the event).
+ * For this common case, we can optimize out locking overhead.
+ */
+ if (php->ph_list == NULL) {
+ PH_EXIT(php);
+ return;
+ }
+
+ for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) {
+ if ((pdp->pd_events & events) ||
+ (events & (POLLHUP | POLLERR))) {
+
+ pollcache_t *pcp;
+
+ if (pdp->pd_portev != NULL) {
+ port_kevent_t *pkevp = pdp->pd_portev;
+ /*
+ * Object (fd) is associated with an event port,
+ * => send event notification to the port.
+ */
+ pkevp->portkev_events |= events &
+ (pdp->pd_events | POLLHUP | POLLERR);
+ if (pkevp->portkev_flags & PORT_KEV_VALID) {
+ pkevp->portkev_flags &= ~PORT_KEV_VALID;
+ (void) port_send_event(pdp->pd_portev);
+ }
+ continue;
+ }
+
+ pcp = pdp->pd_pcache;
+
+ /*
+ * Try to grab the lock for this thread. If
+ * we don't get it then we may deadlock so
+ * back out and restart all over again. Note
+ * that the failure rate is very very low.
+ */
+ if (mutex_tryenter(&pcp->pc_lock)) {
+ pollnotify(pcp, pdp->pd_fd);
+ mutex_exit(&pcp->pc_lock);
+ } else {
+ /*
+ * We are here because:
+ * 1) This thread has been woke up
+ * and is trying to get out of poll().
+ * 2) Some other thread is also here
+ * but with a different pollhead lock.
+ *
+ * So, we need to drop the lock on pollhead
+ * because of (1) but we want to prevent
+ * that thread from doing lwp_exit() or
+ * devpoll close. We want to ensure that
+ * the pollcache pointer is still invalid.
+ *
+ * Solution: Grab the pcp->pc_no_exit lock,
+ * increment the pc_busy counter, drop every
+ * lock in sight. Get out of the way and wait
+ * for type (2) threads to finish.
+ */
+
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy++; /* prevents exit()'s */
+ mutex_exit(&pcp->pc_no_exit);
+
+ PH_EXIT(php);
+ mutex_enter(&pcp->pc_lock);
+ mutex_exit(&pcp->pc_lock);
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy--;
+ if (pcp->pc_busy == 0) {
+ /*
+ * Wakeup the thread waiting in
+ * thread_exit().
+ */
+ cv_signal(&pcp->pc_busy_cv);
+ }
+ mutex_exit(&pcp->pc_no_exit);
+ goto retry;
+ }
+ }
+ }
+ PH_EXIT(php);
+}
+
+/*
+ * This function is called to inform a thread that
+ * an event being polled for has occurred.
+ * The pollstate lock on the thread should be held on entry.
+ */
+void
+pollnotify(pollcache_t *pcp, int fd)
+{
+ ASSERT(fd < pcp->pc_mapsize);
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+ BT_SET(pcp->pc_bitmap, fd);
+ pcp->pc_flag |= T_POLLWAKE;
+ cv_signal(&pcp->pc_cv);
+}
+
+/*
+ * add a polldat entry to pollhead ph_list. The polldat struct is used
+ * by pollwakeup to wake sleeping pollers when polled events has happened.
+ */
+void
+pollhead_insert(pollhead_t *php, polldat_t *pdp)
+{
+ PH_ENTER(php);
+ ASSERT(pdp->pd_next == NULL);
+#ifdef DEBUG
+ {
+ /*
+ * the polldat should not be already on the list
+ */
+ polldat_t *wp;
+ for (wp = php->ph_list; wp; wp = wp->pd_next) {
+ ASSERT(wp != pdp);
+ }
+ }
+#endif /* DEBUG */
+ pdp->pd_next = php->ph_list;
+ php->ph_list = pdp;
+ PH_EXIT(php);
+}
+
+/*
+ * Delete the polldat entry from ph_list.
+ */
+void
+pollhead_delete(pollhead_t *php, polldat_t *pdp)
+{
+ polldat_t *wp;
+ polldat_t **wpp;
+
+ PH_ENTER(php);
+ for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) {
+ if (wp == pdp) {
+ *wpp = pdp->pd_next;
+ pdp->pd_next = NULL;
+ break;
+ }
+ }
+#ifdef DEBUG
+ /* assert that pdp is no longer in the list */
+ for (wp = *wpp; wp; wp = wp->pd_next) {
+ ASSERT(wp != pdp);
+ }
+#endif /* DEBUG */
+ PH_EXIT(php);
+}
+
+/*
+ * walk through the poll fd lists to see if they are identical. This is an
+ * expensive operation and should not be done more than once for each poll()
+ * call.
+ *
+ * As an optimization (i.e., not having to go through the lists more than
+ * once), this routine also clear the revents field of pollfd in 'current'.
+ * Zeroing out the revents field of each entry in current poll list is
+ * required by poll man page.
+ *
+ * Since the events field of cached list has illegal poll events filtered
+ * out, the current list applies the same filtering before comparison.
+ *
+ * The routine stops when it detects a meaningful difference, or when it
+ * exhausts the lists.
+ */
+int
+pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n)
+{
+ int ix;
+
+ for (ix = 0; ix < n; ix++) {
+ if (current[ix].fd == cached[ix].fd) {
+ /*
+ * Filter out invalid poll events while we are in
+ * inside the loop.
+ */
+ if (current[ix].events & ~VALID_POLL_EVENTS) {
+ current[ix].events &= VALID_POLL_EVENTS;
+ if (newlist != NULL)
+ newlist[ix].events = current[ix].events;
+ }
+ if (current[ix].events == cached[ix].events) {
+ current[ix].revents = 0;
+ continue;
+ }
+ }
+ if ((current[ix].fd < 0) && (cached[ix].fd < 0)) {
+ current[ix].revents = 0;
+ continue;
+ }
+ return (ix);
+ }
+ return (ix);
+}
+
+/*
+ * This routine returns a pointer to a cached poll fd entry, or NULL if it
+ * does not find it in the hash table.
+ */
+polldat_t *
+pcache_lookup_fd(pollcache_t *pcp, int fd)
+{
+ int hashindex;
+ polldat_t *pdp;
+
+ hashindex = POLLHASH(pcp->pc_hashsize, fd);
+ pdp = pcp->pc_hash[hashindex];
+ while (pdp != NULL) {
+ if (pdp->pd_fd == fd)
+ break;
+ pdp = pdp->pd_hashnext;
+ }
+ return (pdp);
+}
+
+polldat_t *
+pcache_alloc_fd(int nsets)
+{
+ polldat_t *pdp;
+
+ pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP);
+ if (nsets > 0) {
+ pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP);
+ pdp->pd_nsets = nsets;
+ }
+ return (pdp);
+}
+
+/*
+ * This routine inserts a polldat into the pollcache's hash table. It
+ * may be necessary to grow the size of the hash table.
+ */
+void
+pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds)
+{
+ int hashindex;
+ int fd;
+
+ if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) ||
+ (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) {
+ pcache_grow_hashtbl(pcp, nfds);
+ }
+ fd = pdp->pd_fd;
+ hashindex = POLLHASH(pcp->pc_hashsize, fd);
+ pdp->pd_hashnext = pcp->pc_hash[hashindex];
+ pcp->pc_hash[hashindex] = pdp;
+ pcp->pc_fdcount++;
+
+#ifdef DEBUG
+ {
+ /*
+ * same fd should not appear on a hash list twice
+ */
+ polldat_t *pdp1;
+ for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) {
+ ASSERT(pdp->pd_fd != pdp1->pd_fd);
+ }
+ }
+#endif /* DEBUG */
+}
+
+/*
+ * Grow the hash table -- either double the table size or round it to the
+ * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the
+ * elements on the hash table.
+ */
+void
+pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds)
+{
+ int oldsize;
+ polldat_t **oldtbl;
+ polldat_t *pdp, *pdp1;
+ int i;
+#ifdef DEBUG
+ int count = 0;
+#endif
+
+ ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0);
+ oldsize = pcp->pc_hashsize;
+ oldtbl = pcp->pc_hash;
+ if (nfds > pcp->pc_hashsize * POLLHASHINC) {
+ pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
+ ~(POLLHASHCHUNKSZ - 1);
+ } else {
+ pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC;
+ }
+ pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
+ KM_SLEEP);
+ /*
+ * rehash existing elements
+ */
+ pcp->pc_fdcount = 0;
+ for (i = 0; i < oldsize; i++) {
+ pdp = oldtbl[i];
+ while (pdp != NULL) {
+ pdp1 = pdp->pd_hashnext;
+ pcache_insert_fd(pcp, pdp, nfds);
+ pdp = pdp1;
+#ifdef DEBUG
+ count++;
+#endif
+ }
+ }
+ kmem_free(oldtbl, oldsize * sizeof (polldat_t *));
+ ASSERT(pcp->pc_fdcount == count);
+}
+
+void
+pcache_grow_map(pollcache_t *pcp, int fd)
+{
+ int newsize;
+ ulong_t *newmap;
+
+ /*
+ * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is
+ * power of 2.
+ */
+ newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1);
+ newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t),
+ KM_SLEEP);
+ /*
+ * don't want pollwakeup to set a bit while growing the bitmap.
+ */
+ ASSERT(mutex_owned(&pcp->pc_lock) == 0);
+ mutex_enter(&pcp->pc_lock);
+ bcopy(pcp->pc_bitmap, newmap,
+ (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t));
+ kmem_free(pcp->pc_bitmap,
+ (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t));
+ pcp->pc_bitmap = newmap;
+ pcp->pc_mapsize = newsize;
+ mutex_exit(&pcp->pc_lock);
+}
+
+/*
+ * remove all the reference from pollhead list and fpollinfo lists.
+ */
+void
+pcache_clean(pollcache_t *pcp)
+{
+ int i;
+ polldat_t **hashtbl;
+ polldat_t *pdp;
+
+ ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock));
+ hashtbl = pcp->pc_hash;
+ for (i = 0; i < pcp->pc_hashsize; i++) {
+ for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
+ if (pdp->pd_php != NULL) {
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = NULL;
+ }
+ if (pdp->pd_fp != NULL) {
+ delfpollinfo(pdp->pd_fd);
+ pdp->pd_fp = NULL;
+ }
+ }
+ }
+}
+
+void
+pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp)
+{
+ int i;
+ int fd = pdp->pd_fd;
+
+ /*
+ * we come here because an earlier close() on this cached poll fd.
+ */
+ ASSERT(pdp->pd_fp == NULL);
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ pdp->pd_events = 0;
+ for (i = 0; i < ps->ps_nsets; i++) {
+ xref_t *refp;
+ pollcacheset_t *pcsp;
+
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[i];
+ if (refp->xf_refcnt) {
+ ASSERT(refp->xf_position >= 0);
+ pcsp = &ps->ps_pcacheset[i];
+ if (refp->xf_refcnt == 1) {
+ pcsp->pcs_pollfd[refp->xf_position].fd = -1;
+ refp->xf_refcnt = 0;
+ pdp->pd_count--;
+ } else if (refp->xf_refcnt > 1) {
+ int j;
+
+ /*
+ * turn off every appearance in pcs_pollfd list
+ */
+ for (j = refp->xf_position;
+ j < pcsp->pcs_nfds; j++) {
+ if (pcsp->pcs_pollfd[j].fd == fd) {
+ pcsp->pcs_pollfd[j].fd = -1;
+ refp->xf_refcnt--;
+ pdp->pd_count--;
+ }
+ }
+ }
+ ASSERT(refp->xf_refcnt == 0);
+ refp->xf_position = POLLPOSINVAL;
+ }
+ }
+ ASSERT(pdp->pd_count == 0);
+}
+
+/*
+ * Insert poll fd into the pollcache, and add poll registration.
+ * This routine is called after getf() and before releasef(). So the vnode
+ * can not disappear even if we block here.
+ * If there is an error, the polled fd is not cached.
+ */
+int
+pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp,
+ ssize_t pos, int which)
+{
+ pollcache_t *pcp = ps->ps_pcache;
+ polldat_t *pdp;
+ int error;
+ int fd;
+ pollhead_t *memphp = NULL;
+ xref_t *refp;
+ int newpollfd = 0;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ /*
+ * The poll caching uses the existing VOP_POLL interface. If there
+ * is no polled events, we want the polled device to set its "some
+ * one is sleeping in poll" flag. When the polled events happen
+ * later, the driver will call pollwakeup(). We achieve this by
+ * always passing 0 in the third parameter ("anyyet") when calling
+ * VOP_POLL. This parameter is not looked at by drivers when the
+ * polled events exist. If a driver chooses to ignore this parameter
+ * and call pollwakeup whenever the polled events happen, that will
+ * be OK too.
+ */
+ ASSERT(curthread->t_pollcache == NULL);
+ error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents,
+ &memphp);
+ if (error) {
+ return (error);
+ }
+ if (pollfdp->revents) {
+ (*fdcntp)++;
+ }
+ /*
+ * polling the underlying device succeeded. Now we can cache it.
+ * A close can't come in here because we have not done a releasef()
+ * yet.
+ */
+ fd = pollfdp->fd;
+ pdp = pcache_lookup_fd(pcp, fd);
+ if (pdp == NULL) {
+ ASSERT(ps->ps_nsets > 0);
+ pdp = pcache_alloc_fd(ps->ps_nsets);
+ newpollfd = 1;
+ }
+ /*
+ * If this entry was used to cache a poll fd which was closed, and
+ * this entry has not been cleaned, do it now.
+ */
+ if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) {
+ pcacheset_invalidate(ps, pdp);
+ ASSERT(pdp->pd_next == NULL);
+ }
+ if (pdp->pd_count == 0) {
+ pdp->pd_fd = fd;
+ pdp->pd_fp = fp;
+ addfpollinfo(fd);
+ pdp->pd_thread = curthread;
+ pdp->pd_pcache = pcp;
+ /*
+ * the entry is never used or cleared by removing a cached
+ * pollfd (pcache_delete_fd). So all the fields should be clear.
+ */
+ ASSERT(pdp->pd_next == NULL);
+ }
+
+ /*
+ * A polled fd is considered cached. So there should be a fpollinfo
+ * entry on uf_fpollinfo list.
+ */
+ ASSERT(infpollinfo(fd));
+ /*
+ * If there is an inconsistency, we want to know it here.
+ */
+ ASSERT(pdp->pd_fp == fp);
+
+ /*
+ * XXX pd_events is a union of all polled events on this fd, possibly
+ * by different threads. Unless this is a new first poll(), pd_events
+ * never shrinks. If an event is no longer polled by a process, there
+ * is no way to cancel that event. In that case, poll degrade to its
+ * old form -- polling on this fd every time poll() is called. The
+ * assumption is an app always polls the same type of events.
+ */
+ pdp->pd_events |= pollfdp->events;
+
+ pdp->pd_count++;
+ /*
+ * There is not much special handling for multiple appearances of
+ * same fd other than xf_position always recording the first
+ * appearance in poll list. If this is called from pcacheset_cache_list,
+ * a VOP_POLL is called on every pollfd entry; therefore each
+ * revents and fdcnt should be set correctly. If this is called from
+ * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will
+ * pick up the right count and handle revents field of each pollfd
+ * entry.
+ */
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[which];
+ if (refp->xf_refcnt == 0) {
+ refp->xf_position = pos;
+ } else {
+ /*
+ * xf_position records the fd's first appearance in poll list
+ */
+ if (pos < refp->xf_position) {
+ refp->xf_position = pos;
+ }
+ }
+ ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd);
+ refp->xf_refcnt++;
+ if (fd >= pcp->pc_mapsize) {
+ pcache_grow_map(pcp, fd);
+ }
+ if (fd > pcp->pc_mapend) {
+ pcp->pc_mapend = fd;
+ }
+ if (newpollfd != 0) {
+ pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds);
+ }
+ if (memphp) {
+ if (pdp->pd_php == NULL) {
+ pollhead_insert(memphp, pdp);
+ pdp->pd_php = memphp;
+ } else {
+ if (memphp != pdp->pd_php) {
+ /*
+ * layered devices (e.g. console driver)
+ * may change the vnode and thus the pollhead
+ * pointer out from underneath us.
+ */
+ pollhead_delete(pdp->pd_php, pdp);
+ pollhead_insert(memphp, pdp);
+ pdp->pd_php = memphp;
+ }
+ }
+ }
+ /*
+ * Since there is a considerable window between VOP_POLL and when
+ * we actually put the polldat struct on the pollhead list, we could
+ * miss a pollwakeup. In the case of polling additional events, we
+ * don't update the events until after VOP_POLL. So we could miss
+ * pollwakeup there too. So we always set the bit here just to be
+ * safe. The real performance gain is in subsequent pcache_poll.
+ */
+ mutex_enter(&pcp->pc_lock);
+ BT_SET(pcp->pc_bitmap, fd);
+ mutex_exit(&pcp->pc_lock);
+ return (0);
+}
+
+/*
+ * The entry is not really deleted. The fields are cleared so that the
+ * entry is no longer useful, but it will remain in the hash table for reuse
+ * later. It will be freed when the polling lwp exits.
+ */
+int
+pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent)
+{
+ pollcache_t *pcp = ps->ps_pcache;
+ polldat_t *pdp;
+ xref_t *refp;
+
+ ASSERT(fd < pcp->pc_mapsize);
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+
+ pdp = pcache_lookup_fd(pcp, fd);
+ ASSERT(pdp != NULL);
+ ASSERT(pdp->pd_count > 0);
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[which];
+ if (pdp->pd_count == 1) {
+ pdp->pd_events = 0;
+ refp->xf_position = POLLPOSINVAL;
+ ASSERT(refp->xf_refcnt == 1);
+ refp->xf_refcnt = 0;
+ if (pdp->pd_php) {
+ /*
+ * It is possible for a wakeup thread to get ahead
+ * of the following pollhead_delete and set the bit in
+ * bitmap. It is OK because the bit will be cleared
+ * here anyway.
+ */
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = NULL;
+ }
+ pdp->pd_count = 0;
+ if (pdp->pd_fp != NULL) {
+ pdp->pd_fp = NULL;
+ delfpollinfo(fd);
+ }
+ mutex_enter(&pcp->pc_lock);
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ mutex_exit(&pcp->pc_lock);
+ return (0);
+ }
+ if ((cevent & POLLCLOSED) == POLLCLOSED) {
+ /*
+ * fd cached here has been closed. This is the first
+ * pcache_delete_fd called after the close. Clean up the
+ * entire entry.
+ */
+ pcacheset_invalidate(ps, pdp);
+ ASSERT(pdp->pd_php == NULL);
+ mutex_enter(&pcp->pc_lock);
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ mutex_exit(&pcp->pc_lock);
+ return (0);
+ }
+#ifdef DEBUG
+ if (getf(fd) != NULL) {
+ ASSERT(infpollinfo(fd));
+ releasef(fd);
+ }
+#endif /* DEBUG */
+ pdp->pd_count--;
+ ASSERT(refp->xf_refcnt > 0);
+ if (--refp->xf_refcnt == 0) {
+ refp->xf_position = POLLPOSINVAL;
+ } else {
+ ASSERT(pos >= refp->xf_position);
+ if (pos == refp->xf_position) {
+ /*
+ * The xref position is no longer valid.
+ * Reset it to a special value and let
+ * caller know it needs to updatexref()
+ * with a new xf_position value.
+ */
+ refp->xf_position = POLLPOSTRANS;
+ return (1);
+ }
+ }
+ return (0);
+}
+
+void
+pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which)
+{
+ polldat_t *pdp;
+
+ pdp = pcache_lookup_fd(pcp, fd);
+ ASSERT(pdp != NULL);
+ ASSERT(pdp->pd_ref != NULL);
+ pdp->pd_ref[which].xf_position = pos;
+}
+
+#ifdef DEBUG
+/*
+ * For each polled fd, it's either in the bitmap or cached in
+ * pcache hash table. If this routine returns 0, something is wrong.
+ */
+static int
+pollchecksanity(pollstate_t *ps, nfds_t nfds)
+{
+ int i;
+ int fd;
+ pollcache_t *pcp = ps->ps_pcache;
+ polldat_t *pdp;
+ pollfd_t *pollfdp = ps->ps_pollfd;
+ file_t *fp;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ for (i = 0; i < nfds; i++) {
+ fd = pollfdp[i].fd;
+ if (fd < 0) {
+ ASSERT(pollfdp[i].revents == 0);
+ continue;
+ }
+ if (pollfdp[i].revents == POLLNVAL)
+ continue;
+ if ((fp = getf(fd)) == NULL)
+ continue;
+ pdp = pcache_lookup_fd(pcp, fd);
+ ASSERT(pdp != NULL);
+ ASSERT(infpollinfo(fd));
+ ASSERT(pdp->pd_fp == fp);
+ releasef(fd);
+ if (BT_TEST(pcp->pc_bitmap, fd))
+ continue;
+ if (pdp->pd_php == NULL)
+ return (0);
+ }
+ return (1);
+}
+#endif /* DEBUG */
+
+/*
+ * resolve the difference between the current poll list and a cached one.
+ */
+int
+pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which)
+{
+ int i;
+ pollcache_t *pcp = ps->ps_pcache;
+ pollfd_t *newlist = NULL;
+ pollfd_t *current = ps->ps_pollfd;
+ pollfd_t *cached;
+ pollcacheset_t *pcsp;
+ int common;
+ int count = 0;
+ int offset;
+ int remain;
+ int fd;
+ file_t *fp;
+ int fdcnt = 0;
+ int cnt = 0;
+ nfds_t old_nfds;
+ int error = 0;
+ int mismatch = 0;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+#ifdef DEBUG
+ checkpolldat(ps);
+#endif
+ pcsp = &ps->ps_pcacheset[which];
+ old_nfds = pcsp->pcs_nfds;
+ common = (nfds > old_nfds) ? old_nfds : nfds;
+ if (nfds != old_nfds) {
+ /*
+ * the length of poll list has changed. allocate a new
+ * pollfd list.
+ */
+ newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP);
+ bcopy(current, newlist, sizeof (pollfd_t) * nfds);
+ }
+ /*
+ * Compare the overlapping part of the current fd list with the
+ * cached one. Whenever a difference is found, resolve it.
+ * The comparison is done on the current poll list and the
+ * cached list. But we may be setting up the newlist to be the
+ * cached list for next poll.
+ */
+ cached = pcsp->pcs_pollfd;
+ remain = common;
+
+ while (count < common) {
+ int tmpfd;
+ pollfd_t *np;
+
+ np = (newlist != NULL) ? &newlist[count] : NULL;
+ offset = pcacheset_cmp(&current[count], &cached[count], np,
+ remain);
+ /*
+ * Collect stats. If lists are completed the first time,
+ * it's a hit. Otherwise, it's a partial hit or miss.
+ */
+ if ((count == 0) && (offset == common)) {
+ pollstats.pollcachehit.value.ui64++;
+ } else {
+ mismatch++;
+ }
+ count += offset;
+ if (offset < remain) {
+ ASSERT(count < common);
+ ASSERT((current[count].fd != cached[count].fd) ||
+ (current[count].events != cached[count].events));
+ /*
+ * Filter out invalid events.
+ */
+ if (current[count].events & ~VALID_POLL_EVENTS) {
+ if (newlist != NULL) {
+ newlist[count].events =
+ current[count].events &=
+ VALID_POLL_EVENTS;
+ } else {
+ current[count].events &=
+ VALID_POLL_EVENTS;
+ }
+ }
+ /*
+ * when resolving a difference, we always remove the
+ * fd from cache before inserting one into cache.
+ */
+ if (cached[count].fd >= 0) {
+ tmpfd = cached[count].fd;
+ if (pcache_delete_fd(ps, tmpfd, count, which,
+ (uint_t)cached[count].events)) {
+ /*
+ * This should be rare but needed for
+ * correctness.
+ *
+ * The first appearance in cached list
+ * is being "turned off". The same fd
+ * appear more than once in the cached
+ * poll list. Find the next one on the
+ * list and update the cached
+ * xf_position field.
+ */
+ for (i = count + 1; i < old_nfds; i++) {
+ if (cached[i].fd == tmpfd) {
+ pcache_update_xref(pcp,
+ tmpfd, (ssize_t)i,
+ which);
+ break;
+ }
+ }
+ ASSERT(i <= old_nfds);
+ }
+ /*
+ * In case a new cache list is allocated,
+ * need to keep both cache lists in sync
+ * b/c the new one can be freed if we have
+ * an error later.
+ */
+ cached[count].fd = -1;
+ if (newlist != NULL) {
+ newlist[count].fd = -1;
+ }
+ }
+ if ((tmpfd = current[count].fd) >= 0) {
+ /*
+ * add to the cached fd tbl and bitmap.
+ */
+ if ((fp = getf(tmpfd)) == NULL) {
+ current[count].revents = POLLNVAL;
+ if (newlist != NULL) {
+ newlist[count].fd = -1;
+ }
+ cached[count].fd = -1;
+ fdcnt++;
+ } else {
+ /*
+ * Here we don't care about the
+ * fdcnt. We will examine the bitmap
+ * later and pick up the correct
+ * fdcnt there. So we never bother
+ * to check value of 'cnt'.
+ */
+ error = pcache_insert(ps, fp,
+ &current[count], &cnt,
+ (ssize_t)count, which);
+ /*
+ * if no error, we want to do releasef
+ * after we updated cache poll list
+ * entry so that close() won't race
+ * us.
+ */
+ if (error) {
+ /*
+ * If we encountered an error,
+ * we have invalidated an
+ * entry in cached poll list
+ * (in pcache_delete_fd() above)
+ * but failed to add one here.
+ * This is OK b/c what's in the
+ * cached list is consistent
+ * with content of cache.
+ * It will not have any ill
+ * effect on next poll().
+ */
+ releasef(tmpfd);
+ if (newlist != NULL) {
+ kmem_free(newlist,
+ nfds *
+ sizeof (pollfd_t));
+ }
+ return (error);
+ }
+ /*
+ * If we have allocated a new(temp)
+ * cache list, we need to keep both
+ * in sync b/c the new one can be freed
+ * if we have an error later.
+ */
+ if (newlist != NULL) {
+ newlist[count].fd =
+ current[count].fd;
+ newlist[count].events =
+ current[count].events;
+ }
+ cached[count].fd = current[count].fd;
+ cached[count].events =
+ current[count].events;
+ releasef(tmpfd);
+ }
+ } else {
+ current[count].revents = 0;
+ }
+ count++;
+ remain = common - count;
+ }
+ }
+ if (mismatch != 0) {
+ if (mismatch == common) {
+ pollstats.pollcachemiss.value.ui64++;
+ } else {
+ pollstats.pollcachephit.value.ui64++;
+ }
+ }
+ /*
+ * take care of the non overlapping part of a list
+ */
+ if (nfds > old_nfds) {
+ ASSERT(newlist != NULL);
+ for (i = old_nfds; i < nfds; i++) {
+ /* filter out invalid events */
+ if (current[i].events & ~VALID_POLL_EVENTS) {
+ newlist[i].events = current[i].events =
+ current[i].events & VALID_POLL_EVENTS;
+ }
+ if ((fd = current[i].fd) < 0) {
+ current[i].revents = 0;
+ continue;
+ }
+ /*
+ * add to the cached fd tbl and bitmap.
+ */
+ if ((fp = getf(fd)) == NULL) {
+ current[i].revents = POLLNVAL;
+ newlist[i].fd = -1;
+ fdcnt++;
+ continue;
+ }
+ /*
+ * Here we don't care about the
+ * fdcnt. We will examine the bitmap
+ * later and pick up the correct
+ * fdcnt there. So we never bother to
+ * check 'cnt'.
+ */
+ error = pcache_insert(ps, fp, &current[i], &cnt,
+ (ssize_t)i, which);
+ releasef(fd);
+ if (error) {
+ /*
+ * Here we are half way through adding newly
+ * polled fd. Undo enough to keep the cache
+ * list consistent with the cache content.
+ */
+ pcacheset_remove_list(ps, current, old_nfds,
+ i, which, 0);
+ kmem_free(newlist, nfds * sizeof (pollfd_t));
+ return (error);
+ }
+ }
+ }
+ if (old_nfds > nfds) {
+ /*
+ * remove the fd's which are no longer polled.
+ */
+ pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds,
+ which, 1);
+ }
+ /*
+ * set difference resolved. update nfds and cachedlist
+ * in pollstate struct.
+ */
+ if (newlist != NULL) {
+ kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t));
+ /*
+ * By now, the pollfd.revents field should
+ * all be zeroed.
+ */
+ pcsp->pcs_pollfd = newlist;
+ pcsp->pcs_nfds = nfds;
+ }
+ ASSERT(*fdcntp == 0);
+ *fdcntp = fdcnt;
+ /*
+ * By now for every fd in pollfdp, one of the following should be
+ * true. Otherwise we will miss a polled event.
+ *
+ * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL
+ * will be called on this fd in next poll.
+ * 2. the fd is cached in the pcache (i.e. pd_php is set). So
+ * pollnotify will happen.
+ */
+ ASSERT(pollchecksanity(ps, nfds));
+ /*
+ * make sure cross reference between cached poll lists and cached
+ * poll fds are correct.
+ */
+ ASSERT(pollcheckxref(ps, which));
+ /*
+ * ensure each polldat in pollcache reference a polled fd in
+ * pollcacheset.
+ */
+#ifdef DEBUG
+ checkpolldat(ps);
+#endif
+ return (0);
+}
+
+#ifdef DEBUG
+static int
+pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds)
+{
+ int i;
+ int reventcnt = 0;
+
+ for (i = 0; i < nfds; i++) {
+ if (pollfdp[i].fd < 0) {
+ ASSERT(pollfdp[i].revents == 0);
+ continue;
+ }
+ if (pollfdp[i].revents) {
+ reventcnt++;
+ }
+ if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) {
+ ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd));
+ }
+ }
+ return (reventcnt);
+}
+#endif /* DEBUG */
+
+/*
+ * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock
+ * is held upon entry.
+ */
+int
+pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp,
+ int which)
+{
+ int i;
+ pollcache_t *pcp;
+ int fd;
+ int begin, end, done;
+ pollhead_t *php;
+ int fdcnt;
+ int error = 0;
+ file_t *fp;
+ polldat_t *pdp;
+ xref_t *refp;
+ int entry;
+
+ pcp = ps->ps_pcache;
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ ASSERT(MUTEX_HELD(&pcp->pc_lock));
+retry:
+ done = 0;
+ begin = 0;
+ fdcnt = 0;
+ end = pcp->pc_mapend;
+ while ((fdcnt < nfds) && !done) {
+ php = NULL;
+ /*
+ * only poll fds which may have events
+ */
+ fd = bt_getlowbit(pcp->pc_bitmap, begin, end);
+ ASSERT(fd <= end);
+ if (fd >= 0) {
+ ASSERT(pollcheckrevents(ps, begin, fd, which));
+ /*
+ * adjust map pointers for next round
+ */
+ if (fd == end) {
+ done = 1;
+ } else {
+ begin = fd + 1;
+ }
+ /*
+ * A bitmap caches poll state information of
+ * multiple poll lists. Call VOP_POLL only if
+ * the bit corresponds to an fd in this poll
+ * list.
+ */
+ pdp = pcache_lookup_fd(pcp, fd);
+ ASSERT(pdp != NULL);
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[which];
+ if (refp->xf_refcnt == 0)
+ continue;
+ entry = refp->xf_position;
+ ASSERT((entry >= 0) && (entry < nfds));
+ ASSERT(pollfdp[entry].fd == fd);
+ /*
+ * we are in this routine implies that we have
+ * successfully polled this fd in the past.
+ * Check to see this fd is closed while we are
+ * blocked in poll. This ensures that we don't
+ * miss a close on the fd in the case this fd is
+ * reused.
+ */
+ if (pdp->pd_fp == NULL) {
+ ASSERT(pdp->pd_count > 0);
+ pollfdp[entry].revents = POLLNVAL;
+ fdcnt++;
+ if (refp->xf_refcnt > 1) {
+ /*
+ * this fd appeared multiple time
+ * in the poll list. Find all of them.
+ */
+ for (i = entry + 1; i < nfds; i++) {
+ if (pollfdp[i].fd == fd) {
+ pollfdp[i].revents =
+ POLLNVAL;
+ fdcnt++;
+ }
+ }
+ }
+ pcacheset_invalidate(ps, pdp);
+ continue;
+ }
+ /*
+ * We can be here polling a device that is being
+ * closed (i.e. the file pointer is set to NULL,
+ * but pollcacheclean has not happened yet).
+ */
+ if ((fp = getf(fd)) == NULL) {
+ pollfdp[entry].revents = POLLNVAL;
+ fdcnt++;
+ if (refp->xf_refcnt > 1) {
+ /*
+ * this fd appeared multiple time
+ * in the poll list. Find all of them.
+ */
+ for (i = entry + 1; i < nfds; i++) {
+ if (pollfdp[i].fd == fd) {
+ pollfdp[i].revents =
+ POLLNVAL;
+ fdcnt++;
+ }
+ }
+ }
+ continue;
+ }
+ ASSERT(pdp->pd_fp == fp);
+ ASSERT(infpollinfo(fd));
+ /*
+ * Since we no longer hold poll head lock across
+ * VOP_POLL, pollunlock logic can be simplifed.
+ */
+ ASSERT(pdp->pd_php == NULL ||
+ MUTEX_NOT_HELD(PHLOCK(pdp->pd_php)));
+ /*
+ * underlying file systems may set a "pollpending"
+ * flag when it sees the poll may block. Pollwakeup()
+ * is called by wakeup thread if pollpending is set.
+ * Pass a 0 fdcnt so that the underlying file system
+ * will set the "pollpending" flag set when there is
+ * no polled events.
+ *
+ * Use pollfdp[].events for actual polling because
+ * the pd_events is union of all cached poll events
+ * on this fd. The events parameter also affects
+ * how the polled device sets the "poll pending"
+ * flag.
+ */
+ ASSERT(curthread->t_pollcache == NULL);
+ error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0,
+ &pollfdp[entry].revents, &php);
+ /*
+ * releasef after completely done with this cached
+ * poll entry. To prevent close() coming in to clear
+ * this entry.
+ */
+ if (error) {
+ releasef(fd);
+ break;
+ }
+ /*
+ * layered devices (e.g. console driver)
+ * may change the vnode and thus the pollhead
+ * pointer out from underneath us.
+ */
+ if (php != NULL && pdp->pd_php != NULL &&
+ php != pdp->pd_php) {
+ releasef(fd);
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = php;
+ pollhead_insert(php, pdp);
+ /*
+ * We could have missed a wakeup on the new
+ * target device. Make sure the new target
+ * gets polled once.
+ */
+ BT_SET(pcp->pc_bitmap, fd);
+ goto retry;
+ }
+
+ if (pollfdp[entry].revents) {
+ ASSERT(refp->xf_refcnt >= 1);
+ fdcnt++;
+ if (refp->xf_refcnt > 1) {
+ /*
+ * this fd appeared multiple time
+ * in the poll list. This is rare but
+ * we have to look at all of them for
+ * correctness.
+ */
+ error = plist_chkdupfd(fp, pdp, ps,
+ pollfdp, entry, &fdcnt);
+ if (error > 0) {
+ releasef(fd);
+ break;
+ }
+ if (error < 0) {
+ goto retry;
+ }
+ }
+ releasef(fd);
+ } else {
+ /*
+ * VOP_POLL didn't return any revents. We can
+ * clear the bit in bitmap only if we have the
+ * pollhead ptr cached and no other cached
+ * entry is polling different events on this fd.
+ * VOP_POLL may have dropped the ps_lock. Make
+ * sure pollwakeup has not happened before clear
+ * the bit.
+ */
+ if ((pdp->pd_php != NULL) &&
+ (pollfdp[entry].events == pdp->pd_events) &&
+ ((pcp->pc_flag & T_POLLWAKE) == 0)) {
+ BT_CLEAR(pcp->pc_bitmap, fd);
+ }
+ /*
+ * if the fd can be cached now but not before,
+ * do it now.
+ */
+ if ((pdp->pd_php == NULL) && (php != NULL)) {
+ pdp->pd_php = php;
+ pollhead_insert(php, pdp);
+ /*
+ * We are inserting a polldat struct for
+ * the first time. We may have missed a
+ * wakeup on this device. Re-poll once.
+ * This should be a rare event.
+ */
+ releasef(fd);
+ goto retry;
+ }
+ if (refp->xf_refcnt > 1) {
+ /*
+ * this fd appeared multiple time
+ * in the poll list. This is rare but
+ * we have to look at all of them for
+ * correctness.
+ */
+ error = plist_chkdupfd(fp, pdp, ps,
+ pollfdp, entry, &fdcnt);
+ if (error > 0) {
+ releasef(fd);
+ break;
+ }
+ if (error < 0) {
+ goto retry;
+ }
+ }
+ releasef(fd);
+ }
+ } else {
+ done = 1;
+ ASSERT(pollcheckrevents(ps, begin, end + 1, which));
+ }
+ }
+ if (!error) {
+ ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds));
+ *fdcntp += fdcnt;
+ }
+ return (error);
+}
+
+/*
+ * Going through the poll list without much locking. Poll all fds and
+ * cache all valid fds in the pollcache.
+ */
+int
+pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which)
+{
+ pollfd_t *pollfdp = ps->ps_pollfd;
+ pollcacheset_t *pcacheset = ps->ps_pcacheset;
+ pollfd_t *newfdlist;
+ int i;
+ int fd;
+ file_t *fp;
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ ASSERT(which < ps->ps_nsets);
+ ASSERT(pcacheset != NULL);
+ ASSERT(pcacheset[which].pcs_pollfd == NULL);
+ newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP);
+ /*
+ * cache the new poll list in pollcachset.
+ */
+ bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds);
+
+ pcacheset[which].pcs_pollfd = newfdlist;
+ pcacheset[which].pcs_nfds = ps->ps_nfds;
+ pcacheset[which].pcs_usradr = (uintptr_t)fds;
+
+ /*
+ * We have saved a copy of current poll fd list in one pollcacheset.
+ * The 'revents' field of the new list is not yet set to 0. Loop
+ * through the new list just to do that is expensive. We do that
+ * while polling the list.
+ */
+ for (i = 0; i < ps->ps_nfds; i++) {
+ fd = pollfdp[i].fd;
+ /*
+ * We also filter out the illegal poll events in the event
+ * field for the cached poll list/set.
+ */
+ if (pollfdp[i].events & ~VALID_POLL_EVENTS) {
+ newfdlist[i].events = pollfdp[i].events =
+ pollfdp[i].events & VALID_POLL_EVENTS;
+ }
+ if (fd < 0) {
+ pollfdp[i].revents = 0;
+ continue;
+ }
+ if ((fp = getf(fd)) == NULL) {
+ pollfdp[i].revents = POLLNVAL;
+ /*
+ * invalidate this cache entry in the cached poll list
+ */
+ newfdlist[i].fd = -1;
+ (*fdcntp)++;
+ continue;
+ }
+ /*
+ * cache this fd.
+ */
+ error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i,
+ which);
+ releasef(fd);
+ if (error) {
+ /*
+ * Here we are half way through caching a new
+ * poll list. Undo every thing.
+ */
+ pcacheset_remove_list(ps, pollfdp, 0, i, which, 0);
+ kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t));
+ pcacheset[which].pcs_pollfd = NULL;
+ pcacheset[which].pcs_usradr = NULL;
+ break;
+ }
+ }
+ return (error);
+}
+
+/*
+ * called by pollcacheclean() to set the fp NULL. It also sets polled events
+ * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to
+ * wake any sleeping poller, then remove the polldat from the driver.
+ * The routine is called with ps_pcachelock held.
+ */
+void
+pcache_clean_entry(pollstate_t *ps, int fd)
+{
+ pollcache_t *pcp;
+ polldat_t *pdp;
+ int i;
+
+ ASSERT(ps != NULL);
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ pcp = ps->ps_pcache;
+ ASSERT(pcp);
+ pdp = pcache_lookup_fd(pcp, fd);
+ ASSERT(pdp != NULL);
+ /*
+ * the corresponding fpollinfo in fi_list has been removed by
+ * a close on this fd. Reset the cached fp ptr here.
+ */
+ pdp->pd_fp = NULL;
+ /*
+ * XXX - This routine also touches data in pcacheset struct.
+ *
+ * set the event in cached poll lists to POLLCLOSED. This invalidate
+ * the cached poll fd entry in that poll list, which will force a
+ * removal of this cached entry in next poll(). The cleanup is done
+ * at the removal time.
+ */
+ ASSERT(pdp->pd_ref != NULL);
+ for (i = 0; i < ps->ps_nsets; i++) {
+ xref_t *refp;
+ pollcacheset_t *pcsp;
+
+ refp = &pdp->pd_ref[i];
+ if (refp->xf_refcnt) {
+ ASSERT(refp->xf_position >= 0);
+ pcsp = &ps->ps_pcacheset[i];
+ if (refp->xf_refcnt == 1) {
+ pcsp->pcs_pollfd[refp->xf_position].events =
+ (short)POLLCLOSED;
+ }
+ if (refp->xf_refcnt > 1) {
+ int j;
+ /*
+ * mark every matching entry in pcs_pollfd
+ */
+ for (j = refp->xf_position;
+ j < pcsp->pcs_nfds; j++) {
+ if (pcsp->pcs_pollfd[j].fd == fd) {
+ pcsp->pcs_pollfd[j].events =
+ (short)POLLCLOSED;
+ }
+ }
+ }
+ }
+ }
+ if (pdp->pd_php) {
+ pollwakeup(pdp->pd_php, POLLHUP);
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = NULL;
+ }
+}
+
+/*
+ * This is the first time this thread has ever polled,
+ * so we have to create its pollstate structure.
+ * This will persist for the life of the thread,
+ * until it calls pollcleanup().
+ */
+pollstate_t *
+pollstate_create(void)
+{
+ pollstate_t *ps;
+
+ ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP);
+ ps->ps_nsets = POLLFDSETS;
+ ps->ps_pcacheset = pcacheset_create(ps->ps_nsets);
+ return (ps);
+}
+
+void
+pollstate_destroy(pollstate_t *ps)
+{
+ if (ps->ps_pollfd != NULL) {
+ kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t));
+ ps->ps_pollfd = NULL;
+ }
+ if (ps->ps_pcache != NULL) {
+ pcache_destroy(ps->ps_pcache);
+ ps->ps_pcache = NULL;
+ }
+ pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets);
+ ps->ps_pcacheset = NULL;
+ if (ps->ps_dpbuf != NULL) {
+ kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t));
+ ps->ps_dpbuf = NULL;
+ }
+ mutex_destroy(&ps->ps_lock);
+ kmem_free(ps, sizeof (pollstate_t));
+}
+
+/*
+ * We are holding the appropriate uf_lock entering this routine.
+ * Bump up the ps_busy count to prevent the thread from exiting.
+ */
+void
+pollblockexit(fpollinfo_t *fpip)
+{
+ for (; fpip; fpip = fpip->fp_next) {
+ pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache;
+
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy++; /* prevents exit()'s */
+ mutex_exit(&pcp->pc_no_exit);
+ }
+}
+
+/*
+ * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark
+ * the pcacheset events field POLLCLOSED to force the next poll() to remove
+ * this cache entry. We can't clean the polldat entry clean up here because
+ * lwp block in poll() needs the info to return. Wakeup anyone blocked in
+ * poll and let exiting lwp go. No lock is help upon entry. So it's OK for
+ * pcache_clean_entry to call pollwakeup().
+ */
+void
+pollcacheclean(fpollinfo_t *fip, int fd)
+{
+ struct fpollinfo *fpip, *fpip2;
+
+ fpip = fip;
+ while (fpip) {
+ pollstate_t *ps = fpip->fp_thread->t_pollstate;
+ pollcache_t *pcp = ps->ps_pcache;
+
+ mutex_enter(&ps->ps_lock);
+ pcache_clean_entry(ps, fd);
+ mutex_exit(&ps->ps_lock);
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy--;
+ if (pcp->pc_busy == 0) {
+ /*
+ * Wakeup the thread waiting in
+ * thread_exit().
+ */
+ cv_signal(&pcp->pc_busy_cv);
+ }
+ mutex_exit(&pcp->pc_no_exit);
+
+ fpip2 = fpip;
+ fpip = fpip->fp_next;
+ kmem_free(fpip2, sizeof (fpollinfo_t));
+ }
+}
+
+/*
+ * one of the cache line's counter is wrapping around. Reset all cache line
+ * counters to zero except one. This is simplistic, but probably works
+ * effectively.
+ */
+void
+pcacheset_reset_count(pollstate_t *ps, int index)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ for (i = 0; i < ps->ps_nsets; i++) {
+ if (ps->ps_pcacheset[i].pcs_pollfd != NULL) {
+ ps->ps_pcacheset[i].pcs_count = 0;
+ }
+ }
+ ps->ps_pcacheset[index].pcs_count = 1;
+}
+
+/*
+ * this routine implements poll cache list replacement policy.
+ * It is currently choose the "least used".
+ */
+int
+pcacheset_replace(pollstate_t *ps)
+{
+ int i;
+ int index = 0;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ for (i = 1; i < ps->ps_nsets; i++) {
+ if (ps->ps_pcacheset[index].pcs_count >
+ ps->ps_pcacheset[i].pcs_count) {
+ index = i;
+ }
+ }
+ ps->ps_pcacheset[index].pcs_count = 0;
+ return (index);
+}
+
+/*
+ * this routine is called by strclose to remove remaining polldat struct on
+ * the pollhead list of the device being closed. There are two reasons as why
+ * the polldat structures still remain on the pollhead list:
+ *
+ * (1) The layered device(e.g.the console driver).
+ * In this case, the existence of a polldat implies that the thread putting
+ * the polldat on this list has not exited yet. Before the thread exits, it
+ * will have to hold this pollhead lock to remove the polldat. So holding the
+ * pollhead lock here effectively prevents the thread which put the polldat
+ * on this list from exiting.
+ *
+ * (2) /dev/poll.
+ * When a polled fd is cached in /dev/poll, its polldat will remain on the
+ * pollhead list if the process has not done a POLLREMOVE before closing the
+ * polled fd. We just unlink it here.
+ */
+void
+pollhead_clean(pollhead_t *php)
+{
+ polldat_t *pdp;
+
+ /*
+ * In case(1), while we must prevent the thread in question from
+ * exiting, we must also obey the proper locking order, i.e.
+ * (ps_lock -> phlock).
+ */
+ PH_ENTER(php);
+ while (php->ph_list != NULL) {
+ pollstate_t *ps;
+ pollcache_t *pcp;
+
+ pdp = php->ph_list;
+ ASSERT(pdp->pd_php == php);
+ if (pdp->pd_thread == NULL) {
+ /*
+ * This is case(2). Since the ph_lock is sufficient
+ * to synchronize this lwp with any other /dev/poll
+ * lwp, just unlink the polldat.
+ */
+ php->ph_list = pdp->pd_next;
+ pdp->pd_php = NULL;
+ pdp->pd_next = NULL;
+ continue;
+ }
+ ps = pdp->pd_thread->t_pollstate;
+ ASSERT(ps != NULL);
+ pcp = pdp->pd_pcache;
+ ASSERT(pcp != NULL);
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy++; /* prevents exit()'s */
+ mutex_exit(&pcp->pc_no_exit);
+ /*
+ * Now get the locks in proper order to avoid deadlock.
+ */
+ PH_EXIT(php);
+ mutex_enter(&ps->ps_lock);
+ /*
+ * while we dropped the pollhead lock, the element could be
+ * taken off the list already.
+ */
+ PH_ENTER(php);
+ if (pdp->pd_php == php) {
+ ASSERT(pdp == php->ph_list);
+ php->ph_list = pdp->pd_next;
+ pdp->pd_php = NULL;
+ pdp->pd_next = NULL;
+ }
+ PH_EXIT(php);
+ mutex_exit(&ps->ps_lock);
+ mutex_enter(&pcp->pc_no_exit);
+ pcp->pc_busy--;
+ if (pcp->pc_busy == 0) {
+ /*
+ * Wakeup the thread waiting in
+ * thread_exit().
+ */
+ cv_signal(&pcp->pc_busy_cv);
+ }
+ mutex_exit(&pcp->pc_no_exit);
+ PH_ENTER(php);
+ }
+ PH_EXIT(php);
+}
+
+/*
+ * The remove_list is called to cleanup a partially cached 'current' list or
+ * to remove a partial list which is no longer cached. The flag value of 1
+ * indicates the second case.
+ */
+void
+pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end,
+ int cacheindex, int flag)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&ps->ps_lock));
+ for (i = start; i < end; i++) {
+ if ((pollfdp[i].fd >= 0) &&
+ (flag || !(pollfdp[i].revents & POLLNVAL))) {
+ if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex,
+ (uint_t)pollfdp[i].events)) {
+ int j;
+ int fd = pollfdp[i].fd;
+
+ for (j = i + 1; j < end; j++) {
+ if (pollfdp[j].fd == fd) {
+ pcache_update_xref(
+ ps->ps_pcache, fd,
+ (ssize_t)j, cacheindex);
+ break;
+ }
+ }
+ ASSERT(j <= end);
+ }
+ }
+ }
+}
+
+#ifdef DEBUG
+
+#include<sys/strsubr.h>
+/*
+ * make sure curthread is not on anyone's pollhead list any more.
+ */
+static void
+pollcheckphlist()
+{
+ int i;
+ file_t *fp;
+ uf_entry_t *ufp;
+ uf_info_t *fip = P_FINFO(curproc);
+ struct stdata *stp;
+ polldat_t *pdp;
+
+ mutex_enter(&fip->fi_lock);
+ for (i = 0; i < fip->fi_nfiles; i++) {
+ UF_ENTER(ufp, fip, i);
+ if ((fp = ufp->uf_file) != NULL) {
+ if ((stp = fp->f_vnode->v_stream) != NULL) {
+ PH_ENTER(&stp->sd_pollist);
+ pdp = stp->sd_pollist.ph_list;
+ while (pdp) {
+ ASSERT(pdp->pd_thread != curthread);
+ pdp = pdp->pd_next;
+ }
+ PH_EXIT(&stp->sd_pollist);
+ }
+ }
+ UF_EXIT(ufp);
+ }
+ mutex_exit(&fip->fi_lock);
+}
+
+/*
+ * for resolved set poll list, the xref info in the pcache should be
+ * consistent with this poll list.
+ */
+static int
+pollcheckxref(pollstate_t *ps, int cacheindex)
+{
+ pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd;
+ pollcache_t *pcp = ps->ps_pcache;
+ polldat_t *pdp;
+ int i;
+ xref_t *refp;
+
+ for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) {
+ if (pollfdp[i].fd < 0) {
+ continue;
+ }
+ pdp = pcache_lookup_fd(pcp, pollfdp[i].fd);
+ ASSERT(pdp != NULL);
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[cacheindex];
+ if (refp->xf_position >= 0) {
+ ASSERT(refp->xf_refcnt >= 1);
+ ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd);
+ if (refp->xf_refcnt > 1) {
+ int j;
+ int count = 0;
+
+ for (j = refp->xf_position;
+ j < ps->ps_pcacheset[cacheindex].pcs_nfds;
+ j++) {
+ if (pollfdp[j].fd == pdp->pd_fd) {
+ count++;
+ }
+ }
+ ASSERT(count == refp->xf_refcnt);
+ }
+ }
+ }
+ return (1);
+}
+
+/*
+ * For every cached pollfd, its polldat struct should be consistent with
+ * what is in the pcacheset lists.
+ */
+static void
+checkpolldat(pollstate_t *ps)
+{
+ pollcache_t *pcp = ps->ps_pcache;
+ polldat_t **hashtbl;
+ int i;
+
+ hashtbl = pcp->pc_hash;
+ for (i = 0; i < pcp->pc_hashsize; i++) {
+ polldat_t *pdp;
+
+ for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
+ ASSERT(pdp->pd_ref != NULL);
+ if (pdp->pd_count > 0) {
+ xref_t *refp;
+ int j;
+ pollcacheset_t *pcsp;
+ pollfd_t *pollfd;
+
+ for (j = 0; j < ps->ps_nsets; j++) {
+ refp = &pdp->pd_ref[j];
+ if (refp->xf_refcnt > 0) {
+ pcsp = &ps->ps_pcacheset[j];
+ ASSERT(refp->xf_position < pcsp->pcs_nfds);
+ pollfd = pcsp->pcs_pollfd;
+ ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd);
+ }
+ }
+ }
+ }
+ }
+}
+
+/*
+ * every wfd element on ph_list must have a corresponding fpollinfo on the
+ * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks.
+ */
+void
+checkwfdlist(vnode_t *vp, fpollinfo_t *fpip)
+{
+ stdata_t *stp;
+ polldat_t *pdp;
+ fpollinfo_t *fpip2;
+
+ if ((stp = vp->v_stream) == NULL) {
+ return;
+ }
+ PH_ENTER(&stp->sd_pollist);
+ for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) {
+ if (pdp->pd_thread->t_procp == curthread->t_procp) {
+ for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) {
+ if (pdp->pd_thread == fpip2->fp_thread) {
+ break;
+ }
+ }
+ ASSERT(fpip2 != NULL);
+ }
+ }
+ PH_EXIT(&stp->sd_pollist);
+}
+
+/*
+ * For each cached fd whose bit is not set in bitmap, its revents field in
+ * current poll list should be 0.
+ */
+static int
+pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex)
+{
+ pollcache_t *pcp = ps->ps_pcache;
+ pollfd_t *pollfdp = ps->ps_pollfd;
+ int i;
+
+ for (i = begin; i < end; i++) {
+ polldat_t *pdp;
+
+ ASSERT(!BT_TEST(pcp->pc_bitmap, i));
+ pdp = pcache_lookup_fd(pcp, i);
+ if (pdp && pdp->pd_fp != NULL) {
+ xref_t *refp;
+ int entry;
+
+ ASSERT(pdp->pd_ref != NULL);
+ refp = &pdp->pd_ref[cacheindex];
+ if (refp->xf_refcnt == 0) {
+ continue;
+ }
+ entry = refp->xf_position;
+ ASSERT(entry >= 0);
+ ASSERT(pollfdp[entry].revents == 0);
+ if (refp->xf_refcnt > 1) {
+ int j;
+
+ for (j = entry + 1; j < ps->ps_nfds; j++) {
+ if (pollfdp[j].fd == i) {
+ ASSERT(pollfdp[j].revents == 0);
+ }
+ }
+ }
+ }
+ }
+ return (1);
+}
+
+#endif /* DEBUG */
+
+pollcache_t *
+pcache_alloc()
+{
+ return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP));
+}
+
+void
+pcache_create(pollcache_t *pcp, nfds_t nfds)
+{
+ size_t mapsize;
+
+ /*
+ * allocate enough bits for the poll fd list
+ */
+ if ((mapsize = POLLMAPCHUNK) <= nfds) {
+ mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1);
+ }
+ pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t),
+ KM_SLEEP);
+ pcp->pc_mapsize = mapsize;
+ /*
+ * The hash size is at least POLLHASHCHUNKSZ. If user polls a large
+ * number of fd to start with, allocate a bigger hash table (to the
+ * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a
+ * hash table is expensive.
+ */
+ if (nfds < POLLHASHCHUNKSZ) {
+ pcp->pc_hashsize = POLLHASHCHUNKSZ;
+ } else {
+ pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) &
+ ~(POLLHASHCHUNKSZ - 1);
+ }
+ pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *),
+ KM_SLEEP);
+}
+
+void
+pcache_destroy(pollcache_t *pcp)
+{
+ polldat_t **hashtbl;
+ int i;
+
+ hashtbl = pcp->pc_hash;
+ for (i = 0; i < pcp->pc_hashsize; i++) {
+ if (hashtbl[i] != NULL) {
+ polldat_t *pdp, *pdp2;
+
+ pdp = hashtbl[i];
+ while (pdp != NULL) {
+ pdp2 = pdp->pd_hashnext;
+ if (pdp->pd_ref != NULL) {
+ kmem_free(pdp->pd_ref, sizeof (xref_t) *
+ pdp->pd_nsets);
+ }
+ kmem_free(pdp, sizeof (polldat_t));
+ pdp = pdp2;
+ pcp->pc_fdcount--;
+ }
+ }
+ }
+ ASSERT(pcp->pc_fdcount == 0);
+ kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize);
+ kmem_free(pcp->pc_bitmap,
+ sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL));
+ mutex_destroy(&pcp->pc_no_exit);
+ mutex_destroy(&pcp->pc_lock);
+ cv_destroy(&pcp->pc_cv);
+ cv_destroy(&pcp->pc_busy_cv);
+ kmem_free(pcp, sizeof (pollcache_t));
+}
+
+pollcacheset_t *
+pcacheset_create(int nsets)
+{
+ return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP));
+}
+
+void
+pcacheset_destroy(pollcacheset_t *pcsp, int nsets)
+{
+ int i;
+
+ for (i = 0; i < nsets; i++) {
+ if (pcsp[i].pcs_pollfd != NULL) {
+ kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds *
+ sizeof (pollfd_t));
+ }
+ }
+ kmem_free(pcsp, sizeof (pollcacheset_t) * nsets);
+}
+
+/*
+ * Check each duplicated poll fd in the poll list. It may be necessary to
+ * VOP_POLL the same fd again using different poll events. getf() has been
+ * done by caller. This routine returns 0 if it can sucessfully process the
+ * entire poll fd list. It returns -1 if underlying vnode has changed during
+ * a VOP_POLL, in which case the caller has to repoll. It returns a positive
+ * value if VOP_POLL failed.
+ */
+static int
+plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp,
+ int entry, int *fdcntp)
+{
+ int i;
+ int fd;
+ nfds_t nfds = psp->ps_nfds;
+
+ fd = pollfdp[entry].fd;
+ for (i = entry + 1; i < nfds; i++) {
+ if (pollfdp[i].fd == fd) {
+ if (pollfdp[i].events == pollfdp[entry].events) {
+ if ((pollfdp[i].revents =
+ pollfdp[entry].revents) != 0) {
+ (*fdcntp)++;
+ }
+ } else {
+
+ int error;
+ pollhead_t *php;
+ pollcache_t *pcp = psp->ps_pcache;
+
+ /*
+ * the events are different. VOP_POLL on this
+ * fd so that we don't miss any revents.
+ */
+ php = NULL;
+ ASSERT(curthread->t_pollcache == NULL);
+ error = VOP_POLL(fp->f_vnode,
+ pollfdp[i].events, 0,
+ &pollfdp[i].revents, &php);
+ if (error) {
+ return (error);
+ }
+ /*
+ * layered devices(e.g. console driver)
+ * may change the vnode and thus the pollhead
+ * pointer out from underneath us.
+ */
+ if (php != NULL && pdp->pd_php != NULL &&
+ php != pdp->pd_php) {
+ pollhead_delete(pdp->pd_php, pdp);
+ pdp->pd_php = php;
+ pollhead_insert(php, pdp);
+ /*
+ * We could have missed a wakeup on the
+ * new target device. Make sure the new
+ * target gets polled once.
+ */
+ BT_SET(pcp->pc_bitmap, fd);
+ return (-1);
+ }
+ if (pollfdp[i].revents) {
+ (*fdcntp)++;
+ }
+ }
+ }
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/ppriv.c b/usr/src/uts/common/syscall/ppriv.c
new file mode 100644
index 0000000000..817c4fc83b
--- /dev/null
+++ b/usr/src/uts/common/syscall/ppriv.c
@@ -0,0 +1,333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/cred_impl.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+#include <sys/priv_impl.h>
+#include <sys/policy.h>
+#include <sys/ddi.h>
+#include <sys/thread.h>
+#include <c2/audit.h>
+
+/*
+ * System call support for manipulating privileges.
+ *
+ *
+ * setppriv(2) - set process privilege set
+ * getppriv(2) - get process privilege set
+ * getprivimplinfo(2) - get process privilege implementation information
+ * setpflags(2) - set process (privilege) flags
+ * getpflags(2) - get process (privilege) flags
+ */
+
+/*
+ * setppriv (priv_op_t, priv_ptype_t, priv_set_t)
+ */
+static int
+setppriv(priv_op_t op, priv_ptype_t type, priv_set_t *in_pset)
+{
+ priv_set_t pset, *target;
+ cred_t *cr, *pcr;
+ proc_t *p;
+ boolean_t donocd;
+
+ if (!PRIV_VALIDSET(type) || !PRIV_VALIDOP(op))
+ return (set_errno(EINVAL));
+
+ if (copyin(in_pset, &pset, sizeof (priv_set_t)))
+ return (set_errno(EFAULT));
+
+ p = ttoproc(curthread);
+ cr = cralloc();
+ mutex_enter(&p->p_crlock);
+
+ pcr = p->p_cred;
+
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_setppriv(op, type, &pset, pcr);
+#endif
+
+ /*
+ * Filter out unallowed request (bad op and bad type)
+ */
+ switch (op) {
+ case PRIV_ON:
+ case PRIV_SET:
+ /*
+ * Turning on privileges; the limit set cannot grow,
+ * other sets can but only as long as they remain subsets
+ * of P. Only immediately after exec holds that P <= L.
+ */
+ if (((type == PRIV_LIMIT &&
+ !priv_issubset(&pset, &CR_LPRIV(pcr))) ||
+ !priv_issubset(&pset, &CR_OPPRIV(pcr))) &&
+ !priv_issubset(&pset, priv_getset(pcr, type))) {
+ mutex_exit(&p->p_crlock);
+ crfree(cr);
+ return (set_errno(EPERM));
+ }
+ break;
+
+ case PRIV_OFF:
+ /* PRIV_OFF is always allowed */
+ break;
+ }
+
+ /*
+ * OK! everything is cool.
+ * Do cred COW.
+ */
+ crcopy_to(pcr, cr);
+
+ /*
+ * If we change the effective, permitted or limit set, we attain
+ * "privilege awareness".
+ */
+ if (type != PRIV_INHERITABLE)
+ priv_set_PA(cr);
+
+ target = &(CR_PRIVS(cr)->crprivs[type]);
+
+ switch (op) {
+ case PRIV_ON:
+ priv_union(&pset, target);
+ break;
+ case PRIV_OFF:
+ priv_inverse(&pset);
+ priv_intersect(target, &pset);
+
+ /*
+ * Fall-thru to set target and change other process
+ * privilege sets.
+ */
+ /*FALLTHRU*/
+
+ case PRIV_SET:
+ *target = pset;
+
+ /*
+ * Take privileges no longer permitted out
+ * of other effective sets as well.
+ * Limit set is enforced at exec() time.
+ */
+ if (type == PRIV_PERMITTED)
+ priv_intersect(&pset, &CR_EPRIV(cr));
+ break;
+ }
+
+ /*
+ * When we give up privileges not in the inheritable set,
+ * set SNOCD if not already set; first we compute the
+ * privileges removed from P using Diff = (~P') & P
+ * and then we check whether the removed privileges are
+ * a subset of I. If we retain uid 0, all privileges
+ * are required anyway so don't set SNOCD.
+ */
+ if (type == PRIV_PERMITTED && (p->p_flag & SNOCD) == 0 &&
+ cr->cr_uid != 0 && cr->cr_ruid != 0 && cr->cr_suid != 0) {
+ priv_set_t diff = CR_OPPRIV(cr);
+ priv_inverse(&diff);
+ priv_intersect(&CR_OPPRIV(pcr), &diff);
+ donocd = !priv_issubset(&diff, &CR_IPRIV(cr));
+ } else {
+ donocd = B_FALSE;
+ }
+
+ p->p_cred = cr;
+ mutex_exit(&p->p_crlock);
+
+ if (donocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+
+ crset(p, cr); /* broadcast to process threads */
+
+ return (0);
+}
+
+/*
+ * getppriv (priv_ptype_t, priv_set_t *)
+ */
+static int
+getppriv(priv_ptype_t type, priv_set_t *pset)
+{
+ if (!PRIV_VALIDSET(type))
+ return (set_errno(EINVAL));
+
+ if (copyout(priv_getset(CRED(), type), pset, sizeof (priv_set_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+static int
+getprivimplinfo(void *buf, size_t bufsize)
+{
+ int err;
+
+ err = copyout(priv_hold_implinfo(), buf, min(bufsize, privinfosize));
+
+ priv_release_implinfo();
+
+ if (err)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/*
+ * Set privilege flags
+ *
+ * For now we cheat: the flags are actually bit masks so we can simplify
+ * some; we do make sure that the arguments are valid, though.
+ */
+
+static int
+setpflags(uint_t flag, uint_t val)
+{
+ cred_t *cr, *pcr;
+ proc_t *p = curproc;
+ uint_t newflags;
+
+ if (val > 1 || (flag != PRIV_DEBUG && flag != PRIV_AWARE &&
+ flag != __PROC_PROTECT)) {
+ return (set_errno(EINVAL));
+ }
+
+ if (flag == __PROC_PROTECT) {
+ mutex_enter(&p->p_lock);
+ if (val == 0)
+ p->p_flag &= ~SNOCD;
+ else
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ return (0);
+ }
+
+ cr = cralloc();
+
+ mutex_enter(&p->p_crlock);
+
+ pcr = p->p_cred;
+
+ newflags = CR_FLAGS(pcr);
+
+ if (val != 0)
+ newflags |= flag;
+ else
+ newflags &= ~flag;
+
+ /* No change */
+ if (CR_FLAGS(pcr) == newflags) {
+ mutex_exit(&p->p_crlock);
+ crfree(cr);
+ return (0);
+ }
+
+ /* Trying to unset PA; if we can't, return an error */
+ if (flag == PRIV_AWARE && val == 0 && !priv_can_clear_PA(pcr)) {
+ mutex_exit(&p->p_crlock);
+ crfree(cr);
+ return (set_errno(EPERM));
+ }
+
+ /* Committed to changing the flag */
+ crcopy_to(pcr, cr);
+ if (flag == PRIV_AWARE) {
+ if (val != 0)
+ priv_set_PA(cr);
+ else
+ priv_adjust_PA(cr);
+ } else {
+ CR_FLAGS(cr) = newflags;
+ }
+
+ p->p_cred = cr;
+
+ mutex_exit(&p->p_crlock);
+
+ crset(p, cr);
+
+ return (0);
+}
+
+/*
+ * Getpflags. Currently only implements single bit flags.
+ */
+static uint_t
+getpflags(uint_t flag)
+{
+ if (flag != PRIV_DEBUG && flag != PRIV_AWARE)
+ return (set_errno(EINVAL));
+
+ return ((CR_FLAGS(CRED()) & flag) != 0);
+}
+
+/*
+ * Privilege system call entry point
+ */
+int
+privsys(int code, priv_op_t op, priv_ptype_t type, void *buf, size_t bufsize)
+{
+ switch (code) {
+ case PRIVSYS_SETPPRIV:
+ if (bufsize < sizeof (priv_set_t))
+ return (set_errno(ENOMEM));
+ return (setppriv(op, type, buf));
+ case PRIVSYS_GETPPRIV:
+ if (bufsize < sizeof (priv_set_t))
+ return (set_errno(ENOMEM));
+ return (getppriv(type, buf));
+ case PRIVSYS_GETIMPLINFO:
+ return (getprivimplinfo(buf, bufsize));
+ case PRIVSYS_SETPFLAGS:
+ return (setpflags((uint_t)op, (uint_t)type));
+ case PRIVSYS_GETPFLAGS:
+ return ((int)getpflags((uint_t)op));
+
+ }
+ return (set_errno(EINVAL));
+}
+
+#ifdef _SYSCALL32_IMPL
+int
+privsys32(int code, priv_op_t op, priv_ptype_t type, caddr32_t *buf,
+ size32_t bufsize)
+{
+ return (privsys(code, op, type, (void *)buf, (size_t)bufsize));
+}
+#endif
diff --git a/usr/src/uts/common/syscall/processor_bind.c b/usr/src/uts/common/syscall/processor_bind.c
new file mode 100644
index 0000000000..10ca1178d5
--- /dev/null
+++ b/usr/src/uts/common/syscall/processor_bind.c
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/var.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/kstat.h>
+#include <sys/uadmin.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/procset.h>
+#include <sys/processor.h>
+#include <sys/debug.h>
+#include <sys/task.h>
+#include <sys/project.h>
+#include <sys/zone.h>
+#include <sys/contract_impl.h>
+#include <sys/contract/process_impl.h>
+
+/*
+ * Bind all the threads of a process to a CPU.
+ */
+static int
+cpu_bind_process(proc_t *pp, processorid_t bind, processorid_t *obind,
+ int *error)
+{
+ kthread_t *tp;
+ kthread_t *fp;
+ int err = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ /* skip kernel processes */
+ if (pp->p_flag & SSYS) {
+ *obind = PBIND_NONE;
+ return (0);
+ }
+
+ mutex_enter(&pp->p_lock);
+ tp = pp->p_tlist;
+ if (tp != NULL) {
+ fp = tp;
+ do {
+ i = cpu_bind_thread(tp, bind, obind, error);
+ if (err == 0)
+ err = i;
+ } while ((tp = tp->t_forw) != fp);
+ }
+
+ mutex_exit(&pp->p_lock);
+ return (err);
+}
+
+/*
+ * Bind all the processes of a task to a CPU.
+ */
+static int
+cpu_bind_task(task_t *tk, processorid_t bind, processorid_t *obind,
+ int *error)
+{
+ proc_t *p;
+ int err = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ if ((p = tk->tk_memb_list) == NULL)
+ return (ESRCH);
+
+ do {
+ i = cpu_bind_process(p, bind, obind, error);
+ if (err == 0)
+ err = i;
+ } while ((p = p->p_tasknext) != tk->tk_memb_list);
+
+ return (err);
+}
+
+/*
+ * Bind all the processes in a project to a CPU.
+ */
+static int
+cpu_bind_project(kproject_t *kpj, processorid_t bind, processorid_t *obind,
+ int *error)
+{
+ proc_t *p;
+ int err = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (p = practive; p != NULL; p = p->p_next) {
+ if (p->p_tlist == NULL)
+ continue;
+ if (p->p_task->tk_proj == kpj) {
+ i = cpu_bind_process(p, bind, obind, error);
+ if (err == 0)
+ err = i;
+ }
+ }
+ return (err);
+}
+
+/*
+ * Bind all the processes in a zone to a CPU.
+ */
+int
+cpu_bind_zone(zone_t *zptr, processorid_t bind, processorid_t *obind,
+ int *error)
+{
+ proc_t *p;
+ int err = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (p = practive; p != NULL; p = p->p_next) {
+ if (p->p_tlist == NULL)
+ continue;
+ if (p->p_zone == zptr) {
+ i = cpu_bind_process(p, bind, obind, error);
+ if (err == 0)
+ err = i;
+ }
+ }
+ return (err);
+}
+
+/*
+ * Bind all the processes in a process contract to a CPU.
+ */
+int
+cpu_bind_contract(cont_process_t *ctp, processorid_t bind, processorid_t *obind,
+ int *error)
+{
+ proc_t *p;
+ int err = 0;
+ int i;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (p = practive; p != NULL; p = p->p_next) {
+ if (p->p_tlist == NULL)
+ continue;
+ if (p->p_ct_process == ctp) {
+ i = cpu_bind_process(p, bind, obind, error);
+ if (err == 0)
+ err = i;
+ }
+ }
+ return (err);
+}
+
+/*
+ * processor_bind(2) - Processor binding interfaces.
+ */
+int
+processor_bind(idtype_t idtype, id_t id, processorid_t bind,
+ processorid_t *obindp)
+{
+ processorid_t obind = PBIND_NONE;
+ int ret = 0;
+ int err = 0;
+ cpu_t *cp;
+ kthread_id_t tp;
+ proc_t *pp;
+ task_t *tk;
+ kproject_t *kpj;
+ zone_t *zptr;
+ contract_t *ct;
+
+ /*
+ * Since we might be making a binding to a processor, hold the
+ * cpu_lock so that the processor cannot be taken offline while
+ * we do this.
+ */
+ mutex_enter(&cpu_lock);
+
+ /*
+ * Check to be sure binding processor ID is valid.
+ */
+ switch (bind) {
+ default:
+ if ((cp = cpu_get(bind)) == NULL ||
+ (cp->cpu_flags & (CPU_QUIESCED | CPU_OFFLINE)))
+ ret = EINVAL;
+ else if ((cp->cpu_flags & CPU_READY) == 0)
+ ret = EIO;
+ break;
+
+ case PBIND_NONE:
+ case PBIND_QUERY:
+ break;
+ }
+
+ if (ret) {
+ mutex_exit(&cpu_lock);
+ return (set_errno(ret));
+ }
+
+ switch (idtype) {
+ case P_LWPID:
+ pp = curproc;
+ mutex_enter(&pp->p_lock);
+ if (id == P_MYID) {
+ ret = cpu_bind_thread(curthread, bind, &obind, &err);
+ } else {
+ int found = 0;
+
+ tp = pp->p_tlist;
+ do {
+ if (tp->t_tid == id) {
+ ret = cpu_bind_thread(tp,
+ bind, &obind, &err);
+ found = 1;
+ break;
+ }
+ } while ((tp = tp->t_forw) != pp->p_tlist);
+ if (!found)
+ ret = ESRCH;
+ }
+ mutex_exit(&pp->p_lock);
+ break;
+
+ case P_PID:
+ /*
+ * Note. Cannot use dotoprocs here because it doesn't find
+ * system class processes, which are legal to query.
+ */
+ mutex_enter(&pidlock);
+ if (id == P_MYID) {
+ ret = cpu_bind_process(curproc, bind, &obind, &err);
+ } else if ((pp = prfind(id)) != NULL) {
+ ret = cpu_bind_process(pp, bind, &obind, &err);
+ } else {
+ ret = ESRCH;
+ }
+ mutex_exit(&pidlock);
+ break;
+
+ case P_TASKID:
+ mutex_enter(&pidlock);
+ if (id == P_MYID) {
+ proc_t *p = curproc;
+ id = p->p_task->tk_tkid;
+ }
+
+ if ((tk = task_hold_by_id(id)) != NULL) {
+ ret = cpu_bind_task(tk, bind, &obind, &err);
+ mutex_exit(&pidlock);
+ task_rele(tk);
+ } else {
+ mutex_exit(&pidlock);
+ ret = ESRCH;
+ }
+ break;
+
+ case P_PROJID:
+ if (id == P_MYID)
+ id = curprojid();
+ if ((kpj = project_hold_by_id(id, getzoneid(),
+ PROJECT_HOLD_FIND)) == NULL) {
+ ret = ESRCH;
+ } else {
+ mutex_enter(&pidlock);
+ ret = cpu_bind_project(kpj, bind, &obind, &err);
+ mutex_exit(&pidlock);
+ project_rele(kpj);
+ }
+ break;
+
+ case P_ZONEID:
+ if (id == P_MYID)
+ id = getzoneid();
+
+ if ((zptr = zone_find_by_id(id)) == NULL) {
+ ret = ESRCH;
+ } else {
+ mutex_enter(&pidlock);
+ ret = cpu_bind_zone(zptr, bind, &obind, &err);
+ mutex_exit(&pidlock);
+ zone_rele(zptr);
+ }
+ break;
+
+ case P_CTID:
+ if (id == P_MYID)
+ id = PRCTID(curproc);
+
+ if ((ct = contract_type_ptr(process_type, id,
+ curproc->p_zone->zone_uniqid)) == NULL) {
+ ret = ESRCH;
+ } else {
+ mutex_enter(&pidlock);
+ ret = cpu_bind_contract(ct->ct_data,
+ bind, &obind, &err);
+ mutex_exit(&pidlock);
+ contract_rele(ct);
+ }
+ break;
+
+ case P_CPUID:
+ if (id == P_MYID || bind != PBIND_NONE || cpu_get(id) == NULL)
+ ret = EINVAL;
+ else
+ ret = cpu_unbind(id);
+ break;
+
+ case P_ALL:
+ if (id == P_MYID || bind != PBIND_NONE) {
+ ret = EINVAL;
+ } else {
+ int i;
+ cpu_t *cp = cpu_list;
+ do {
+ if ((cp->cpu_flags & CPU_EXISTS) == 0)
+ continue;
+ i = cpu_unbind(cp->cpu_id);
+ if (ret == 0)
+ ret = i;
+ } while ((cp = cp->cpu_next) != cpu_list);
+ }
+ break;
+
+ default:
+ /*
+ * Spec says this is invalid, even though we could
+ * handle other idtypes.
+ */
+ ret = EINVAL;
+ break;
+ }
+ mutex_exit(&cpu_lock);
+
+ /*
+ * If no search error occurred, see if any permissions errors did.
+ */
+ if (ret == 0)
+ ret = err;
+
+ if (ret == 0 && obindp != NULL)
+ if (copyout((caddr_t)&obind, (caddr_t)obindp,
+ sizeof (obind)) == -1)
+ ret = EFAULT;
+ return (ret ? set_errno(ret) : 0); /* return success or failure */
+}
diff --git a/usr/src/uts/common/syscall/processor_info.c b/usr/src/uts/common/syscall/processor_info.c
new file mode 100644
index 0000000000..d080f08e02
--- /dev/null
+++ b/usr/src/uts/common/syscall/processor_info.c
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1992, 1994, 1998 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/var.h>
+#include <sys/thread.h>
+#include <sys/cpuvar.h>
+#include <sys/kstat.h>
+#include <sys/uadmin.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/procset.h>
+#include <sys/processor.h>
+#include <sys/debug.h>
+
+/*
+ * processor_info(2) - return information on a processor.
+ */
+int
+processor_info(processorid_t cpun, processor_info_t *infop)
+{
+ cpu_t *cp;
+ processor_info_t temp;
+
+ mutex_enter(&cpu_lock);
+ if ((cp = cpu_get(cpun)) == NULL) {
+ mutex_exit(&cpu_lock);
+ return (set_errno(EINVAL));
+ }
+ bcopy(&cp->cpu_type_info, &temp, sizeof (temp));
+ mutex_exit(&cpu_lock);
+
+ /*
+ * The spec indicates that the rest of the information is meaningless
+ * if the CPU is offline, but if presented by the machine-dependent
+ * layer, it is probably still accurate. It seems OK to copy it all in
+ * either case.
+ */
+ if (copyout((caddr_t)&temp, (caddr_t)infop,
+ sizeof (processor_info_t)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/profil.c b/usr/src/uts/common/syscall/profil.c
new file mode 100644
index 0000000000..e74ea39824
--- /dev/null
+++ b/usr/src/uts/common/syscall/profil.c
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998, Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/debug.h>
+
+/*
+ * Profiling.
+ */
+int
+profil(unsigned short *bufbase, size_t bufsize, u_long pcoffset, u_int pcscale)
+{
+ struct proc *p = ttoproc(curthread);
+
+ if (pcscale == 1)
+ pcscale = 0;
+
+ mutex_enter(&p->p_pflock);
+ p->p_prof.pr_base = bufbase;
+ p->p_prof.pr_size = bufsize;
+ p->p_prof.pr_off = pcoffset;
+ p->p_prof.pr_scale = pcscale;
+
+ /* pcsample and profil are mutually exclusive */
+ p->p_prof.pr_samples = 0;
+
+ mutex_exit(&p->p_pflock);
+ mutex_enter(&p->p_lock);
+ set_proc_post_sys(p); /* activate post_syscall profiling code */
+ mutex_exit(&p->p_lock);
+ return (0);
+}
+
+
+/*
+ * PC Sampling
+ */
+long
+pcsample(void *buf, long nsamples)
+{
+ struct proc *p = ttoproc(curthread);
+ long count = 0;
+
+ if (nsamples < 0 ||
+ ((get_udatamodel() != DATAMODEL_NATIVE) && (nsamples > INT32_MAX)))
+ return (set_errno(EINVAL));
+
+ mutex_enter(&p->p_pflock);
+ p->p_prof.pr_base = buf;
+ p->p_prof.pr_size = nsamples;
+ p->p_prof.pr_scale = 1;
+ count = p->p_prof.pr_samples;
+ p->p_prof.pr_samples = 0;
+ mutex_exit(&p->p_pflock);
+
+ mutex_enter(&p->p_lock);
+ set_proc_post_sys(p); /* activate post_syscall profiling code */
+ mutex_exit(&p->p_lock);
+
+ return (count);
+}
diff --git a/usr/src/uts/common/syscall/pset.c b/usr/src/uts/common/syscall/pset.c
new file mode 100644
index 0000000000..73b45c88be
--- /dev/null
+++ b/usr/src/uts/common/syscall/pset.c
@@ -0,0 +1,797 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/thread.h>
+#include <sys/disp.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/cpupart.h>
+#include <sys/pset.h>
+#include <sys/modctl.h>
+#include <sys/syscall.h>
+#include <sys/task.h>
+#include <sys/loadavg.h>
+#include <sys/fss.h>
+#include <sys/pool.h>
+#include <sys/pool_pset.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/contract/process_impl.h>
+
+static int pset(int, long, long, long, long);
+
+static struct sysent pset_sysent = {
+ 5,
+ SE_ARGC | SE_NOUNLOAD,
+ (int (*)())pset,
+};
+
+static struct modlsys modlsys = {
+ &mod_syscallops, "processor sets", &pset_sysent
+};
+
+#ifdef _SYSCALL32_IMPL
+static struct modlsys modlsys32 = {
+ &mod_syscallops32, "32-bit pset(2) syscall", &pset_sysent
+};
+#endif
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsys,
+#ifdef _SYSCALL32_IMPL
+ &modlsys32,
+#endif
+ NULL
+};
+
+#define PSET_BADATTR(attr) ((~PSET_NOESCAPE) & (attr))
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+pset_create(psetid_t *psetp)
+{
+ psetid_t newpset;
+ int error;
+
+ if (secpolicy_pset(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ pool_lock();
+ if (pool_state == POOL_ENABLED) {
+ pool_unlock();
+ return (set_errno(ENOTSUP));
+ }
+ error = cpupart_create(&newpset);
+ if (error) {
+ pool_unlock();
+ return (set_errno(error));
+ }
+ if (copyout(&newpset, psetp, sizeof (psetid_t)) != 0) {
+ (void) cpupart_destroy(newpset);
+ pool_unlock();
+ return (set_errno(EFAULT));
+ }
+ pool_unlock();
+ return (error);
+}
+
+static int
+pset_destroy(psetid_t pset)
+{
+ int error;
+
+ if (secpolicy_pset(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ pool_lock();
+ if (pool_state == POOL_ENABLED) {
+ pool_unlock();
+ return (set_errno(ENOTSUP));
+ }
+ error = cpupart_destroy(pset);
+ pool_unlock();
+ if (error)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+static int
+pset_assign(psetid_t pset, processorid_t cpuid, psetid_t *opset, int forced)
+{
+ psetid_t oldpset;
+ int error = 0;
+ cpu_t *cp;
+
+ if (pset != PS_QUERY && secpolicy_pset(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ pool_lock();
+ if (pset != PS_QUERY && pool_state == POOL_ENABLED) {
+ pool_unlock();
+ return (set_errno(ENOTSUP));
+ }
+
+ mutex_enter(&cpu_lock);
+ if ((cp = cpu_get(cpuid)) == NULL) {
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+ return (set_errno(EINVAL));
+ }
+
+ oldpset = cpupart_query_cpu(cp);
+
+ if (pset != PS_QUERY)
+ error = cpupart_attach_cpu(pset, cp, forced);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+
+ if (error)
+ return (set_errno(error));
+
+ if (opset != NULL)
+ if (copyout(&oldpset, opset, sizeof (psetid_t)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+static int
+pset_info(psetid_t pset, int *typep, uint_t *numcpusp,
+ processorid_t *cpulistp)
+{
+ int pset_type;
+ uint_t user_ncpus = 0, real_ncpus, copy_ncpus;
+ processorid_t *pset_cpus = NULL;
+ int error = 0;
+
+ if (numcpusp != NULL) {
+ if (copyin(numcpusp, &user_ncpus, sizeof (uint_t)) != 0)
+ return (set_errno(EFAULT));
+ }
+
+ if (user_ncpus > max_ncpus) /* sanity check */
+ user_ncpus = max_ncpus;
+ if (user_ncpus != 0 && cpulistp != NULL)
+ pset_cpus = kmem_alloc(sizeof (processorid_t) * user_ncpus,
+ KM_SLEEP);
+
+ real_ncpus = user_ncpus;
+ if ((error = cpupart_get_cpus(&pset, pset_cpus, &real_ncpus)) != 0)
+ goto out;
+
+ /*
+ * Now copyout the information about this processor set.
+ */
+
+ /*
+ * Get number of cpus to copy back. If the user didn't pass in
+ * a big enough buffer, only copy back as many cpus as fits in
+ * the buffer but copy back the real number of cpus.
+ */
+
+ if (user_ncpus != 0 && cpulistp != NULL) {
+ copy_ncpus = MIN(real_ncpus, user_ncpus);
+ if (copyout(pset_cpus, cpulistp,
+ sizeof (processorid_t) * copy_ncpus) != 0) {
+ error = EFAULT;
+ goto out;
+ }
+ }
+ if (pset_cpus != NULL)
+ kmem_free(pset_cpus, sizeof (processorid_t) * user_ncpus);
+ if (typep != NULL) {
+ if (pset == PS_NONE)
+ pset_type = PS_NONE;
+ else
+ pset_type = PS_PRIVATE;
+ if (copyout(&pset_type, typep, sizeof (int)) != 0)
+ return (set_errno(EFAULT));
+ }
+ if (numcpusp != NULL)
+ if (copyout(&real_ncpus, numcpusp, sizeof (uint_t)) != 0)
+ return (set_errno(EFAULT));
+ return (0);
+
+out:
+ if (pset_cpus != NULL)
+ kmem_free(pset_cpus, sizeof (processorid_t) * user_ncpus);
+ return (set_errno(error));
+}
+
+static int
+pset_bind_thread(kthread_t *tp, psetid_t pset, psetid_t *oldpset, void *projbuf,
+ void *zonebuf)
+{
+ int error = 0;
+
+ ASSERT(pool_lock_held());
+ ASSERT(MUTEX_HELD(&cpu_lock));
+ ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
+
+ *oldpset = tp->t_bind_pset;
+ if (pset != PS_QUERY) {
+ /*
+ * Must have the same UID as the target process or
+ * have PRIV_PROC_OWNER privilege.
+ */
+ if (!hasprocperm(tp->t_cred, CRED()))
+ return (EPERM);
+ /*
+ * Unbinding of an unbound thread should always succeed.
+ */
+ if (*oldpset == PS_NONE && pset == PS_NONE)
+ return (0);
+ /*
+ * Only privileged processes can move threads from psets with
+ * PSET_NOESCAPE attribute.
+ */
+ if ((tp->t_cpupart->cp_attr & PSET_NOESCAPE) &&
+ secpolicy_pset(CRED()) != 0)
+ return (EPERM);
+ if ((error = cpupart_bind_thread(tp, pset, 0,
+ projbuf, zonebuf)) == 0)
+ tp->t_bind_pset = pset;
+ }
+ return (error);
+}
+
+static int
+pset_bind_process(proc_t *pp, psetid_t pset, psetid_t *oldpset, void *projbuf,
+ void *zonebuf)
+{
+ int error = 0;
+ kthread_t *tp;
+
+ /* skip kernel processes */
+ if (pset != PS_QUERY && pp->p_flag & SSYS) {
+ *oldpset = PS_NONE;
+ return (0);
+ }
+
+ mutex_enter(&pp->p_lock);
+ tp = pp->p_tlist;
+ if (tp != NULL) {
+ do {
+ int rval;
+
+ rval = pset_bind_thread(tp, pset, oldpset, projbuf,
+ zonebuf);
+ if (error == 0)
+ error = rval;
+ } while ((tp = tp->t_forw) != pp->p_tlist);
+ } else
+ error = ESRCH;
+ mutex_exit(&pp->p_lock);
+
+ return (error);
+}
+
+static int
+pset_bind_task(task_t *tk, psetid_t pset, psetid_t *oldpset, void *projbuf,
+ void *zonebuf)
+{
+ int error = 0;
+ proc_t *pp;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ if ((pp = tk->tk_memb_list) == NULL) {
+ return (ESRCH);
+ }
+
+ do {
+ int rval;
+
+ rval = pset_bind_process(pp, pset, oldpset, projbuf, zonebuf);
+ if (error == 0)
+ error = rval;
+ } while ((pp = pp->p_tasknext) != tk->tk_memb_list);
+
+ return (error);
+}
+
+static int
+pset_bind_project(kproject_t *kpj, psetid_t pset, psetid_t *oldpset,
+ void *projbuf, void *zonebuf)
+{
+ int error = 0;
+ proc_t *pp;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (pp = practive; pp != NULL; pp = pp->p_next) {
+ if (pp->p_tlist == NULL)
+ continue;
+ if (pp->p_task->tk_proj == kpj) {
+ int rval;
+
+ rval = pset_bind_process(pp, pset, oldpset, projbuf,
+ zonebuf);
+ if (error == 0)
+ error = rval;
+ }
+ }
+
+ return (error);
+}
+
+static int
+pset_bind_zone(zone_t *zptr, psetid_t pset, psetid_t *oldpset, void *projbuf,
+ void *zonebuf)
+{
+ int error = 0;
+ proc_t *pp;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (pp = practive; pp != NULL; pp = pp->p_next) {
+ if (pp->p_zone == zptr) {
+ int rval;
+
+ rval = pset_bind_process(pp, pset, oldpset, projbuf,
+ zonebuf);
+ if (error == 0)
+ error = rval;
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * Unbind all threads from the specified processor set, or from all
+ * processor sets.
+ */
+static int
+pset_unbind(psetid_t pset, void *projbuf, void *zonebuf, idtype_t idtype)
+{
+ psetid_t olbind;
+ kthread_t *tp;
+ int error = 0;
+ int rval;
+ proc_t *pp;
+
+ ASSERT(MUTEX_HELD(&cpu_lock));
+
+ if (idtype == P_PSETID && cpupart_find(pset) == NULL)
+ return (EINVAL);
+
+ mutex_enter(&pidlock);
+ for (pp = practive; pp != NULL; pp = pp->p_next) {
+ mutex_enter(&pp->p_lock);
+ tp = pp->p_tlist;
+ /*
+ * Skip zombies and kernel processes, and processes in
+ * other zones, if called from a non-global zone.
+ */
+ if (tp == NULL || (pp->p_flag & SSYS) ||
+ !HASZONEACCESS(curproc, pp->p_zone->zone_id)) {
+ mutex_exit(&pp->p_lock);
+ continue;
+ }
+ do {
+ if ((idtype == P_PSETID && tp->t_bind_pset != pset) ||
+ (idtype == P_ALL && tp->t_bind_pset == PS_NONE))
+ continue;
+ rval = pset_bind_thread(tp, PS_NONE, &olbind,
+ projbuf, zonebuf);
+ if (error == 0)
+ error = rval;
+ } while ((tp = tp->t_forw) != pp->p_tlist);
+ mutex_exit(&pp->p_lock);
+ }
+ mutex_exit(&pidlock);
+ return (error);
+}
+
+static int
+pset_bind_contract(cont_process_t *ctp, psetid_t pset, psetid_t *oldpset,
+ void *projbuf, void *zonebuf)
+{
+ int error = 0;
+ proc_t *pp;
+
+ ASSERT(MUTEX_HELD(&pidlock));
+
+ for (pp = practive; pp != NULL; pp = pp->p_next) {
+ if (pp->p_ct_process == ctp) {
+ int rval;
+
+ rval = pset_bind_process(pp, pset, oldpset, projbuf,
+ zonebuf);
+ if (error == 0)
+ error = rval;
+ }
+ }
+
+ return (error);
+}
+
+static int
+pset_bind(psetid_t pset, idtype_t idtype, id_t id, psetid_t *opset)
+{
+ kthread_t *tp;
+ proc_t *pp;
+ task_t *tk;
+ kproject_t *kpj;
+ contract_t *ct;
+ zone_t *zptr;
+ psetid_t oldpset;
+ int error = 0;
+ void *projbuf, *zonebuf;
+
+ pool_lock();
+ if (pset != PS_QUERY) {
+ /*
+ * Check if the set actually exists before checking
+ * permissions. This is the historical error
+ * precedence. Note that if pset was PS_MYID, the
+ * cpupart_get_cpus call will change it to the
+ * processor set id of the caller (or PS_NONE if the
+ * caller is not bound to a processor set).
+ */
+ if (pool_state == POOL_ENABLED) {
+ pool_unlock();
+ return (set_errno(ENOTSUP));
+ }
+ if (cpupart_get_cpus(&pset, NULL, NULL) != 0) {
+ pool_unlock();
+ return (set_errno(EINVAL));
+ } else if (pset != PS_NONE && secpolicy_pset(CRED()) != 0) {
+ pool_unlock();
+ return (set_errno(EPERM));
+ }
+ }
+
+ /*
+ * Pre-allocate enough buffers for FSS for all active projects
+ * and for all active zones on the system. Unused buffers will
+ * be freed later by fss_freebuf().
+ */
+ mutex_enter(&cpu_lock);
+ projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
+ zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
+
+ switch (idtype) {
+ case P_LWPID:
+ pp = curproc;
+ mutex_enter(&pidlock);
+ mutex_enter(&pp->p_lock);
+ if (id == P_MYID) {
+ tp = curthread;
+ } else {
+ if ((tp = idtot(pp, id)) == NULL) {
+ mutex_exit(&pp->p_lock);
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ break;
+ }
+ }
+ error = pset_bind_thread(tp, pset, &oldpset, projbuf, zonebuf);
+ mutex_exit(&pp->p_lock);
+ mutex_exit(&pidlock);
+ break;
+
+ case P_PID:
+ mutex_enter(&pidlock);
+ if (id == P_MYID) {
+ pp = curproc;
+ } else if ((pp = prfind(id)) == NULL) {
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ break;
+ }
+ error = pset_bind_process(pp, pset, &oldpset, projbuf, zonebuf);
+ mutex_exit(&pidlock);
+ break;
+
+ case P_TASKID:
+ mutex_enter(&pidlock);
+ if (id == P_MYID)
+ id = curproc->p_task->tk_tkid;
+ if ((tk = task_hold_by_id(id)) == NULL) {
+ mutex_exit(&pidlock);
+ error = ESRCH;
+ break;
+ }
+ error = pset_bind_task(tk, pset, &oldpset, projbuf, zonebuf);
+ mutex_exit(&pidlock);
+ task_rele(tk);
+ break;
+
+ case P_PROJID:
+ if (id == P_MYID)
+ id = curprojid();
+ if ((kpj = project_hold_by_id(id, getzoneid(),
+ PROJECT_HOLD_FIND)) == NULL) {
+ error = ESRCH;
+ break;
+ }
+ mutex_enter(&pidlock);
+ error = pset_bind_project(kpj, pset, &oldpset, projbuf,
+ zonebuf);
+ mutex_exit(&pidlock);
+ project_rele(kpj);
+ break;
+
+ case P_ZONEID:
+ if (id == P_MYID)
+ id = getzoneid();
+ if ((zptr = zone_find_by_id(id)) == NULL) {
+ error = ESRCH;
+ break;
+ }
+ mutex_enter(&pidlock);
+ error = pset_bind_zone(zptr, pset, &oldpset, projbuf, zonebuf);
+ mutex_exit(&pidlock);
+ zone_rele(zptr);
+ break;
+
+ case P_CTID:
+ if (id == P_MYID)
+ id = PRCTID(curproc);
+ if ((ct = contract_type_ptr(process_type, id,
+ curproc->p_zone->zone_uniqid)) == NULL) {
+ error = ESRCH;
+ break;
+ }
+ mutex_enter(&pidlock);
+ error = pset_bind_contract(ct->ct_data, pset, &oldpset, projbuf,
+ zonebuf);
+ mutex_exit(&pidlock);
+ contract_rele(ct);
+ break;
+
+ case P_PSETID:
+ if (id == P_MYID || pset != PS_NONE || !INGLOBALZONE(curproc)) {
+ error = EINVAL;
+ break;
+ }
+ error = pset_unbind(id, projbuf, zonebuf, idtype);
+ break;
+
+ case P_ALL:
+ if (id == P_MYID || pset != PS_NONE || !INGLOBALZONE(curproc)) {
+ error = EINVAL;
+ break;
+ }
+ error = pset_unbind(PS_NONE, projbuf, zonebuf, idtype);
+ break;
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ fss_freebuf(projbuf, FSS_ALLOC_PROJ);
+ fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
+ mutex_exit(&cpu_lock);
+ pool_unlock();
+
+ if (error != 0)
+ return (set_errno(error));
+ if (opset != NULL) {
+ if (copyout(&oldpset, opset, sizeof (psetid_t)) != 0)
+ return (set_errno(EFAULT));
+ }
+ return (0);
+}
+
+/*
+ * Report load average statistics for the specified processor set.
+ */
+static int
+pset_getloadavg(psetid_t pset, int *buf, int nelem)
+{
+ int *loadbuf;
+ int error = 0;
+
+ if (nelem < 0)
+ return (set_errno(EINVAL));
+
+ /*
+ * We keep the same number of load average statistics for processor
+ * sets as we do for the system as a whole.
+ */
+ if (nelem > LOADAVG_NSTATS)
+ nelem = LOADAVG_NSTATS;
+
+ loadbuf = kmem_alloc(nelem * sizeof (int), KM_SLEEP);
+
+ mutex_enter(&cpu_lock);
+ error = cpupart_get_loadavg(pset, loadbuf, nelem);
+ mutex_exit(&cpu_lock);
+ if (!error && nelem && copyout(loadbuf, buf, nelem * sizeof (int)) != 0)
+ error = EFAULT;
+
+ kmem_free(loadbuf, nelem * sizeof (int));
+
+ if (error)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+
+/*
+ * Return list of active processor sets, up to a maximum indicated by
+ * numpsets. The total number of processor sets is stored in the
+ * location pointed to by numpsets.
+ */
+static int
+pset_list(psetid_t *psetlist, uint_t *numpsets)
+{
+ uint_t user_npsets = 0;
+ uint_t real_npsets;
+ psetid_t *psets = NULL;
+ int error = 0;
+
+ if (numpsets != NULL) {
+ if (copyin(numpsets, &user_npsets, sizeof (uint_t)) != 0)
+ return (set_errno(EFAULT));
+ }
+
+ /*
+ * Get the list of all processor sets. First we need to find
+ * out how many there are, so we can allocate a large enough
+ * buffer.
+ */
+ mutex_enter(&cpu_lock);
+ if (!INGLOBALZONE(curproc) && pool_pset_enabled()) {
+ psetid_t psetid = zone_pset_get(curproc->p_zone);
+
+ if (psetid == PS_NONE) {
+ real_npsets = 0;
+ } else {
+ real_npsets = 1;
+ psets = kmem_alloc(real_npsets * sizeof (psetid_t),
+ KM_SLEEP);
+ psets[0] = psetid;
+ }
+ } else {
+ real_npsets = cpupart_list(0, NULL, CP_ALL);
+ if (real_npsets) {
+ psets = kmem_alloc(real_npsets * sizeof (psetid_t),
+ KM_SLEEP);
+ (void) cpupart_list(psets, real_npsets, CP_ALL);
+ }
+ }
+ mutex_exit(&cpu_lock);
+
+ if (user_npsets > real_npsets)
+ user_npsets = real_npsets;
+
+ if (numpsets != NULL) {
+ if (copyout(&real_npsets, numpsets, sizeof (uint_t)) != 0)
+ error = EFAULT;
+ else if (psetlist != NULL && user_npsets != 0) {
+ if (copyout(psets, psetlist,
+ user_npsets * sizeof (psetid_t)) != 0)
+ error = EFAULT;
+ }
+ }
+
+ if (real_npsets)
+ kmem_free(psets, real_npsets * sizeof (psetid_t));
+
+ if (error)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+static int
+pset_setattr(psetid_t pset, uint_t attr)
+{
+ int error;
+
+ if (secpolicy_pset(CRED()) != 0)
+ return (set_errno(EPERM));
+ pool_lock();
+ if (pool_state == POOL_ENABLED) {
+ pool_unlock();
+ return (set_errno(ENOTSUP));
+ }
+ if (pset == PS_QUERY || PSET_BADATTR(attr)) {
+ pool_unlock();
+ return (set_errno(EINVAL));
+ }
+ if ((error = cpupart_setattr(pset, attr)) != 0) {
+ pool_unlock();
+ return (set_errno(error));
+ }
+ pool_unlock();
+ return (0);
+}
+
+static int
+pset_getattr(psetid_t pset, uint_t *attrp)
+{
+ int error = 0;
+ uint_t attr;
+
+ if (pset == PS_QUERY)
+ return (set_errno(EINVAL));
+ if ((error = cpupart_getattr(pset, &attr)) != 0)
+ return (set_errno(error));
+ if (copyout(&attr, attrp, sizeof (uint_t)) != 0)
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+static int
+pset(int subcode, long arg1, long arg2, long arg3, long arg4)
+{
+ switch (subcode) {
+ case PSET_CREATE:
+ return (pset_create((psetid_t *)arg1));
+ case PSET_DESTROY:
+ return (pset_destroy((psetid_t)arg1));
+ case PSET_ASSIGN:
+ return (pset_assign((psetid_t)arg1,
+ (processorid_t)arg2, (psetid_t *)arg3, 0));
+ case PSET_INFO:
+ return (pset_info((psetid_t)arg1, (int *)arg2,
+ (uint_t *)arg3, (processorid_t *)arg4));
+ case PSET_BIND:
+ return (pset_bind((psetid_t)arg1, (idtype_t)arg2,
+ (id_t)arg3, (psetid_t *)arg4));
+ case PSET_GETLOADAVG:
+ return (pset_getloadavg((psetid_t)arg1, (int *)arg2,
+ (int)arg3));
+ case PSET_LIST:
+ return (pset_list((psetid_t *)arg1, (uint_t *)arg2));
+ case PSET_SETATTR:
+ return (pset_setattr((psetid_t)arg1, (uint_t)arg2));
+ case PSET_GETATTR:
+ return (pset_getattr((psetid_t)arg1, (uint_t *)arg2));
+ case PSET_ASSIGN_FORCED:
+ return (pset_assign((psetid_t)arg1,
+ (processorid_t)arg2, (psetid_t *)arg3, 1));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
diff --git a/usr/src/uts/common/syscall/rctlsys.c b/usr/src/uts/common/syscall/rctlsys.c
new file mode 100644
index 0000000000..03617b5d44
--- /dev/null
+++ b/usr/src/uts/common/syscall/rctlsys.c
@@ -0,0 +1,871 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#include <sys/cmn_err.h>
+#include <sys/cred.h>
+#include <sys/errno.h>
+#include <sys/rctl.h>
+#include <sys/rctl_impl.h>
+#include <sys/strlog.h>
+#include <sys/syslog.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/policy.h>
+#include <sys/proc.h>
+#include <sys/task.h>
+
+/*
+ * setrctl(2), getrctl(2), and private rctlsys(2*) system calls
+ *
+ * Resource control block (rctlblk_ptr_t, rctl_opaque_t)
+ * The resource control system call interfaces present the resource control
+ * values and flags via the resource control block abstraction, made manifest
+ * via an opaque data type with strict type definitions. Keeping the formal
+ * definitions in the rcontrol block allows us to be clever in the kernel,
+ * combining attributes where appropriate in the current implementation while
+ * preserving binary compatibility in the face of implementation changes.
+ */
+
+#define RBX_TO_BLK 0x1
+#define RBX_FROM_BLK 0x2
+#define RBX_VAL 0x4
+#define RBX_CTL 0x8
+
+static void
+rctlsys_rblk_xfrm(rctl_opaque_t *blk, rctl_dict_entry_t *rde,
+ rctl_val_t *val, int flags)
+{
+ if (flags & RBX_FROM_BLK) {
+ if (flags & RBX_VAL) {
+ /*
+ * Firing time cannot be set.
+ */
+ val->rcv_privilege = blk->rcq_privilege;
+ val->rcv_value = blk->rcq_value;
+ val->rcv_flagaction = blk->rcq_local_flagaction;
+ val->rcv_action_signal = blk->rcq_local_signal;
+ val->rcv_action_recip_pid =
+ blk->rcq_local_recipient_pid;
+ }
+ if (flags & RBX_CTL) {
+ rde->rcd_flagaction = blk->rcq_global_flagaction;
+ rde->rcd_syslog_level = blk->rcq_global_syslog_level;
+
+ /*
+ * Because the strlog() interface supports fewer options
+ * than are made available via the syslog() interface to
+ * userland, we map the syslog level down to a smaller
+ * set of distinct logging behaviours.
+ */
+ rde->rcd_strlog_flags = 0;
+ switch (blk->rcq_global_syslog_level) {
+ case LOG_EMERG:
+ case LOG_ALERT:
+ case LOG_CRIT:
+ rde->rcd_strlog_flags |= SL_CONSOLE;
+ /*FALLTHROUGH*/
+ case LOG_ERR:
+ rde->rcd_strlog_flags |= SL_ERROR;
+ /*FALLTHROUGH*/
+ case LOG_WARNING:
+ rde->rcd_strlog_flags |= SL_WARN;
+ break;
+ case LOG_NOTICE:
+ rde->rcd_strlog_flags |= SL_CONSOLE;
+ /*FALLTHROUGH*/
+ case LOG_INFO: /* informational */
+ case LOG_DEBUG: /* debug-level messages */
+ default:
+ rde->rcd_strlog_flags |= SL_NOTE;
+ break;
+ }
+ }
+ } else {
+ bzero(blk, sizeof (rctl_opaque_t));
+ if (flags & RBX_VAL) {
+ blk->rcq_privilege = val->rcv_privilege;
+ blk->rcq_value = val->rcv_value;
+ blk->rcq_enforced_value = rctl_model_value(rde,
+ curproc, val->rcv_value);
+ blk->rcq_local_flagaction = val->rcv_flagaction;
+ blk->rcq_local_signal = val->rcv_action_signal;
+ blk->rcq_firing_time = val->rcv_firing_time;
+ blk->rcq_local_recipient_pid =
+ val->rcv_action_recip_pid;
+ }
+ if (flags & RBX_CTL) {
+ blk->rcq_global_flagaction = rde->rcd_flagaction;
+ blk->rcq_global_syslog_level = rde->rcd_syslog_level;
+ }
+ }
+}
+
+/*
+ * int rctl_invalid_value(rctl_dict_entry_t *, rctl_val_t *)
+ *
+ * Overview
+ * Perform basic validation of proposed new resource control value against the
+ * global properties set on the control. Any system call operation presented
+ * with an invalid resource control value should return -1 and set errno to
+ * EINVAL.
+ *
+ * Return values
+ * 0 if valid, 1 if invalid.
+ *
+ * Caller's context
+ * No restriction on context.
+ */
+int
+rctl_invalid_value(rctl_dict_entry_t *rde, rctl_val_t *rval)
+{
+ rctl_val_t *sys_rval;
+
+ if (rval->rcv_privilege != RCPRIV_BASIC &&
+ rval->rcv_privilege != RCPRIV_PRIVILEGED &&
+ rval->rcv_privilege != RCPRIV_SYSTEM)
+ return (1);
+
+ if (rval->rcv_flagaction & ~RCTL_LOCAL_MASK)
+ return (1);
+
+ if (rval->rcv_privilege == RCPRIV_BASIC &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_NOBASIC) != 0)
+ return (1);
+
+ if ((rval->rcv_flagaction & RCTL_LOCAL_DENY) == 0 &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_DENY_ALWAYS) != 0)
+ return (1);
+
+ if ((rval->rcv_flagaction & RCTL_LOCAL_DENY) &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_DENY_NEVER))
+ return (1);
+
+ if ((rval->rcv_flagaction & RCTL_LOCAL_SIGNAL) &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_SIGNAL_NEVER))
+ return (1);
+
+ if ((rval->rcv_flagaction & RCTL_LOCAL_SIGNAL) &&
+ rval->rcv_action_signal == 0)
+ return (1);
+
+ if (rval->rcv_action_signal == SIGXCPU &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_CPU_TIME) == 0)
+ return (1);
+ else if (rval->rcv_action_signal == SIGXFSZ &&
+ (rde->rcd_flagaction & RCTL_GLOBAL_FILE_SIZE) == 0)
+ return (1);
+ else if (rval->rcv_action_signal != SIGHUP &&
+ rval->rcv_action_signal != SIGABRT &&
+ rval->rcv_action_signal != SIGKILL &&
+ rval->rcv_action_signal != SIGTERM &&
+ rval->rcv_action_signal != SIGSTOP &&
+ rval->rcv_action_signal != SIGXCPU &&
+ rval->rcv_action_signal != SIGXFSZ &&
+ rval->rcv_action_signal != SIGXRES &&
+ rval->rcv_action_signal != 0) /* That is, no signal is ok. */
+ return (1);
+
+ sys_rval = rde->rcd_default_value;
+ while (sys_rval->rcv_privilege != RCPRIV_SYSTEM)
+ sys_rval = sys_rval->rcv_next;
+
+ if (rval->rcv_value > sys_rval->rcv_value)
+ return (1);
+
+ return (0);
+}
+
+/*
+ * static long rctlsys_get(char *name, rctl_opaque_t *old_rblk,
+ * rctl_opaque_t *new_rblk, int flags)
+ *
+ * Overview
+ * rctlsys_get() is the implementation of the core logic of getrctl(2), the
+ * public system call for fetching resource control values. Two mutually
+ * exclusive flag values are supported: RCTL_FIRST and RCTL_NEXT. When
+ * RCTL_FIRST is presented, the value of old_rblk is ignored, and the first
+ * value in the resource control value sequence for the named control is
+ * transformed and placed in the user memory location at new_rblk. In the
+ * RCTL_NEXT case, the value of old_rblk is examined, and the next value in
+ * the sequence is transformed and placed at new_rblk.
+ */
+static long
+rctlsys_get(char *name, rctl_opaque_t *old_rblk, rctl_opaque_t *new_rblk,
+ int flags)
+{
+ rctl_val_t *nval;
+ rctl_opaque_t *nblk;
+ rctl_hndl_t hndl;
+ char *kname;
+ size_t klen;
+ rctl_dict_entry_t *krde;
+ int ret;
+ int action = flags & (~RCTLSYS_ACTION_MASK);
+
+ if (flags & (~RCTLSYS_MASK))
+ return (set_errno(EINVAL));
+
+ if (action != RCTL_FIRST && action != RCTL_NEXT &&
+ action != RCTL_USAGE)
+ return (set_errno(EINVAL));
+
+ if (new_rblk == NULL || name == NULL)
+ return (set_errno(EFAULT));
+
+ kname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP);
+
+ if (copyinstr(name, kname, MAXPATHLEN, &klen) != 0) {
+ kmem_free(kname, MAXPATHLEN);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(EFAULT));
+ }
+
+ if ((hndl = rctl_hndl_lookup(kname)) == -1) {
+ kmem_free(kname, MAXPATHLEN);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(EINVAL));
+ }
+
+ if (rctl_global_get(kname, krde) == -1) {
+ kmem_free(kname, MAXPATHLEN);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(ESRCH));
+ }
+
+ kmem_free(kname, MAXPATHLEN);
+
+ nval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+
+ if (action == RCTL_USAGE) {
+ kmem_cache_free(rctl_val_cache, nval);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(ENOTSUP));
+ } else if (action == RCTL_FIRST) {
+
+ mutex_enter(&curproc->p_lock);
+ if (ret = rctl_local_get(hndl, NULL, nval, curproc)) {
+ mutex_exit(&curproc->p_lock);
+ kmem_cache_free(rctl_val_cache, nval);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(ret));
+ }
+ mutex_exit(&curproc->p_lock);
+ } else {
+ /*
+ * RCTL_NEXT
+ */
+ rctl_val_t *oval;
+ rctl_opaque_t *oblk;
+
+ oblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP);
+
+ if (copyin(old_rblk, oblk, sizeof (rctl_opaque_t)) == -1) {
+ kmem_cache_free(rctl_val_cache, nval);
+ kmem_free(oblk, sizeof (rctl_opaque_t));
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(EFAULT));
+ }
+
+ oval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+
+ rctlsys_rblk_xfrm(oblk, NULL, oval, RBX_FROM_BLK | RBX_VAL);
+ mutex_enter(&curproc->p_lock);
+ ret = rctl_local_get(hndl, oval, nval, curproc);
+ mutex_exit(&curproc->p_lock);
+
+ kmem_cache_free(rctl_val_cache, oval);
+ kmem_free(oblk, sizeof (rctl_opaque_t));
+
+ if (ret != 0) {
+ kmem_cache_free(rctl_val_cache, nval);
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ return (set_errno(ret));
+ }
+ }
+
+ nblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP);
+
+ rctlsys_rblk_xfrm(nblk, krde, nval, RBX_TO_BLK | RBX_VAL | RBX_CTL);
+
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_cache_free(rctl_val_cache, nval);
+
+ if (copyout(nblk, new_rblk, sizeof (rctl_opaque_t)) == -1) {
+ kmem_free(nblk, sizeof (rctl_opaque_t));
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(nblk, sizeof (rctl_opaque_t));
+
+ return (0);
+}
+
+/*
+ * static long rctlsys_set(char *name, rctl_opaque_t *old_rblk,
+ * rctl_opaque_t *new_rblk, int flags)
+ *
+ * Overview
+ * rctlsys_set() is the implementation of the core login of setrctl(2), which
+ * allows the establishment of resource control values. Flags may take on any
+ * of three exclusive values: RCTL_INSERT, RCTL_DELETE, and RCTL_REPLACE.
+ * RCTL_INSERT ignores old_rblk and inserts the value in the appropriate
+ * position in the ordered sequence of resource control values. RCTL_DELETE
+ * ignores old_rblk and deletes the first resource control value matching
+ * (value, priority) in the given resource block. If no matching value is
+ * found, -1 is returned and errno is set to ENOENT. Finally, in the case of
+ * RCTL_REPLACE, old_rblk is used to match (value, priority); the matching
+ * resource control value in the sequence is replaced with the contents of
+ * new_rblk. Again, if no match is found, -1 is returned and errno is set to
+ * ENOENT.
+ *
+ * rctlsys_set() causes a cursor test, which can reactivate resource controls
+ * that have previously fired.
+ */
+static long
+rctlsys_set(char *name, rctl_opaque_t *old_rblk, rctl_opaque_t *new_rblk,
+ int flags)
+{
+ rctl_val_t *nval;
+ rctl_dict_entry_t *rde;
+ rctl_opaque_t *nblk;
+ rctl_hndl_t hndl;
+ char *kname;
+ size_t klen;
+ long ret = 0;
+ proc_t *pp = NULL;
+ pid_t pid;
+ int action = flags & (~RCTLSYS_ACTION_MASK);
+ rctl_val_t *oval;
+ rctl_val_t *rval1;
+ rctl_val_t *rval2;
+ rctl_val_t *tval;
+ rctl_opaque_t *oblk;
+
+ if (flags & (~RCTLSYS_MASK))
+ return (set_errno(EINVAL));
+
+ if (action != RCTL_INSERT &&
+ action != RCTL_DELETE &&
+ action != RCTL_REPLACE)
+ return (set_errno(EINVAL));
+
+ if (new_rblk == NULL || name == NULL)
+ return (set_errno(EFAULT));
+
+ kname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if (copyinstr(name, kname, MAXPATHLEN, &klen) != 0) {
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EFAULT));
+ }
+
+ if ((hndl = rctl_hndl_lookup(kname)) == -1) {
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EINVAL));
+ }
+
+ kmem_free(kname, MAXPATHLEN);
+
+ rde = rctl_dict_lookup_hndl(hndl);
+
+ nblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP);
+
+ if (copyin(new_rblk, nblk, sizeof (rctl_opaque_t)) == -1) {
+ kmem_free(nblk, sizeof (rctl_opaque_t));
+ return (set_errno(EFAULT));
+ }
+
+ nval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+
+ rctlsys_rblk_xfrm(nblk, NULL, nval, RBX_FROM_BLK | RBX_VAL);
+
+ if (rctl_invalid_value(rde, nval)) {
+ kmem_free(nblk, sizeof (rctl_opaque_t));
+ kmem_cache_free(rctl_val_cache, nval);
+ return (set_errno(EINVAL));
+ }
+
+ /* allocate what we might need before potentially grabbing p_lock */
+ oblk = kmem_alloc(sizeof (rctl_opaque_t), KM_SLEEP);
+ oval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ rval1 = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+ rval2 = kmem_cache_alloc(rctl_val_cache, KM_SLEEP);
+
+ if (nval->rcv_privilege == RCPRIV_BASIC) {
+ if (flags & RCTL_USE_RECIPIENT_PID) {
+ pid = nval->rcv_action_recip_pid;
+
+ /* case for manipulating rctl values on other procs */
+ if (pid != curproc->p_pid) {
+ /* cannot be other pid on process rctls */
+ if (rde->rcd_entity == RCENTITY_PROCESS) {
+ ret = set_errno(EINVAL);
+ goto rctlsys_out;
+ }
+ /*
+ * must have privilege to manipulate controls
+ * on other processes
+ */
+ if (secpolicy_rctlsys(CRED(), B_FALSE) != 0) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+
+ pid = nval->rcv_action_recip_pid;
+ mutex_enter(&pidlock);
+ pp = prfind(pid);
+ if (!pp) {
+ mutex_exit(&pidlock);
+ ret = set_errno(ESRCH);
+ goto rctlsys_out;
+ }
+
+ /*
+ * idle or zombie procs have either not yet
+ * set up their rctls or have already done
+ * their rctl_set_tearoff's.
+ */
+ if (pp->p_stat == SZOMB ||
+ pp->p_stat == SIDL) {
+ mutex_exit(&pidlock);
+ ret = set_errno(ESRCH);
+ goto rctlsys_out;
+ }
+
+ /*
+ * hold this pp's p_lock to ensure that
+ * it does not do it's rctl_set_tearoff
+ * If we did not do this, we could
+ * potentially add rctls to the entity
+ * with a recipient that is a process
+ * that has exited.
+ */
+ mutex_enter(&pp->p_lock);
+ mutex_exit(&pidlock);
+
+ /*
+ * We know that curproc's task, project,
+ * and zone pointers will not change
+ * because functions that change them
+ * call holdlwps(SHOLDFORK1) first.
+ */
+
+ /*
+ * verify that the found pp is in the
+ * current task. If it is, then it
+ * is also within the current project
+ * and zone.
+ */
+ if (rde->rcd_entity == RCENTITY_TASK &&
+ pp->p_task != curproc->p_task) {
+ ret = set_errno(ESRCH);
+ goto rctlsys_out;
+ }
+
+ ASSERT(pp->p_task->tk_proj ==
+ curproc->p_task->tk_proj);
+ ASSERT(pp->p_zone == curproc->p_zone);
+
+
+ nval->rcv_action_recipient = pp;
+ nval->rcv_action_recip_pid = pid;
+
+ } else {
+ /* for manipulating rctl values on this proc */
+ mutex_enter(&curproc->p_lock);
+ pp = curproc;
+ nval->rcv_action_recipient = curproc;
+ nval->rcv_action_recip_pid = curproc->p_pid;
+ }
+
+ } else {
+ /* RCTL_USE_RECIPIENT_PID not set, use this proc */
+ mutex_enter(&curproc->p_lock);
+ pp = curproc;
+ nval->rcv_action_recipient = curproc;
+ nval->rcv_action_recip_pid = curproc->p_pid;
+ }
+
+ } else {
+ /* privileged controls have no recipient pid */
+ mutex_enter(&curproc->p_lock);
+ pp = curproc;
+ nval->rcv_action_recipient = NULL;
+ nval->rcv_action_recip_pid = -1;
+ }
+
+ nval->rcv_firing_time = 0;
+
+ if (action == RCTL_REPLACE) {
+
+ if (copyin(old_rblk, oblk, sizeof (rctl_opaque_t)) == -1) {
+ ret = set_errno(EFAULT);
+ goto rctlsys_out;
+ }
+
+ rctlsys_rblk_xfrm(oblk, NULL, oval, RBX_FROM_BLK | RBX_VAL);
+
+ if (rctl_invalid_value(rde, oval)) {
+ ret = set_errno(EINVAL);
+ goto rctlsys_out;
+ }
+
+ if (oval->rcv_privilege == RCPRIV_BASIC) {
+ if (!(flags & RCTL_USE_RECIPIENT_PID)) {
+ oval->rcv_action_recipient = curproc;
+ oval->rcv_action_recip_pid = curproc->p_pid;
+ }
+ } else {
+ oval->rcv_action_recipient = NULL;
+ oval->rcv_action_recip_pid = -1;
+ }
+
+ /*
+ * Find the real value we're attempting to replace on the
+ * sequence, rather than trusting the one delivered from
+ * userland.
+ */
+ if (ret = rctl_local_get(hndl, NULL, rval1, pp)) {
+ (void) set_errno(ret);
+ goto rctlsys_out;
+ }
+
+ do {
+ if (rval1->rcv_privilege == RCPRIV_SYSTEM ||
+ rctl_val_cmp(oval, rval1, 0) == 0)
+ break;
+
+ tval = rval1;
+ rval1 = rval2;
+ rval2 = tval;
+ } while (rctl_local_get(hndl, rval2, rval1, pp) == 0);
+
+ if (rval1->rcv_privilege == RCPRIV_SYSTEM) {
+ if (rctl_val_cmp(oval, rval1, 1) == 0)
+ ret = set_errno(EPERM);
+ else
+ ret = set_errno(ESRCH);
+
+ goto rctlsys_out;
+ }
+
+ bcopy(rval1, oval, sizeof (rctl_val_t));
+
+ /*
+ * System controls are immutable.
+ */
+ if (nval->rcv_privilege == RCPRIV_SYSTEM) {
+ ret = set_errno(EPERM);
+ goto rctlsys_out;
+ }
+
+ /*
+ * Only privileged processes in the global zone can modify
+ * privileged rctls of type RCENTITY_ZONE; replacing privileged
+ * controls with basic ones are not allowed either. Lowering a
+ * lowerable one might be OK for privileged processes in a
+ * non-global zone, but lowerable rctls probably don't make
+ * sense for zones (hence, not modifiable from within a zone).
+ */
+ if (rde->rcd_entity == RCENTITY_ZONE &&
+ (nval->rcv_privilege == RCPRIV_PRIVILEGED ||
+ oval->rcv_privilege == RCPRIV_PRIVILEGED) &&
+ secpolicy_rctlsys(CRED(), B_TRUE) != 0) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+
+ /*
+ * Must be privileged to replace a privileged control with
+ * a basic one.
+ */
+ if (oval->rcv_privilege == RCPRIV_PRIVILEGED &&
+ nval->rcv_privilege != RCPRIV_PRIVILEGED &&
+ secpolicy_rctlsys(CRED(), B_FALSE) != 0) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+
+ /*
+ * Must have lowerable global property for non-privileged
+ * to lower the value of a privileged control; otherwise must
+ * have sufficient privileges to modify privileged controls
+ * at all.
+ */
+ if (oval->rcv_privilege == RCPRIV_PRIVILEGED &&
+ nval->rcv_privilege == RCPRIV_PRIVILEGED &&
+ ((((rde->rcd_flagaction & RCTL_GLOBAL_LOWERABLE) == 0) ||
+ oval->rcv_flagaction != nval->rcv_flagaction ||
+ oval->rcv_action_signal != nval->rcv_action_signal ||
+ oval->rcv_value < nval->rcv_value)) &&
+ secpolicy_rctlsys(CRED(), B_FALSE) != 0) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+
+ if (ret = rctl_local_replace(hndl, oval, nval, pp)) {
+ (void) set_errno(ret);
+ goto rctlsys_out;
+ }
+
+ /* ensure that nval is not freed */
+ nval = NULL;
+
+ } else if (action == RCTL_INSERT) {
+ /*
+ * System controls are immutable.
+ */
+ if (nval->rcv_privilege == RCPRIV_SYSTEM) {
+ ret = set_errno(EPERM);
+ goto rctlsys_out;
+ }
+
+ /*
+ * Only privileged processes in the global zone may add
+ * privileged zone.* rctls. Only privileged processes
+ * may add other privileged rctls.
+ */
+ if (nval->rcv_privilege == RCPRIV_PRIVILEGED) {
+ if ((rde->rcd_entity == RCENTITY_ZONE &&
+ secpolicy_rctlsys(CRED(), B_TRUE) != 0) ||
+ (rde->rcd_entity != RCENTITY_ZONE &&
+ secpolicy_rctlsys(CRED(), B_FALSE) != 0)) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+ }
+
+ /*
+ * Only one basic control is allowed per rctl.
+ * If a basic control is being inserted, delete
+ * any other basic control.
+ */
+ if ((nval->rcv_privilege == RCPRIV_BASIC) &&
+ (rctl_local_get(hndl, NULL, rval1, pp) == 0)) {
+ do {
+ if (rval1->rcv_privilege == RCPRIV_BASIC &&
+ rval1->rcv_action_recipient == curproc) {
+ (void) rctl_local_delete(hndl, rval1,
+ pp);
+ if (rctl_local_get(hndl, NULL, rval1,
+ pp) != 0)
+ break;
+ }
+
+ tval = rval1;
+ rval1 = rval2;
+ rval2 = tval;
+ } while (rctl_local_get(hndl, rval2, rval1, pp)
+ == 0);
+ }
+
+
+ if (ret = rctl_local_insert(hndl, nval, pp)) {
+ (void) set_errno(ret);
+ goto rctlsys_out;
+ }
+
+ /* ensure that nval is not freed */
+ nval = NULL;
+
+ } else {
+ /*
+ * RCTL_DELETE
+ */
+ if (nval->rcv_privilege == RCPRIV_SYSTEM) {
+ ret = set_errno(EPERM);
+ goto rctlsys_out;
+ }
+
+ if (nval->rcv_privilege == RCPRIV_PRIVILEGED) {
+ if ((rde->rcd_entity == RCENTITY_ZONE &&
+ secpolicy_rctlsys(CRED(), B_TRUE) != 0) ||
+ (rde->rcd_entity != RCENTITY_ZONE &&
+ secpolicy_rctlsys(CRED(), B_FALSE) != 0)) {
+ ret = set_errno(EACCES);
+ goto rctlsys_out;
+ }
+ }
+
+ if (ret = rctl_local_delete(hndl, nval, pp)) {
+ (void) set_errno(ret);
+ goto rctlsys_out;
+ }
+ }
+
+rctlsys_out:
+
+ if (pp)
+ mutex_exit(&pp->p_lock);
+
+ kmem_free(nblk, sizeof (rctl_opaque_t));
+ kmem_free(oblk, sizeof (rctl_opaque_t));
+
+ /* only free nval if we did not rctl_local_insert it */
+ if (nval)
+ kmem_cache_free(rctl_val_cache, nval);
+
+ kmem_cache_free(rctl_val_cache, oval);
+ kmem_cache_free(rctl_val_cache, rval1);
+ kmem_cache_free(rctl_val_cache, rval2);
+
+ return (ret);
+}
+
+static long
+rctlsys_lst(char *ubuf, size_t ubufsz)
+{
+ char *kbuf;
+ size_t kbufsz;
+
+ kbufsz = rctl_build_name_buf(&kbuf);
+
+ if (kbufsz <= ubufsz &&
+ copyout(kbuf, ubuf, kbufsz) != 0) {
+ kmem_free(kbuf, kbufsz);
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(kbuf, kbufsz);
+
+ return (kbufsz);
+}
+
+static long
+rctlsys_ctl(char *name, rctl_opaque_t *rblk, int flags)
+{
+ rctl_dict_entry_t *krde;
+ rctl_opaque_t *krblk;
+ char *kname;
+ size_t klen;
+
+ kname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ if (name == NULL || copyinstr(name, kname, MAXPATHLEN, &klen) != 0) {
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EFAULT));
+ }
+
+ switch (flags) {
+ case RCTLCTL_GET:
+ krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP);
+ krblk = kmem_zalloc(sizeof (rctl_opaque_t), KM_SLEEP);
+
+ if (rctl_global_get(kname, krde) == -1) {
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(ESRCH));
+ }
+
+ rctlsys_rblk_xfrm(krblk, krde, NULL, RBX_TO_BLK | RBX_CTL);
+
+ if (copyout(krblk, rblk, sizeof (rctl_opaque_t)) != 0) {
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ break;
+ case RCTLCTL_SET:
+ if (secpolicy_rctlsys(CRED(), B_TRUE) != 0) {
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EPERM));
+ }
+
+ krde = kmem_alloc(sizeof (rctl_dict_entry_t), KM_SLEEP);
+ krblk = kmem_zalloc(sizeof (rctl_opaque_t), KM_SLEEP);
+
+ if (rctl_global_get(kname, krde) == -1) {
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(ESRCH));
+ }
+
+ if (copyin(rblk, krblk, sizeof (rctl_opaque_t)) != 0) {
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EFAULT));
+ }
+
+ rctlsys_rblk_xfrm(krblk, krde, NULL, RBX_FROM_BLK | RBX_CTL);
+
+ if (rctl_global_set(kname, krde) == -1) {
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(ESRCH));
+ }
+
+ kmem_free(krde, sizeof (rctl_dict_entry_t));
+ kmem_free(krblk, sizeof (rctl_opaque_t));
+ kmem_free(kname, MAXPATHLEN);
+
+ break;
+ default:
+ kmem_free(kname, MAXPATHLEN);
+ return (set_errno(EINVAL));
+ }
+
+ return (0);
+}
+
+long
+rctlsys(int code, char *name, void *obuf, void *nbuf, size_t obufsz, int flags)
+{
+ switch (code) {
+ case 0:
+ return (rctlsys_get(name, obuf, nbuf, flags));
+
+ case 1:
+ return (rctlsys_set(name, obuf, nbuf, flags));
+
+ case 2:
+ /*
+ * Private call for rctl_walk(3C).
+ */
+ return (rctlsys_lst(obuf, obufsz));
+
+ case 3:
+ /*
+ * Private code for rctladm(1M): "rctlctl".
+ */
+ return (rctlsys_ctl(name, obuf, flags));
+
+ default:
+ return (set_errno(EINVAL));
+ }
+}
diff --git a/usr/src/uts/common/syscall/readlink.c b/usr/src/uts/common/syscall/readlink.c
new file mode 100644
index 0000000000..a1e8475787
--- /dev/null
+++ b/usr/src/uts/common/syscall/readlink.c
@@ -0,0 +1,119 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/uio.h>
+#include <sys/ioreq.h>
+#include <sys/debug.h>
+
+/*
+ * Read the contents of a symbolic link.
+ */
+ssize_t
+readlink(char *name, char *buf, size_t count)
+{
+ vnode_t *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct vattr vattr;
+ ssize_t cnt;
+
+ if ((cnt = (ssize_t)count) < 0)
+ return (set_errno(EINVAL));
+
+lookup:
+ if (error = lookupname(name, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+
+ if (vp->v_type != VLNK) {
+ /*
+ * Ask the underlying filesystem if it wants this
+ * object to look like a symlink at user-level.
+ */
+ vattr.va_mask = AT_TYPE;
+ error = VOP_GETATTR(vp, &vattr, 0, CRED());
+ if (error || vattr.va_type != VLNK) {
+ VN_RELE(vp);
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(EINVAL));
+ }
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = cnt;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_loffset = 0;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_extflg = UIO_COPY_CACHED;
+ auio.uio_resid = cnt;
+ error = VOP_READLINK(vp, &auio, CRED());
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return ((ssize_t)(cnt - auio.uio_resid));
+}
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * readlink32() intentionally returns a ssize_t rather than ssize32_t;
+ * see the comments above read32 for details.
+ */
+
+ssize_t
+readlink32(caddr32_t name, caddr32_t buf, size32_t count)
+{
+ return ((ssize32_t)readlink((char *)(uintptr_t)name,
+ (char *)(uintptr_t)buf, (ssize32_t)count));
+}
+
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/rename.c b/usr/src/uts/common/syscall/rename.c
new file mode 100644
index 0000000000..4d8d5270ed
--- /dev/null
+++ b/usr/src/uts/common/syscall/rename.c
@@ -0,0 +1,139 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+
+/*
+ * Rename or move an existing file.
+ */
+int
+rename(char *from, char *to)
+{
+ int error;
+
+ if (error = vn_rename(from, to, UIO_USERSPACE))
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Rename a file relative to a given directory
+ */
+int
+renameat(int fromfd, char *old, int tofd, char *new)
+{
+ file_t *fromfp;
+ file_t *tofp;
+ vnode_t *fromvp, *tovp;
+ int error;
+ proc_t *p = curproc;
+ char oldstart, newstart;
+
+ tovp = fromvp = NULL;
+
+ if ((fromfd == AT_FDCWD && old == NULL) ||
+ (tofd == AT_FDCWD && new == NULL))
+ return (set_errno(EFAULT));
+
+ if (fromfd == AT_FDCWD || tofd == AT_FDCWD) {
+ mutex_enter(&p->p_lock);
+ if (fromfd == AT_FDCWD) {
+ fromvp = PTOU(p)->u_cdir;
+ VN_HOLD(fromvp);
+ }
+ if (tofd == AT_FDCWD) {
+ tovp = PTOU(p)->u_cdir;
+ VN_HOLD(tovp);
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ if (copyin(old, &oldstart, sizeof (char)))
+ return (set_errno(EFAULT));
+
+ if (copyin(new, &newstart, sizeof (char)))
+ return (set_errno(EFAULT));
+
+ if (fromvp == NULL) {
+ if (oldstart != '/') {
+ if ((fromfp = getf(fromfd)) == NULL) {
+ if (tovp != NULL)
+ VN_RELE(tovp);
+ return (set_errno(EBADF));
+ }
+ fromvp = fromfp->f_vnode;
+ VN_HOLD(fromvp);
+ releasef(fromfd);
+ } else {
+ fromvp = NULL;
+ }
+ }
+
+ if (tovp == NULL) {
+ if (newstart != '/') {
+ if ((tofp = getf(tofd)) == NULL) {
+ if (fromvp != NULL)
+ VN_RELE(fromvp);
+ return (set_errno(EBADF));
+ }
+ tovp = tofp->f_vnode;
+ VN_HOLD(tovp);
+ releasef(tofd);
+ } else {
+ tovp = NULL;
+ }
+ }
+
+ error = vn_renameat(fromvp, old, tovp, new, UIO_USERSPACE);
+
+ if (fromvp != NULL)
+ VN_RELE(fromvp);
+ if (tovp != NULL)
+ VN_RELE(tovp);
+ if (error != 0)
+ return (set_errno(error));
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/resolvepath.c b/usr/src/uts/common/syscall/resolvepath.c
new file mode 100644
index 0000000000..e6cb678761
--- /dev/null
+++ b/usr/src/uts/common/syscall/resolvepath.c
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+#ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Copyright 1997 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+
+int
+resolvepath(char *path, char *buf, size_t count)
+{
+ struct pathname lookpn;
+ struct pathname resolvepn;
+ int error;
+
+ if (count == 0)
+ return (0);
+ if (error = pn_get(path, UIO_USERSPACE, &lookpn))
+ return (set_errno(error));
+ pn_alloc(&resolvepn);
+ error = lookuppn(&lookpn, &resolvepn, FOLLOW, NULL, NULL);
+ if (error == 0) {
+ if (count > resolvepn.pn_pathlen)
+ count = resolvepn.pn_pathlen;
+ if (copyout(resolvepn.pn_path, buf, count))
+ error = EFAULT;
+ }
+ pn_free(&resolvepn);
+ pn_free(&lookpn);
+
+ if (error)
+ return (set_errno(error));
+ return ((int)count);
+}
diff --git a/usr/src/uts/common/syscall/rlimit.c b/usr/src/uts/common/syscall/rlimit.c
new file mode 100644
index 0000000000..eac3584764
--- /dev/null
+++ b/usr/src/uts/common/syscall/rlimit.c
@@ -0,0 +1,487 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/ulimit.h>
+#include <sys/debug.h>
+#include <sys/rctl.h>
+
+#include <vm/as.h>
+
+/*
+ * Perhaps ulimit could be moved into a user library, as calls to
+ * getrlimit and setrlimit, were it not for binary compatibility
+ * restrictions.
+ */
+long
+ulimit(int cmd, long arg)
+{
+ proc_t *p = curproc;
+ long retval;
+
+ switch (cmd) {
+
+ case UL_GFILLIM: /* Return current file size limit. */
+ {
+ rlim64_t filesize;
+
+ mutex_enter(&p->p_lock);
+ filesize = rctl_enforced_value(rctlproc_legacy[RLIMIT_FSIZE],
+ p->p_rctls, p);
+ mutex_exit(&p->p_lock);
+
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ /*
+ * File size is returned in blocks for ulimit.
+ * This function is deprecated and therefore LFS API
+ * didn't define the behaviour of ulimit.
+ * Here we return maximum value of file size possible
+ * so that applications that do not check errors
+ * continue to work.
+ */
+ if (filesize > MAXOFF32_T)
+ filesize = MAXOFF32_T;
+ retval = ((int)filesize >> SCTRSHFT);
+ } else
+ retval = filesize >> SCTRSHFT;
+ break;
+ }
+
+ case UL_SFILLIM: /* Set new file size limit. */
+ {
+ int error = 0;
+ rlim64_t lim = (rlim64_t)arg;
+ struct rlimit64 rl64;
+ rctl_alloc_gp_t *gp = rctl_rlimit_set_prealloc(1);
+
+ if (lim >= (((rlim64_t)MAXOFFSET_T) >> SCTRSHFT))
+ lim = (rlim64_t)RLIM64_INFINITY;
+ else
+ lim <<= SCTRSHFT;
+
+ rl64.rlim_max = rl64.rlim_cur = lim;
+ mutex_enter(&p->p_lock);
+ if (error = rctl_rlimit_set(rctlproc_legacy[RLIMIT_FSIZE], p,
+ &rl64, gp, RCTL_LOCAL_DENY | RCTL_LOCAL_SIGNAL, SIGXFSZ,
+ CRED())) {
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+ return (set_errno(error));
+ }
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+ retval = arg;
+ break;
+ }
+
+ case UL_GMEMLIM: /* Return maximum possible break value. */
+ {
+ struct seg *seg;
+ struct seg *nextseg;
+ struct as *as = p->p_as;
+ caddr_t brkend;
+ caddr_t brkbase;
+ size_t size;
+ rlim64_t size_ctl;
+ rlim64_t vmem_ctl;
+
+ /*
+ * Find the segment with a virtual address
+ * greater than the end of the current break.
+ */
+ nextseg = NULL;
+ mutex_enter(&p->p_lock);
+ brkbase = (caddr_t)p->p_brkbase;
+ brkend = (caddr_t)p->p_brkbase + p->p_brksize;
+ mutex_exit(&p->p_lock);
+
+ /*
+ * Since we can't return less than the current break,
+ * initialize the return value to the current break
+ */
+ retval = (long)brkend;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = as_findseg(as, brkend, 0); seg != NULL;
+ seg = AS_SEGNEXT(as, seg)) {
+ if (seg->s_base >= brkend) {
+ nextseg = seg;
+ break;
+ }
+ }
+
+ mutex_enter(&p->p_lock);
+ size_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_DATA],
+ p->p_rctls, p);
+ vmem_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_VMEM],
+ p->p_rctls, p);
+ mutex_exit(&p->p_lock);
+
+ /*
+ * First, calculate the maximum break value based on
+ * the user's RLIMIT_DATA, but also taking into account
+ * that this value cannot be greater than as->a_userlimit.
+ * We also take care to make sure that we don't overflow
+ * in the calculation.
+ */
+ /*
+ * Since we are casting the RLIMIT_DATA value to a
+ * ulong (a 32-bit value in the 32-bit kernel) we have
+ * to pass this assertion.
+ */
+ ASSERT32((size_t)size_ctl <= UINT32_MAX);
+
+ size = (size_t)size_ctl;
+ if (as->a_userlimit - brkbase > size)
+ retval = MAX((size_t)retval, (size_t)(brkbase + size));
+ /* don't return less than current */
+ else
+ retval = (long)as->a_userlimit;
+
+ /*
+ * The max break cannot extend into the next segment
+ */
+ if (nextseg != NULL)
+ retval = MIN((uintptr_t)retval,
+ (uintptr_t)nextseg->s_base);
+
+ /*
+ * Handle the case where there is an limit on RLIMIT_VMEM
+ */
+ if (vmem_ctl < UINT64_MAX) {
+ /* calculate brkend based on the end of page */
+ caddr_t brkendpg = (caddr_t)roundup((uintptr_t)brkend,
+ PAGESIZE);
+ /*
+ * Large Files: The following assertion has to pass
+ * through to ensure the correctness of the cast.
+ */
+ ASSERT32(vmem_ctl <= UINT32_MAX);
+
+ size = (size_t)(vmem_ctl & PAGEMASK);
+
+ if (as->a_size < size)
+ size -= as->a_size;
+ else
+ size = 0;
+ /*
+ * Take care to not overflow the calculation
+ */
+ if (as->a_userlimit - brkendpg > size)
+ retval = MIN((size_t)retval,
+ (size_t)(brkendpg + size));
+ }
+
+ AS_LOCK_EXIT(as, &as->a_lock);
+
+ /* truncate to same boundary as sbrk */
+
+ switch (get_udatamodel()) {
+ default:
+ case DATAMODEL_ILP32:
+ retval = retval & ~(8-1);
+ break;
+ case DATAMODEL_LP64:
+ retval = retval & ~(16-1);
+ break;
+ }
+ break;
+ }
+
+ case UL_GDESLIM: /* Return approximate number of open files */
+ {
+ rlim64_t fdno_ctl;
+
+ mutex_enter(&curproc->p_lock);
+ fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
+ curproc->p_rctls, curproc);
+ ASSERT(fdno_ctl <= INT_MAX);
+ retval = (rlim_t)fdno_ctl;
+ mutex_exit(&curproc->p_lock);
+ break;
+ }
+
+ default:
+ return (set_errno(EINVAL));
+
+ }
+ return (retval);
+}
+
+#ifdef _SYSCALL32_IMPL
+
+int
+ulimit32(int cmd, int arg)
+{
+ return ((int)ulimit(cmd, (long)arg));
+}
+
+#endif /* _SYSCALL32_IMPL */
+
+#if defined(_ILP32) || defined(_SYSCALL32_IMPL)
+
+/*
+ * Large Files: getrlimit returns RLIM_SAVED_CUR or RLIM_SAVED_MAX when
+ * rlim_cur or rlim_max is not representable in 32-bit rlim_t. These
+ * values are just tokens which will be used in setrlimit to set the
+ * correct limits. The current limits are saved in the saved_rlimit members
+ * in user structures when the token is returned. setrlimit restores
+ * the limit values to these saved values when the token is passed.
+ * Consider the following common scenario of the apps:
+ *
+ * limit = getrlimit();
+ * savedlimit = limit;
+ * limit = limit1;
+ * setrlimit(limit)
+ * // execute all processes in the new rlimit state.
+ * setrlimit(savedlimit) // restore the old values.
+ *
+ * Most apps don't check error returns from getrlimit or setrlimit
+ * and this is why we return tokens when the correct value
+ * cannot be represented in rlim_t. For more discussion refer to
+ * the LFS API document.
+ *
+ * In the 64-bit kernel, all existing resource limits are treated in this
+ * manner. In the 32-bit kernel, CPU time is treated equivalently to the
+ * file size limit above; the VM-related limits are not. The macro,
+ * RLIM_SAVED(x), returns true if the resource limit should be handled in
+ * this way on the current kernel.
+ */
+int
+getrlimit32(int resource, struct rlimit32 *rlp)
+{
+ struct rlimit32 rlim32;
+ struct rlimit64 rlim64;
+ struct proc *p = curproc;
+ struct user *up = PTOU(p);
+ int savecur = 0;
+ int savemax = 0;
+
+ if (resource < 0 || resource >= RLIM_NLIMITS)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&p->p_lock);
+ (void) rctl_rlimit_get(rctlproc_legacy[resource], p, &rlim64);
+ mutex_exit(&p->p_lock);
+
+ if (rlim64.rlim_max > (rlim64_t)UINT32_MAX) {
+
+ if (rlim64.rlim_max == RLIM64_INFINITY)
+ rlim32.rlim_max = RLIM32_INFINITY;
+ else {
+ savemax = 1;
+ rlim32.rlim_max = RLIM32_SAVED_MAX;
+ /*CONSTCOND*/
+ ASSERT(RLIM_SAVED(resource));
+ }
+
+ if (rlim64.rlim_cur == RLIM64_INFINITY)
+ rlim32.rlim_cur = RLIM32_INFINITY;
+ else if (rlim64.rlim_cur == rlim64.rlim_max) {
+ savecur = 1;
+ rlim32.rlim_cur = RLIM32_SAVED_MAX;
+ /*CONSTCOND*/
+ ASSERT(RLIM_SAVED(resource));
+ } else if (rlim64.rlim_cur > (rlim64_t)UINT32_MAX) {
+ savecur = 1;
+ rlim32.rlim_cur = RLIM32_SAVED_CUR;
+ /*CONSTCOND*/
+ ASSERT(RLIM_SAVED(resource));
+ } else
+ rlim32.rlim_cur = rlim64.rlim_cur;
+
+ /*
+ * save the current limits in user structure.
+ */
+ /*CONSTCOND*/
+ if (RLIM_SAVED(resource)) {
+ mutex_enter(&p->p_lock);
+ if (savemax)
+ up->u_saved_rlimit[resource].rlim_max =
+ rlim64.rlim_max;
+ if (savecur)
+ up->u_saved_rlimit[resource].rlim_cur =
+ rlim64.rlim_cur;
+ mutex_exit(&p->p_lock);
+ }
+ } else {
+ ASSERT(rlim64.rlim_cur <= (rlim64_t)UINT32_MAX);
+ rlim32.rlim_max = rlim64.rlim_max;
+ rlim32.rlim_cur = rlim64.rlim_cur;
+ }
+
+ if (copyout(&rlim32, rlp, sizeof (rlim32)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+/*
+ * See comments above getrlimit32(). When the tokens are passed in the
+ * rlimit structure the values are considered equal to the values
+ * stored in saved_rlimit members of user structure.
+ * When the user passes RLIM_INFINITY to set the resource limit to
+ * unlimited internally understand this value as RLIM64_INFINITY and
+ * let rlimit() do the job.
+ */
+int
+setrlimit32(int resource, struct rlimit32 *rlp)
+{
+ struct rlimit32 rlim32;
+ struct rlimit64 rlim64;
+ struct rlimit64 saved_rlim;
+ int error;
+ struct proc *p = ttoproc(curthread);
+ struct user *up = PTOU(p);
+ rctl_alloc_gp_t *gp;
+
+ if (resource < 0 || resource >= RLIM_NLIMITS)
+ return (set_errno(EINVAL));
+ if (copyin(rlp, &rlim32, sizeof (rlim32)))
+ return (set_errno(EFAULT));
+
+ gp = rctl_rlimit_set_prealloc(1);
+
+ /*
+ * Disallow resource limit tunnelling
+ */
+ /*CONSTCOND*/
+ if (RLIM_SAVED(resource)) {
+ mutex_enter(&p->p_lock);
+ saved_rlim = up->u_saved_rlimit[resource];
+ mutex_exit(&p->p_lock);
+ } else {
+ saved_rlim.rlim_max = (rlim64_t)rlim32.rlim_max;
+ saved_rlim.rlim_cur = (rlim64_t)rlim32.rlim_cur;
+ }
+
+ switch (rlim32.rlim_cur) {
+ case RLIM32_INFINITY:
+ rlim64.rlim_cur = RLIM64_INFINITY;
+ break;
+ case RLIM32_SAVED_CUR:
+ rlim64.rlim_cur = saved_rlim.rlim_cur;
+ break;
+ case RLIM32_SAVED_MAX:
+ rlim64.rlim_cur = saved_rlim.rlim_max;
+ break;
+ default:
+ rlim64.rlim_cur = (rlim64_t)rlim32.rlim_cur;
+ break;
+ }
+
+ switch (rlim32.rlim_max) {
+ case RLIM32_INFINITY:
+ rlim64.rlim_max = RLIM64_INFINITY;
+ break;
+ case RLIM32_SAVED_MAX:
+ rlim64.rlim_max = saved_rlim.rlim_max;
+ break;
+ case RLIM32_SAVED_CUR:
+ rlim64.rlim_max = saved_rlim.rlim_cur;
+ break;
+ default:
+ rlim64.rlim_max = (rlim64_t)rlim32.rlim_max;
+ break;
+ }
+
+ mutex_enter(&p->p_lock);
+ if (error = rctl_rlimit_set(rctlproc_legacy[resource], p, &rlim64, gp,
+ rctlproc_flags[resource], rctlproc_signals[resource], CRED())) {
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+ return (set_errno(error));
+ }
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+
+ return (0);
+}
+
+#endif /* _ILP32 && _SYSCALL32_IMPL */
+
+int
+getrlimit64(int resource, struct rlimit64 *rlp)
+{
+ struct rlimit64 rlim64;
+ struct proc *p = ttoproc(curthread);
+
+ if (resource < 0 || resource >= RLIM_NLIMITS)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&p->p_lock);
+ (void) rctl_rlimit_get(rctlproc_legacy[resource], p, &rlim64);
+ mutex_exit(&p->p_lock);
+
+ if (copyout(&rlim64, rlp, sizeof (rlim64)))
+ return (set_errno(EFAULT));
+ return (0);
+}
+
+int
+setrlimit64(int resource, struct rlimit64 *rlp)
+{
+ struct rlimit64 rlim64;
+ struct proc *p = ttoproc(curthread);
+ int error;
+ rctl_alloc_gp_t *gp;
+
+ if (resource < 0 || resource >= RLIM_NLIMITS)
+ return (set_errno(EINVAL));
+ if (copyin(rlp, &rlim64, sizeof (rlim64)))
+ return (set_errno(EFAULT));
+
+ gp = rctl_rlimit_set_prealloc(1);
+
+ mutex_enter(&p->p_lock);
+ if (error = rctl_rlimit_set(rctlproc_legacy[resource], p, &rlim64, gp,
+ rctlproc_flags[resource], rctlproc_signals[resource], CRED())) {
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+ return (set_errno(error));
+ }
+ mutex_exit(&p->p_lock);
+ rctl_prealloc_destroy(gp);
+ return (0);
+
+}
diff --git a/usr/src/uts/common/syscall/rmdir.c b/usr/src/uts/common/syscall/rmdir.c
new file mode 100644
index 0000000000..0a0ad7e2cd
--- /dev/null
+++ b/usr/src/uts/common/syscall/rmdir.c
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1989 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/filio.h>
+
+#include <sys/debug.h>
+
+/*
+ * Remove a directory.
+ */
+int
+rmdir(char *dname)
+{
+ int error;
+
+ if (error = vn_remove(dname, UIO_USERSPACE, RMDIRECTORY))
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c
new file mode 100644
index 0000000000..5b66f2fa41
--- /dev/null
+++ b/usr/src/uts/common/syscall/rusagesys.c
@@ -0,0 +1,294 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Implement fast getrusage call
+ */
+
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/time.h>
+#include <sys/errno.h>
+#include <sys/resource.h>
+
+static int
+getrusage(void *user_rusage)
+{
+ struct rusage r;
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ hrtime_t snsecs, unsecs;
+ klwp_t *lwp;
+
+ r.ru_maxrss = 0; /* always 0 */
+ r.ru_ixrss = 0; /* always 0 */
+ r.ru_idrss = 0; /* always 0 */
+ r.ru_isrss = 0; /* always 0 */
+
+ r.ru_utime.tv_sec = 0;
+ r.ru_utime.tv_usec = 0;
+ r.ru_stime.tv_sec = 0;
+ r.ru_stime.tv_usec = 0;
+
+ mutex_enter(&p->p_lock);
+
+ if (p->p_defunct > 0) {
+ r.ru_majflt = p->p_ru.majflt;
+ r.ru_minflt = p->p_ru.minflt;
+ r.ru_nswap = p->p_ru.nswap;
+ r.ru_inblock = p->p_ru.inblock;
+ r.ru_oublock = p->p_ru.oublock;
+ r.ru_msgsnd = p->p_ru.msgsnd;
+ r.ru_msgrcv = p->p_ru.msgrcv;
+ r.ru_nsignals = p->p_ru.nsignals;
+ r.ru_nvcsw = p->p_ru.nvcsw;
+ r.ru_nivcsw = p->p_ru.nivcsw;
+ }
+
+ unsecs = mstate_aggr_state(p, LMS_USER);
+ snsecs = mstate_aggr_state(p, LMS_SYSTEM);
+
+ do {
+ if (t->t_proc_flag & TP_LWPEXIT)
+ continue;
+
+ lwp = ttolwp(t);
+
+ r.ru_majflt += lwp->lwp_ru.majflt;
+ r.ru_minflt += lwp->lwp_ru.minflt;
+ r.ru_nswap += lwp->lwp_ru.nswap;
+ r.ru_inblock += lwp->lwp_ru.inblock;
+ r.ru_oublock += lwp->lwp_ru.oublock;
+ r.ru_msgsnd += lwp->lwp_ru.msgsnd;
+ r.ru_msgrcv += lwp->lwp_ru.msgrcv;
+ r.ru_nsignals += lwp->lwp_ru.nsignals;
+ r.ru_nvcsw += lwp->lwp_ru.nvcsw;
+ r.ru_nivcsw += lwp->lwp_ru.nivcsw;
+
+ } while ((t = t->t_forw) != curthread);
+
+ mutex_exit(&p->p_lock);
+
+ hrt2tv(unsecs, &r.ru_utime);
+ hrt2tv(snsecs, &r.ru_stime);
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct rusage32 r32;
+
+ r32.ru_maxrss = 0; /* always 0 */
+ r32.ru_ixrss = 0; /* always 0 */
+ r32.ru_idrss = 0; /* always 0 */
+ r32.ru_isrss = 0; /* always 0 */
+
+ r32.ru_utime.tv_sec = r.ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r.ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r.ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r.ru_stime.tv_usec;
+
+ r32.ru_majflt = (int32_t)r.ru_majflt;
+ r32.ru_minflt = (int32_t)r.ru_minflt;
+ r32.ru_nswap = (int32_t)r.ru_nswap;
+ r32.ru_inblock = (int32_t)r.ru_inblock;
+ r32.ru_oublock = (int32_t)r.ru_oublock;
+ r32.ru_msgsnd = (int32_t)r.ru_msgsnd;
+ r32.ru_msgrcv = (int32_t)r.ru_msgrcv;
+ r32.ru_nsignals = (int32_t)r.ru_nsignals;
+ r32.ru_nvcsw = (int32_t)r.ru_nvcsw;
+ r32.ru_nivcsw = (int32_t)r.ru_nivcsw;
+ if (copyout(&r32, user_rusage, sizeof (r32)) != 0)
+ return (set_errno(EFAULT));
+ } else
+#endif /* _SYSCALL32_IMPL */
+
+ if (copyout(&r, user_rusage, sizeof (r)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+static int
+getrusage_chld(void *user_rusage)
+{
+ struct rusage r;
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+
+ hrtime_t snsecs, unsecs;
+
+ r.ru_maxrss = 0; /* always 0 */
+ r.ru_ixrss = 0; /* always 0 */
+ r.ru_idrss = 0; /* always 0 */
+ r.ru_isrss = 0; /* always 0 */
+
+ mutex_enter(&p->p_lock);
+
+ unsecs = p->p_cacct[LMS_USER];
+ snsecs = p->p_cacct[LMS_SYSTEM] + p->p_cacct[LMS_TRAP];
+ r.ru_utime.tv_sec = 0;
+ r.ru_utime.tv_usec = 0;
+ r.ru_stime.tv_sec = 0;
+ r.ru_stime.tv_usec = 0;
+
+ r.ru_majflt = p->p_cru.majflt;
+ r.ru_minflt = p->p_cru.minflt;
+ r.ru_nswap = p->p_cru.nswap;
+ r.ru_inblock = p->p_cru.inblock;
+ r.ru_oublock = p->p_cru.oublock;
+ r.ru_msgsnd = p->p_cru.msgsnd;
+ r.ru_msgrcv = p->p_cru.msgrcv;
+ r.ru_nsignals = p->p_cru.nsignals;
+ r.ru_nvcsw = p->p_cru.nvcsw;
+ r.ru_nivcsw = p->p_cru.nivcsw;
+
+ mutex_exit(&p->p_lock);
+
+ hrt2tv(unsecs, &r.ru_utime);
+ hrt2tv(snsecs, &r.ru_stime);
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct rusage32 r32;
+
+ r32.ru_maxrss = 0; /* always 0 */
+ r32.ru_ixrss = 0; /* always 0 */
+ r32.ru_idrss = 0; /* always 0 */
+ r32.ru_isrss = 0; /* always 0 */
+
+ r32.ru_utime.tv_sec = r.ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r.ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r.ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r.ru_stime.tv_usec;
+
+ r32.ru_majflt = (int32_t)r.ru_majflt;
+ r32.ru_minflt = (int32_t)r.ru_minflt;
+ r32.ru_nswap = (int32_t)r.ru_nswap;
+ r32.ru_inblock = (int32_t)r.ru_inblock;
+ r32.ru_oublock = (int32_t)r.ru_oublock;
+ r32.ru_msgsnd = (int32_t)r.ru_msgsnd;
+ r32.ru_msgrcv = (int32_t)r.ru_msgrcv;
+ r32.ru_nsignals = (int32_t)r.ru_nsignals;
+ r32.ru_nvcsw = (int32_t)r.ru_nvcsw;
+ r32.ru_nivcsw = (int32_t)r.ru_nivcsw;
+ if (copyout(&r32, user_rusage, sizeof (r32)) != 0)
+ return (set_errno(EFAULT));
+ } else
+#endif /* _SYSCALL32_IMPL */
+
+ if (copyout(&r, user_rusage, sizeof (r)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+static int
+getrusage_lwp(void *user_rusage)
+{
+ struct rusage r;
+ kthread_t *t = curthread;
+ klwp_t *lwp;
+ hrtime_t snsecs, unsecs;
+ struct mstate *ms;
+
+ r.ru_maxrss = 0; /* always 0 */
+ r.ru_ixrss = 0; /* always 0 */
+ r.ru_idrss = 0; /* always 0 */
+ r.ru_isrss = 0; /* always 0 */
+ r.ru_utime.tv_sec = 0;
+ r.ru_utime.tv_usec = 0;
+ r.ru_stime.tv_sec = 0;
+ r.ru_stime.tv_usec = 0;
+
+ lwp = ttolwp(t);
+ ms = &lwp->lwp_mstate;
+ unsecs = ms->ms_acct[LMS_USER];
+ snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP];
+ scalehrtime(&unsecs);
+ scalehrtime(&snsecs);
+ r.ru_majflt = lwp->lwp_ru.majflt;
+ r.ru_minflt = lwp->lwp_ru.minflt;
+ r.ru_nswap = lwp->lwp_ru.nswap;
+ r.ru_inblock = lwp->lwp_ru.inblock;
+ r.ru_oublock = lwp->lwp_ru.oublock;
+ r.ru_msgsnd = lwp->lwp_ru.msgsnd;
+ r.ru_msgrcv = lwp->lwp_ru.msgrcv;
+ r.ru_nsignals = lwp->lwp_ru.nsignals;
+ r.ru_nvcsw = lwp->lwp_ru.nvcsw;
+ r.ru_nivcsw = lwp->lwp_ru.nivcsw;
+
+ hrt2tv(unsecs, &r.ru_utime);
+ hrt2tv(snsecs, &r.ru_stime);
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct rusage32 r32;
+
+ r32.ru_maxrss = 0; /* always 0 */
+ r32.ru_ixrss = 0; /* always 0 */
+ r32.ru_idrss = 0; /* always 0 */
+ r32.ru_isrss = 0; /* always 0 */
+
+ r32.ru_utime.tv_sec = r.ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r.ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r.ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r.ru_stime.tv_usec;
+
+ r32.ru_majflt = (int32_t)r.ru_majflt;
+ r32.ru_minflt = (int32_t)r.ru_minflt;
+ r32.ru_nswap = (int32_t)r.ru_nswap;
+ r32.ru_inblock = (int32_t)r.ru_inblock;
+ r32.ru_oublock = (int32_t)r.ru_oublock;
+ r32.ru_msgsnd = (int32_t)r.ru_msgsnd;
+ r32.ru_msgrcv = (int32_t)r.ru_msgrcv;
+ r32.ru_nsignals = (int32_t)r.ru_nsignals;
+ r32.ru_nvcsw = (int32_t)r.ru_nvcsw;
+ r32.ru_nivcsw = (int32_t)r.ru_nivcsw;
+ if (copyout(&r32, user_rusage, sizeof (r32)) != 0)
+ return (set_errno(EFAULT));
+ } else
+#endif /* _SYSCALL32_IMPL */
+
+ if (copyout(&r, user_rusage, sizeof (r)) != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+int
+rusagesys(int code, void * arg)
+{
+ switch (code) {
+
+ case _RUSAGESYS_GETRUSAGE:
+ return (getrusage(arg));
+ case _RUSAGESYS_GETRUSAGE_CHLD:
+ return (getrusage_chld(arg));
+ case _RUSAGESYS_GETRUSAGE_LWP:
+ return (getrusage_lwp(arg));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c
new file mode 100644
index 0000000000..d2f35e2051
--- /dev/null
+++ b/usr/src/uts/common/syscall/rw.c
@@ -0,0 +1,1223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/user.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/cpuvar.h>
+#include <sys/uio.h>
+#include <sys/ioreq.h>
+#include <sys/debug.h>
+#include <sys/rctl.h>
+#include <sys/nbmlock.h>
+
+#define COPYOUT_MIN_SIZE (1<<17) /* 128K */
+
+static size_t copyout_min_size = COPYOUT_MIN_SIZE;
+
+/*
+ * read, write, pread, pwrite, readv, and writev syscalls.
+ *
+ * 64-bit open: all open's are large file opens.
+ * Large Files: the behaviour of read depends on whether the fd
+ * corresponds to large open or not.
+ * 32-bit open: FOFFMAX flag not set.
+ * read until MAXOFF32_T - 1 and read at MAXOFF32_T returns
+ * EOVERFLOW if count is non-zero and if size of file
+ * is > MAXOFF32_T. If size of file is <= MAXOFF32_T read
+ * at >= MAXOFF32_T returns EOF.
+ */
+
+/*
+ * Native system call
+ */
+ssize_t
+read(int fdes, void *cbuf, size_t count)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t cnt, bcount;
+ int error = 0;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+ if ((cnt = (ssize_t)count) < 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG && cnt == 0) {
+ goto out;
+ }
+
+ rwflag = 0;
+ aiov.iov_base = cbuf;
+ aiov.iov_len = cnt;
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with write() calls.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_READ, fp->f_offset, cnt, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+
+ /*
+ * We do the following checks inside VOP_RWLOCK so as to
+ * prevent file size from changing while these checks are
+ * being done. Also, we load fp's offset to the local
+ * variable fileoff because we can have a parallel lseek
+ * going on (f_offset is not protected by any lock) which
+ * could change f_offset. We need to see the value only
+ * once here and take a decision. Seeing it more than once
+ * can lead to incorrect functionality.
+ */
+
+ fileoff = (u_offset_t)fp->f_offset;
+ if (fileoff >= OFFSET_MAX(fp) && (vp->v_type == VREG)) {
+ struct vattr va;
+ va.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ goto out;
+ }
+ if (fileoff >= va.va_size) {
+ cnt = 0;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ goto out;
+ } else {
+ error = EOVERFLOW;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ goto out;
+ }
+ }
+ if ((vp->v_type == VREG) &&
+ (fileoff + cnt > OFFSET_MAX(fp))) {
+ cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+ }
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount = cnt;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ /*
+ * Only use bypass caches when the count is large enough
+ */
+ if (bcount < copyout_min_size)
+ auio.uio_extflg = UIO_COPY_CACHED;
+ else
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /* If read sync is not asked for, filter sync flags */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
+ cnt -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, sysread, 1);
+ CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)cnt);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
+
+ if (vp->v_type == VFIFO) /* Backward compatibility */
+ fp->f_offset = cnt;
+ else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
+ fp->f_offset = auio.uio_loffset;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && cnt != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (cnt);
+}
+
+/*
+ * Native system call
+ */
+ssize_t
+write(int fdes, void *cbuf, size_t count)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t cnt, bcount;
+ int error = 0;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+ if ((cnt = (ssize_t)count) < 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG && cnt == 0) {
+ goto out;
+ }
+
+ rwflag = 1;
+ aiov.iov_base = cbuf;
+ aiov.iov_len = cnt;
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, cnt, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+
+ fileoff = fp->f_offset;
+ if (vp->v_type == VREG) {
+
+ /*
+ * We raise psignal if write for >0 bytes causes
+ * it to exceed the ulimit.
+ */
+ if (fileoff >= curproc->p_fsz_ctl) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * We return EFBIG if write is done at an offset
+ * greater than the offset maximum for this file structure.
+ */
+
+ if (fileoff >= OFFSET_MAX(fp)) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * Limit the bytes to be written upto offset maximum for
+ * this open file structure.
+ */
+ if (fileoff + cnt > OFFSET_MAX(fp))
+ cnt = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+ }
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount = cnt;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
+ cnt -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+ CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)cnt);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
+
+ if (vp->v_type == VFIFO) /* Backward compatibility */
+ fp->f_offset = cnt;
+ else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
+ fp->f_offset = auio.uio_loffset;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && cnt != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (cnt);
+}
+
+ssize_t
+pread(int fdes, void *cbuf, size_t count, off_t offset)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t bcount;
+ int error = 0;
+ u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
+#ifdef _SYSCALL32_IMPL
+ u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
+ MAXOFF32_T : MAXOFFSET_T;
+#else
+ const u_offset_t maxoff = MAXOFF32_T;
+#endif
+ int in_crit = 0;
+
+ if ((bcount = (ssize_t)count) < 0)
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & (FREAD)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ rwflag = 0;
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG) {
+
+ if (bcount == 0)
+ goto out;
+
+ /*
+ * Return EINVAL if an invalid offset comes to pread.
+ * Negative offset from user will cause this error.
+ */
+
+ if (fileoff > maxoff) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Limit offset such that we don't read or write
+ * a file beyond the maximum offset representable in
+ * an off_t structure.
+ */
+ if (fileoff + bcount > maxoff)
+ bcount = (ssize_t)((offset_t)maxoff - fileoff);
+ } else if (vp->v_type == VFIFO) {
+ error = ESPIPE;
+ goto out;
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = bcount;
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ if (vp->v_type == VREG && fileoff == (u_offset_t)maxoff) {
+ struct vattr va;
+ va.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ goto out;
+ }
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ /*
+ * We have to return EOF if fileoff is >= file size.
+ */
+ if (fileoff >= va.va_size) {
+ bcount = 0;
+ goto out;
+ }
+
+ /*
+ * File is greater than or equal to maxoff and therefore
+ * we return EOVERFLOW.
+ */
+ error = EOVERFLOW;
+ goto out;
+ }
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /* If read sync is not asked for, filter sync flags */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
+ bcount -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, sysread, 1);
+ CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && bcount != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (bcount);
+}
+
+ssize_t
+pwrite(int fdes, void *cbuf, size_t count, off_t offset)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t bcount;
+ int error = 0;
+ u_offset_t fileoff = (u_offset_t)(ulong_t)offset;
+#ifdef _SYSCALL32_IMPL
+ u_offset_t maxoff = get_udatamodel() == DATAMODEL_ILP32 ?
+ MAXOFF32_T : MAXOFFSET_T;
+#else
+ const u_offset_t maxoff = MAXOFF32_T;
+#endif
+ int in_crit = 0;
+
+ if ((bcount = (ssize_t)count) < 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ rwflag = 1;
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG) {
+
+ if (bcount == 0)
+ goto out;
+
+ /*
+ * return EINVAL for offsets that cannot be
+ * represented in an off_t.
+ */
+ if (fileoff > maxoff) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * Take appropriate action if we are trying to write above the
+ * resource limit.
+ */
+ if (fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+
+ error = EFBIG;
+ goto out;
+ }
+ /*
+ * Don't allow pwrite to cause file sizes to exceed
+ * maxoff.
+ */
+ if (fileoff == maxoff) {
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff + count > maxoff)
+ bcount = (ssize_t)((u_offset_t)maxoff - fileoff);
+ } else if (vp->v_type == VFIFO) {
+ error = ESPIPE;
+ goto out;
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = bcount;
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
+ bcount -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+ CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && bcount != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (bcount);
+}
+
+/*
+ * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr....
+ * XXX -- However, SVVS expects readv() and writev() to fail if
+ * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source),
+ * XXX -- so I guess that's the "interface".
+ */
+#define DEF_IOV_MAX 16
+
+ssize_t
+readv(int fdes, struct iovec *iovp, int iovcnt)
+{
+ struct uio auio;
+ struct iovec aiov[DEF_IOV_MAX];
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t count, bcount;
+ int error = 0;
+ int i;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+ if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ return (set_errno(EINVAL));
+
+#ifdef _SYSCALL32_IMPL
+ /*
+ * 32-bit callers need to have their iovec expanded,
+ * while ensuring that they can't move more than 2Gbytes
+ * of data in a single call.
+ */
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct iovec32 aiov32[DEF_IOV_MAX];
+ ssize32_t count32;
+
+ if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ return (set_errno(EFAULT));
+
+ count32 = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize32_t iovlen32 = aiov32[i].iov_len;
+ count32 += iovlen32;
+ if (iovlen32 < 0 || count32 < 0)
+ return (set_errno(EINVAL));
+ aiov[i].iov_len = iovlen32;
+ aiov[i].iov_base =
+ (caddr_t)(uintptr_t)aiov32[i].iov_base;
+ }
+ } else
+#endif
+ if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ return (set_errno(EFAULT));
+
+ count = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize_t iovlen = aiov[i].iov_len;
+ count += iovlen;
+ if (iovlen < 0 || count < 0)
+ return (set_errno(EINVAL));
+ }
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FREAD) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ rwflag = 0;
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_READ, fp->f_offset, count, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ fileoff = fp->f_offset;
+
+ /*
+ * Behaviour is same as read. Please see comments in read.
+ */
+
+ if ((vp->v_type == VREG) && (fileoff >= OFFSET_MAX(fp))) {
+ struct vattr va;
+ va.va_mask = AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &va, 0, fp->f_cred))) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ goto out;
+ }
+ if (fileoff >= va.va_size) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ count = 0;
+ goto out;
+ } else {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ error = EOVERFLOW;
+ goto out;
+ }
+ }
+ if ((vp->v_type == VREG) && (fileoff + count > OFFSET_MAX(fp))) {
+ count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+ }
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_resid = bcount = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ if (bcount < copyout_min_size)
+ auio.uio_extflg = UIO_COPY_CACHED;
+ else
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /* If read sync is not asked for, filter sync flags */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
+ count -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, sysread, 1);
+ CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)count);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
+
+ if (vp->v_type == VFIFO) /* Backward compatibility */
+ fp->f_offset = count;
+ else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
+ fp->f_offset = auio.uio_loffset;
+
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && count != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (count);
+}
+
+ssize_t
+writev(int fdes, struct iovec *iovp, int iovcnt)
+{
+ struct uio auio;
+ struct iovec aiov[DEF_IOV_MAX];
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t count, bcount;
+ int error = 0;
+ int i;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+ if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX)
+ return (set_errno(EINVAL));
+
+#ifdef _SYSCALL32_IMPL
+ /*
+ * 32-bit callers need to have their iovec expanded,
+ * while ensuring that they can't move more than 2Gbytes
+ * of data in a single call.
+ */
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ struct iovec32 aiov32[DEF_IOV_MAX];
+ ssize32_t count32;
+
+ if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32)))
+ return (set_errno(EFAULT));
+
+ count32 = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize32_t iovlen = aiov32[i].iov_len;
+ count32 += iovlen;
+ if (iovlen < 0 || count32 < 0)
+ return (set_errno(EINVAL));
+ aiov[i].iov_len = iovlen;
+ aiov[i].iov_base =
+ (caddr_t)(uintptr_t)aiov32[i].iov_base;
+ }
+ } else
+#endif
+ if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec)))
+ return (set_errno(EFAULT));
+
+ count = 0;
+ for (i = 0; i < iovcnt; i++) {
+ ssize_t iovlen = aiov[i].iov_len;
+ count += iovlen;
+ if (iovlen < 0 || count < 0)
+ return (set_errno(EINVAL));
+ }
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type == VREG && count == 0) {
+ goto out;
+ }
+
+ rwflag = 1;
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, fp->f_offset, count, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+
+ fileoff = fp->f_offset;
+
+ /*
+ * Behaviour is same as write. Please see comments for write.
+ */
+
+ if (vp->v_type == VREG) {
+ if (fileoff >= curproc->p_fsz_ctl) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO);
+ mutex_exit(&curproc->p_lock);
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff >= OFFSET_MAX(fp)) {
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff + count > OFFSET_MAX(fp))
+ count = (ssize_t)(OFFSET_MAX(fp) - fileoff);
+ }
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = aiov;
+ auio.uio_iovcnt = iovcnt;
+ auio.uio_resid = bcount = count;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
+ count -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+ CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)count);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)count;
+
+ if (vp->v_type == VFIFO) /* Backward compatibility */
+ fp->f_offset = count;
+ else if (((fp->f_flag & FAPPEND) == 0) ||
+ (vp->v_type != VREG) || (bcount != 0)) /* POSIX */
+ fp->f_offset = auio.uio_loffset;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && count != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (count);
+}
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+
+/*
+ * This syscall supplies 64-bit file offsets to 32-bit applications only.
+ */
+ssize32_t
+pread64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
+ uint32_t offset_2)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t bcount;
+ int error = 0;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+#if defined(_LITTLE_ENDIAN)
+ fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
+#else
+ fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
+#endif
+
+ if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & (FREAD)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ rwflag = 0;
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG) {
+
+ if (bcount == 0)
+ goto out;
+
+ /*
+ * Same as pread. See comments in pread.
+ */
+
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ if (fileoff + bcount > MAXOFFSET_T)
+ bcount = (ssize_t)(MAXOFFSET_T - fileoff);
+ } else if (vp->v_type == VFIFO) {
+ error = ESPIPE;
+ goto out;
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_READ, fileoff, bcount, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = bcount;
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ auio.uio_loffset = fileoff;
+
+ /*
+ * Note: File size can never be greater than MAXOFFSET_T.
+ * If ever we start supporting 128 bit files the code
+ * similar to the one in pread at this place should be here.
+ * Here we avoid the unnecessary VOP_GETATTR() when we
+ * know that fileoff == MAXOFFSET_T implies that it is always
+ * greater than or equal to file size.
+ */
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /* If read sync is not asked for, filter sync flags */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(vp, &auio, ioflag, fp->f_cred, NULL);
+ bcount -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, sysread, 1);
+ CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)bcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && bcount != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (bcount);
+}
+
+/*
+ * This syscall supplies 64-bit file offsets to 32-bit applications only.
+ */
+ssize32_t
+pwrite64(int fdes, void *cbuf, size32_t count, uint32_t offset_1,
+ uint32_t offset_2)
+{
+ struct uio auio;
+ struct iovec aiov;
+ file_t *fp;
+ register vnode_t *vp;
+ struct cpu *cp;
+ int fflag, ioflag, rwflag;
+ ssize_t bcount;
+ int error = 0;
+ u_offset_t fileoff;
+ int in_crit = 0;
+
+#if defined(_LITTLE_ENDIAN)
+ fileoff = ((u_offset_t)offset_2 << 32) | (u_offset_t)offset_1;
+#else
+ fileoff = ((u_offset_t)offset_1 << 32) | (u_offset_t)offset_2;
+#endif
+
+ if ((bcount = (ssize_t)count) < 0 || bcount > INT32_MAX)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ if (((fflag = fp->f_flag) & (FWRITE)) == 0) {
+ error = EBADF;
+ goto out;
+ }
+
+ rwflag = 1;
+ vp = fp->f_vnode;
+
+ if (vp->v_type == VREG) {
+
+ if (bcount == 0)
+ goto out;
+
+ /*
+ * See comments in pwrite.
+ */
+ if (fileoff > MAXOFFSET_T) {
+ error = EINVAL;
+ goto out;
+ }
+ if (fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_SAFE);
+ mutex_exit(&curproc->p_lock);
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff == MAXOFFSET_T) {
+ error = EFBIG;
+ goto out;
+ }
+ if (fileoff + bcount > MAXOFFSET_T)
+ bcount = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff);
+ } else if (vp->v_type == VFIFO) {
+ error = ESPIPE;
+ goto out;
+ }
+
+ /*
+ * We have to enter the critical region before calling VOP_RWLOCK
+ * to avoid a deadlock with ufs.
+ */
+ if (nbl_need_check(vp)) {
+ int svmand;
+
+ nbl_start_crit(vp, RW_READER);
+ in_crit = 1;
+ error = nbl_svmand(vp, fp->f_cred, &svmand);
+ if (error != 0)
+ goto out;
+ if (nbl_conflict(vp, NBL_WRITE, fileoff, bcount, svmand)) {
+ error = EACCES;
+ goto out;
+ }
+ }
+
+ aiov.iov_base = cbuf;
+ aiov.iov_len = bcount;
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+ auio.uio_loffset = fileoff;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = bcount;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ auio.uio_extflg = UIO_COPY_CACHED;
+
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ error = VOP_WRITE(vp, &auio, ioflag, fp->f_cred, NULL);
+ bcount -= auio.uio_resid;
+ CPU_STATS_ENTER_K();
+ cp = CPU;
+ CPU_STATS_ADDQ(cp, sys, syswrite, 1);
+ CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)bcount);
+ CPU_STATS_EXIT_K();
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)bcount;
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+
+ if (error == EINTR && bcount != 0)
+ error = 0;
+out:
+ if (in_crit)
+ nbl_end_crit(vp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (bcount);
+}
+
+#endif /* _SYSCALL32_IMPL || _ILP32 */
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * Tail-call elimination of xxx32() down to xxx()
+ *
+ * A number of xxx32 system calls take a len (or count) argument and
+ * return a number in the range [0,len] or -1 on error.
+ * Given an ssize32_t input len, the downcall xxx() will return
+ * a 64-bit value that is -1 or in the range [0,len] which actually
+ * is a proper return value for the xxx32 call. So even if the xxx32
+ * calls can be considered as returning a ssize32_t, they are currently
+ * declared as returning a ssize_t as this enables tail-call elimination.
+ *
+ * The cast of len (or count) to ssize32_t is needed to ensure we pass
+ * down negative input values as such and let the downcall handle error
+ * reporting. Functions covered by this comments are:
+ *
+ * rw.c: read32, write32, pread32, pwrite32, readv32, writev32.
+ * socksyscall.c: recv32, recvfrom32, send32, sendto32.
+ * readlink.c: readlink32.
+ */
+
+ssize_t
+read32(int32_t fdes, caddr32_t cbuf, size32_t count)
+{
+ return (read(fdes,
+ (void *)(uintptr_t)cbuf, (ssize32_t)count));
+}
+
+ssize_t
+write32(int32_t fdes, caddr32_t cbuf, size32_t count)
+{
+ return (write(fdes,
+ (void *)(uintptr_t)cbuf, (ssize32_t)count));
+}
+
+ssize_t
+pread32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
+{
+ return (pread(fdes,
+ (void *)(uintptr_t)cbuf, (ssize32_t)count,
+ (off_t)(uint32_t)offset));
+}
+
+ssize_t
+pwrite32(int32_t fdes, caddr32_t cbuf, size32_t count, off32_t offset)
+{
+ return (pwrite(fdes,
+ (void *)(uintptr_t)cbuf, (ssize32_t)count,
+ (off_t)(uint32_t)offset));
+}
+
+ssize_t
+readv32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
+{
+ return (readv(fdes, (void *)(uintptr_t)iovp, iovcnt));
+}
+
+ssize_t
+writev32(int32_t fdes, caddr32_t iovp, int32_t iovcnt)
+{
+ return (writev(fdes, (void *)(uintptr_t)iovp, iovcnt));
+}
+
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/sem.c b/usr/src/uts/common/syscall/sem.c
new file mode 100644
index 0000000000..5498418a27
--- /dev/null
+++ b/usr/src/uts/common/syscall/sem.c
@@ -0,0 +1,1208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Inter-Process Communication Semaphore Facility.
+ *
+ * See os/ipc.c for a description of common IPC functionality.
+ *
+ * Resource controls
+ * -----------------
+ *
+ * Control: project.max-sem-ids (rc_project_semmni)
+ * Description: Maximum number of semaphore ids allowed a project.
+ *
+ * When semget() is used to allocate a semaphore set, one id is
+ * allocated. If the id allocation doesn't succeed, semget() fails
+ * and errno is set to ENOSPC. Upon successful semctl(, IPC_RMID)
+ * the id is deallocated.
+ *
+ * Control: process.max-sem-nsems (rc_process_semmsl)
+ * Description: Maximum number of semaphores allowed per semaphore set.
+ *
+ * When semget() is used to allocate a semaphore set, the size of the
+ * set is compared with this limit. If the number of semaphores
+ * exceeds the limit, semget() fails and errno is set to EINVAL.
+ *
+ * Control: process.max-sem-ops (rc_process_semopm)
+ * Description: Maximum number of semaphore operations allowed per
+ * semop call.
+ *
+ * When semget() successfully allocates a semaphore set, the minimum
+ * enforced value of this limit is used to initialize the
+ * "system-imposed maximum" number of operations a semop() call for
+ * this set can perform.
+ *
+ * Undo structures
+ * ---------------
+ *
+ * Removing the undo structure tunables involved a serious redesign of
+ * how they were implemented. There is now one undo structure for
+ * every process/semaphore array combination (lazily allocated, of
+ * course), and each is equal in size to the semaphore it corresponds
+ * to. To avoid scalability and performance problems, the undo
+ * structures are stored in two places: a per-process AVL tree sorted
+ * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
+ * per-semaphore linked list (sem_undos, protected by the semaphore's
+ * ID lock). The former is used by semop, where a lookup is performed
+ * once and cached if SEM_UNDO is specified for any of the operations,
+ * and at process exit where the undoable operations are rolled back.
+ * The latter is used when removing the semaphore, so the undo
+ * structures can be removed from the appropriate processes' trees.
+ *
+ * The undo structure itself contains pointers to the ksemid and proc
+ * to which it corresponds, a list node, an AVL node, and an array of
+ * adjust-on-exit (AOE) values. When an undo structure is allocated it
+ * is immediately added to both the process's tree and the semaphore's
+ * list. Lastly, the reference count on the semaphore is increased.
+ *
+ * Avoiding a lock ordering violation between p_lock and the ID lock,
+ * wont to occur when there is a race between a process exiting and the
+ * removal of a semaphore, mandates the delicate dance that exists
+ * between semexit and sem_rmid.
+ *
+ * sem_rmid, holding the ID lock, iterates through all undo structures
+ * and for each takes the appropriate process's p_lock and checks to
+ * see if p_semacct is NULL. If it is, it skips that undo structure
+ * and continues to the next. Otherwise, it removes the undo structure
+ * from both the AVL tree and the semaphore's list, and releases the
+ * hold that the undo structure had on the semaphore.
+ *
+ * The important other half of this is semexit, which will immediately
+ * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
+ * p_lock. From this point on it is semexit's responsibility to clean
+ * up all undo structures found in the tree -- a coexecuting sem_rmid
+ * will see the NULL p_semacct and skip that undo structure. It walks
+ * the AVL tree (using avl_destroy_nodes) and for each undo structure
+ * takes the appropriate semaphore's ID lock (always legal since the
+ * undo structure has a hold on the semaphore), updates all semaphores
+ * with non-zero AOE values, and removes the structure from the
+ * semaphore's list. It then drops the structure's reference on the
+ * semaphore, drops the ID lock, and frees the undo structure.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/vmem.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/ipc.h>
+#include <sys/ipc_impl.h>
+#include <sys/sem.h>
+#include <sys/sem_impl.h>
+#include <sys/user.h>
+#include <sys/proc.h>
+#include <sys/cpuvar.h>
+#include <sys/debug.h>
+#include <sys/var.h>
+#include <sys/cmn_err.h>
+#include <sys/modctl.h>
+#include <sys/syscall.h>
+#include <sys/avl.h>
+#include <sys/list.h>
+#include <sys/zone.h>
+
+#include <c2/audit.h>
+
+extern rctl_hndl_t rc_project_semmni;
+extern rctl_hndl_t rc_process_semmsl;
+extern rctl_hndl_t rc_process_semopm;
+static ipc_service_t *sem_svc;
+static zone_key_t sem_zone_key;
+
+/*
+ * The following tunables are obsolete. Though for compatibility we
+ * still read and interpret seminfo_semmsl, seminfo_semopm and
+ * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
+ * mechanism for administrating the IPC Semaphore facility is through
+ * the resource controls described at the top of this file.
+ */
+int seminfo_semaem = 16384; /* (obsolete) */
+int seminfo_semmap = 10; /* (obsolete) */
+int seminfo_semmni = 10; /* (obsolete) */
+int seminfo_semmns = 60; /* (obsolete) */
+int seminfo_semmnu = 30; /* (obsolete) */
+int seminfo_semmsl = 25; /* (obsolete) */
+int seminfo_semopm = 10; /* (obsolete) */
+int seminfo_semume = 10; /* (obsolete) */
+int seminfo_semusz = 96; /* (obsolete) */
+int seminfo_semvmx = 32767; /* (obsolete) */
+
+#define SEM_MAXUCOPS 4096 /* max # of unchecked ops per semop call */
+#define SEM_UNDOSZ(n) (sizeof (struct sem_undo) + (n - 1) * sizeof (int))
+
+static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
+ uintptr_t a2, uintptr_t a3);
+static void sem_dtor(kipc_perm_t *);
+static void sem_rmid(kipc_perm_t *);
+static void sem_remove_zone(zoneid_t, void *);
+
+static struct sysent ipcsem_sysent = {
+ 5,
+ SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
+ semsys
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modlsys modlsys = {
+ &mod_syscallops, "System V semaphore facility", &ipcsem_sysent
+};
+
+#ifdef _SYSCALL32_IMPL
+static struct modlsys modlsys32 = {
+ &mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
+};
+#endif
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsys,
+#ifdef _SYSCALL32_IMPL
+ &modlsys32,
+#endif
+ NULL
+};
+
+
+int
+_init(void)
+{
+ int result;
+
+ sem_svc = ipcs_create("semids", rc_project_semmni, sizeof (ksemid_t),
+ sem_dtor, sem_rmid, AT_IPC_SEM,
+ offsetof(kproject_data_t, kpd_semmni));
+ zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
+
+ if ((result = mod_install(&modlinkage)) == 0)
+ return (0);
+
+ (void) zone_key_delete(sem_zone_key);
+ ipcs_destroy(sem_svc);
+
+ return (result);
+}
+
+int
+_fini(void)
+{
+ return (EBUSY);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static void
+sem_dtor(kipc_perm_t *perm)
+{
+ ksemid_t *sp = (ksemid_t *)perm;
+
+ kmem_free(sp->sem_base,
+ P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
+ list_destroy(&sp->sem_undos);
+}
+
+/*
+ * sem_undo_add - Create or update adjust on exit entry.
+ */
+static int
+sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
+{
+ int newval = undo->un_aoe[num] - val;
+
+ if (newval > USHRT_MAX || newval < -USHRT_MAX)
+ return (ERANGE);
+ undo->un_aoe[num] = newval;
+
+ return (0);
+}
+
+/*
+ * sem_undo_clear - clears all undo entries for specified semaphores
+ *
+ * Used when semaphores are reset by SETVAL or SETALL.
+ */
+static void
+sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
+{
+ struct sem_undo *undo;
+ int i;
+
+ ASSERT(low <= high);
+ ASSERT(high < sp->sem_nsems);
+
+ for (undo = list_head(&sp->sem_undos); undo;
+ undo = list_next(&sp->sem_undos, undo))
+ for (i = low; i <= high; i++)
+ undo->un_aoe[i] = 0;
+}
+
+/*
+ * sem_rollback - roll back work done so far if unable to complete operation
+ */
+static void
+sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
+{
+ struct sem *semp; /* semaphore ptr */
+
+ for (op += n - 1; n--; op--) {
+ if (op->sem_op == 0)
+ continue;
+ semp = &sp->sem_base[op->sem_num];
+ semp->semval -= op->sem_op;
+ if (op->sem_flg & SEM_UNDO) {
+ ASSERT(undo != NULL);
+ (void) sem_undo_add(-op->sem_op, op->sem_num, undo);
+ }
+ }
+}
+
+static void
+sem_rmid(kipc_perm_t *perm)
+{
+ ksemid_t *sp = (ksemid_t *)perm;
+ struct sem *semp;
+ struct sem_undo *undo;
+ size_t size = SEM_UNDOSZ(sp->sem_nsems);
+ int i;
+
+ /*LINTED*/
+ while (undo = list_head(&sp->sem_undos)) {
+ list_remove(&sp->sem_undos, undo);
+ mutex_enter(&undo->un_proc->p_lock);
+ if (undo->un_proc->p_semacct == NULL) {
+ mutex_exit(&undo->un_proc->p_lock);
+ continue;
+ }
+ avl_remove(undo->un_proc->p_semacct, undo);
+ mutex_exit(&undo->un_proc->p_lock);
+ kmem_free(undo, size);
+ ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
+ }
+
+ for (i = 0; i < sp->sem_nsems; i++) {
+ semp = &sp->sem_base[i];
+ semp->semval = semp->sempid = 0;
+ if (semp->semncnt) {
+ cv_broadcast(&semp->semncnt_cv);
+ semp->semncnt = 0;
+ }
+ if (semp->semzcnt) {
+ cv_broadcast(&semp->semzcnt_cv);
+ semp->semzcnt = 0;
+ }
+ }
+}
+
+/*
+ * semctl - Semctl system call.
+ */
+static int
+semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
+{
+ ksemid_t *sp; /* ptr to semaphore header */
+ struct sem *p; /* ptr to semaphore */
+ unsigned int i; /* loop control */
+ ushort_t *vals, *vp;
+ size_t vsize = 0;
+ int error = 0;
+ int retval = 0;
+ struct cred *cr;
+ kmutex_t *lock;
+ model_t mdl = get_udatamodel();
+ STRUCT_DECL(semid_ds, sid);
+ struct semid_ds64 ds64;
+
+ STRUCT_INIT(sid, mdl);
+ cr = CRED();
+
+ /*
+ * Perform pre- or non-lookup actions (e.g. copyins, RMID).
+ */
+ switch (cmd) {
+ case IPC_SET:
+ if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
+ return (set_errno(EFAULT));
+ break;
+
+ case IPC_SET64:
+ if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
+ return (set_errno(EFAULT));
+ break;
+
+ case SETALL:
+ if ((lock = ipc_lookup(sem_svc, semid,
+ (kipc_perm_t **)&sp)) == NULL)
+ return (set_errno(EINVAL));
+ vsize = sp->sem_nsems * sizeof (*vals);
+ mutex_exit(lock);
+
+ /* allocate space to hold all semaphore values */
+ vals = kmem_alloc(vsize, KM_SLEEP);
+
+ if (copyin((void *)arg, vals, vsize)) {
+ kmem_free(vals, vsize);
+ return (set_errno(EFAULT));
+ }
+ break;
+
+ case IPC_RMID:
+ if (error = ipc_rmid(sem_svc, semid, cr))
+ return (set_errno(error));
+ return (0);
+ }
+
+ if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
+ if (vsize != 0)
+ kmem_free(vals, vsize);
+ return (set_errno(EINVAL));
+ }
+ switch (cmd) {
+ /* Set ownership and permissions. */
+ case IPC_SET:
+
+ if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
+ &STRUCT_BUF(sid)->sem_perm, mdl)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ sp->sem_ctime = gethrestime_sec();
+ mutex_exit(lock);
+ return (0);
+
+ /* Get semaphore data structure. */
+ case IPC_STAT:
+
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+
+ ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
+ STRUCT_FSETP(sid, sem_base, NULL); /* kernel addr */
+ STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
+ STRUCT_FSET(sid, sem_otime, sp->sem_otime);
+ STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
+ STRUCT_FSET(sid, sem_binary, sp->sem_binary);
+ mutex_exit(lock);
+
+ if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
+ return (set_errno(EFAULT));
+ return (0);
+
+ case IPC_SET64:
+
+ if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
+ &ds64.semx_perm)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ sp->sem_ctime = gethrestime_sec();
+ mutex_exit(lock);
+ return (0);
+
+ case IPC_STAT64:
+
+ ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
+ ds64.semx_nsems = sp->sem_nsems;
+ ds64.semx_otime = sp->sem_otime;
+ ds64.semx_ctime = sp->sem_ctime;
+
+ mutex_exit(lock);
+ if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
+ return (set_errno(EFAULT));
+
+ return (0);
+
+ /* Get # of processes sleeping for greater semval. */
+ case GETNCNT:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ if (semnum >= sp->sem_nsems) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ retval = sp->sem_base[semnum].semncnt;
+ mutex_exit(lock);
+ return (retval);
+
+ /* Get pid of last process to operate on semaphore. */
+ case GETPID:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ if (semnum >= sp->sem_nsems) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ retval = sp->sem_base[semnum].sempid;
+ mutex_exit(lock);
+ return (retval);
+
+ /* Get semval of one semaphore. */
+ case GETVAL:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ if (semnum >= sp->sem_nsems) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ retval = sp->sem_base[semnum].semval;
+ mutex_exit(lock);
+ return (retval);
+
+ /* Get all semvals in set. */
+ case GETALL:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+
+ /* allocate space to hold all semaphore values */
+ vsize = sp->sem_nsems * sizeof (*vals);
+ vals = vp = kmem_alloc(vsize, KM_SLEEP);
+
+ for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
+ bcopy(&p->semval, vp, sizeof (p->semval));
+
+ mutex_exit(lock);
+
+ if (copyout((void *)vals, (void *)arg, vsize)) {
+ kmem_free(vals, vsize);
+ return (set_errno(EFAULT));
+ }
+
+ kmem_free(vals, vsize);
+ return (0);
+
+ /* Get # of processes sleeping for semval to become zero. */
+ case GETZCNT:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ if (semnum >= sp->sem_nsems) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ retval = sp->sem_base[semnum].semzcnt;
+ mutex_exit(lock);
+ return (retval);
+
+ /* Set semval of one semaphore. */
+ case SETVAL:
+ if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
+ mutex_exit(lock);
+ return (set_errno(error));
+ }
+ if (semnum >= sp->sem_nsems) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ if ((uint_t)arg > USHRT_MAX) {
+ mutex_exit(lock);
+ return (set_errno(ERANGE));
+ }
+ p = &sp->sem_base[semnum];
+ if ((p->semval = (ushort_t)arg) != 0) {
+ if (p->semncnt) {
+ cv_broadcast(&p->semncnt_cv);
+ }
+ } else if (p->semzcnt) {
+ cv_broadcast(&p->semzcnt_cv);
+ }
+ p->sempid = curproc->p_pid;
+ sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
+ mutex_exit(lock);
+ return (0);
+
+ /* Set semvals of all semaphores in set. */
+ case SETALL:
+ /* Check if semaphore set has been deleted and reallocated. */
+ if (sp->sem_nsems * sizeof (*vals) != vsize) {
+ error = set_errno(EINVAL);
+ goto seterr;
+ }
+ if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
+ error = set_errno(error);
+ goto seterr;
+ }
+ sem_undo_clear(sp, 0, sp->sem_nsems - 1);
+ for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
+ (p++)->sempid = curproc->p_pid) {
+ if ((p->semval = vals[i++]) != 0) {
+ if (p->semncnt) {
+ cv_broadcast(&p->semncnt_cv);
+ }
+ } else if (p->semzcnt) {
+ cv_broadcast(&p->semzcnt_cv);
+ }
+ }
+seterr:
+ mutex_exit(lock);
+ kmem_free(vals, vsize);
+ return (error);
+
+ default:
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+
+ /* NOTREACHED */
+}
+
+/*
+ * semexit - Called by exit() to clean up on process exit.
+ */
+void
+semexit(proc_t *pp)
+{
+ avl_tree_t *tree;
+ struct sem_undo *undo;
+ void *cookie = NULL;
+
+ mutex_enter(&pp->p_lock);
+ tree = pp->p_semacct;
+ pp->p_semacct = NULL;
+ mutex_exit(&pp->p_lock);
+
+ while (undo = avl_destroy_nodes(tree, &cookie)) {
+ ksemid_t *sp = undo->un_sp;
+ size_t size = SEM_UNDOSZ(sp->sem_nsems);
+ int i;
+
+ (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
+ if (!IPC_FREE(&sp->sem_perm)) {
+ for (i = 0; i < sp->sem_nsems; i++) {
+ int adj = undo->un_aoe[i];
+ if (adj) {
+ struct sem *semp = &sp->sem_base[i];
+ int v = (int)semp->semval + adj;
+
+ if (v < 0 || v > USHRT_MAX)
+ continue;
+ semp->semval = (ushort_t)v;
+ if (v == 0 && semp->semzcnt)
+ cv_broadcast(&semp->semzcnt_cv);
+ if (adj > 0 && semp->semncnt)
+ cv_broadcast(&semp->semncnt_cv);
+ }
+ }
+ list_remove(&sp->sem_undos, undo);
+ }
+ ipc_rele(sem_svc, (kipc_perm_t *)sp);
+ kmem_free(undo, size);
+ }
+
+ avl_destroy(tree);
+ kmem_free(tree, sizeof (avl_tree_t));
+}
+
+/*
+ * Remove all semaphores associated with a given zone. Called by
+ * zone_shutdown when the zone is halted.
+ */
+/*ARGSUSED1*/
+static void
+sem_remove_zone(zoneid_t zoneid, void *arg)
+{
+ ipc_remove_zone(sem_svc, zoneid);
+}
+
+/*
+ * semget - Semget system call.
+ */
+static int
+semget(key_t key, int nsems, int semflg)
+{
+ ksemid_t *sp;
+ kmutex_t *lock;
+ int id, error;
+ proc_t *pp = curproc;
+
+top:
+ if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
+ return (set_errno(error));
+
+ if (!IPC_FREE(&sp->sem_perm)) {
+ /*
+ * A semaphore with the requested key exists.
+ */
+ if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
+ mutex_exit(lock);
+ return (set_errno(EINVAL));
+ }
+ } else {
+ /*
+ * This is a new semaphore set. Finish initialization.
+ */
+ if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
+ nsems, RCA_SAFE) & RCT_DENY)) {
+ mutex_exit(lock);
+ mutex_exit(&pp->p_lock);
+ ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
+ return (set_errno(EINVAL));
+ }
+ mutex_exit(lock);
+ mutex_exit(&pp->p_lock);
+
+ /*
+ * We round the allocation up to coherency granularity
+ * so that multiple semaphore allocations won't result
+ * in the false sharing of their sem structures.
+ */
+ sp->sem_base =
+ kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
+ KM_SLEEP);
+ sp->sem_binary = (nsems == 1);
+ sp->sem_nsems = (ushort_t)nsems;
+ sp->sem_ctime = gethrestime_sec();
+ sp->sem_otime = 0;
+ list_create(&sp->sem_undos, sizeof (struct sem_undo),
+ offsetof(struct sem_undo, un_list));
+
+ if (error = ipc_commit_begin(sem_svc, key, semflg,
+ (kipc_perm_t *)sp)) {
+ if (error == EAGAIN)
+ goto top;
+ return (set_errno(error));
+ }
+ sp->sem_maxops =
+ rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
+ if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
+ RCA_SAFE) & RCT_DENY) {
+ ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
+ return (set_errno(EINVAL));
+ }
+ lock = ipc_commit_end(sem_svc, &sp->sem_perm);
+ }
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_ipcget(AT_IPC_SEM, (void *)sp);
+#endif
+ id = sp->sem_perm.ipc_id;
+ mutex_exit(lock);
+ return (id);
+}
+
+/*
+ * semids system call.
+ */
+static int
+semids(int *buf, uint_t nids, uint_t *pnids)
+{
+ int error;
+
+ if (error = ipc_ids(sem_svc, buf, nids, pnids))
+ return (set_errno(error));
+
+ return (0);
+}
+
+
+/*
+ * Helper function for semop - copies in the provided timespec and
+ * computes the absolute future time after which we must return.
+ */
+static int
+compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
+ timespec_t *timeout)
+{
+ model_t datamodel = get_udatamodel();
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(timeout, ts, sizeof (timespec_t)))
+ return (EFAULT);
+ } else {
+ timespec32_t ts32;
+
+ if (copyin(timeout, &ts32, sizeof (timespec32_t)))
+ return (EFAULT);
+ TIMESPEC32_TO_TIMESPEC(ts, &ts32)
+ }
+
+ if (itimerspecfix(ts))
+ return (EINVAL);
+
+ /*
+ * Convert the timespec value into absolute time.
+ */
+ timespecadd(ts, now);
+ *tsp = ts;
+
+ return (0);
+}
+
+/*
+ * Undo structure comparator. We sort based on ksemid_t pointer.
+ */
+static int
+sem_undo_compar(const void *x, const void *y)
+{
+ struct sem_undo *undo1 = (struct sem_undo *)x;
+ struct sem_undo *undo2 = (struct sem_undo *)y;
+
+ if (undo1->un_sp < undo2->un_sp)
+ return (-1);
+ if (undo1->un_sp > undo2->un_sp)
+ return (1);
+ return (0);
+}
+
+/*
+ * Helper function for semop - creates an undo structure and adds it to
+ * the process's avl tree and the semaphore's list.
+ */
+static int
+sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
+ struct sem_undo *template, struct sem_undo **un)
+{
+ size_t size;
+ struct sem_undo *undo;
+ avl_tree_t *tree = NULL;
+ avl_index_t where;
+
+ mutex_exit(*lock);
+
+ size = SEM_UNDOSZ(sp->sem_nsems);
+ undo = kmem_zalloc(size, KM_SLEEP);
+ undo->un_proc = pp;
+ undo->un_sp = sp;
+
+ if (pp->p_semacct == NULL)
+ tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+
+ *lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
+ if (IPC_FREE(&sp->sem_perm)) {
+ kmem_free(undo, size);
+ if (tree)
+ kmem_free(tree, sizeof (avl_tree_t));
+ return (EIDRM);
+ }
+
+ mutex_enter(&pp->p_lock);
+ if (tree) {
+ if (pp->p_semacct == NULL) {
+ avl_create(tree, sem_undo_compar,
+ sizeof (struct sem_undo),
+ offsetof(struct sem_undo, un_avl));
+ pp->p_semacct = tree;
+ } else {
+ kmem_free(tree, sizeof (avl_tree_t));
+ }
+ }
+
+ if (*un = avl_find(pp->p_semacct, template, &where)) {
+ mutex_exit(&pp->p_lock);
+ kmem_free(undo, size);
+ } else {
+ *un = undo;
+ avl_insert(pp->p_semacct, undo, where);
+ mutex_exit(&pp->p_lock);
+ list_insert_head(&sp->sem_undos, undo);
+ ipc_hold(sem_svc, (kipc_perm_t *)sp);
+ }
+
+
+ return (0);
+}
+
+/*
+ * semop - Semop system call.
+ */
+static int
+semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
+{
+ ksemid_t *sp = NULL;
+ kmutex_t *lock;
+ struct sembuf *op; /* ptr to operation */
+ int i; /* loop control */
+ struct sem *semp; /* ptr to semaphore */
+ int error = 0;
+ struct sembuf *uops; /* ptr to copy of user ops */
+ struct sembuf x_sem; /* avoid kmem_alloc's */
+ timespec_t now, ts, *tsp = NULL;
+ int timecheck = 0;
+ int cvres, needundo, mode;
+ struct sem_undo *undo;
+ proc_t *pp = curproc;
+ int held = 0;
+
+ CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
+
+ /*
+ * To avoid the cost of copying in 'timeout' in the common
+ * case, we could only grab the time here and defer the copyin
+ * and associated computations until we are about to block.
+ *
+ * The down side to this is that we would then have to spin
+ * some goto top nonsense to avoid the copyin behind the semid
+ * lock. As a common use of timed semaphores is as an explicit
+ * blocking mechanism, this could incur a greater penalty.
+ *
+ * If we eventually decide that this would be a wise route to
+ * take, the deferrable functionality is completely contained
+ * in 'compute_timeout', and the interface is defined such that
+ * we can legally not validate 'timeout' if it is unused.
+ */
+ if (timeout != NULL) {
+ timecheck = timechanged;
+ gethrestime(&now);
+ if (error = compute_timeout(&tsp, &ts, &now, timeout))
+ return (set_errno(error));
+ }
+
+ /*
+ * Allocate space to hold the vector of semaphore ops. If
+ * there is only 1 operation we use a preallocated buffer on
+ * the stack for speed.
+ *
+ * Since we don't want to allow the user to allocate an
+ * arbitrary amount of kernel memory, we need to check against
+ * the number of operations allowed by the semaphore. We only
+ * bother doing this if the number of operations is larger than
+ * SEM_MAXUCOPS.
+ */
+ if (nsops == 1)
+ uops = &x_sem;
+ else if (nsops == 0)
+ return (0);
+ else if (nsops <= SEM_MAXUCOPS)
+ uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
+
+ if (nsops > SEM_MAXUCOPS) {
+ if ((lock = ipc_lookup(sem_svc, semid,
+ (kipc_perm_t **)&sp)) == NULL)
+ return (set_errno(EFAULT));
+
+ if (nsops > sp->sem_maxops) {
+ mutex_exit(lock);
+ return (set_errno(E2BIG));
+ }
+ held = 1;
+ ipc_hold(sem_svc, (kipc_perm_t *)sp);
+ mutex_exit(lock);
+
+ uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
+ if (copyin(sops, uops, nsops * sizeof (*op))) {
+ error = EFAULT;
+ (void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
+ goto semoperr;
+ }
+
+ lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
+ if (IPC_FREE(&sp->sem_perm)) {
+ error = EIDRM;
+ goto semoperr;
+ }
+ } else {
+ /*
+ * This could be interleaved with the above code, but
+ * keeping them separate improves readability.
+ */
+ if (copyin(sops, uops, nsops * sizeof (*op))) {
+ error = EFAULT;
+ goto semoperr_unlocked;
+ }
+
+ if ((lock = ipc_lookup(sem_svc, semid,
+ (kipc_perm_t **)&sp)) == NULL) {
+ error = EINVAL;
+ goto semoperr_unlocked;
+ }
+
+ if (nsops > sp->sem_maxops) {
+ error = E2BIG;
+ goto semoperr;
+ }
+ }
+
+ /*
+ * Scan all operations. Verify that sem #s are in range and
+ * this process is allowed the requested operations. If any
+ * operations are marked SEM_UNDO, find (or allocate) the undo
+ * structure for this process and semaphore.
+ */
+ needundo = 0;
+ mode = 0;
+ for (i = 0, op = uops; i++ < nsops; op++) {
+ mode |= op->sem_op ? SEM_A : SEM_R;
+ if (op->sem_num >= sp->sem_nsems) {
+ error = EFBIG;
+ goto semoperr;
+ }
+ if ((op->sem_flg & SEM_UNDO) && op->sem_op)
+ needundo = 1;
+ }
+ if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
+ goto semoperr;
+
+ if (needundo) {
+ struct sem_undo template;
+
+ template.un_sp = sp;
+ mutex_enter(&pp->p_lock);
+ if (pp->p_semacct)
+ undo = avl_find(pp->p_semacct, &template, NULL);
+ else
+ undo = NULL;
+ mutex_exit(&pp->p_lock);
+ if (undo == NULL) {
+ if (error = sem_undo_alloc(pp, sp, &lock, &template,
+ &undo))
+ goto semoperr;
+
+ /* sem_undo_alloc unlocks the semaphore */
+ if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
+ goto semoperr;
+ }
+ }
+
+check:
+ /*
+ * Loop waiting for the operations to be satisfied atomically.
+ * Actually, do the operations and undo them if a wait is needed
+ * or an error is detected.
+ */
+ for (i = 0; i < nsops; i++) {
+ op = &uops[i];
+ semp = &sp->sem_base[op->sem_num];
+
+ /*
+ * Raise the semaphore (i.e. sema_v)
+ */
+ if (op->sem_op > 0) {
+ if (op->sem_op + (int)semp->semval > USHRT_MAX ||
+ ((op->sem_flg & SEM_UNDO) &&
+ (error = sem_undo_add(op->sem_op, op->sem_num,
+ undo)))) {
+ if (i)
+ sem_rollback(sp, uops, i, undo);
+ if (error == 0)
+ error = ERANGE;
+ goto semoperr;
+ }
+ semp->semval += op->sem_op;
+ /*
+ * If we are only incrementing the semaphore value
+ * by one on a binary semaphore, we can cv_signal.
+ */
+ if (semp->semncnt) {
+ if (op->sem_op == 1 && sp->sem_binary)
+ cv_signal(&semp->semncnt_cv);
+ else
+ cv_broadcast(&semp->semncnt_cv);
+ }
+ if (semp->semzcnt && !semp->semval)
+ cv_broadcast(&semp->semzcnt_cv);
+ continue;
+ }
+
+ /*
+ * Lower the semaphore (i.e. sema_p)
+ */
+ if (op->sem_op < 0) {
+ if (semp->semval >= (unsigned)(-op->sem_op)) {
+ if ((op->sem_flg & SEM_UNDO) &&
+ (error = sem_undo_add(op->sem_op,
+ op->sem_num, undo))) {
+ if (i)
+ sem_rollback(sp, uops, i, undo);
+ goto semoperr;
+ }
+ semp->semval += op->sem_op;
+ if (semp->semzcnt && !semp->semval)
+ cv_broadcast(&semp->semzcnt_cv);
+ continue;
+ }
+ if (i)
+ sem_rollback(sp, uops, i, undo);
+ if (op->sem_flg & IPC_NOWAIT) {
+ error = EAGAIN;
+ goto semoperr;
+ }
+
+ /*
+ * Mark the semaphore set as not a binary type
+ * if we are decrementing the value by more than 1.
+ *
+ * V operations will resort to cv_broadcast
+ * for this set because there are too many weird
+ * cases that have to be caught.
+ */
+ if (op->sem_op < -1)
+ sp->sem_binary = 0;
+ if (!held) {
+ held = 1;
+ ipc_hold(sem_svc, (kipc_perm_t *)sp);
+ }
+ semp->semncnt++;
+ cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
+ tsp, timecheck);
+ lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
+
+ if (!IPC_FREE(&sp->sem_perm)) {
+ ASSERT(semp->semncnt != 0);
+ semp->semncnt--;
+ if (cvres > 0) /* normal wakeup */
+ goto check;
+ }
+
+ /* EINTR or EAGAIN overrides EIDRM */
+ if (cvres == 0)
+ error = EINTR;
+ else if (cvres < 0)
+ error = EAGAIN;
+ else
+ error = EIDRM;
+ goto semoperr;
+ }
+
+ /*
+ * Wait for zero value
+ */
+ if (semp->semval) {
+ if (i)
+ sem_rollback(sp, uops, i, undo);
+ if (op->sem_flg & IPC_NOWAIT) {
+ error = EAGAIN;
+ goto semoperr;
+ }
+
+ if (!held) {
+ held = 1;
+ ipc_hold(sem_svc, (kipc_perm_t *)sp);
+ }
+ semp->semzcnt++;
+ cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
+ tsp, timecheck);
+ lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
+
+ /*
+ * Don't touch semp if the semaphores have been removed.
+ */
+ if (!IPC_FREE(&sp->sem_perm)) {
+ ASSERT(semp->semzcnt != 0);
+ semp->semzcnt--;
+ if (cvres > 0) /* normal wakeup */
+ goto check;
+ }
+
+ /* EINTR or EAGAIN overrides EIDRM */
+ if (cvres == 0)
+ error = EINTR;
+ else if (cvres < 0)
+ error = EAGAIN;
+ else
+ error = EIDRM;
+ goto semoperr;
+ }
+ }
+
+ /* All operations succeeded. Update sempid for accessed semaphores. */
+ for (i = 0, op = uops; i++ < nsops;
+ sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
+ ;
+ sp->sem_otime = gethrestime_sec();
+ if (held)
+ ipc_rele(sem_svc, (kipc_perm_t *)sp);
+ else
+ mutex_exit(lock);
+
+ /* Before leaving, deallocate the buffer that held the user semops */
+ if (nsops != 1)
+ kmem_free(uops, sizeof (*uops) * nsops);
+ return (0);
+
+ /*
+ * Error return labels
+ */
+semoperr:
+ if (held)
+ ipc_rele(sem_svc, (kipc_perm_t *)sp);
+ else
+ mutex_exit(lock);
+
+semoperr_unlocked:
+
+ /* Before leaving, deallocate the buffer that held the user semops */
+ if (nsops != 1)
+ kmem_free(uops, sizeof (*uops) * nsops);
+ return (set_errno(error));
+}
+
+/*
+ * semsys - System entry point for semctl, semget, and semop system calls.
+ */
+static int
+semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
+{
+ int error;
+
+ switch (opcode) {
+ case SEMCTL:
+ error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
+ break;
+ case SEMGET:
+ error = semget((key_t)a1, (int)a2, (int)a3);
+ break;
+ case SEMOP:
+ error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
+ break;
+ case SEMIDS:
+ error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
+ break;
+ case SEMTIMEDOP:
+ error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
+ (timespec_t *)a4);
+ break;
+ default:
+ error = set_errno(EINVAL);
+ break;
+ }
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
new file mode 100644
index 0000000000..2f504af827
--- /dev/null
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -0,0 +1,1186 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/time.h>
+#include <sys/file.h>
+#include <sys/open.h>
+#include <sys/user.h>
+#include <sys/termios.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/esunddi.h>
+#include <sys/flock.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/vmsystm.h>
+
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <netinet/in.h>
+#include <sys/sendfile.h>
+#include <sys/un.h>
+#include <inet/nca/ncadoorhdr.h>
+#include <inet/nca/ncaio.h>
+#include <sys/tihdr.h>
+#include <sys/atomic.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/tcp.h>
+
+extern int nca_sendfilev(file_t *, struct sendfilevec *, int, ssize_t *);
+extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
+ ssize32_t *);
+extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
+ int);
+
+#define SEND_MAX_CHUNK 16
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+/*
+ * 64 bit offsets for 32 bit applications only running either on
+ * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer
+ * more than 2GB of data.
+ */
+int
+sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv,
+ int copy_cnt, ssize32_t *count)
+{
+ struct vnode *vp;
+ ushort_t fflag;
+ int ioflag;
+ size32_t cnt;
+ ssize32_t sfv_len;
+ ssize32_t tmpcount;
+ u_offset_t sfv_off;
+ struct uio auio;
+ struct iovec aiov;
+ int i, error;
+
+ fflag = fp->f_flag;
+ vp = fp->f_vnode;
+ for (i = 0; i < copy_cnt; i++) {
+
+ if (ISSIG(curthread, JUSTLOOKING))
+ return (EINTR);
+
+ /*
+ * Do similar checks as "write" as we are writing
+ * sfv_len bytes into "vp".
+ */
+ sfv_len = (ssize32_t)sfv->sfv_len;
+
+ if (sfv_len == 0)
+ continue;
+
+ if (sfv_len < 0)
+ return (EINVAL);
+
+ if (vp->v_type == VREG) {
+ if (*fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(
+ rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_SAFE);
+ mutex_exit(&curproc->p_lock);
+ return (EFBIG);
+ }
+
+ if (*fileoff >= OFFSET_MAX(fp))
+ return (EFBIG);
+
+ if (*fileoff + sfv_len > OFFSET_MAX(fp))
+ return (EINVAL);
+ }
+
+ tmpcount = *count + sfv_len;
+ if (tmpcount < 0)
+ return (EINVAL);
+
+ sfv_off = sfv->sfv_off;
+
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ if (sfv->sfv_fd == SFV_FD_SELF) {
+ aiov.iov_len = sfv_len;
+ aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
+ auio.uio_loffset = *fileoff;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = sfv_len;
+ auio.uio_iov = &aiov;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ ioflag = auio.uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ while (sfv_len > 0) {
+ error = VOP_WRITE(vp, &auio, ioflag,
+ fp->f_cred, NULL);
+ cnt = sfv_len - auio.uio_resid;
+ sfv_len -= cnt;
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
+ if (vp->v_type == VREG)
+ *fileoff += cnt;
+ *count += cnt;
+ if (error != 0)
+ return (error);
+ }
+ } else {
+ file_t *ffp;
+ vnode_t *readvp;
+ int readflg = 0;
+ size_t size;
+ caddr_t ptr;
+
+ if ((ffp = getf(sfv->sfv_fd)) == NULL)
+ return (EBADF);
+
+ if ((ffp->f_flag & FREAD) == 0) {
+ releasef(sfv->sfv_fd);
+ return (EBADF);
+ }
+
+ readvp = ffp->f_vnode;
+ if (readvp->v_type != VREG) {
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+
+ /*
+ * No point reading and writing to same vp,
+ * as long as both are regular files. readvp is not
+ * locked; but since we got it from an open file the
+ * contents will be valid during the time of access.
+ */
+ if (VN_CMP(vp, readvp)) {
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+
+ /*
+ * Note: we assume readvp != vp. "vp" is already
+ * locked, and "readvp" must not be.
+ */
+ (void) VOP_RWLOCK(readvp, readflg, NULL);
+
+ /*
+ * Same checks as in pread64.
+ */
+ if (sfv_off > MAXOFFSET_T) {
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+
+ if (sfv_off + sfv_len > MAXOFFSET_T)
+ sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
+
+ /* Find the native blocksize to transfer data */
+ size = MIN(vp->v_vfsp->vfs_bsize,
+ readvp->v_vfsp->vfs_bsize);
+ size = sfv_len < size ? sfv_len : size;
+ ptr = kmem_alloc(size, KM_SLEEP);
+
+ while (sfv_len > 0) {
+ size_t iov_len;
+
+ iov_len = MIN(size, sfv_len);
+ aiov.iov_base = ptr;
+ aiov.iov_len = iov_len;
+ auio.uio_loffset = sfv_off;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = ffp->f_flag;
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /*
+ * If read sync is not asked for,
+ * filter sync flags
+ */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(readvp, &auio, ioflag,
+ fp->f_cred, NULL);
+ if (error) {
+ kmem_free(ptr, size);
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (error);
+ }
+
+ /*
+ * Check how must data was really read.
+ * Decrement the 'len' and increment the
+ * 'off' appropriately.
+ */
+ cnt = iov_len - auio.uio_resid;
+ if (cnt == 0) {
+ /*
+ * If we were reading a pipe (currently
+ * not implemented), we may now lose
+ * data.
+ */
+ kmem_free(ptr, size);
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+ sfv_len -= cnt;
+ sfv_off += cnt;
+
+ aiov.iov_base = ptr;
+ aiov.iov_len = cnt;
+ auio.uio_loffset = *fileoff;
+ auio.uio_resid = cnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ error = VOP_WRITE(vp, &auio, ioflag,
+ fp->f_cred, NULL);
+
+ /*
+ * Check how much data was written. Increment
+ * the 'len' and decrement the 'off' if all
+ * the data was not written.
+ */
+ cnt -= auio.uio_resid;
+ sfv_len += auio.uio_resid;
+ sfv_off -= auio.uio_resid;
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)cnt;
+ if (vp->v_type == VREG)
+ *fileoff += cnt;
+ *count += cnt;
+ if (error != 0) {
+ kmem_free(ptr, size);
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (error);
+ }
+ }
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ kmem_free(ptr, size);
+ }
+ sfv++;
+ }
+ return (0);
+}
+
+ssize32_t
+sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt,
+ size32_t *xferred, int fildes)
+{
+ int rwflag;
+ u_offset_t fileoff;
+ int copy_cnt;
+ const struct ksendfilevec64 *copy_vec;
+ struct ksendfilevec64 sfv[SEND_MAX_CHUNK];
+ struct vnode *vp;
+ int error;
+ ssize32_t count = 0;
+ int osfvcnt;
+
+ rwflag = 1;
+ vp = fp->f_vnode;
+ (void) VOP_RWLOCK(vp, rwflag, NULL);
+
+ copy_vec = vec;
+ fileoff = fp->f_offset;
+ osfvcnt = sfvcnt;
+
+ do {
+ copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
+ if (copyin(copy_vec, sfv, copy_cnt *
+ sizeof (struct ksendfilevec64))) {
+ error = EFAULT;
+ break;
+ }
+
+ /*
+ * Optimize the single regular file over
+ * the socket case.
+ */
+ if (vp->v_type == VSOCK && osfvcnt == 1 &&
+ sfv->sfv_fd != SFV_FD_SELF) {
+ file_t *rfp;
+ vnode_t *rvp;
+
+ if ((rfp = getf(sfv->sfv_fd)) == NULL) {
+ error = EBADF;
+ break;
+ }
+ if ((rfp->f_flag & FREAD) == 0) {
+ releasef(sfv->sfv_fd);
+ error = EBADF;
+ break;
+ }
+ rvp = rfp->f_vnode;
+ if (rvp->v_type == VREG) {
+ error = sosendfile64(fp, rfp, sfv, &count);
+ break;
+ }
+ releasef(sfv->sfv_fd);
+ }
+ error = sendvec_chunk64(fp, &fileoff, sfv, copy_cnt, &count);
+ if (error != 0)
+ break;
+
+ copy_vec += copy_cnt;
+ sfvcnt -= copy_cnt;
+ } while (sfvcnt > 0);
+
+ if (vp->v_type == VREG)
+ fp->f_offset += count;
+
+ VOP_RWUNLOCK(vp, rwflag, NULL);
+ if (copyout(&count, xferred, sizeof (count)))
+ error = EFAULT;
+ releasef(fildes);
+ if (error != 0)
+ return (set_errno(error));
+ return (count);
+}
+#endif
+
+int
+sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
+ int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count)
+{
+ struct vnode *vp;
+ struct uio auio;
+ struct iovec aiov;
+ ushort_t fflag;
+ int ioflag;
+ int i, error;
+ size_t cnt;
+ ssize_t sfv_len;
+ u_offset_t sfv_off;
+#ifdef _SYSCALL32_IMPL
+ model_t model = get_udatamodel();
+ u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
+ MAXOFF32_T : MAXOFFSET_T;
+#else
+ const u_offset_t maxoff = MAXOFF32_T;
+#endif
+ mblk_t *dmp = NULL;
+ int wroff;
+ int buf_left = 0;
+ size_t iov_len;
+ mblk_t *head, *tmp;
+ size_t size = total_size;
+
+ fflag = fp->f_flag;
+ vp = fp->f_vnode;
+
+ ASSERT(vp->v_type == VSOCK);
+ ASSERT(maxblk > 0);
+
+ wroff = (int)vp->v_stream->sd_wroff;
+ buf_left = MIN(total_size, maxblk);
+ head = dmp = allocb(buf_left + wroff, BPRI_HI);
+ if (head == NULL)
+ return (ENOMEM);
+ head->b_wptr = head->b_rptr = head->b_rptr + wroff;
+
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ for (i = 0; i < copy_cnt; i++) {
+ if (ISSIG(curthread, JUSTLOOKING))
+ return (EINTR);
+
+ /*
+ * Do similar checks as "write" as we are writing
+ * sfv_len bytes into "vp".
+ */
+ sfv_len = (ssize_t)sfv->sfv_len;
+
+ if (sfv_len == 0) {
+ sfv++;
+ continue;
+ }
+
+ /* Make sure sfv_len is not negative */
+#ifdef _SYSCALL32_IMPL
+ if (model == DATAMODEL_ILP32) {
+ if ((ssize32_t)sfv_len < 0)
+ return (EINVAL);
+ } else
+#endif
+ if (sfv_len < 0)
+ return (EINVAL);
+
+ /* Check for overflow */
+#ifdef _SYSCALL32_IMPL
+ if (model == DATAMODEL_ILP32) {
+ if (((ssize32_t)(*count + sfv_len)) < 0)
+ return (EINVAL);
+ } else
+#endif
+ if ((*count + sfv_len) < 0)
+ return (EINVAL);
+
+ sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
+
+ if (sfv->sfv_fd == SFV_FD_SELF) {
+ while (sfv_len > 0) {
+ if (buf_left == 0) {
+ tmp = dmp;
+ buf_left = MIN(total_size, maxblk);
+ iov_len = MIN(buf_left, sfv_len);
+ dmp = allocb(buf_left + wroff, BPRI_HI);
+ if (dmp == NULL) {
+ freemsg(head);
+ return (ENOMEM);
+ }
+ dmp->b_wptr = dmp->b_rptr =
+ dmp->b_rptr + wroff;
+ tmp->b_cont = dmp;
+ } else {
+ iov_len = MIN(buf_left, sfv_len);
+ }
+
+ aiov.iov_len = iov_len;
+ aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
+ auio.uio_loffset = *fileoff;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = iov_len;
+ auio.uio_iov = &aiov;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+
+ buf_left -= iov_len;
+ total_size -= iov_len;
+ sfv_len -= iov_len;
+ sfv_off += iov_len;
+
+ error = uiomove((caddr_t)dmp->b_wptr,
+ iov_len, UIO_WRITE, &auio);
+ if (error != 0) {
+ freemsg(head);
+ return (error);
+ }
+ dmp->b_wptr += iov_len;
+ }
+ } else {
+ file_t *ffp;
+ vnode_t *readvp;
+ int readflg = 0;
+
+ if ((ffp = getf(sfv->sfv_fd)) == NULL) {
+ freemsg(head);
+ return (EBADF);
+ }
+
+ if ((ffp->f_flag & FREAD) == 0) {
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (EACCES);
+ }
+
+ readvp = ffp->f_vnode;
+ if (readvp->v_type != VREG) {
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (EINVAL);
+ }
+
+ /*
+ * No point reading and writing to same vp,
+ * as long as both are regular files. readvp is not
+ * locked; but since we got it from an open file the
+ * contents will be valid during the time of access.
+ */
+
+ if (VN_CMP(vp, readvp)) {
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (EINVAL);
+ }
+
+ /*
+ * Note: we assume readvp != vp. "vp" is already
+ * locked, and "readvp" must not be.
+ */
+
+ (void) VOP_RWLOCK(readvp, readflg, NULL);
+
+ /* Same checks as in pread */
+ if (sfv_off > maxoff) {
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (EINVAL);
+ }
+ if (sfv_off + sfv_len > maxoff) {
+ sfv_len = (ssize_t)((offset_t)maxoff -
+ sfv_off);
+ }
+
+ while (sfv_len > 0) {
+ if (buf_left == 0) {
+ tmp = dmp;
+ buf_left = MIN(total_size, maxblk);
+ iov_len = MIN(buf_left, sfv_len);
+ dmp = allocb(buf_left + wroff, BPRI_HI);
+ if (dmp == NULL) {
+ VOP_RWUNLOCK(readvp, readflg,
+ NULL);
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (ENOMEM);
+ }
+ dmp->b_wptr = dmp->b_rptr =
+ dmp->b_rptr + wroff;
+ tmp->b_cont = dmp;
+ } else {
+ iov_len = MIN(buf_left, sfv_len);
+ }
+ aiov.iov_base = (caddr_t)dmp->b_wptr;
+ aiov.iov_len = iov_len;
+ auio.uio_loffset = sfv_off;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = ffp->f_flag;
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /*
+ * If read sync is not asked for,
+ * filter sync flags
+ */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(readvp, &auio, ioflag,
+ fp->f_cred, NULL);
+ if (error != 0) {
+ /*
+ * If we were reading a pipe (currently
+ * not implemented), we may now loose
+ * data.
+ */
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (error);
+ }
+
+ /*
+ * Check how much data was really read.
+ * Decrement the 'len' and increment the
+ * 'off' appropriately.
+ */
+ cnt = iov_len - auio.uio_resid;
+ if (cnt == 0) {
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ freemsg(head);
+ return (EINVAL);
+ }
+ sfv_len -= cnt;
+ sfv_off += cnt;
+ total_size -= cnt;
+ buf_left -= cnt;
+
+ dmp->b_wptr += cnt;
+ }
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ }
+ sfv++;
+ }
+
+ ASSERT(total_size == 0);
+ error = kstrwritemp(vp, head, fflag);
+ if (error != 0) {
+ freemsg(head);
+ return (error);
+ }
+ ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
+ *count += size;
+
+ return (0);
+}
+
+
+int
+sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
+ int copy_cnt, ssize_t *count)
+{
+ struct vnode *vp;
+ struct uio auio;
+ struct iovec aiov;
+ ushort_t fflag;
+ int ioflag;
+ int i, error;
+ size_t cnt;
+ ssize_t sfv_len;
+ u_offset_t sfv_off;
+#ifdef _SYSCALL32_IMPL
+ model_t model = get_udatamodel();
+ u_offset_t maxoff = (model == DATAMODEL_ILP32) ?
+ MAXOFF32_T : MAXOFFSET_T;
+#else
+ const u_offset_t maxoff = MAXOFF32_T;
+#endif
+ mblk_t *dmp;
+
+ fflag = fp->f_flag;
+ vp = fp->f_vnode;
+
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ for (i = 0; i < copy_cnt; i++) {
+ if (ISSIG(curthread, JUSTLOOKING))
+ return (EINTR);
+
+ /*
+ * Do similar checks as "write" as we are writing
+ * sfv_len bytes into "vp".
+ */
+ sfv_len = (ssize_t)sfv->sfv_len;
+
+ if (sfv_len == 0) {
+ sfv++;
+ continue;
+ }
+
+ /* Make sure sfv_len is not negative */
+#ifdef _SYSCALL32_IMPL
+ if (model == DATAMODEL_ILP32) {
+ if ((ssize32_t)sfv_len < 0)
+ return (EINVAL);
+ } else
+#endif
+ if (sfv_len < 0)
+ return (EINVAL);
+
+ if (vp->v_type == VREG) {
+ if (*fileoff >= curproc->p_fsz_ctl) {
+ mutex_enter(&curproc->p_lock);
+ (void) rctl_action(
+ rctlproc_legacy[RLIMIT_FSIZE],
+ curproc->p_rctls, curproc, RCA_SAFE);
+ mutex_exit(&curproc->p_lock);
+
+ return (EFBIG);
+ }
+
+ if (*fileoff >= maxoff)
+ return (EFBIG);
+
+ if (*fileoff + sfv_len > maxoff)
+ return (EINVAL);
+ }
+
+ /* Check for overflow */
+#ifdef _SYSCALL32_IMPL
+ if (model == DATAMODEL_ILP32) {
+ if (((ssize32_t)(*count + sfv_len)) < 0)
+ return (EINVAL);
+ } else
+#endif
+ if ((*count + sfv_len) < 0)
+ return (EINVAL);
+
+ sfv_off = (u_offset_t)(ulong_t)sfv->sfv_off;
+
+ if (sfv->sfv_fd == SFV_FD_SELF) {
+ aiov.iov_len = sfv_len;
+ aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
+ auio.uio_loffset = *fileoff;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = sfv_len;
+ auio.uio_iov = &aiov;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+
+ if (vp->v_type == VSOCK) {
+
+ /*
+ * Optimize for the socket case
+ */
+ int wroff = (int)vp->v_stream->sd_wroff;
+
+ dmp = allocb(sfv_len + wroff, BPRI_HI);
+ if (dmp == NULL)
+ return (ENOMEM);
+ dmp->b_wptr = dmp->b_rptr = dmp->b_rptr + wroff;
+ error = uiomove((caddr_t)dmp->b_wptr,
+ sfv_len, UIO_WRITE, &auio);
+ if (error != 0) {
+ freeb(dmp);
+ return (error);
+ }
+ dmp->b_wptr += sfv_len;
+ error = kstrwritemp(vp, dmp, fflag);
+ if (error != 0) {
+ freeb(dmp);
+ return (error);
+ }
+ ttolwp(curthread)->lwp_ru.ioch +=
+ (ulong_t)sfv_len;
+ *count += sfv_len;
+ } else {
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ while (sfv_len > 0) {
+ error = VOP_WRITE(vp, &auio, ioflag,
+ fp->f_cred, NULL);
+ cnt = sfv_len - auio.uio_resid;
+ sfv_len -= cnt;
+ ttolwp(curthread)->lwp_ru.ioch +=
+ (ulong_t)cnt;
+ *fileoff += cnt;
+ *count += cnt;
+ if (error != 0)
+ return (error);
+ }
+ }
+ } else {
+ file_t *ffp;
+ vnode_t *readvp;
+ int readflg = 0;
+ size_t size;
+ caddr_t ptr;
+
+ if ((ffp = getf(sfv->sfv_fd)) == NULL)
+ return (EBADF);
+
+ if ((ffp->f_flag & FREAD) == 0) {
+ releasef(sfv->sfv_fd);
+ return (EBADF);
+ }
+
+ readvp = ffp->f_vnode;
+ if (readvp->v_type != VREG) {
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+
+ /*
+ * No point reading and writing to same vp,
+ * as long as both are regular files. readvp is not
+ * locked; but since we got it from an open file the
+ * contents will be valid during the time of access.
+ */
+ if (VN_CMP(vp, readvp)) {
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+
+ /*
+ * Note: we assume readvp != vp. "vp" is already
+ * locked, and "readvp" must not be.
+ */
+ (void) VOP_RWLOCK(readvp, readflg, NULL);
+
+ /* Same checks as in pread */
+ if (sfv_off > maxoff) {
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+ if (sfv_off + sfv_len > maxoff) {
+ sfv_len = (ssize_t)((offset_t)maxoff -
+ sfv_off);
+ }
+ /* Find the native blocksize to transfer data */
+ size = MIN(vp->v_vfsp->vfs_bsize,
+ readvp->v_vfsp->vfs_bsize);
+ size = sfv_len < size ? sfv_len : size;
+
+ while (sfv_len > 0) {
+ size_t iov_len;
+
+ iov_len = MIN(size, sfv_len);
+
+ dmp = allocb(iov_len, BPRI_HI);
+ if (dmp == NULL) {
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (ENOMEM);
+ }
+ ptr = (caddr_t)dmp->b_rptr;
+
+ aiov.iov_base = ptr;
+ aiov.iov_len = iov_len;
+ auio.uio_loffset = sfv_off;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = iov_len;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = MAXOFFSET_T;
+ auio.uio_fmode = ffp->f_flag;
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+
+ /*
+ * If read sync is not asked for,
+ * filter sync flags
+ */
+ if ((ioflag & FRSYNC) == 0)
+ ioflag &= ~(FSYNC|FDSYNC);
+ error = VOP_READ(readvp, &auio, ioflag,
+ fp->f_cred, NULL);
+ if (error != 0) {
+ /*
+ * If we were reading a pipe (currently
+ * not implemented), we may now lose
+ * data.
+ */
+ freeb(dmp);
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (error);
+ }
+
+ /*
+ * Check how much data was really read.
+ * Decrement the 'len' and increment the
+ * 'off' appropriately.
+ */
+ cnt = iov_len - auio.uio_resid;
+ if (cnt == 0) {
+ freeb(dmp);
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ return (EINVAL);
+ }
+ sfv_len -= cnt;
+ sfv_off += cnt;
+
+ if (vp->v_type == VSOCK) {
+ dmp->b_wptr = dmp->b_rptr + cnt;
+
+ error = kstrwritemp(vp, dmp, fflag);
+ if (error != 0) {
+ freeb(dmp);
+ VOP_RWUNLOCK(readvp, readflg,
+ NULL);
+ releasef(sfv->sfv_fd);
+ return (error);
+ }
+
+ ttolwp(curthread)->lwp_ru.ioch +=
+ (ulong_t)cnt;
+ *count += cnt;
+ } else {
+
+ aiov.iov_base = ptr;
+ aiov.iov_len = cnt;
+ auio.uio_loffset = *fileoff;
+ auio.uio_resid = cnt;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_llimit = curproc->p_fsz_ctl;
+ auio.uio_fmode = fflag;
+ ioflag = auio.uio_fmode &
+ (FAPPEND|FSYNC|FDSYNC|FRSYNC);
+ error = VOP_WRITE(vp, &auio, ioflag,
+ fp->f_cred, NULL);
+
+ /*
+ * Check how much data was written.
+ * Increment the 'len' and decrement the
+ * 'off' if all the data was not
+ * written.
+ */
+ cnt -= auio.uio_resid;
+ sfv_len += auio.uio_resid;
+ sfv_off -= auio.uio_resid;
+ ttolwp(curthread)->lwp_ru.ioch +=
+ (ulong_t)cnt;
+ *fileoff += cnt;
+ *count += cnt;
+ freeb(dmp);
+ if (error != 0) {
+ VOP_RWUNLOCK(readvp, readflg,
+ NULL);
+ releasef(sfv->sfv_fd);
+ return (error);
+ }
+ }
+ }
+ VOP_RWUNLOCK(readvp, readflg, NULL);
+ releasef(sfv->sfv_fd);
+ }
+ sfv++;
+ }
+ return (0);
+}
+
+ssize_t
+sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
+ size_t *xferred)
+{
+ int error;
+ file_t *fp;
+ struct vnode *vp;
+ struct sonode *so;
+ u_offset_t fileoff;
+ int copy_cnt;
+ const struct sendfilevec *copy_vec;
+ struct sendfilevec sfv[SEND_MAX_CHUNK];
+ ssize_t count = 0;
+#ifdef _SYSCALL32_IMPL
+ struct ksendfilevec32 sfv32[SEND_MAX_CHUNK];
+#endif
+ ssize_t total_size = 0;
+ int i;
+ boolean_t is_sock = B_FALSE;
+ int maxblk = 0;
+
+ if (sfvcnt <= 0)
+ return (set_errno(EINVAL));
+
+ if ((fp = getf(fildes)) == NULL)
+ return (set_errno(EBADF));
+
+ if (((fp->f_flag) & FWRITE) == 0) {
+ error = EBADF;
+ goto err;
+ }
+
+ fileoff = fp->f_offset;
+ vp = fp->f_vnode;
+
+ switch (vp->v_type) {
+ case VSOCK:
+ so = VTOSO(vp);
+ /* sendfile not supported for SCTP */
+ if (so->so_protocol == IPPROTO_SCTP) {
+ error = EPROTONOSUPPORT;
+ goto err;
+ }
+ is_sock = B_TRUE;
+ switch (so->so_family) {
+ case AF_NCA:
+ case AF_INET:
+ case AF_INET6:
+ /*
+ * Make similar checks done in SOP_WRITE().
+ */
+ if (so->so_state & SS_CANTSENDMORE) {
+ tsignal(curthread, SIGPIPE);
+ error = EPIPE;
+ goto err;
+ }
+ if (so->so_type != SOCK_STREAM) {
+ error = EOPNOTSUPP;
+ goto err;
+ }
+
+ if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
+ (SS_ISCONNECTED|SS_ISBOUND)) {
+ error = ENOTCONN;
+ goto err;
+ }
+
+ if ((so->so_state & SS_TCP_FAST_ACCEPT) &&
+ (so->so_priv != NULL)) {
+ maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
+ } else {
+ maxblk = (int)vp->v_stream->sd_maxblk;
+ }
+ break;
+ default:
+ error = EAFNOSUPPORT;
+ goto err;
+ }
+ break;
+ case VREG:
+ break;
+ default:
+ error = EINVAL;
+ goto err;
+ }
+
+ switch (opcode) {
+ case SENDFILEV :
+ break;
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+ case SENDFILEV64 :
+ return (sendvec64(fp, (struct ksendfilevec64 *)vec, sfvcnt,
+ (size32_t *)xferred, fildes));
+#endif
+ default :
+ error = ENOSYS;
+ break;
+ }
+
+ (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
+ copy_vec = vec;
+
+ do {
+ copy_cnt = MIN(sfvcnt, SEND_MAX_CHUNK);
+#ifdef _SYSCALL32_IMPL
+ /* 32-bit callers need to have their iovec expanded. */
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ if (copyin(copy_vec, sfv32,
+ copy_cnt * sizeof (ksendfilevec32_t))) {
+ error = EFAULT;
+ break;
+ }
+
+ for (i = 0; i < copy_cnt; i++) {
+ sfv[i].sfv_fd = sfv32[i].sfv_fd;
+ sfv[i].sfv_off =
+ (off_t)(uint32_t)sfv32[i].sfv_off;
+ sfv[i].sfv_len = (size_t)sfv32[i].sfv_len;
+ total_size += sfv[i].sfv_len;
+ sfv[i].sfv_flag = sfv32[i].sfv_flag;
+ }
+ } else {
+#endif
+ if (copyin(copy_vec, sfv,
+ copy_cnt * sizeof (sendfilevec_t))) {
+ error = EFAULT;
+ break;
+ }
+
+ for (i = 0; i < copy_cnt; i++) {
+ total_size += sfv[i].sfv_len;
+ }
+#ifdef _SYSCALL32_IMPL
+ }
+#endif
+
+ /*
+ * The task between deciding to use sendvec_small_chunk
+ * and sendvec_chunk is dependant on multiple things:
+ *
+ * i) latency is important for smaller files. So if the
+ * data is smaller than 'tcp_slow_start_initial' times
+ * maxblk, then use sendvec_small_chunk which creates
+ * maxblk size mblks and chains then together and sends
+ * them to TCP in one shot. It also leaves 'wroff' size
+ * space for the headers in each mblk.
+ *
+ * ii) for total size bigger than 'tcp_slow_start_initial'
+ * time maxblk, its probably real file data which is
+ * dominating. So its better to use sendvec_chunk because
+ * performance goes to dog if we don't do pagesize reads.
+ * sendvec_chunk will do pagesize reads and write them
+ * in pagesize mblks to TCP.
+ *
+ * Side Notes: A write to file has not been optimized.
+ * Future zero copy code will plugin into sendvec_chunk
+ * only because doing zero copy for files smaller then
+ * pagesize is useless.
+ *
+ * Note, if socket has NL7C enabled then call NL7C's
+ * senfilev() function to give NL7C a chance to copy
+ * the vec for caching, then continue processing as
+ * normal.
+ */
+ if (is_sock) {
+ switch (so->so_family) {
+ case AF_INET:
+ case AF_INET6:
+ if (so->so_nl7c_flags != 0) {
+ nl7c_sendfilev(so, fileoff,
+ sfv, copy_cnt);
+ }
+ if (total_size <= (4 * maxblk))
+ error = sendvec_small_chunk(fp,
+ &fileoff, sfv, copy_cnt,
+ total_size, maxblk, &count);
+ else
+ error = sendvec_chunk(fp, &fileoff,
+ sfv, copy_cnt, &count);
+ break;
+ case AF_NCA:
+ error = nca_sendfilev(fp, sfv, copy_cnt,
+ &count);
+ break;
+ }
+ } else {
+ ASSERT(vp->v_type == VREG);
+ error = sendvec_chunk(fp, &fileoff, sfv, copy_cnt,
+ &count);
+ }
+
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() == DATAMODEL_ILP32)
+ copy_vec = (const struct sendfilevec *)((char *)copy_vec +
+ (copy_cnt * sizeof (ksendfilevec32_t)));
+ else
+#endif
+ copy_vec += copy_cnt;
+ sfvcnt -= copy_cnt;
+ } while (sfvcnt > 0);
+
+ if (vp->v_type == VREG)
+ fp->f_offset += count;
+
+
+ VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
+
+#ifdef _SYSCALL32_IMPL
+ if (get_udatamodel() == DATAMODEL_ILP32) {
+ ssize32_t count32 = (ssize32_t)count;
+ if (copyout(&count32, xferred, sizeof (count32)))
+ error = EFAULT;
+ releasef(fildes);
+ if (error != 0)
+ return (set_errno(error));
+ return (count32);
+ }
+#endif
+ if (copyout(&count, xferred, sizeof (count)))
+ error = EFAULT;
+ releasef(fildes);
+ if (error != 0)
+ return (set_errno(error));
+ return (count);
+err:
+ ASSERT(error != 0);
+ releasef(fildes);
+ return (set_errno(error));
+}
diff --git a/usr/src/uts/common/syscall/sigaction.c b/usr/src/uts/common/syscall/sigaction.c
new file mode 100644
index 0000000000..8a38de67b8
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigaction.c
@@ -0,0 +1,231 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <sys/debug.h>
+
+int
+sigaction(int sig, struct sigaction *actp, struct sigaction *oactp)
+{
+ struct sigaction act;
+ struct sigaction oact;
+ k_sigset_t set;
+ proc_t *p;
+ int sigcld_look = 0;
+
+ if (sig <= 0 || sig >= NSIG ||
+ (actp != NULL && sigismember(&cantmask, sig)))
+ return (set_errno(EINVAL));
+
+ /*
+ * act and oact might be the same address, so copyin act first.
+ */
+ if (actp) {
+#if defined(__sparc)
+ void (*handler)();
+#endif
+ if (copyin(actp, &act, sizeof (act)))
+ return (set_errno(EFAULT));
+#if defined(__sparc)
+ /*
+ * Check alignment of handler
+ */
+ handler = act.sa_handler;
+ if (handler != SIG_IGN && handler != SIG_DFL &&
+ ((uintptr_t)handler & 0x3) != 0)
+ return (set_errno(EINVAL));
+#endif
+ }
+
+ p = curproc;
+ mutex_enter(&p->p_lock);
+
+ if (oactp) {
+ int flags;
+ void (*disp)();
+
+ disp = u.u_signal[sig - 1];
+
+ flags = 0;
+ if (disp != SIG_DFL && disp != SIG_IGN) {
+ set = u.u_sigmask[sig-1];
+ if (sigismember(&p->p_siginfo, sig))
+ flags |= SA_SIGINFO;
+ if (sigismember(&u.u_sigrestart, sig))
+ flags |= SA_RESTART;
+ if (sigismember(&u.u_sigonstack, sig))
+ flags |= SA_ONSTACK;
+ if (sigismember(&u.u_sigresethand, sig))
+ flags |= SA_RESETHAND;
+ if (sigismember(&u.u_signodefer, sig))
+ flags |= SA_NODEFER;
+ } else
+ sigemptyset(&set);
+
+ if (sig == SIGCLD) {
+ if (p->p_flag & SNOWAIT)
+ flags |= SA_NOCLDWAIT;
+ if (!(p->p_flag & SJCTL))
+ flags |= SA_NOCLDSTOP;
+ }
+
+ oact.sa_handler = disp;
+ oact.sa_flags = flags;
+ sigktou(&set, &oact.sa_mask);
+ }
+
+ if (actp) {
+ if (sig == SIGCLD &&
+ act.sa_handler != SIG_IGN &&
+ act.sa_handler != SIG_DFL)
+ sigcld_look = 1;
+
+ sigutok(&act.sa_mask, &set);
+ setsigact(sig, act.sa_handler, set, act.sa_flags);
+ }
+
+ mutex_exit(&p->p_lock);
+
+ if (sigcld_look)
+ sigcld_repost();
+
+ if (oactp &&
+ copyout(&oact, oactp, sizeof (oact)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+#ifdef _SYSCALL32_IMPL
+
+int
+sigaction32(int sig, struct sigaction32 *actp, struct sigaction32 *oactp)
+{
+ struct sigaction32 act32;
+ struct sigaction32 oact32;
+ k_sigset_t set;
+ proc_t *p;
+ int sigcld_look = 0;
+
+ if (sig <= 0 || sig >= NSIG ||
+ (actp != NULL && sigismember(&cantmask, sig)))
+ return (set_errno(EINVAL));
+
+ /*
+ * act and oact might be the same address, so copyin act first.
+ */
+ if (actp) {
+#if defined(__sparc)
+ void (*handler)();
+#endif
+ if (copyin(actp, &act32, sizeof (act32)))
+ return (set_errno(EFAULT));
+#if defined(__sparc)
+ /*
+ * Check alignment of handler
+ */
+ handler = (void (*)())act32.sa_handler;
+ if (handler != SIG_IGN && handler != SIG_DFL &&
+ ((uintptr_t)handler & 0x3) != 0)
+ return (set_errno(EINVAL));
+#endif
+ }
+
+ p = curproc;
+ mutex_enter(&p->p_lock);
+
+ if (oactp) {
+ int flags;
+ void (*disp)();
+
+ disp = u.u_signal[sig - 1];
+
+ flags = 0;
+ if (disp != SIG_DFL && disp != SIG_IGN) {
+ set = u.u_sigmask[sig-1];
+ if (sigismember(&p->p_siginfo, sig))
+ flags |= SA_SIGINFO;
+ if (sigismember(&u.u_sigrestart, sig))
+ flags |= SA_RESTART;
+ if (sigismember(&u.u_sigonstack, sig))
+ flags |= SA_ONSTACK;
+ if (sigismember(&u.u_sigresethand, sig))
+ flags |= SA_RESETHAND;
+ if (sigismember(&u.u_signodefer, sig))
+ flags |= SA_NODEFER;
+ } else
+ sigemptyset(&set);
+
+ if (sig == SIGCLD) {
+ if (p->p_flag & SNOWAIT)
+ flags |= SA_NOCLDWAIT;
+ if (!(p->p_flag & SJCTL))
+ flags |= SA_NOCLDSTOP;
+ }
+
+ oact32.sa_handler = (caddr32_t)(uintptr_t)disp;
+ oact32.sa_flags = flags;
+ sigktou(&set, &oact32.sa_mask);
+ }
+
+ if (actp) {
+ if (sig == SIGCLD &&
+ act32.sa_handler != (caddr32_t)SIG_IGN &&
+ act32.sa_handler != (caddr32_t)SIG_DFL)
+ sigcld_look = 1;
+
+ sigutok(&act32.sa_mask, &set);
+ setsigact(sig, (void (*)())(uintptr_t)act32.sa_handler, set,
+ act32.sa_flags);
+ }
+
+ mutex_exit(&p->p_lock);
+
+ if (sigcld_look)
+ sigcld_repost();
+
+ if (oactp &&
+ copyout(&oact32, oactp, sizeof (oact32)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/sigaltstack.c b/usr/src/uts/common/syscall/sigaltstack.c
new file mode 100644
index 0000000000..4c310390b3
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigaltstack.c
@@ -0,0 +1,121 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <sys/debug.h>
+
+int
+sigaltstack(struct sigaltstack *ssp, struct sigaltstack *oss)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ struct sigaltstack ss;
+
+ /*
+ * User's oss and ss might be the same address, so copyin first and
+ * save before copying out.
+ */
+ if (ssp) {
+ if (lwp->lwp_sigaltstack.ss_flags & SS_ONSTACK)
+ return (set_errno(EPERM));
+ if (copyin(ssp, &ss, sizeof (ss)))
+ return (set_errno(EFAULT));
+ if (ss.ss_flags & ~SS_DISABLE)
+ return (set_errno(EINVAL));
+ if (!(ss.ss_flags & SS_DISABLE) && ss.ss_size < MINSIGSTKSZ)
+ return (set_errno(ENOMEM));
+ }
+
+ if (oss) {
+ if (copyout(&lwp->lwp_sigaltstack,
+ oss, sizeof (struct sigaltstack)))
+ return (set_errno(EFAULT));
+ }
+
+ if (ssp)
+ lwp->lwp_sigaltstack = ss;
+
+ return (0);
+}
+
+#ifdef _LP64
+int
+sigaltstack32(struct sigaltstack32 *ssp, struct sigaltstack32 *oss)
+{
+ klwp_t *lwp = ttolwp(curthread);
+ struct sigaltstack *ss;
+ struct sigaltstack32 ss32, oss32;
+
+ /*
+ * User's oss and ss might be the same address, so copyin first and
+ * save before copying out.
+ */
+ if (ssp) {
+ if (lwp->lwp_sigaltstack.ss_flags & SS_ONSTACK)
+ return (set_errno(EPERM));
+ if (copyin(ssp, &ss32, sizeof (ss32)))
+ return (set_errno(EFAULT));
+ if (ss32.ss_flags & ~SS_DISABLE)
+ return (set_errno(EINVAL));
+ if (!(ss32.ss_flags & SS_DISABLE) && ss32.ss_size < MINSIGSTKSZ)
+ return (set_errno(ENOMEM));
+ }
+
+ if (oss) {
+ /*
+ * copy to ILP32 struct before copyout.
+ */
+ ss = &lwp->lwp_sigaltstack;
+ oss32.ss_sp = (caddr32_t)(uintptr_t)ss->ss_sp;
+ oss32.ss_size = (size32_t)ss->ss_size;
+ oss32.ss_flags = ss->ss_flags;
+
+ if (copyout(&oss32, oss, sizeof (oss32)))
+ return (set_errno(EFAULT));
+ }
+
+ if (ssp) {
+ ss = &lwp->lwp_sigaltstack;
+ ss->ss_sp = (void *)(uintptr_t)ss32.ss_sp;
+ ss->ss_size = (size_t)ss32.ss_size;
+ ss->ss_flags = ss32.ss_flags;
+ }
+
+ return (0);
+}
+#endif /* _LP64 */
diff --git a/usr/src/uts/common/syscall/signotify.c b/usr/src/uts/common/syscall/signotify.c
new file mode 100644
index 0000000000..0c32a0cd5e
--- /dev/null
+++ b/usr/src/uts/common/syscall/signotify.c
@@ -0,0 +1,226 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/procset.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <vm/as.h>
+#include <sys/debug.h>
+#include <sys/contract/process_impl.h>
+
+/*ARGSUSED*/
+static int
+copyin_siginfo(model_t datamodel, void *uaddr, k_siginfo_t *ksip)
+{
+#ifdef _SYSCALL32_IMPL
+ int ret;
+
+ if (datamodel == DATAMODEL_NATIVE) {
+#endif
+ return (copyin(uaddr, ksip, sizeof (k_siginfo_t)));
+#ifdef _SYSCALL32_IMPL
+ } else {
+ siginfo32_t si32;
+
+ if (ret = copyin(uaddr, &si32, sizeof (si32)))
+ return (ret);
+
+ siginfo_32tok(&si32, ksip);
+ }
+
+ return (0);
+#endif
+}
+
+/*
+ * To find secured 64 bit id for signotify() call
+ * This depends upon as_getmemid() which returns
+ * unique vnode/offset for a user virtual address.
+ */
+static u_longlong_t
+get_sigid(proc_t *p, caddr_t addr)
+{
+ u_longlong_t snid = 0;
+ memid_t memid;
+ quad_t *tquad = (quad_t *)&snid;
+
+ if (!as_getmemid(p->p_as, addr, &memid)) {
+ tquad->val[0] = (int)memid.val[0];
+ tquad->val[1] = (int)memid.val[1];
+ }
+ return (snid);
+}
+
+#define SIGN_PTR(p, n) &((signotifyq_t *)(&p->p_signhdr[1]))[n];
+
+int
+signotify(int cmd, siginfo_t *siginfo, signotify_id_t *sn_id)
+{
+ k_siginfo_t info;
+ signotify_id_t id;
+ proc_t *p;
+ proc_t *cp = curproc;
+ signotifyq_t *snqp;
+ struct cred *cr;
+ sigqueue_t *sqp;
+ sigqhdr_t *sqh;
+ u_longlong_t sid;
+ model_t datamodel = get_udatamodel();
+
+ if (copyin(sn_id, &id, sizeof (signotify_id_t)))
+ return (set_errno(EFAULT));
+
+ if (id.sn_index >= _SIGNOTIFY_MAX || id.sn_index < 0)
+ return (set_errno(EINVAL));
+
+ switch (cmd) {
+ case SN_PROC:
+ /* get snid for the given user address of signotifyid_t */
+ sid = get_sigid(cp, (caddr_t)sn_id);
+
+ if (id.sn_pid > 0) {
+ mutex_enter(&pidlock);
+ if ((p = prfind(id.sn_pid)) != NULL) {
+ mutex_enter(&p->p_lock);
+ if (p->p_signhdr != NULL) {
+ snqp = SIGN_PTR(p, id.sn_index);
+ if (snqp->sn_snid == sid) {
+ mutex_exit(&p->p_lock);
+ mutex_exit(&pidlock);
+ return (set_errno(EBUSY));
+ }
+ }
+ mutex_exit(&p->p_lock);
+ }
+ mutex_exit(&pidlock);
+ }
+
+ if (copyin_siginfo(datamodel, siginfo, &info))
+ return (set_errno(EFAULT));
+
+ /* The si_code value must indicate the signal will be queued */
+ if (!sigwillqueue(info.si_signo, info.si_code))
+ return (set_errno(EINVAL));
+
+ if (cp->p_signhdr == NULL) {
+ /* Allocate signotify pool first time */
+ sqh = sigqhdralloc(sizeof (signotifyq_t),
+ _SIGNOTIFY_MAX);
+ mutex_enter(&cp->p_lock);
+ if (cp->p_signhdr == NULL) {
+ /* hang the pool head on proc */
+ cp->p_signhdr = sqh;
+ } else {
+ /* another lwp allocated the pool, free ours */
+ sigqhdrfree(sqh);
+ }
+ } else {
+ mutex_enter(&cp->p_lock);
+ }
+
+ sqp = sigqalloc(cp->p_signhdr);
+ if (sqp == NULL) {
+ mutex_exit(&cp->p_lock);
+ return (set_errno(EAGAIN));
+ }
+ cr = CRED();
+ sqp->sq_info = info;
+ sqp->sq_info.si_pid = cp->p_pid;
+ sqp->sq_info.si_ctid = PRCTID(cp);
+ sqp->sq_info.si_zoneid = getzoneid();
+ sqp->sq_info.si_uid = crgetruid(cr);
+
+ /* fill the signotifyq_t fields */
+ ((signotifyq_t *)sqp)->sn_snid = sid;
+
+ mutex_exit(&cp->p_lock);
+
+ /* complete the signotify_id_t fields */
+ id.sn_index = (signotifyq_t *)sqp - SIGN_PTR(cp, 0);
+ id.sn_pid = cp->p_pid;
+
+ break;
+
+ case SN_CANCEL:
+ case SN_SEND:
+
+ mutex_enter(&pidlock);
+ if ((id.sn_pid <= 0) || ((p = prfind(id.sn_pid)) == NULL)) {
+ mutex_exit(&pidlock);
+ return (set_errno(EINVAL));
+ }
+ mutex_enter(&p->p_lock);
+ mutex_exit(&pidlock);
+
+ if (p->p_signhdr == NULL) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+
+ snqp = SIGN_PTR(p, id.sn_index);
+
+ if (snqp->sn_snid == 0) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+
+ if (snqp->sn_snid != get_sigid(cp, (caddr_t)sn_id)) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+
+ snqp->sn_snid = 0;
+
+ /* cmd == SN_CANCEL or signo == 0 (SIGEV_NONE) */
+ if (((sigqueue_t *)snqp)->sq_info.si_signo <= 0)
+ cmd = SN_CANCEL;
+
+ sigqsend(cmd, p, 0, (sigqueue_t *)snqp);
+ mutex_exit(&p->p_lock);
+
+ id.sn_pid = 0;
+ id.sn_index = 0;
+
+ break;
+
+ default :
+ return (set_errno(EINVAL));
+ }
+
+ if (copyout(&id, sn_id, sizeof (signotify_id_t)))
+ return (set_errno(EFAULT));
+
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sigpending.c b/usr/src/uts/common/syscall/sigpending.c
new file mode 100644
index 0000000000..5801d09d26
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigpending.c
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+
+int
+sigpending(int flag, sigset_t *setp)
+{
+ sigset_t set;
+ k_sigset_t kset;
+ proc_t *p;
+
+ switch (flag) {
+ case 1: /* sigpending */
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(curthread);
+ kset = p->p_sig;
+ sigorset(&kset, &curthread->t_sig);
+ sigandset(&kset, &curthread->t_hold);
+ mutex_exit(&p->p_lock);
+ break;
+ case 2: /* sigfillset */
+ kset = fillset;
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ sigktou(&kset, &set);
+ if (copyout((caddr_t)&set, (caddr_t)setp, sizeof (sigset_t)))
+ return (set_errno(EFAULT));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sigprocmask.c b/usr/src/uts/common/syscall/sigprocmask.c
new file mode 100644
index 0000000000..8f7cf6113d
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigprocmask.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+
+
+/*
+ * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+
+int64_t
+lwp_sigmask(int how, uint_t bits0, uint_t bits1)
+{
+ kthread_t *t = curthread;
+ proc_t *p = ttoproc(t);
+ rval_t rv;
+
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(t);
+
+ bits0 &= (FILLSET0 & ~CANTMASK0);
+ bits1 &= (FILLSET1 & ~CANTMASK1);
+
+ rv.r_val1 = t->t_hold.__sigbits[0];
+ rv.r_val2 = t->t_hold.__sigbits[1];
+
+ switch (how) {
+ case SIG_BLOCK:
+ t->t_hold.__sigbits[0] |= bits0;
+ t->t_hold.__sigbits[1] |= bits1;
+ break;
+ case SIG_UNBLOCK:
+ t->t_hold.__sigbits[0] &= ~bits0;
+ t->t_hold.__sigbits[1] &= ~bits1;
+ if (sigcheck(p, t))
+ t->t_sig_check = 1;
+ break;
+ case SIG_SETMASK:
+ t->t_hold.__sigbits[0] = bits0;
+ t->t_hold.__sigbits[1] = bits1;
+ if (sigcheck(p, t))
+ t->t_sig_check = 1;
+ break;
+ }
+
+ mutex_exit(&p->p_lock);
+ return (rv.r_vals);
+}
+
+/*
+ * This system call is no longer called from libc.
+ * It exists solely for the benefit of statically-linked
+ * binaries from the past. It should be eliminated.
+ */
+int
+sigprocmask(int how, sigset_t *setp, sigset_t *osetp)
+{
+ sigset_t set;
+ k_sigset_t kset;
+ rval_t rv;
+
+ /*
+ * User's oset and set might be the same address, so copyin first and
+ * save before copying out.
+ */
+ if (setp) {
+ switch (how) {
+ case SIG_BLOCK:
+ case SIG_UNBLOCK:
+ case SIG_SETMASK:
+ break;
+ default:
+ return (set_errno(EINVAL));
+ }
+ if (copyin((caddr_t)setp, (caddr_t)&set, sizeof (sigset_t)))
+ return (set_errno(EFAULT));
+ sigutok(&set, &kset);
+ } else {
+ /* none of SIG_BLOCK, SIG_UNBLOCK, SIG_SETMASK equals 0 */
+ how = 0;
+ sigemptyset(&kset);
+ }
+
+ rv.r_vals = lwp_sigmask(how, kset.__sigbits[0], kset.__sigbits[1]);
+
+ if (osetp) {
+ kset.__sigbits[0] = rv.r_val1;
+ kset.__sigbits[1] = rv.r_val2;
+ sigktou(&kset, &set);
+ if (copyout((caddr_t)&set, (caddr_t)osetp, sizeof (sigset_t)))
+ return (set_errno(EFAULT));
+ }
+
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sigqueue.c b/usr/src/uts/common/syscall/sigqueue.c
new file mode 100644
index 0000000000..38c5b91202
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigqueue.c
@@ -0,0 +1,185 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998-2000 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/procset.h>
+#include <sys/fault.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <sys/debug.h>
+
+static int
+sigqkill(pid_t pid, int signo, sigsend_t *sigsend)
+{
+ register proc_t *p;
+ int error;
+
+ if (signo < 0 || signo >= NSIG)
+ return (set_errno(EINVAL));
+
+ if (pid == -1) {
+ procset_t set;
+
+ setprocset(&set, POP_AND, P_ALL, P_MYID, P_ALL, P_MYID);
+ error = sigsendset(&set, sigsend);
+ } else if (pid > 0) {
+ mutex_enter(&pidlock);
+ if ((p = prfind(pid)) == NULL || p->p_stat == SIDL)
+ error = ESRCH;
+ else {
+ error = sigsendproc(p, sigsend);
+ if (error == 0 && sigsend->perm == 0)
+ error = EPERM;
+ }
+ mutex_exit(&pidlock);
+ } else {
+ int nfound = 0;
+ pid_t pgid;
+
+ if (pid == 0)
+ pgid = ttoproc(curthread)->p_pgrp;
+ else
+ pgid = -pid;
+
+ error = 0;
+ mutex_enter(&pidlock);
+ for (p = pgfind(pgid); p && !error; p = p->p_pglink) {
+ if (p->p_stat != SIDL) {
+ nfound++;
+ error = sigsendproc(p, sigsend);
+ }
+ }
+ mutex_exit(&pidlock);
+ if (nfound == 0)
+ error = ESRCH;
+ else if (error == 0 && sigsend->perm == 0)
+ error = EPERM;
+ }
+
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+
+/*
+ * for implementations that don't require binary compatibility,
+ * the kill system call may be made into a library call to the
+ * sigsend system call
+ */
+int
+kill(pid_t pid, int sig)
+{
+ sigsend_t v;
+
+ bzero(&v, sizeof (v));
+ v.sig = sig;
+ v.checkperm = 1;
+ v.sicode = SI_USER;
+
+ return (sigqkill(pid, sig, &v));
+}
+
+/*
+ * The handling of small unions, like the sigval argument to sigqueue,
+ * is architecture dependent. We have adapted the convention that the
+ * value itself is passed in the storage which crosses the kernel
+ * protection boundary. This procedure will accept a scalar argument,
+ * and store it in the appropriate value member of the sigsend_t structure.
+ */
+int
+sigqueue(pid_t pid, int signo, /* union sigval */ void *value, int si_code)
+{
+ sigsend_t v;
+ sigqhdr_t *sqh;
+ proc_t *p = curproc;
+
+ /* The si_code value must indicate the signal will be queued */
+ if (pid <= 0 || !sigwillqueue(signo, si_code))
+ return (set_errno(EINVAL));
+
+ if (p->p_sigqhdr == NULL) {
+ /* Allocate sigqueue pool first time */
+ sqh = sigqhdralloc(sizeof (sigqueue_t), _SIGQUEUE_MAX);
+ mutex_enter(&p->p_lock);
+ if (p->p_sigqhdr == NULL) {
+ /* hang the pool head on proc */
+ p->p_sigqhdr = sqh;
+ } else {
+ /* another lwp allocated the pool, free ours */
+ sigqhdrfree(sqh);
+ }
+ mutex_exit(&p->p_lock);
+ }
+
+ bzero(&v, sizeof (v));
+ v.sig = signo;
+ v.checkperm = 1;
+ v.sicode = si_code;
+ v.value.sival_ptr = value;
+
+ return (sigqkill(pid, signo, &v));
+}
+
+#ifdef _SYSCALL32_IMPL
+/*
+ * sigqueue32 - System call entry point for 32-bit callers on LP64 kernel,
+ * needed to handle the 32-bit sigvals as correctly as we can. We always
+ * assume that a 32-bit caller is passing an int. A 64-bit recipient
+ * that expects an int will therefore get it correctly. A 32-bit
+ * recipient will also get it correctly since siginfo_kto32() uses
+ * sival_int in the conversion. Since a 32-bit pointer has the same
+ * size and address in the sigval, it also converts correctly so that
+ * two 32-bit apps can exchange a pointer value. However, this means
+ * that a pointer sent by a 32-bit caller will be seen in the upper half
+ * by a 64-bit recipient, and only the upper half of a 64-bit pointer will
+ * be seen by a 32-bit recipient. This is the best solution that does
+ * not require severe hacking of the sigval union. Anyways, what it
+ * means to be sending pointers between processes with dissimilar
+ * models is unclear.
+ */
+int
+sigqueue32(pid_t pid, int signo, /* union sigval32 */ caddr32_t value,
+ int si_code)
+{
+ union sigval sv;
+
+ bzero(&sv, sizeof (sv));
+ sv.sival_int = (int)value;
+ return (sigqueue(pid, signo, sv.sival_ptr, si_code));
+}
+#endif
diff --git a/usr/src/uts/common/syscall/sigsendset.c b/usr/src/uts/common/syscall/sigsendset.c
new file mode 100644
index 0000000000..fa40d76502
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigsendset.c
@@ -0,0 +1,67 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* Copyright (c) 1994 Sun Microsystems, Inc. */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright (c) 1998 by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/procset.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <sys/debug.h>
+
+int
+sigsendsys(procset_t *psp, int sig)
+{
+ int error;
+ procset_t set;
+ sigsend_t v;
+
+
+ if (sig < 0 || sig >= NSIG)
+ return (set_errno(EINVAL));
+
+ bzero(&v, sizeof (v));
+ v.sig = sig;
+ v.checkperm = 1;
+ v.sicode = SI_USER;
+
+ if (copyin((caddr_t)psp, (caddr_t)&set, sizeof (procset_t)))
+ return (set_errno(EFAULT));
+ if (error = sigsendset(&set, &v))
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sigsuspend.c b/usr/src/uts/common/syscall/sigsuspend.c
new file mode 100644
index 0000000000..819bf787fc
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigsuspend.c
@@ -0,0 +1,66 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/procset.h>
+#include <sys/signal.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+
+int
+sigsuspend(sigset_t *setp)
+{
+ sigset_t set;
+ k_sigset_t kset;
+ proc_t *p = curproc;
+
+ if (copyin((caddr_t)setp, (caddr_t)&set, sizeof (sigset_t)))
+ return (set_errno(EFAULT));
+ sigutok(&set, &kset);
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(curthread);
+ ttolwp(curthread)->lwp_sigoldmask = curthread->t_hold;
+ curthread->t_hold = kset;
+ curthread->t_sig_check = 1; /* so post-syscall will re-evaluate */
+ curthread->t_flag |= T_TOMASK;
+ /* pause() */
+ while (cv_wait_sig_swap(&curthread->t_delay_cv, &p->p_lock))
+ ;
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINTR));
+}
diff --git a/usr/src/uts/common/syscall/sigtimedwait.c b/usr/src/uts/common/syscall/sigtimedwait.c
new file mode 100644
index 0000000000..ad4d79b763
--- /dev/null
+++ b/usr/src/uts/common/syscall/sigtimedwait.c
@@ -0,0 +1,207 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/bitmap.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/procset.h>
+#include <sys/signal.h>
+#include <sys/siginfo.h>
+#include <sys/time.h>
+#include <sys/kmem.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+#include <sys/condvar_impl.h>
+#include <sys/model.h>
+#include <sys/sdt.h>
+#include <sys/zone.h>
+
+static int
+copyout_siginfo(model_t datamodel, k_siginfo_t *ksip, void *uaddr)
+{
+ zoneid_t zoneid = getzoneid();
+
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (SI_FROMUSER(ksip) && zoneid != GLOBAL_ZONEID &&
+ zoneid != ksip->si_zoneid) {
+ k_siginfo_t sani_sip = *ksip;
+ sani_sip.si_pid = curproc->p_zone->zone_zsched->p_pid;
+ sani_sip.si_uid = 0;
+ sani_sip.si_ctid = -1;
+ sani_sip.si_zoneid = zoneid;
+ if (copyout(&sani_sip, uaddr, sizeof (sani_sip)))
+ return (set_errno(EFAULT));
+ } else {
+ if (copyout(ksip, uaddr, sizeof (*ksip)))
+ return (set_errno(EFAULT));
+ }
+ }
+#ifdef _SYSCALL32_IMPL
+ else {
+ siginfo32_t si32;
+
+ siginfo_kto32(ksip, &si32);
+ if (SI_FROMUSER(ksip) && zoneid != GLOBAL_ZONEID &&
+ zoneid != ksip->si_zoneid) {
+ si32.si_pid = curproc->p_zone->zone_zsched->p_pid;
+ si32.si_uid = 0;
+ si32.si_ctid = -1;
+ si32.si_zoneid = zoneid;
+ }
+ if (copyout(&si32, uaddr, sizeof (si32)))
+ return (set_errno(EFAULT));
+ }
+#endif
+ return (ksip->si_signo);
+}
+
+/*
+ * Wait until a signal within a specified set is posted or until the
+ * time interval 'timeout' if specified. The signal is caught but
+ * not delivered. The value of the signal is returned to the caller.
+ */
+int
+sigtimedwait(sigset_t *setp, siginfo_t *siginfop, timespec_t *timeoutp)
+{
+ sigset_t set;
+ k_sigset_t kset;
+ k_sigset_t oldmask;
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+ proc_t *p = ttoproc(t);
+ timespec_t sig_timeout;
+ timespec_t *rqtp = NULL;
+ int timecheck = 0;
+ int ret;
+ int error = 0;
+ k_siginfo_t info, *infop;
+ model_t datamodel = get_udatamodel();
+
+ if (timeoutp) {
+ timespec_t now;
+
+ timecheck = timechanged;
+ gethrestime(&now);
+ if (datamodel == DATAMODEL_NATIVE) {
+ if (copyin(timeoutp, &sig_timeout,
+ sizeof (sig_timeout)))
+ return (set_errno(EFAULT));
+ } else {
+ timespec32_t timeout32;
+
+ if (copyin(timeoutp, &timeout32, sizeof (timeout32)))
+ return (set_errno(EFAULT));
+ TIMESPEC32_TO_TIMESPEC(&sig_timeout, &timeout32)
+ }
+
+ if (itimerspecfix(&sig_timeout))
+ return (set_errno(EINVAL));
+ /*
+ * Convert the timespec value into absolute time.
+ */
+ timespecadd(&sig_timeout, &now);
+ rqtp = &sig_timeout;
+ }
+ if (copyin(setp, &set, sizeof (set)))
+ return (set_errno(EFAULT));
+ sigutok(&set, &kset);
+ if (sigisempty(&kset))
+ return (set_errno(EINVAL));
+
+ mutex_enter(&p->p_lock);
+ /*
+ * set the thread's signal mask to unmask
+ * those signals in the specified set.
+ */
+ schedctl_finish_sigblock(t);
+ oldmask = t->t_hold;
+ sigdiffset(&t->t_hold, &kset);
+
+ /*
+ * Wait until we take a signal or until
+ * the absolute future time is passed.
+ */
+ while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock,
+ rqtp, timecheck)) > 0)
+ continue;
+ if (ret == -1)
+ error = EAGAIN;
+
+ /*
+ * Restore thread's signal mask to its previous value.
+ */
+ t->t_hold = oldmask;
+ t->t_sig_check = 1; /* so post_syscall sees new t_hold mask */
+
+ if (error) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(error)); /* timer expired */
+ }
+ /*
+ * Don't bother with signal if it is not in request set.
+ */
+ if (lwp->lwp_cursig == 0 || !sigismember(&kset, lwp->lwp_cursig)) {
+ mutex_exit(&p->p_lock);
+ /*
+ * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
+ * This happens if some other thread in this process called
+ * forkall() or exit().
+ */
+ return (set_errno(EINTR));
+ }
+
+ if (lwp->lwp_curinfo)
+ infop = &lwp->lwp_curinfo->sq_info;
+ else {
+ infop = &info;
+ bzero(infop, sizeof (info));
+ infop->si_signo = lwp->lwp_cursig;
+ infop->si_code = SI_NOINFO;
+ }
+
+ lwp->lwp_ru.nsignals++;
+ ret = lwp->lwp_cursig;
+ DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop);
+ lwp->lwp_cursig = 0;
+ lwp->lwp_extsig = 0;
+ mutex_exit(&p->p_lock);
+
+ if (siginfop)
+ ret = copyout_siginfo(datamodel, infop, siginfop);
+ if (lwp->lwp_curinfo) {
+ siginfofree(lwp->lwp_curinfo);
+ lwp->lwp_curinfo = NULL;
+ }
+ return (ret);
+}
diff --git a/usr/src/uts/common/syscall/ssig.c b/usr/src/uts/common/syscall/ssig.c
new file mode 100644
index 0000000000..e0998f474b
--- /dev/null
+++ b/usr/src/uts/common/syscall/ssig.c
@@ -0,0 +1,169 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/fault.h>
+#include <sys/procset.h>
+#include <sys/signal.h>
+#include <sys/schedctl.h>
+#include <sys/debug.h>
+
+
+/*
+ * ssig() is the old common entry for signal, sigset, sighold,
+ * sigrelse, sigignore and sigpause.
+ *
+ * All of these interfaces have been reimplemented in libc using
+ * calls to sigaction, sigsuspend and sigprocmask.
+ *
+ * This kernel interface is no longer called by any application
+ * that is dynamically linked with libc. It exists solely for
+ * the benefit of really old statically-linked applications.
+ * It should be removed from the system.
+ */
+
+int
+ssig(int signo, void (*func)())
+{
+ int sig;
+ struct proc *p;
+ int flags;
+ int retval = 0;
+ int sigcld_look = 0;
+
+ sig = signo & SIGNO_MASK;
+
+ if (sig <= 0 || sig >= NSIG || sigismember(&cantmask, sig))
+ return (set_errno(EINVAL));
+
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_lock);
+ schedctl_finish_sigblock(curthread);
+ switch (signo & ~SIGNO_MASK) {
+
+ case SIGHOLD: /* sighold */
+ sigaddset(&curthread->t_hold, sig);
+ mutex_exit(&p->p_lock);
+ return (0);
+
+ case SIGRELSE: /* sigrelse */
+ sigdelset(&curthread->t_hold, sig);
+ curthread->t_sig_check = 1; /* so ISSIG will see release */
+ mutex_exit(&p->p_lock);
+ return (0);
+
+ case SIGPAUSE: /* sigpause */
+ sigdelset(&curthread->t_hold, sig);
+ curthread->t_sig_check = 1; /* so ISSIG will see release */
+ /* pause() */
+ while (cv_wait_sig_swap(&curthread->t_delay_cv, &p->p_lock))
+ ;
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINTR));
+
+ case SIGIGNORE: /* signore */
+ sigdelset(&curthread->t_hold, sig);
+ curthread->t_sig_check = 1; /* so ISSIG will see release */
+ func = SIG_IGN;
+ flags = 0;
+ break;
+
+ case SIGDEFER: /* sigset */
+ if (sigismember(&curthread->t_hold, sig))
+ retval = (int)SIG_HOLD;
+ else
+ retval = (int)(uintptr_t)u.u_signal[sig-1];
+ if (func == SIG_HOLD) {
+ sigaddset(&curthread->t_hold, sig);
+ mutex_exit(&p->p_lock);
+ return (retval);
+ }
+
+#if defined(__sparc)
+ /*
+ * Check alignment of handler
+ */
+ if (func != SIG_IGN && func != SIG_DFL &&
+ ((uintptr_t)func & 0x3) != 0) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+#endif
+ sigdelset(&curthread->t_hold, sig);
+ curthread->t_sig_check = 1; /* so post_syscall sees it */
+ flags = 0;
+ break;
+
+ case 0: /* signal */
+#if defined(__sparc)
+ /*
+ * Check alignment of handler
+ */
+ if (func != SIG_IGN && func != SIG_DFL &&
+ ((uintptr_t)func & 0x3) != 0) {
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+#endif
+ retval = (int)(uintptr_t)u.u_signal[sig-1];
+ flags = SA_RESETHAND|SA_NODEFER;
+ break;
+
+ default: /* error */
+ mutex_exit(&p->p_lock);
+ return (set_errno(EINVAL));
+ }
+
+ if (sigismember(&stopdefault, sig))
+ flags |= SA_RESTART;
+ else if (sig == SIGCLD) {
+ flags |= SA_NOCLDSTOP;
+ if (func == SIG_IGN)
+ flags |= SA_NOCLDWAIT;
+ else if (func != SIG_DFL)
+ sigcld_look = 1;
+ }
+
+ setsigact(sig, func, nullsmask, flags);
+ mutex_exit(&p->p_lock);
+
+ if (sigcld_look)
+ sigcld_repost();
+
+ return (retval);
+}
diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c
new file mode 100644
index 0000000000..b9505ebaa1
--- /dev/null
+++ b/usr/src/uts/common/syscall/stat.c
@@ -0,0 +1,675 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Get file attribute information through a file name or a file descriptor.
+ */
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/fcntl.h>
+#include <sys/pathname.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/mode.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/ioreq.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <c2/audit.h>
+
+/*
+ * Get the vp to be stated and the cred to be used for the call
+ * to VOP_GETATTR
+ */
+
+/*
+ * nmflag has the following values
+ *
+ * 1 - Always do lookup. i.e. stat, lstat.
+ * 2 - Name is optional i.e. fstatat
+ * 0 - Don't lookup name, vp is in file_p. i.e. fstat
+ *
+ */
+static int
+cstatat_getvp(int fd, char *name, int nmflag,
+ int follow, vnode_t **vp, cred_t **cred)
+{
+ vnode_t *startvp;
+ file_t *fp;
+ int error;
+ cred_t *cr;
+
+ *vp = NULL;
+
+ /*
+ * Only return EFAULT for fstatat when fd == AT_FDCWD && name == NULL
+ */
+
+ if (fd == AT_FDCWD) {
+ if (name != NULL || nmflag != 2) {
+ startvp = NULL;
+ cr = CRED();
+ crhold(cr);
+ } else
+ return (EFAULT);
+ } else {
+ char startchar;
+
+ if (nmflag == 1 || (nmflag == 2 && name != NULL)) {
+ if (copyin(name, &startchar, sizeof (char)))
+ return (EFAULT);
+ } else {
+ startchar = '\0';
+ }
+ if (startchar != '/' || nmflag == 0) {
+ if ((fp = getf(fd)) == NULL) {
+ return (EBADF);
+ }
+ startvp = fp->f_vnode;
+ cr = fp->f_cred;
+ crhold(cr);
+ VN_HOLD(startvp);
+ releasef(fd);
+ } else {
+ startvp = NULL;
+ cr = CRED();
+ crhold(cr);
+ }
+ }
+ *cred = cr;
+
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_setfsat_path(1);
+#endif /* C2_AUDIT */
+
+
+ if (nmflag == 1 || (nmflag == 2 && name != NULL)) {
+lookup:
+ if (error = lookupnameat(name, UIO_USERSPACE, follow, NULLVPP,
+ vp, startvp)) {
+ if (error == ESTALE)
+ goto lookup;
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ crfree(cr);
+ return (error);
+ }
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ } else {
+ *vp = startvp;
+ }
+
+ return (0);
+}
+
+/*
+ * Native syscall interfaces:
+ *
+ * N-bit kernel, N-bit applications, N-bit file offsets
+ */
+
+static int cstatat(int, char *, int, struct stat *, int, int);
+static int cstat(vnode_t *vp, struct stat *, int, cred_t *);
+
+int
+stat(char *fname, struct stat *sb)
+{
+ return (cstatat(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL));
+}
+
+int
+lstat(char *fname, struct stat *sb)
+{
+ return (cstatat(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0));
+}
+
+/*
+ * fstat can and should be fast, do an inline implementation here.
+ */
+#define FSTAT_BODY(fd, sb, statfn) \
+ { \
+ file_t *fp; \
+ int error; \
+ \
+ if ((fp = getf(fd)) == NULL) \
+ return (set_errno(EBADF)); \
+ if (audit_active) \
+ audit_setfsat_path(1); \
+ error = statfn(fp->f_vnode, sb, 0, fp->f_cred); \
+ releasef(fd); \
+ if (error) \
+ return (set_errno(error)); \
+ return (0); \
+ }
+
+int
+fstat(int fd, struct stat *sb)
+{
+ FSTAT_BODY(fd, sb, cstat)
+}
+
+int
+fstatat(int fd, char *name, struct stat *sb, int flags)
+{
+ return (cstatat(fd, name, 2, sb, flags, 0));
+}
+
+#if defined(__i386) || defined(__i386_COMPAT)
+
+/*
+ * Handle all the "extended" stat operations in the same way;
+ * validate the version, then call the real handler.
+ */
+
+#define XSTAT_BODY(ver, f, s, fn) \
+ return (ver != _STAT_VER ? set_errno(EINVAL) : fn(f, s));
+
+#endif /* __i386 || __i386_COMPAT */
+
+#if defined(__i386)
+
+/*
+ * Syscalls for i386 applications that issue {,l,f}xstat() directly
+ */
+int
+xstat(int version, char *fname, struct stat *sb)
+{
+ XSTAT_BODY(version, fname, sb, stat)
+}
+
+int
+lxstat(int version, char *fname, struct stat *sb)
+{
+ XSTAT_BODY(version, fname, sb, lstat)
+}
+
+int
+fxstat(int version, int fd, struct stat *sb)
+{
+ XSTAT_BODY(version, fd, sb, fstat)
+}
+
+#endif /* __i386 */
+
+/*
+ * Common code for stat(), lstat(), and fstat().
+ * (32-bit kernel, 32-bit applications, 32-bit files)
+ * (64-bit kernel, 64-bit applications, 64-bit files)
+ */
+static int
+cstat(vnode_t *vp, struct stat *ubp, int flag, cred_t *cr)
+{
+ struct vfssw *vswp;
+ struct stat sb;
+ vattr_t vattr;
+ int error;
+
+ vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+ if ((error = VOP_GETATTR(vp, &vattr, flag, cr)) != 0)
+ return (error);
+#ifdef _ILP32
+ /*
+ * (32-bit kernel, 32-bit applications, 32-bit files)
+ * NOTE: 32-bit kernel maintains a 64-bit unsigend va_size.
+ *
+ * st_size of devices (VBLK and VCHR special files) is a special case.
+ * POSIX does not define size behavior for special files, so the
+ * following Solaris specific behavior is not a violation. Solaris
+ * returns the size of the device.
+ *
+ * For compatibility with 32-bit programs which happen to do stat() on
+ * a (mknod) bigger than 2GB we suppress the large file EOVERFLOW and
+ * instead we return the value MAXOFF32_T (LONG_MAX).
+ *
+ * 32-bit applications that care about the size of devices should be
+ * built 64-bit or use a large file interface (lfcompile(5) or lf64(5)).
+ */
+ if ((vattr.va_size > MAXOFF32_T) &&
+ ((vp->v_type == VBLK) || (vp->v_type == VCHR))) {
+ /* OVERFLOW | UNKNOWN_SIZE */
+ vattr.va_size = MAXOFF32_T;
+ }
+#endif /* _ILP32 */
+ if (vattr.va_size > MAXOFF_T || vattr.va_nblocks > LONG_MAX ||
+ vattr.va_nodeid > ULONG_MAX)
+ return (EOVERFLOW);
+
+ bzero(&sb, sizeof (sb));
+ sb.st_dev = vattr.va_fsid;
+ sb.st_ino = (ino_t)vattr.va_nodeid;
+ sb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+ sb.st_nlink = vattr.va_nlink;
+ sb.st_uid = vattr.va_uid;
+ sb.st_gid = vattr.va_gid;
+ sb.st_rdev = vattr.va_rdev;
+ sb.st_size = (off_t)vattr.va_size;
+ sb.st_atim = vattr.va_atime;
+ sb.st_mtim = vattr.va_mtime;
+ sb.st_ctim = vattr.va_ctime;
+ sb.st_blksize = vattr.va_blksize;
+ sb.st_blocks = (blkcnt_t)vattr.va_nblocks;
+ if (vp->v_vfsp != NULL) {
+ vswp = &vfssw[vp->v_vfsp->vfs_fstype];
+ if (vswp->vsw_name && *vswp->vsw_name)
+ (void) strcpy(sb.st_fstype, vswp->vsw_name);
+ }
+ if (copyout(&sb, ubp, sizeof (sb)))
+ return (EFAULT);
+ return (0);
+}
+
+static int
+cstatat(int fd, char *name, int nmflag, struct stat *sb, int follow, int flags)
+{
+ vnode_t *vp;
+ int error;
+ cred_t *cred;
+ int link_follow;
+
+ link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW;
+lookup:
+ if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred))
+ return (set_errno(error));
+ error = cstat(vp, sb, flags, cred);
+ crfree(cred);
+ VN_RELE(vp);
+out:
+ if (error != 0) {
+ if (error == ESTALE &&
+ (nmflag == 1 || (nmflag == 2 && name != NULL)))
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+
+/*
+ * 64-bit kernel, 32-bit applications, 32-bit file offsets
+ */
+static int cstatat32(int, char *, int, struct stat32 *, int, int);
+static int cstat32(vnode_t *, struct stat32 *, int, cred_t *);
+
+int
+stat32(char *fname, struct stat32 *sb)
+{
+ return (cstatat32(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL));
+}
+
+int
+lstat32(char *fname, struct stat32 *sb)
+{
+ return (cstatat32(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0));
+}
+
+int
+fstat32(int fd, struct stat32 *sb)
+{
+ FSTAT_BODY(fd, sb, cstat32)
+}
+
+int
+fstatat32(int fd, char *name, struct stat32 *sb, int flag)
+{
+ return (cstatat32(fd, name, 2, sb, flag, 0));
+}
+
+#if defined(__i386_COMPAT)
+
+/*
+ * Syscalls for i386 applications that issue {,l,f}xstat() directly
+ */
+int
+xstat32(int version, char *fname, struct stat32 *sb)
+{
+ XSTAT_BODY(version, fname, sb, stat32)
+}
+
+int
+lxstat32(int version, char *fname, struct stat32 *sb)
+{
+ XSTAT_BODY(version, fname, sb, lstat32)
+}
+
+int
+fxstat32(int version, int fd, struct stat32 *sb)
+{
+ XSTAT_BODY(version, fd, sb, fstat32)
+}
+
+#endif /* __i386_COMPAT */
+
+static int
+cstat32(vnode_t *vp, struct stat32 *ubp, int flag, struct cred *cr)
+{
+ struct vfssw *vswp;
+ struct stat32 sb;
+ vattr_t vattr;
+ int error;
+ dev32_t st_dev, st_rdev;
+
+ vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, flag, cr))
+ return (error);
+
+ /* devices are a special case, see comments in cstat */
+ if ((vattr.va_size > MAXOFF32_T) &&
+ ((vp->v_type == VBLK) || (vp->v_type == VCHR))) {
+ /* OVERFLOW | UNKNOWN_SIZE */
+ vattr.va_size = MAXOFF32_T;
+ }
+
+ /* check for large values */
+ if (!cmpldev(&st_dev, vattr.va_fsid) ||
+ !cmpldev(&st_rdev, vattr.va_rdev) ||
+ vattr.va_size > MAXOFF32_T ||
+ vattr.va_nblocks > INT32_MAX ||
+ vattr.va_nodeid > UINT32_MAX ||
+ TIMESPEC_OVERFLOW(&(vattr.va_atime)) ||
+ TIMESPEC_OVERFLOW(&(vattr.va_mtime)) ||
+ TIMESPEC_OVERFLOW(&(vattr.va_ctime)))
+ return (EOVERFLOW);
+
+ bzero(&sb, sizeof (sb));
+ sb.st_dev = st_dev;
+ sb.st_ino = (ino32_t)vattr.va_nodeid;
+ sb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+ sb.st_nlink = vattr.va_nlink;
+ sb.st_uid = vattr.va_uid;
+ sb.st_gid = vattr.va_gid;
+ sb.st_rdev = st_rdev;
+ sb.st_size = (off32_t)vattr.va_size;
+ TIMESPEC_TO_TIMESPEC32(&(sb.st_atim), &(vattr.va_atime));
+ TIMESPEC_TO_TIMESPEC32(&(sb.st_mtim), &(vattr.va_mtime));
+ TIMESPEC_TO_TIMESPEC32(&(sb.st_ctim), &(vattr.va_ctime));
+ sb.st_blksize = vattr.va_blksize;
+ sb.st_blocks = (blkcnt32_t)vattr.va_nblocks;
+ if (vp->v_vfsp != NULL) {
+ vswp = &vfssw[vp->v_vfsp->vfs_fstype];
+ if (vswp->vsw_name && *vswp->vsw_name)
+ (void) strcpy(sb.st_fstype, vswp->vsw_name);
+ }
+ if (copyout(&sb, ubp, sizeof (sb)))
+ return (EFAULT);
+ return (0);
+}
+
+static int
+cstatat32(int fd, char *name, int nmflag, struct stat32 *sb,
+ int follow, int flags)
+{
+ vnode_t *vp;
+ int error;
+ cred_t *cred;
+ int link_follow;
+
+ link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW;
+lookup:
+ if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred))
+ return (set_errno(error));
+ error = cstat32(vp, sb, flags, cred);
+ crfree(cred);
+ VN_RELE(vp);
+out:
+ if (error != 0) {
+ if (error == ESTALE &&
+ (nmflag == 1 || (nmflag == 2 && name != NULL)))
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+#endif /* _SYSCALL32_IMPL */
+
+#if defined(_ILP32)
+
+/*
+ * 32-bit kernel, 32-bit applications, 64-bit file offsets.
+ *
+ * These routines are implemented differently on 64-bit kernels.
+ */
+static int cstatat64(int, char *, int, struct stat64 *, int, int);
+static int cstat64(vnode_t *, struct stat64 *, int, cred_t *);
+
+int
+stat64(char *fname, struct stat64 *sb)
+{
+ return (cstatat64(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL));
+}
+
+int
+lstat64(char *fname, struct stat64 *sb)
+{
+ return (cstatat64(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0));
+}
+
+int
+fstat64(int fd, struct stat64 *sb)
+{
+ FSTAT_BODY(fd, sb, cstat64)
+}
+
+int
+fstatat64(int fd, char *name, struct stat64 *sb, int flags)
+{
+ return (cstatat64(fd, name, 2, sb, flags, 0));
+}
+
+static int
+cstat64(vnode_t *vp, struct stat64 *ubp, int flag, cred_t *cr)
+{
+ struct vfssw *vswp;
+ struct stat64 lsb;
+ vattr_t vattr;
+ int error;
+
+ vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, flag, cr))
+ return (error);
+
+ bzero(&lsb, sizeof (lsb));
+ lsb.st_dev = vattr.va_fsid;
+ lsb.st_ino = vattr.va_nodeid;
+ lsb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+ lsb.st_nlink = vattr.va_nlink;
+ lsb.st_uid = vattr.va_uid;
+ lsb.st_gid = vattr.va_gid;
+ lsb.st_rdev = vattr.va_rdev;
+ lsb.st_size = vattr.va_size;
+ lsb.st_atim = vattr.va_atime;
+ lsb.st_mtim = vattr.va_mtime;
+ lsb.st_ctim = vattr.va_ctime;
+ lsb.st_blksize = vattr.va_blksize;
+ lsb.st_blocks = vattr.va_nblocks;
+ if (vp->v_vfsp != NULL) {
+ vswp = &vfssw[vp->v_vfsp->vfs_fstype];
+ if (vswp->vsw_name && *vswp->vsw_name)
+ (void) strcpy(lsb.st_fstype, vswp->vsw_name);
+ }
+ if (copyout(&lsb, ubp, sizeof (lsb)))
+ return (EFAULT);
+ return (0);
+}
+
+static int
+cstatat64(int fd, char *name, int nmflag, struct stat64 *sb,
+ int follow, int flags)
+{
+ vnode_t *vp;
+ int error;
+ cred_t *cred;
+ int link_follow;
+
+ link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW;
+lookup:
+ if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred))
+ return (set_errno(error));
+ error = cstat64(vp, sb, flags, cred);
+ crfree(cred);
+ VN_RELE(vp);
+out:
+ if (error != 0) {
+ if (error == ESTALE &&
+ (nmflag == 1 || (nmflag == 2 && name != NULL)))
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+#endif /* _ILP32 */
+
+#if defined(_SYSCALL32_IMPL)
+
+/*
+ * 64-bit kernel, 32-bit applications, 64-bit file offsets.
+ *
+ * We'd really like to call the "native" stat calls for these ones,
+ * but the problem is that the 64-bit ABI defines the 'stat64' structure
+ * differently from the way the 32-bit ABI defines it.
+ */
+
+static int cstatat64_32(int, char *, int, struct stat64_32 *, int, int);
+static int cstat64_32(vnode_t *, struct stat64_32 *, int, cred_t *);
+
+int
+stat64_32(char *fname, struct stat64_32 *sb)
+{
+ return (cstatat64_32(AT_FDCWD, fname, 1, sb, 0, ATTR_REAL));
+}
+
+int
+lstat64_32(char *fname, struct stat64_32 *sb)
+{
+ return (cstatat64_32(AT_FDCWD, fname, 1, sb, AT_SYMLINK_NOFOLLOW, 0));
+}
+
+int
+fstat64_32(int fd, struct stat64_32 *sb)
+{
+ FSTAT_BODY(fd, sb, cstat64_32)
+}
+
+int
+fstatat64_32(int fd, char *name, struct stat64_32 *sb, int flag)
+{
+ return (cstatat64_32(fd, name, 2, sb, flag, 0));
+}
+
+static int
+cstat64_32(vnode_t *vp, struct stat64_32 *ubp, int flag, cred_t *cr)
+{
+ struct vfssw *vswp;
+ struct stat64_32 lsb;
+ vattr_t vattr;
+ int error;
+ dev32_t st_dev, st_rdev;
+
+ vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE;
+ if (error = VOP_GETATTR(vp, &vattr, flag, cr))
+ return (error);
+
+ if (!cmpldev(&st_dev, vattr.va_fsid) ||
+ !cmpldev(&st_rdev, vattr.va_rdev) ||
+ TIMESPEC_OVERFLOW(&(vattr.va_atime)) ||
+ TIMESPEC_OVERFLOW(&(vattr.va_mtime)) ||
+ TIMESPEC_OVERFLOW(&(vattr.va_ctime)))
+ return (EOVERFLOW);
+
+ bzero(&lsb, sizeof (lsb));
+ lsb.st_dev = st_dev;
+ lsb.st_ino = vattr.va_nodeid;
+ lsb.st_mode = VTTOIF(vattr.va_type) | vattr.va_mode;
+ lsb.st_nlink = vattr.va_nlink;
+ lsb.st_uid = vattr.va_uid;
+ lsb.st_gid = vattr.va_gid;
+ lsb.st_rdev = st_rdev;
+ lsb.st_size = vattr.va_size;
+ TIMESPEC_TO_TIMESPEC32(&(lsb.st_atim), &(vattr.va_atime));
+ TIMESPEC_TO_TIMESPEC32(&(lsb.st_mtim), &(vattr.va_mtime));
+ TIMESPEC_TO_TIMESPEC32(&(lsb.st_ctim), &(vattr.va_ctime));
+ lsb.st_blksize = vattr.va_blksize;
+ lsb.st_blocks = vattr.va_nblocks;
+ if (vp->v_vfsp != NULL) {
+ vswp = &vfssw[vp->v_vfsp->vfs_fstype];
+ if (vswp->vsw_name && *vswp->vsw_name)
+ (void) strcpy(lsb.st_fstype, vswp->vsw_name);
+ }
+ if (copyout(&lsb, ubp, sizeof (lsb)))
+ return (EFAULT);
+ return (0);
+}
+
+static int
+cstatat64_32(int fd, char *name, int nmflag, struct stat64_32 *sb,
+ int follow, int flags)
+{
+ vnode_t *vp;
+ int error;
+ cred_t *cred;
+ int link_follow;
+
+ link_follow = (follow == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW;
+lookup:
+ if (error = cstatat_getvp(fd, name, nmflag, link_follow, &vp, &cred))
+ return (set_errno(error));
+ error = cstat64_32(vp, sb, flags, cred);
+ crfree(cred);
+ VN_RELE(vp);
+out:
+ if (error != 0) {
+ if (error == ESTALE &&
+ (nmflag == 1 || (nmflag == 2 && name != NULL)))
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/statfs.c b/usr/src/uts/common/syscall/statfs.c
new file mode 100644
index 0000000000..5d8c2cd395
--- /dev/null
+++ b/usr/src/uts/common/syscall/statfs.c
@@ -0,0 +1,164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/fstyp.h>
+#include <sys/systm.h>
+#include <sys/vfs.h>
+#include <sys/statfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+
+#include <vm/page.h>
+
+#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
+
+/*
+ * statfs(2) and fstatfs(2) have been replaced by statvfs(2) and
+ * fstatvfs(2) and will be removed from the system in a near-future
+ * release.
+ *
+ * Supported here purely for 32-bit compatibility.
+ */
+
+static int cstatfs(struct vfs *, struct statfs32 *, int);
+
+int
+statfs32(char *fname, struct statfs32 *sbp, int32_t len, int32_t fstyp)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ if (fstyp != 0)
+ error = EINVAL;
+ else
+ error = cstatfs(vp->v_vfsp, sbp, len);
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+fstatfs32(int32_t fdes, struct statfs32 *sbp, int32_t len, int32_t fstyp)
+{
+ struct file *fp;
+ int error;
+
+ if (fstyp != 0)
+ return (set_errno(EINVAL));
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ error = cstatfs(fp->f_vnode->v_vfsp, sbp, len);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Common routine for fstatfs and statfs.
+ */
+static int
+cstatfs(struct vfs *vfsp, struct statfs32 *sbp, int len)
+{
+ struct statfs32 sfs;
+ struct statvfs64 svfs;
+ int error, i;
+ char *cp, *cp2;
+ struct vfssw *vswp;
+
+ if (len < 0 || len > sizeof (struct statfs))
+ return (EINVAL);
+ if (error = VFS_STATVFS(vfsp, &svfs))
+ return (error);
+
+ if (svfs.f_blocks > UINT32_MAX || svfs.f_bfree > UINT32_MAX ||
+ svfs.f_files > UINT32_MAX || svfs.f_ffree > UINT32_MAX)
+ return (EOVERFLOW);
+ /*
+ * Map statvfs fields into the old statfs structure.
+ */
+ bzero(&sfs, sizeof (sfs));
+ sfs.f_bsize = svfs.f_bsize;
+ sfs.f_frsize = (svfs.f_frsize == svfs.f_bsize) ? 0 : svfs.f_frsize;
+ sfs.f_blocks = svfs.f_blocks * (svfs.f_frsize / 512);
+ sfs.f_bfree = svfs.f_bfree * (svfs.f_frsize / 512);
+ sfs.f_files = svfs.f_files;
+ sfs.f_ffree = svfs.f_ffree;
+
+ cp = svfs.f_fstr;
+ cp2 = sfs.f_fname;
+ i = 0;
+ while (i++ < sizeof (sfs.f_fname))
+ if (*cp != '\0')
+ *cp2++ = *cp++;
+ else
+ *cp2++ = '\0';
+ while (*cp != '\0' &&
+ i++ < (sizeof (svfs.f_fstr) - sizeof (sfs.f_fpack)))
+ cp++;
+ (void) strncpy(sfs.f_fpack, cp + 1, sizeof (sfs.f_fpack));
+ if ((vswp = vfs_getvfssw(svfs.f_basetype)) == NULL)
+ sfs.f_fstyp = 0;
+ else {
+ sfs.f_fstyp = vswp - vfssw;
+ vfs_unrefvfssw(vswp);
+ }
+
+ if (copyout(&sfs, sbp, len))
+ return (EFAULT);
+
+ return (0);
+}
+
+#endif /* _SYSCALL32_IMPL || _ILP32 */
diff --git a/usr/src/uts/common/syscall/statvfs.c b/usr/src/uts/common/syscall/statvfs.c
new file mode 100644
index 0000000000..dc0a98153a
--- /dev/null
+++ b/usr/src/uts/common/syscall/statvfs.c
@@ -0,0 +1,366 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Get file system statistics (statvfs and fstatvfs).
+ */
+
+#include <sys/types.h>
+#include <sys/inttypes.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/fstyp.h>
+#include <sys/systm.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+
+#include <vm/page.h>
+
+#define STATVFSCOPY(dst, src) \
+ (dst)->f_bsize = (src)->f_bsize; \
+ (dst)->f_frsize = (src)->f_frsize; \
+ (dst)->f_blocks = (src)->f_blocks; \
+ (dst)->f_bfree = (src)->f_bfree; \
+ (dst)->f_bavail = (src)->f_bavail; \
+ (dst)->f_files = (src)->f_files; \
+ (dst)->f_ffree = (src)->f_ffree; \
+ (dst)->f_favail = (src)->f_favail; \
+ (dst)->f_fsid = (src)->f_fsid; \
+ bcopy((src)->f_basetype, (dst)->f_basetype, \
+ sizeof ((dst)->f_basetype)); \
+ (dst)->f_flag = (src)->f_flag; \
+ (dst)->f_namemax = (src)->f_namemax; \
+ bcopy((src)->f_fstr, (dst)->f_fstr, \
+ sizeof ((dst)->f_fstr))
+
+/*
+ * Common routines for statvfs and fstatvfs.
+ */
+
+static int
+cstatvfs32(struct vfs *vfsp, struct statvfs32 *ubp)
+{
+ struct statvfs64 ds64;
+ struct statvfs32 ds32;
+ int error;
+
+#if !defined(lint)
+ ASSERT32(sizeof (struct statvfs) == sizeof (struct statvfs32));
+ ASSERT32(sizeof (struct statvfs64) == sizeof (struct statvfs64_32));
+#endif
+
+ bzero(&ds64, sizeof (ds64));
+ if ((error = VFS_STATVFS(vfsp, &ds64)) != 0)
+ return (error);
+
+ /*
+ * VFS_STATVFS can return data that is incompatible with the space
+ * available the 32-bit statvfs structure. Check here to see if
+ * it will fit into the 32-bit structure, if not, return EOVERFLOW.
+ *
+ * The check for -1 is because some file systems return -1 in the
+ * fields that are irrelevant or nonessential, and we do not want
+ * to return EOVERFLOW for them. For example: df is expected to
+ * show -1 in the output for some of these fields on NFS mounted
+ * filesystems.
+ */
+ if (ds64.f_files == (fsfilcnt64_t)-1)
+ ds64.f_files = UINT32_MAX;
+ if (ds64.f_ffree == (fsfilcnt64_t)-1)
+ ds64.f_ffree = UINT32_MAX;
+ if (ds64.f_favail == (fsfilcnt64_t)-1)
+ ds64.f_favail = UINT32_MAX;
+ if (ds64.f_bavail == (fsblkcnt64_t)-1)
+ ds64.f_bavail = UINT32_MAX;
+ if (ds64.f_bfree == (fsblkcnt64_t)-1)
+ ds64.f_bfree = UINT32_MAX;
+
+ if (ds64.f_blocks > UINT32_MAX || ds64.f_bfree > UINT32_MAX ||
+ ds64.f_bavail > UINT32_MAX || ds64.f_files > UINT32_MAX ||
+ ds64.f_ffree > UINT32_MAX || ds64.f_favail > UINT32_MAX)
+ return (EOVERFLOW);
+#ifdef _LP64
+ /*
+ * On the 64-bit kernel, even these fields grow to 64-bit
+ * quantities in the statvfs64 structure.
+ */
+ if (ds64.f_namemax == (ulong_t)-1l)
+ ds64.f_namemax = UINT32_MAX;
+
+ if (ds64.f_bsize > UINT32_MAX || ds64.f_frsize > UINT32_MAX ||
+ ds64.f_fsid > UINT32_MAX || ds64.f_flag > UINT32_MAX ||
+ ds64.f_namemax > UINT32_MAX)
+ return (EOVERFLOW);
+#endif
+
+ bzero(&ds32, sizeof (ds32));
+ STATVFSCOPY(&ds32, &ds64);
+ if (copyout(&ds32, ubp, sizeof (ds32)) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+static int
+cstatvfs64(struct vfs *vfsp, struct statvfs64 *ubp)
+{
+ struct statvfs64 ds64;
+ int error;
+
+#if !defined(lint)
+ ASSERT64(sizeof (struct statvfs) == sizeof (struct statvfs64));
+#endif
+ bzero(&ds64, sizeof (ds64));
+ if ((error = VFS_STATVFS(vfsp, &ds64)) != 0)
+ return (error);
+ if (copyout(&ds64, ubp, sizeof (ds64)) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+/*
+ * Native system calls
+ */
+int
+statvfs(char *fname, struct statvfs *sbp)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+#ifdef _LP64
+ error = cstatvfs64(vp->v_vfsp, (struct statvfs64 *)sbp);
+#else
+ error = cstatvfs32(vp->v_vfsp, (struct statvfs32 *)sbp);
+#endif
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+fstatvfs(int fdes, struct statvfs *sbp)
+{
+ struct file *fp;
+ int error;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+#ifdef _LP64
+ error = cstatvfs64(fp->f_vnode->v_vfsp, (struct statvfs64 *)sbp);
+#else
+ error = cstatvfs32(fp->f_vnode->v_vfsp, (struct statvfs32 *)sbp);
+#endif
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+#if defined(_ILP32)
+
+/*
+ * Large File system calls.
+ *
+ * (We deliberately don't have special "large file" system calls in the
+ * 64-bit kernel -- we just use the native versions, since they're just
+ * as functional.)
+ */
+int
+statvfs64(char *fname, struct statvfs64 *sbp)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ error = cstatvfs64(vp->v_vfsp, sbp);
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+fstatvfs64(int fdes, struct statvfs64 *sbp)
+{
+ struct file *fp;
+ int error;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ error = cstatvfs64(fp->f_vnode->v_vfsp, sbp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+#endif /* _ILP32 */
+
+#ifdef _SYSCALL32_IMPL
+
+static int
+cstatvfs64_32(struct vfs *vfsp, struct statvfs64_32 *ubp)
+{
+ struct statvfs64 ds64;
+ struct statvfs64_32 ds64_32;
+ int error;
+
+ bzero(&ds64, sizeof (ds64));
+ if ((error = VFS_STATVFS(vfsp, &ds64)) != 0)
+ return (error);
+
+ /*
+ * On the 64-bit kernel, even these fields grow to 64-bit
+ * quantities in the statvfs64 structure.
+ */
+ if (ds64.f_namemax == (ulong_t)-1l)
+ ds64.f_namemax = UINT32_MAX;
+
+ if (ds64.f_bsize > UINT32_MAX || ds64.f_frsize > UINT32_MAX ||
+ ds64.f_fsid > UINT32_MAX || ds64.f_flag > UINT32_MAX ||
+ ds64.f_namemax > UINT32_MAX)
+ return (EOVERFLOW);
+
+ STATVFSCOPY(&ds64_32, &ds64);
+ if (copyout(&ds64_32, ubp, sizeof (ds64_32)) != 0)
+ return (EFAULT);
+ return (0);
+}
+
+/*
+ * ILP32 "small file" system calls on LP64 kernel
+ */
+int
+statvfs32(char *fname, struct statvfs32 *sbp)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ error = cstatvfs32(vp->v_vfsp, sbp);
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+fstatvfs32(int fdes, struct statvfs32 *sbp)
+{
+ struct file *fp;
+ int error;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ error = cstatvfs32(fp->f_vnode->v_vfsp, sbp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * ILP32 Large File system calls on LP64 kernel
+ */
+int
+statvfs64_32(char *fname, struct statvfs64_32 *sbp)
+{
+ vnode_t *vp;
+ int error;
+
+lookup:
+ if (error = lookupname(fname, UIO_USERSPACE, FOLLOW, NULLVPP, &vp)) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ error = cstatvfs64_32(vp->v_vfsp, sbp);
+ VN_RELE(vp);
+ if (error) {
+ if (error == ESTALE)
+ goto lookup;
+ return (set_errno(error));
+ }
+ return (0);
+}
+
+int
+fstatvfs64_32(int fdes, struct statvfs64_32 *sbp)
+{
+ struct file *fp;
+ int error;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (set_errno(EBADF));
+ error = cstatvfs64_32(fp->f_vnode->v_vfsp, sbp);
+ releasef(fdes);
+ if (error)
+ return (set_errno(error));
+ return (0);
+}
+
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/strcalls.c b/usr/src/uts/common/syscall/strcalls.c
new file mode 100644
index 0000000000..bdde97a39d
--- /dev/null
+++ b/usr/src/uts/common/syscall/strcalls.c
@@ -0,0 +1,537 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/fs/fifonode.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/debug.h>
+
+/*
+ * STREAMS system calls.
+ */
+
+int getmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *flagsp);
+int putmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int flags);
+int getpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *prip,
+ int *flagsp);
+int putpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int pri,
+ int flags);
+
+static int msgio(int fdes, struct strbuf *ctl, struct strbuf *data, int *rval,
+ int mode, unsigned char *prip, int *flagsp);
+
+int
+getmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *flagsp)
+{
+ int error;
+ int localflags;
+ int realflags = 0;
+ unsigned char pri = 0;
+ int rv = 0;
+
+ /*
+ * Convert between old flags (localflags) and new flags (realflags).
+ */
+ if (copyin(flagsp, &localflags, sizeof (*flagsp)))
+ return (set_errno(EFAULT));
+ switch (localflags) {
+ case 0:
+ realflags = MSG_ANY;
+ break;
+
+ case RS_HIPRI:
+ realflags = MSG_HIPRI;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if ((error = msgio(fdes, ctl, data, &rv, FREAD, &pri,
+ &realflags)) == 0) {
+ /*
+ * massage realflags based on localflags.
+ */
+ if (realflags == MSG_HIPRI)
+ localflags = RS_HIPRI;
+ else
+ localflags = 0;
+ if (copyout(&localflags, flagsp, sizeof (*flagsp)))
+ error = EFAULT;
+ }
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+int
+putmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int flags)
+{
+ unsigned char pri = 0;
+ int realflags;
+ int error;
+ int rv = 0;
+
+ switch (flags) {
+ case RS_HIPRI:
+ realflags = MSG_HIPRI;
+ break;
+ case (RS_HIPRI|MSG_XPG4):
+ realflags = MSG_HIPRI|MSG_XPG4;
+ break;
+ case MSG_XPG4:
+ realflags = MSG_BAND|MSG_XPG4;
+ break;
+ case 0:
+ realflags = MSG_BAND;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+ error = msgio(fdes, ctl, data, &rv, FWRITE, &pri, &realflags);
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+
+int
+getpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int *prip,
+ int *flagsp)
+{
+ int error;
+ int flags;
+ int intpri;
+ unsigned char pri;
+ int rv = 0;
+
+ if (copyin(flagsp, &flags, sizeof (flags)))
+ return (set_errno(EFAULT));
+ if (copyin(prip, &intpri, sizeof (intpri)))
+ return (set_errno(EFAULT));
+ if ((intpri > 255) || (intpri < 0))
+ return (set_errno(EINVAL));
+ pri = (unsigned char)intpri;
+ error = msgio(fdes, ctl, data, &rv, FREAD, &pri, &flags);
+ if (error != 0)
+ return (set_errno(error));
+ if (copyout(&flags, flagsp, sizeof (flags)))
+ return (set_errno(EFAULT));
+ intpri = (int)pri;
+ if (copyout(&intpri, prip, sizeof (intpri)))
+ return (set_errno(EFAULT));
+ return (rv);
+}
+
+int
+putpmsg(int fdes, struct strbuf *ctl, struct strbuf *data, int intpri,
+ int flags)
+{
+ unsigned char pri;
+ int rv = 0;
+ int error;
+
+ if ((intpri > 255) || (intpri < 0))
+ return (set_errno(EINVAL));
+ pri = (unsigned char)intpri;
+ error = msgio(fdes, ctl, data, &rv, FWRITE, &pri, &flags);
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+/*
+ * Common code for getmsg and putmsg calls: check permissions,
+ * copy in args, do preliminary setup, and switch to
+ * appropriate stream routine.
+ */
+static int
+msgio(int fdes, struct strbuf *ctl, struct strbuf *data, int *rval,
+ int mode, unsigned char *prip, int *flagsp)
+{
+ file_t *fp;
+ vnode_t *vp;
+ struct strbuf msgctl, msgdata;
+ int error;
+ int flag;
+ klwp_t *lwp = ttolwp(curthread);
+ rval_t rv;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (EBADF);
+ if ((fp->f_flag & mode) == 0) {
+ releasef(fdes);
+ return (EBADF);
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type == VFIFO) {
+ if (vp->v_stream) {
+ /*
+ * must use sd_vnode, could be named pipe
+ */
+ (void) fifo_vfastoff(vp->v_stream->sd_vnode);
+ } else {
+ releasef(fdes);
+ return (ENOSTR);
+ }
+ } else if ((vp->v_type != VCHR && vp->v_type != VSOCK) ||
+ vp->v_stream == NULL) {
+ releasef(fdes);
+ return (ENOSTR);
+ }
+ if ((ctl != NULL) &&
+ copyin(ctl, &msgctl, sizeof (struct strbuf))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ if ((data != NULL) &&
+ copyin(data, &msgdata, sizeof (struct strbuf))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+
+ if (mode == FREAD) {
+ if (ctl == NULL)
+ msgctl.maxlen = -1;
+ if (data == NULL)
+ msgdata.maxlen = -1;
+ flag = fp->f_flag;
+ rv.r_val1 = 0;
+ if (vp->v_type == VSOCK) {
+ error = sock_getmsg(vp, &msgctl, &msgdata, prip,
+ flagsp, flag, &rv);
+ } else {
+ error = strgetmsg(vp, &msgctl, &msgdata, prip,
+ flagsp, flag, &rv);
+ }
+ *rval = rv.r_val1;
+ if (error != 0) {
+ releasef(fdes);
+ return (error);
+ }
+ if (lwp != NULL)
+ lwp->lwp_ru.msgrcv++;
+ if (((ctl != NULL) &&
+ copyout(&msgctl, ctl, sizeof (struct strbuf))) ||
+ ((data != NULL) &&
+ copyout(&msgdata, data, sizeof (struct strbuf)))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ releasef(fdes);
+ return (0);
+ }
+
+ /*
+ * FWRITE case
+ */
+ if (ctl == NULL)
+ msgctl.len = -1;
+ if (data == NULL)
+ msgdata.len = -1;
+ flag = fp->f_flag;
+ if (vp->v_type == VSOCK) {
+ error = sock_putmsg(vp, &msgctl, &msgdata, *prip, *flagsp,
+ flag);
+ } else {
+ error = strputmsg(vp, &msgctl, &msgdata, *prip, *flagsp, flag);
+ }
+ releasef(fdes);
+ if (error == 0 && lwp != NULL)
+ lwp->lwp_ru.msgsnd++;
+ return (error);
+}
+
+
+#if defined(_LP64) && defined(_SYSCALL32)
+
+static int msgio32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data,
+ int *rval, int mode, unsigned char *prip, int *flagsp);
+
+int
+getmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t *flagsp)
+{
+ int error;
+ int32_t localflags;
+ int realflags = 0;
+ unsigned char pri = 0;
+ int rv = 0;
+
+ /*
+ * Convert between old flags (localflags) and new flags (realflags).
+ */
+ if (copyin(flagsp, &localflags, sizeof (*flagsp)))
+ return (set_errno(EFAULT));
+ switch (localflags) {
+ case 0:
+ realflags = MSG_ANY;
+ break;
+
+ case RS_HIPRI:
+ realflags = MSG_HIPRI;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+
+ if ((error = msgio32(fdes, ctl, data, &rv, FREAD, &pri,
+ &realflags)) == 0) {
+ /*
+ * massage realflags based on localflags.
+ */
+ if (realflags == MSG_HIPRI)
+ localflags = RS_HIPRI;
+ else
+ localflags = 0;
+ if (copyout(&localflags, flagsp, sizeof (*flagsp)))
+ error = EFAULT;
+ }
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+int
+putmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t flags)
+{
+ unsigned char pri = 0;
+ int realflags;
+ int error;
+ int rv = 0;
+
+ switch (flags) {
+ case RS_HIPRI:
+ realflags = MSG_HIPRI;
+ break;
+ case (RS_HIPRI|MSG_XPG4):
+ realflags = MSG_HIPRI|MSG_XPG4;
+ break;
+ case MSG_XPG4:
+ realflags = MSG_BAND|MSG_XPG4;
+ break;
+ case 0:
+ realflags = MSG_BAND;
+ break;
+
+ default:
+ return (set_errno(EINVAL));
+ }
+ error = msgio32(fdes, ctl, data, &rv, FWRITE, &pri, &realflags);
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+
+int
+getpmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t *prip,
+ int32_t *flagsp)
+{
+ int error;
+ int32_t flags;
+ int32_t intpri;
+ unsigned char pri;
+ int rv = 0;
+
+ if (copyin(flagsp, &flags, sizeof (*flagsp)))
+ return (set_errno(EFAULT));
+ if (copyin(prip, &intpri, sizeof (intpri)))
+ return (set_errno(EFAULT));
+ if ((intpri > 255) || (intpri < 0))
+ return (set_errno(EINVAL));
+ pri = (unsigned char)intpri;
+ error = msgio32(fdes, ctl, data, &rv, FREAD, &pri, &flags);
+ if (error != 0)
+ return (set_errno(error));
+ if (copyout(&flags, flagsp, sizeof (flags)))
+ return (set_errno(EFAULT));
+ intpri = (int)pri;
+ if (copyout(&intpri, prip, sizeof (intpri)))
+ return (set_errno(EFAULT));
+ return (rv);
+}
+
+int
+putpmsg32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int32_t intpri,
+ int32_t flags)
+{
+ unsigned char pri;
+ int rv = 0;
+ int error;
+
+ if ((intpri > 255) || (intpri < 0))
+ return (set_errno(EINVAL));
+ pri = (unsigned char)intpri;
+ error = msgio32(fdes, ctl, data, &rv, FWRITE, &pri, &flags);
+ if (error != 0)
+ return (set_errno(error));
+ return (rv);
+}
+
+/*
+ * Common code for getmsg and putmsg calls: check permissions,
+ * copy in args, do preliminary setup, and switch to
+ * appropriate stream routine.
+ */
+static int
+msgio32(int fdes, struct strbuf32 *ctl, struct strbuf32 *data, int *rval,
+ int mode, unsigned char *prip, int *flagsp)
+{
+ file_t *fp;
+ vnode_t *vp;
+ struct strbuf32 msgctl32, msgdata32;
+ struct strbuf msgctl, msgdata;
+ int error;
+ int flag;
+ klwp_t *lwp = ttolwp(curthread);
+ rval_t rv;
+
+ if ((fp = getf(fdes)) == NULL)
+ return (EBADF);
+ if ((fp->f_flag & mode) == 0) {
+ releasef(fdes);
+ return (EBADF);
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type == VFIFO) {
+ if (vp->v_stream) {
+ /*
+ * must use sd_vnode, could be named pipe
+ */
+ (void) fifo_vfastoff(vp->v_stream->sd_vnode);
+ } else {
+ releasef(fdes);
+ return (ENOSTR);
+ }
+ } else if ((vp->v_type != VCHR && vp->v_type != VSOCK) ||
+ vp->v_stream == NULL) {
+ releasef(fdes);
+ return (ENOSTR);
+ }
+ if (ctl != NULL) {
+ if (copyin(ctl, &msgctl32, sizeof (msgctl32))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ msgctl.len = msgctl32.len;
+ msgctl.maxlen = msgctl32.maxlen;
+ msgctl.buf = (caddr_t)(uintptr_t)msgctl32.buf;
+ }
+ if (data != NULL) {
+ if (copyin(data, &msgdata32, sizeof (msgdata32))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ msgdata.len = msgdata32.len;
+ msgdata.maxlen = msgdata32.maxlen;
+ msgdata.buf = (caddr_t)(uintptr_t)msgdata32.buf;
+ }
+
+ if (mode == FREAD) {
+ if (ctl == NULL)
+ msgctl.maxlen = -1;
+ if (data == NULL)
+ msgdata.maxlen = -1;
+ flag = fp->f_flag;
+ rv.r_val1 = 0;
+ if (vp->v_type == VSOCK) {
+ error = sock_getmsg(vp, &msgctl, &msgdata, prip,
+ flagsp, flag, &rv);
+ } else {
+ error = strgetmsg(vp, &msgctl, &msgdata, prip,
+ flagsp, flag, &rv);
+ }
+ *rval = rv.r_val1;
+ if (error != 0) {
+ releasef(fdes);
+ return (error);
+ }
+ if (lwp != NULL)
+ lwp->lwp_ru.msgrcv++;
+ if (ctl != NULL) {
+ /* XX64 - range check */
+ msgctl32.len = msgctl.len;
+ msgctl32.maxlen = msgctl.maxlen;
+ msgctl32.buf = (caddr32_t)(uintptr_t)msgctl.buf;
+ if (copyout(&msgctl32, ctl, sizeof (msgctl32))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ }
+ if (data != NULL) {
+ /* XX64 - range check */
+ msgdata32.len = msgdata.len;
+ msgdata32.maxlen = msgdata.maxlen;
+ msgdata32.buf = (caddr32_t)(uintptr_t)msgdata.buf;
+ if (copyout(&msgdata32, data, sizeof (msgdata32))) {
+ releasef(fdes);
+ return (EFAULT);
+ }
+ }
+ releasef(fdes);
+ return (0);
+ }
+
+ /*
+ * FWRITE case
+ */
+ if (ctl == NULL)
+ msgctl.len = -1;
+ if (data == NULL)
+ msgdata.len = -1;
+ flag = fp->f_flag;
+ if (vp->v_type == VSOCK) {
+ error = sock_putmsg(vp, &msgctl, &msgdata, *prip, *flagsp,
+ flag);
+ } else {
+ error = strputmsg(vp, &msgctl, &msgdata, *prip, *flagsp, flag);
+ }
+ releasef(fdes);
+ if (error == 0 && lwp != NULL)
+ lwp->lwp_ru.msgsnd++;
+ return (error);
+}
+
+#endif /* _LP64 && _SYSCALL32 */
diff --git a/usr/src/uts/common/syscall/symlink.c b/usr/src/uts/common/syscall/symlink.c
new file mode 100644
index 0000000000..2ce51d24cd
--- /dev/null
+++ b/usr/src/uts/common/syscall/symlink.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+#include <c2/audit.h>
+
+/*
+ * Create a symbolic link. Similar to link or rename except target
+ * name is passed as string argument, not converted to vnode reference.
+ */
+int
+symlink(char *target, char *linkname)
+{
+ vnode_t *dvp;
+ struct vattr vattr;
+ struct pathname lpn;
+ char *tbuf;
+ size_t tlen;
+ int error;
+
+top:
+ if (error = pn_get(linkname, UIO_USERSPACE, &lpn))
+ return (set_errno(error));
+ if (error = lookuppn(&lpn, NULL, NO_FOLLOW, &dvp, NULLVPP)) {
+ pn_free(&lpn);
+ if (error == ESTALE)
+ goto top;
+ return (set_errno(error));
+ }
+ if (vn_is_readonly(dvp))
+ error = EROFS;
+ else if (pn_fixslash(&lpn))
+ error = ENOTDIR;
+ else {
+ tbuf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if ((error = copyinstr(target, tbuf, MAXPATHLEN, &tlen)) == 0) {
+ vattr.va_type = VLNK;
+ vattr.va_mode = 0777;
+ vattr.va_mask = AT_TYPE|AT_MODE;
+ error = VOP_SYMLINK(dvp, lpn.pn_path, &vattr,
+ tbuf, CRED());
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_symlink_create(dvp, lpn.pn_path,
+ tbuf, error);
+#endif /* C2_AUDIT */
+ }
+ kmem_free(tbuf, MAXPATHLEN);
+ }
+ pn_free(&lpn);
+ VN_RELE(dvp);
+ if (error) {
+ if (error == ESTALE)
+ goto top;
+ return (set_errno(error));
+ }
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sync.c b/usr/src/uts/common/syscall/sync.c
new file mode 100644
index 0000000000..14945aa1d8
--- /dev/null
+++ b/usr/src/uts/common/syscall/sync.c
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* Copyright (c) 1994 Sun Microsystems, Inc. */
+/* All Rights Reserved */
+
+
+
+#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4 1.42 */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/vfs.h>
+
+int
+syssync()
+{
+ vfs_sync(0);
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c
new file mode 100644
index 0000000000..222fcd5739
--- /dev/null
+++ b/usr/src/uts/common/syscall/sysconfig.c
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/errno.h>
+#include <sys/var.h>
+#include <sys/signal.h>
+#include <sys/time.h>
+#include <sys/sysconfig.h>
+#include <sys/resource.h>
+#include <sys/ulimit.h>
+#include <sys/unistd.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+#include <sys/mman.h>
+#include <sys/timer.h>
+#include <sys/zone.h>
+
+long
+sysconfig(int which)
+{
+ switch (which) {
+
+ /*
+ * if it is not handled in mach_sysconfig either
+ * it must be EINVAL.
+ */
+ default:
+ return (mach_sysconfig(which)); /* `uname -i`/os */
+
+ case _CONFIG_CLK_TCK:
+ return ((long)hz); /* clock frequency per second */
+
+ case _CONFIG_PROF_TCK:
+ return ((long)hz); /* profiling clock freq per sec */
+
+ case _CONFIG_NGROUPS:
+ /*
+ * Maximum number of supplementary groups.
+ */
+ return (ngroups_max);
+
+ case _CONFIG_OPEN_FILES:
+ /*
+ * Maximum number of open files (soft limit).
+ */
+ {
+ rlim64_t fd_ctl;
+ mutex_enter(&curproc->p_lock);
+ fd_ctl = rctl_enforced_value(
+ rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls,
+ curproc);
+ mutex_exit(&curproc->p_lock);
+ return ((ulong_t)fd_ctl);
+ }
+
+ case _CONFIG_CHILD_MAX:
+ /*
+ * Maximum number of processes.
+ */
+ return (v.v_maxup);
+
+ case _CONFIG_POSIX_VER:
+ return (_POSIX_VERSION); /* current POSIX version */
+
+ case _CONFIG_PAGESIZE:
+ return (PAGESIZE);
+
+ case _CONFIG_XOPEN_VER:
+ return (_XOPEN_VERSION); /* current XOPEN version */
+
+ case _CONFIG_NPROC_CONF:
+ return (zone_ncpus_get(curproc->p_zone));
+
+ case _CONFIG_NPROC_ONLN:
+ return (zone_ncpus_online_get(curproc->p_zone));
+
+ case _CONFIG_NPROC_MAX:
+ return (max_ncpus);
+
+ case _CONFIG_STACK_PROT:
+ return (curproc->p_stkprot & ~PROT_USER);
+
+ case _CONFIG_AIO_LISTIO_MAX:
+ return (_AIO_LISTIO_MAX);
+
+ case _CONFIG_AIO_MAX:
+ return (_AIO_MAX);
+
+ case _CONFIG_AIO_PRIO_DELTA_MAX:
+ return (0);
+
+ case _CONFIG_DELAYTIMER_MAX:
+ return (INT_MAX);
+
+ case _CONFIG_MQ_OPEN_MAX:
+ return (_MQ_OPEN_MAX);
+
+ case _CONFIG_MQ_PRIO_MAX:
+ return (_MQ_PRIO_MAX);
+
+ case _CONFIG_RTSIG_MAX:
+ return (_SIGRTMAX - _SIGRTMIN + 1);
+
+ case _CONFIG_SEM_NSEMS_MAX:
+ return (_SEM_NSEMS_MAX);
+
+ case _CONFIG_SEM_VALUE_MAX:
+ return (_SEM_VALUE_MAX);
+
+ case _CONFIG_SIGQUEUE_MAX:
+ return (_SIGQUEUE_MAX);
+
+ case _CONFIG_SIGRT_MIN:
+ return (_SIGRTMIN);
+
+ case _CONFIG_SIGRT_MAX:
+ return (_SIGRTMAX);
+
+ case _CONFIG_TIMER_MAX:
+ return (_TIMER_MAX);
+
+ case _CONFIG_PHYS_PAGES:
+ return (physinstalled);
+
+ case _CONFIG_AVPHYS_PAGES:
+ return (freemem);
+
+ case _CONFIG_MAXPID:
+ return (maxpid);
+
+ case _CONFIG_CPUID_MAX:
+ return (max_cpuid);
+
+ case _CONFIG_SYMLOOP_MAX:
+ return (MAXSYMLINKS);
+ }
+}
diff --git a/usr/src/uts/common/syscall/sysfs.c b/usr/src/uts/common/syscall/sysfs.c
new file mode 100644
index 0000000000..58b760bf29
--- /dev/null
+++ b/usr/src/uts/common/syscall/sysfs.c
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1993 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/fstyp.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/buf.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+
+/*
+ * System call to map fstype numbers to names, and vice versa.
+ */
+
+static int sysfsind(char *);
+static int sysfstyp(int, char *);
+
+int
+sysfs(int opcode, long a1, long a2)
+{
+ int error;
+
+ switch (opcode) {
+ case GETFSIND:
+ error = sysfsind((char *)a1);
+ break;
+ case GETFSTYP:
+ error = sysfstyp((int)a1, (char *)a2);
+ break;
+ case GETNFSTYP:
+ /*
+ * Return number of fstypes configured in the system.
+ */
+ return (nfstype - 1);
+ default:
+ error = set_errno(EINVAL);
+ }
+
+ return (error);
+}
+
+static int
+sysfsind(char *fsname)
+{
+ /*
+ * Translate fs identifier to an index into the vfssw structure.
+ */
+ struct vfssw *vswp;
+ char fsbuf[FSTYPSZ];
+ int retval;
+ size_t len = 0;
+
+ retval = copyinstr(fsname, fsbuf, FSTYPSZ, &len);
+ if (retval == ENOENT) /* XXX */
+ retval = EINVAL; /* XXX */
+ if (len == 1) /* Includes null byte */
+ retval = EINVAL;
+ if (retval)
+ return (set_errno(retval));
+ /*
+ * Search the vfssw table for the fs identifier
+ * and return the index.
+ */
+ if ((vswp = vfs_getvfssw(fsbuf)) != NULL) {
+ retval = vswp - vfssw;
+ vfs_unrefvfssw(vswp);
+ return (retval);
+ }
+
+ return (set_errno(EINVAL));
+}
+
+static int
+sysfstyp(int index, char *cbuf)
+{
+ /*
+ * Translate fstype index into an fs identifier.
+ */
+ char *src;
+ struct vfssw *vswp;
+ char *osrc;
+ int error = 0;
+
+ if (index <= 0 || index >= nfstype)
+ return (set_errno(EINVAL));
+ RLOCK_VFSSW();
+ vswp = &vfssw[index];
+
+ osrc = src = vswp->vsw_name;
+ while (*src++)
+ ;
+
+ if (copyout(osrc, cbuf, src - osrc))
+ error = set_errno(EFAULT);
+ RUNLOCK_VFSSW();
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/systeminfo.c b/usr/src/uts/common/syscall/systeminfo.c
new file mode 100644
index 0000000000..91c8e73ee4
--- /dev/null
+++ b/usr/src/uts/common/syscall/systeminfo.c
@@ -0,0 +1,329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All rights reserved. */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/errno.h>
+#include <sys/cred.h>
+#include <sys/utsname.h>
+#include <sys/systeminfo.h>
+#include <sys/unistd.h>
+#include <sys/debug.h>
+#include <sys/bootconf.h>
+#include <sys/socket.h>
+#include <sys/policy.h>
+#include <net/if.h>
+#include <sys/sunddi.h>
+#include <sys/promif.h>
+#include <sys/zone.h>
+#include <sys/model.h>
+
+static void get_netif_name(char *, char *);
+
+long
+systeminfo(int command, char *buf, long count)
+{
+ int error = 0;
+ long strcnt, getcnt;
+ char *kstr;
+
+ if (count < 0 && command != SI_SET_HOSTNAME &&
+ command != SI_SET_SRPC_DOMAIN)
+ return (set_errno(EINVAL));
+
+ /*
+ * Deal with the common "get a string" case first.
+ */
+ switch (command) {
+ case SI_SYSNAME:
+ kstr = utsname.sysname;
+ break;
+ case SI_HOSTNAME:
+ kstr = uts_nodename();
+ break;
+ case SI_RELEASE:
+ kstr = utsname.release;
+ break;
+ case SI_VERSION:
+ kstr = utsname.version;
+ break;
+ case SI_MACHINE:
+ kstr = utsname.machine;
+ break;
+#ifdef _LP64
+ case SI_ARCHITECTURE_64:
+ case SI_ARCHITECTURE_K:
+ kstr = architecture;
+ break;
+ case SI_ARCHITECTURE_32:
+ case SI_ARCHITECTURE:
+ kstr = architecture_32;
+ break;
+ case SI_ARCHITECTURE_NATIVE:
+ kstr = get_udatamodel() == DATAMODEL_NATIVE ?
+ architecture : architecture_32;
+ break;
+#else
+ case SI_ARCHITECTURE_K:
+ case SI_ARCHITECTURE_32:
+ case SI_ARCHITECTURE:
+ case SI_ARCHITECTURE_NATIVE:
+ kstr = architecture;
+ break;
+#endif
+ case SI_HW_SERIAL:
+ kstr = hw_serial;
+ break;
+ case SI_HW_PROVIDER:
+ kstr = hw_provider;
+ break;
+ case SI_SRPC_DOMAIN:
+ kstr = curproc->p_zone->zone_domain;
+ break;
+ case SI_PLATFORM:
+ kstr = platform;
+ break;
+ case SI_ISALIST:
+ kstr = isa_list;
+ break;
+ default:
+ kstr = NULL;
+ break;
+ }
+
+ if (kstr != NULL) {
+ if ((strcnt = strlen(kstr)) >= count) {
+ getcnt = count - 1;
+ if (subyte(buf + count - 1, 0) < 0)
+ return (set_errno(EFAULT));
+ } else
+ getcnt = strcnt + 1;
+ if (copyout(kstr, buf, getcnt))
+ return (set_errno(EFAULT));
+ return (strcnt + 1);
+ }
+
+ switch (command) {
+ case SI_DHCP_CACHE:
+ {
+ char *tmp;
+
+ if (dhcack == NULL) {
+ tmp = "";
+ strcnt = 0;
+ } else {
+ /*
+ * If the interface name has not yet been resolved
+ * (first IFNAMSIZ bytes of dhcack[]) and a valid
+ * netdev_path[] was stashed by loadrootmodules in
+ * swapgeneric.c, resolve the interface name now.
+ */
+ if (dhcack[0] == '\0' &&
+ netdev_path != NULL && netdev_path[0] != '\0') {
+ get_netif_name(netdev_path, dhcack);
+ }
+
+ tmp = dhcack;
+ strcnt = IFNAMSIZ + strlen(&tmp[IFNAMSIZ]);
+ }
+
+ getcnt = (strcnt >= count) ? count : strcnt + 1;
+
+ if (copyout(tmp, buf, getcnt)) {
+ error = EFAULT;
+ break;
+ }
+
+ if (strcnt >= count && subyte((buf + count - 1), 0) < 0) {
+ error = EFAULT;
+ break;
+ }
+
+ return (strcnt + 1);
+ }
+
+ case SI_SET_HOSTNAME:
+ {
+ size_t len;
+ char name[SYS_NMLN];
+ char *name_to_use;
+
+ if ((error = secpolicy_systeminfo(CRED())) != 0)
+ break;
+
+ name_to_use = uts_nodename();
+ if ((error = copyinstr(buf, name, SYS_NMLN, &len)) != 0)
+ break;
+
+ /*
+ * Must be non-NULL string and string
+ * must be less than SYS_NMLN chars.
+ */
+ if (len < 2 || (len == SYS_NMLN && name[SYS_NMLN-1] != '\0')) {
+ error = EINVAL;
+ break;
+ }
+
+ /*
+ * Copy the name into the relevant zone's nodename.
+ */
+ (void) strcpy(name_to_use, name);
+
+ /*
+ * Notify other interested parties that the nodename was set
+ */
+ if (name_to_use == utsname.nodename) /* global zone nodename */
+ nodename_set();
+
+ return (len);
+ }
+
+ case SI_SET_SRPC_DOMAIN:
+ {
+ char name[SYS_NMLN];
+ size_t len;
+
+ if ((error = secpolicy_systeminfo(CRED())) != 0)
+ break;
+ if ((error = copyinstr(buf, name, SYS_NMLN, &len)) != 0)
+ break;
+ /*
+ * If string passed in is longer than length
+ * allowed for domain name, fail.
+ */
+ if (len == SYS_NMLN && name[SYS_NMLN-1] != '\0') {
+ error = EINVAL;
+ break;
+ }
+
+ (void) strcpy(curproc->p_zone->zone_domain, name);
+ return (len);
+ }
+
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (set_errno(error));
+}
+
+/*
+ * i_path_find_node: Internal routine used by path_to_devinfo
+ * to locate a given nodeid in the device tree.
+ */
+struct i_path_findnode {
+ dnode_t nodeid;
+ dev_info_t *dip;
+};
+
+static int
+i_path_find_node(dev_info_t *dev, void *arg)
+{
+ struct i_path_findnode *f = (struct i_path_findnode *)arg;
+
+
+ if (ddi_get_nodeid(dev) == (int)f->nodeid) {
+ f->dip = dev;
+ return (DDI_WALK_TERMINATE);
+ }
+ return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * Return the devinfo node to a boot device
+ */
+static dev_info_t *
+path_to_devinfo(char *path)
+{
+ struct i_path_findnode fn;
+ extern dev_info_t *top_devinfo;
+
+ /*
+ * Get the nodeid of the given pathname, if such a mapping exists.
+ */
+ fn.dip = NULL;
+ fn.nodeid = prom_finddevice(path);
+ if (fn.nodeid != OBP_BADNODE) {
+ /*
+ * Find the nodeid in our copy of the device tree and return
+ * whatever name we used to bind this node to a driver.
+ */
+ ddi_walk_devs(top_devinfo, i_path_find_node, (void *)(&fn));
+ }
+
+ return (fn.dip);
+}
+
+/*
+ * Determine the network interface name from the device path argument.
+ */
+static void
+get_netif_name(char *devname, char *ifname)
+{
+ dev_info_t *dip;
+ major_t ndev;
+ char *name;
+ int unit;
+
+ dip = path_to_devinfo(devname);
+ if (dip == NULL) {
+ cmn_err(CE_WARN, "get_netif_name: "
+ "can't bind driver for '%s'\n", devname);
+ return;
+ }
+
+ ndev = ddi_driver_major(dip);
+ if (ndev == -1) {
+ cmn_err(CE_WARN, "get_netif_name: "
+ "no driver bound to '%s'\n", devname);
+ return;
+ }
+
+ name = ddi_major_to_name(ndev);
+ if (name == NULL) {
+ cmn_err(CE_WARN, "get_netif_name: "
+ "no name for major number %d\n", ndev);
+ return;
+ }
+
+ unit = i_ddi_devi_get_ppa(dip);
+ if (unit < 0) {
+ cmn_err(CE_WARN, "get_netif_name: "
+ "illegal unit number %d\n", unit);
+ return;
+ }
+
+ (void) snprintf(ifname, IFNAMSIZ, "%s%d", name, unit);
+}
diff --git a/usr/src/uts/common/syscall/tasksys.c b/usr/src/uts/common/syscall/tasksys.c
new file mode 100644
index 0000000000..10b7e95c76
--- /dev/null
+++ b/usr/src/uts/common/syscall/tasksys.c
@@ -0,0 +1,266 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * System calls for creating and inquiring about tasks and projects
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/proc.h>
+#include <sys/task.h>
+#include <sys/systm.h>
+#include <sys/project.h>
+#include <sys/cpuvar.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+/*
+ * Limit projlist to 256k projects.
+ */
+#define MAX_PROJLIST_BUFSIZE 1048576
+
+typedef struct projlist_walk {
+ projid_t *pw_buf;
+ size_t pw_bufsz;
+} projlist_walk_t;
+
+
+/*
+ * taskid_t tasksys_settaskid(projid_t projid, uint_t flags);
+ *
+ * Overview
+ * Place the calling process in a new task if sufficiently privileged. If the
+ * present task is finalized, the process may not create a new task.
+ *
+ * Return values
+ * 0 on success, errno on failure.
+ */
+static long
+tasksys_settaskid(projid_t projid, uint_t flags)
+{
+ proc_t *p = ttoproc(curthread);
+ kproject_t *oldpj;
+ kproject_t *kpj;
+ task_t *tk, *oldtk;
+ rctl_entity_p_t e;
+ zone_t *zone;
+ int rctlfail = 0;
+
+ if (secpolicy_tasksys(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (projid < 0 || projid > MAXPROJID)
+ return (set_errno(EINVAL));
+
+ if (flags & ~TASK_FINAL)
+ return (set_errno(EINVAL));
+
+ mutex_enter(&pidlock);
+ if (p->p_task->tk_flags & TASK_FINAL) {
+ mutex_exit(&pidlock);
+ return (set_errno(EACCES));
+ }
+ mutex_exit(&pidlock);
+
+ /*
+ * Try to stop all other lwps in the process while we're changing
+ * our project. This way, curthread doesn't need to grab its own
+ * thread_lock to find its project ID (see curprojid()). If this
+ * is the /proc agent lwp, we know that the other lwps are already
+ * held. If we failed to hold all lwps, bail out and return EINTR.
+ */
+ if (curthread != p->p_agenttp && !holdlwps(SHOLDFORK1))
+ return (set_errno(EINTR));
+ /*
+ * Put a hold on our new project and make sure that nobody is
+ * trying to bind it to a pool while we're joining.
+ */
+ kpj = project_hold_by_id(projid, getzoneid(), PROJECT_HOLD_INSERT);
+ e.rcep_p.proj = kpj;
+ e.rcep_t = RCENTITY_PROJECT;
+
+ mutex_enter(&p->p_lock);
+ oldpj = p->p_task->tk_proj;
+ zone = p->p_zone;
+
+ mutex_enter(&zone->zone_nlwps_lock);
+
+ if (kpj->kpj_nlwps + p->p_lwpcnt > kpj->kpj_nlwps_ctl)
+ if (rctl_test_entity(rc_project_nlwps, kpj->kpj_rctls, p, &e,
+ p->p_lwpcnt, 0) & RCT_DENY)
+ rctlfail = 1;
+
+ if (kpj->kpj_ntasks + 1 > kpj->kpj_ntasks_ctl)
+ if (rctl_test_entity(rc_project_ntasks, kpj->kpj_rctls, p, &e,
+ 1, 0) & RCT_DENY)
+ rctlfail = 1;
+
+ if (rctlfail) {
+ mutex_exit(&zone->zone_nlwps_lock);
+ if (curthread != p->p_agenttp)
+ continuelwps(p);
+ mutex_exit(&p->p_lock);
+ return (set_errno(EAGAIN));
+ }
+ kpj->kpj_nlwps += p->p_lwpcnt;
+ kpj->kpj_ntasks++;
+
+ oldpj->kpj_nlwps -= p->p_lwpcnt;
+
+ mutex_exit(&zone->zone_nlwps_lock);
+ mutex_exit(&p->p_lock);
+
+ mutex_enter(&kpj->kpj_poolbind);
+ tk = task_create(projid, curproc->p_zone);
+ mutex_enter(&cpu_lock);
+ /*
+ * Returns with p_lock held.
+ */
+ oldtk = task_join(tk, flags);
+ if (curthread != p->p_agenttp)
+ continuelwps(p);
+ mutex_exit(&p->p_lock);
+ mutex_exit(&cpu_lock);
+ mutex_exit(&kpj->kpj_poolbind);
+ task_rele(oldtk);
+ project_rele(kpj);
+ return (tk->tk_tkid);
+}
+
+/*
+ * taskid_t tasksys_gettaskid(void);
+ *
+ * Overview
+ * Return the current task ID for this process.
+ *
+ * Return value
+ * The ID for the task to which the current process belongs.
+ */
+static long
+tasksys_gettaskid()
+{
+ long ret;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&pidlock);
+ ret = p->p_task->tk_tkid;
+ mutex_exit(&pidlock);
+ return (ret);
+}
+
+/*
+ * projid_t tasksys_getprojid(void);
+ *
+ * Overview
+ * Return the current project ID for this process.
+ *
+ * Return value
+ * The ID for the project to which the current process belongs.
+ */
+static long
+tasksys_getprojid()
+{
+ long ret;
+ proc_t *p = ttoproc(curthread);
+
+ mutex_enter(&pidlock);
+ ret = p->p_task->tk_proj->kpj_id;
+ mutex_exit(&pidlock);
+ return (ret);
+}
+
+static int
+tasksys_projlist_cb(kproject_t *kp, void *buf)
+{
+ projlist_walk_t *pw = (projlist_walk_t *)buf;
+
+ if (pw && pw->pw_bufsz >= sizeof (projid_t)) {
+ *pw->pw_buf = kp->kpj_id;
+ pw->pw_buf++;
+ pw->pw_bufsz -= sizeof (projid_t);
+ }
+
+ return (0);
+}
+
+/*
+ * long tasksys_projlist(void *buf, size_t bufsz)
+ *
+ * Overview
+ * Return a buffer containing the project IDs of all currently active projects
+ * in the current zone.
+ *
+ * Return values
+ * The minimum size of a buffer sufficiently large to contain all of the
+ * active project IDs, or -1 if an error occurs during copyout.
+ */
+static long
+tasksys_projlist(void *buf, size_t bufsz)
+{
+ long ret = 0;
+ projlist_walk_t pw;
+ void *kbuf;
+
+ if (buf == NULL || bufsz == 0)
+ return (project_walk_all(getzoneid(), tasksys_projlist_cb,
+ NULL));
+
+ if (bufsz > MAX_PROJLIST_BUFSIZE)
+ return (set_errno(ENOMEM));
+
+ kbuf = pw.pw_buf = kmem_zalloc(bufsz, KM_SLEEP);
+ pw.pw_bufsz = bufsz;
+
+ ret = project_walk_all(getzoneid(), tasksys_projlist_cb, &pw);
+
+ if (copyout(kbuf, buf, bufsz) == -1)
+ ret = set_errno(EFAULT);
+
+ kmem_free(kbuf, bufsz);
+ return (ret);
+}
+
+long
+tasksys(int code, projid_t projid, uint_t flags, void *projidbuf, size_t pbufsz)
+{
+ switch (code) {
+ case 0:
+ return (tasksys_settaskid(projid, flags));
+ case 1:
+ return (tasksys_gettaskid());
+ case 2:
+ return (tasksys_getprojid());
+ case 3:
+ return (tasksys_projlist(projidbuf, pbufsz));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
diff --git a/usr/src/uts/common/syscall/time.c b/usr/src/uts/common/syscall/time.c
new file mode 100644
index 0000000000..ccca2f5847
--- /dev/null
+++ b/usr/src/uts/common/syscall/time.c
@@ -0,0 +1,80 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1994-2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All rights reserved. */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/debug.h>
+#include <sys/policy.h>
+
+time_t
+gtime(void)
+{
+ return (gethrestime_sec());
+}
+
+int
+stime(time_t time)
+{
+ timestruc_t ts;
+
+ if (secpolicy_settime(CRED()) != 0)
+ return (set_errno(EPERM));
+
+ if (time < 0)
+ return (set_errno(EINVAL));
+
+ ts.tv_sec = time;
+ ts.tv_nsec = 0;
+ mutex_enter(&tod_lock);
+ tod_set(ts);
+ set_hrestime(&ts);
+ mutex_exit(&tod_lock);
+
+ return (0);
+}
+
+#if defined(_SYSCALL32_IMPL)
+int
+stime32(time32_t time)
+{
+ if (time < 0)
+ return (set_errno(EINVAL));
+
+ return (stime((time_t)time));
+}
+#endif
diff --git a/usr/src/uts/common/syscall/times.c b/usr/src/uts/common/syscall/times.c
new file mode 100644
index 0000000000..cefa942d57
--- /dev/null
+++ b/usr/src/uts/common/syscall/times.c
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <sys/debug.h>
+#include <sys/msacct.h>
+
+/*
+ * Return system and user times.
+ */
+
+clock_t
+times(struct tms *tp)
+{
+ proc_t *p = ttoproc(curthread);
+ struct tms p_time;
+ clock_t ret_lbolt;
+
+ mutex_enter(&p->p_lock);
+ p_time.tms_utime = (clock_t)NSEC_TO_TICK(
+ mstate_aggr_state(p, LMS_USER));
+ p_time.tms_stime = (clock_t)NSEC_TO_TICK(
+ mstate_aggr_state(p, LMS_SYSTEM));
+ p_time.tms_cutime = p->p_cutime;
+ p_time.tms_cstime = p->p_cstime;
+ mutex_exit(&p->p_lock);
+
+ if (copyout(&p_time, tp, sizeof (p_time)))
+ return (set_errno(EFAULT));
+
+ ret_lbolt = lbolt;
+
+ return (ret_lbolt == -1 ? 0 : ret_lbolt);
+}
+
+#ifdef _SYSCALL32_IMPL
+
+/*
+ * We deliberately -don't- return EOVERFLOW on type overflow,
+ * since the 32-bit kernel simply wraps 'em around.
+ */
+clock32_t
+times32(struct tms32 *tp)
+{
+ proc_t *p = ttoproc(curthread);
+ struct tms32 p_time;
+ clock32_t ret_lbolt;
+
+ mutex_enter(&p->p_lock);
+ p_time.tms_utime = (clock32_t)NSEC_TO_TICK(
+ mstate_aggr_state(p, LMS_USER));
+ p_time.tms_stime = (clock32_t)NSEC_TO_TICK(
+ mstate_aggr_state(p, LMS_SYSTEM));
+ p_time.tms_cutime = (clock32_t)p->p_cutime;
+ p_time.tms_cstime = (clock32_t)p->p_cstime;
+ mutex_exit(&p->p_lock);
+
+ if (copyout(&p_time, tp, sizeof (p_time)))
+ return (set_errno(EFAULT));
+
+ ret_lbolt = (clock32_t)lbolt;
+
+ return (ret_lbolt == (clock32_t)-1 ? 0 : ret_lbolt);
+}
+
+#endif /* _SYSCALL32_IMPL */
diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c
new file mode 100644
index 0000000000..31a3ff0a10
--- /dev/null
+++ b/usr/src/uts/common/syscall/uadmin.c
@@ -0,0 +1,373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/swap.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/var.h>
+#include <sys/uadmin.h>
+#include <sys/signal.h>
+#include <sys/time.h>
+#include <vm/seg_kmem.h>
+#include <sys/modctl.h>
+#include <sys/callb.h>
+#include <sys/dumphdr.h>
+#include <sys/debug.h>
+#include <sys/ftrace.h>
+#include <sys/cmn_err.h>
+#include <sys/panic.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+/*
+ * Administrivia system call. We provide this in two flavors: one for calling
+ * from the system call path (uadmin), and the other for calling from elsewhere
+ * within the kernel (kadmin). Callers must beware that certain uadmin cmd
+ * values (specifically A_SWAPCTL) are only supported by uadmin and not kadmin.
+ */
+
+extern ksema_t fsflush_sema;
+kmutex_t ualock;
+
+
+/*
+ * Kill all user processes in said zone. A special argument of ALL_ZONES is
+ * passed in when the system as a whole is shutting down. The lack of per-zone
+ * process lists is likely to make the following a performance bottleneck on a
+ * system with many zones.
+ */
+void
+killall(zoneid_t zoneid)
+{
+ proc_t *p;
+
+ ASSERT(zoneid != GLOBAL_ZONEID);
+ /*
+ * Kill all processes except kernel daemons and ourself.
+ * Make a first pass to stop all processes so they won't
+ * be trying to restart children as we kill them.
+ */
+ mutex_enter(&pidlock);
+ for (p = practive; p != NULL; p = p->p_next) {
+ if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
+ p->p_exec != NULLVP && /* kernel daemons */
+ p->p_as != &kas &&
+ p->p_stat != SZOMB) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOWAIT;
+ sigtoproc(p, NULL, SIGSTOP);
+ mutex_exit(&p->p_lock);
+ }
+ }
+ p = practive;
+ while (p != NULL) {
+ if ((zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) &&
+ p->p_exec != NULLVP && /* kernel daemons */
+ p->p_as != &kas &&
+ p->p_stat != SIDL &&
+ p->p_stat != SZOMB) {
+ mutex_enter(&p->p_lock);
+ if (sigismember(&p->p_sig, SIGKILL)) {
+ mutex_exit(&p->p_lock);
+ p = p->p_next;
+ } else {
+ sigtoproc(p, NULL, SIGKILL);
+ mutex_exit(&p->p_lock);
+ (void) cv_timedwait(&p->p_srwchan_cv,
+ &pidlock, lbolt + hz);
+ p = practive;
+ }
+ } else {
+ p = p->p_next;
+ }
+ }
+ mutex_exit(&pidlock);
+}
+
+int
+kadmin(int cmd, int fcn, void *mdep, cred_t *credp)
+{
+ int error = 0;
+ int locked = 0;
+ char *buf;
+ size_t buflen = 0;
+
+ /*
+ * We might be called directly by the kernel's fault-handling code, so
+ * we can't assert that the caller is in the global zone.
+ */
+
+ /*
+ * Make sure that cmd is one of the valid <sys/uadmin.h> command codes
+ * and that we have appropriate privileges for this action.
+ */
+ switch (cmd) {
+ case A_FTRACE:
+ case A_SHUTDOWN:
+ case A_REBOOT:
+ case A_REMOUNT:
+ case A_FREEZE:
+ case A_DUMP:
+ if (secpolicy_sys_config(credp, B_FALSE) != 0)
+ return (EPERM);
+ break;
+
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Serialize these operations on ualock. If it is held, just return
+ * as if successful since the system will soon reset or remount.
+ */
+ if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_REMOUNT) {
+ if (!mutex_tryenter(&ualock))
+ return (0);
+ locked = 1;
+ }
+
+ switch (cmd) {
+ case A_SHUTDOWN:
+ {
+ proc_t *p = ttoproc(curthread);
+
+ /*
+ * Release (almost) all of our own resources if we are called
+ * from a user context, however if we are calling kadmin() from
+ * a kernel context then we do not release these resources.
+ */
+ if (ttoproc(curthread) != &p0) {
+ if ((error = exitlwps(0)) != 0)
+ return (error);
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOWAIT;
+ sigfillset(&p->p_ignore);
+ curthread->t_lwp->lwp_cursig = 0;
+ curthread->t_lwp->lwp_extsig = 0;
+ if (p->p_exec) {
+ vnode_t *exec_vp = p->p_exec;
+ p->p_exec = NULLVP;
+ mutex_exit(&p->p_lock);
+ VN_RELE(exec_vp);
+ } else {
+ mutex_exit(&p->p_lock);
+ }
+
+ pollcleanup();
+ closeall(P_FINFO(curproc));
+ relvm();
+
+ } else {
+ /*
+ * Reset t_cred if not set because much of the
+ * filesystem code depends on CRED() being valid.
+ */
+ if (curthread->t_cred == NULL)
+ curthread->t_cred = kcred;
+ }
+
+ /*
+ * Communcate that init shouldn't be restarted.
+ */
+ zone_shutdown_global();
+
+ killall(ALL_ZONES);
+ /*
+ * If we are calling kadmin() from a kernel context then we
+ * do not release these resources.
+ */
+ if (ttoproc(curthread) != &p0) {
+ VN_RELE(u.u_cdir);
+ if (u.u_rdir)
+ VN_RELE(u.u_rdir);
+ if (u.u_cwd)
+ refstr_rele(u.u_cwd);
+
+ u.u_cdir = rootdir;
+ u.u_rdir = NULL;
+ u.u_cwd = NULL;
+ }
+
+ /*
+ * Allow the reboot/halt/poweroff code a chance to do
+ * anything it needs to whilst we still have filesystems
+ * mounted, like loading any modules necessary for later
+ * performing the actual poweroff.
+ */
+ if ((mdep != NULL) && (*(char *)mdep == '/')) {
+ buf = i_convert_boot_device_name(mdep, NULL, &buflen);
+ mdpreboot(cmd, fcn, buf);
+ } else
+ mdpreboot(cmd, fcn, mdep);
+
+ /*
+ * Allow fsflush to finish running and then prevent it
+ * from ever running again so that vfs_unmountall() and
+ * vfs_syncall() can acquire the vfs locks they need.
+ */
+ sema_p(&fsflush_sema);
+ (void) callb_execute_class(CB_CL_UADMIN_PRE_VFS, NULL);
+
+ vfs_unmountall();
+ (void) VFS_MOUNTROOT(rootvfs, ROOT_UNMOUNT);
+ vfs_syncall();
+
+ (void) callb_execute_class(CB_CL_UADMIN_POST_VFS, NULL);
+ dump_ereports();
+ dump_messages();
+
+ /* FALLTHROUGH */
+ }
+
+ case A_REBOOT:
+ if ((mdep != NULL) && (*(char *)mdep == '/')) {
+ buf = i_convert_boot_device_name(mdep, NULL, &buflen);
+ mdboot(cmd, fcn, buf);
+ } else
+ mdboot(cmd, fcn, mdep);
+ /* no return expected */
+ break;
+
+ case A_REMOUNT:
+ (void) VFS_MOUNTROOT(rootvfs, ROOT_REMOUNT);
+ break;
+
+ case A_FREEZE:
+ {
+ /* XXX: declare in some header file */
+ extern int cpr(int);
+
+ if (modload("misc", "cpr") == -1)
+ return (ENOTSUP);
+ error = cpr(fcn);
+ break;
+ }
+
+ case A_FTRACE:
+ {
+ switch (fcn) {
+ case AD_FTRACE_START:
+ (void) FTRACE_START();
+ break;
+ case AD_FTRACE_STOP:
+ (void) FTRACE_STOP();
+ break;
+ default:
+ error = EINVAL;
+ }
+ break;
+ }
+
+ case A_DUMP:
+ {
+ if (fcn == AD_NOSYNC) {
+ in_sync = 1;
+ break;
+ }
+
+ panic_bootfcn = fcn;
+ panic_forced = 1;
+
+ if ((mdep != NULL) && (*(char *)mdep == '/')) {
+ panic_bootstr = i_convert_boot_device_name(mdep,
+ NULL, &buflen);
+ } else
+ panic_bootstr = mdep;
+
+ panic("forced crash dump initiated at user request");
+ /*NOTREACHED*/
+ }
+
+ default:
+ error = EINVAL;
+ }
+
+ if (locked)
+ mutex_exit(&ualock);
+
+ return (error);
+}
+
+int
+uadmin(int cmd, int fcn, uintptr_t mdep)
+{
+ int error = 0, rv = 0;
+ size_t nbytes = 0;
+ char buf[257];
+ cred_t *credp = CRED();
+
+ /*
+ * The swapctl system call doesn't have its own entry point: it uses
+ * uadmin as a wrapper so we just call it directly from here.
+ */
+ if (cmd == A_SWAPCTL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE)
+ error = swapctl(fcn, (void *)mdep, &rv);
+#if defined(_SYSCALL32_IMPL)
+ else
+ error = swapctl32(fcn, (void *)mdep, &rv);
+#endif /* _SYSCALL32_IMPL */
+ return (error ? set_errno(error) : rv);
+ }
+
+ /*
+ * Handle zones.
+ */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ error = zone_uadmin(cmd, fcn, credp);
+ return (error ? set_errno(error) : 0);
+ }
+
+ /*
+ * Certain subcommands intepret a non-NULL mdep value as a pointer to
+ * a boot string. Attempt to copy it in now, or reset mdep to NULL.
+ */
+ if (cmd == A_SHUTDOWN || cmd == A_REBOOT || cmd == A_DUMP) {
+ if (mdep != NULL && copyinstr((const char *)mdep, buf,
+ sizeof (buf) - 1, &nbytes) == 0) {
+ buf[nbytes] = '\0';
+ mdep = (uintptr_t)buf;
+ } else
+ mdep = NULL;
+ }
+
+ if ((error = kadmin(cmd, fcn, (void *)mdep, credp)) != 0)
+ return (set_errno(error));
+
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/ucredsys.c b/usr/src/uts/common/syscall/ucredsys.c
new file mode 100644
index 0000000000..16e4ce82b8
--- /dev/null
+++ b/usr/src/uts/common/syscall/ucredsys.c
@@ -0,0 +1,208 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/ucred.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/stropts.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/socket.h>
+#include <sys/strsubr.h>
+#include <c2/audit.h>
+
+/*
+ * Getpeerucred system call implementation.
+ */
+static int
+getpeerucred(int fd, void *buf)
+{
+ file_t *fp;
+ struct ucred_s *uc;
+ vnode_t *vp;
+ k_peercred_t kpc;
+ int err;
+ int32_t rval;
+
+ kpc.pc_cr = NULL;
+ kpc.pc_cpid = -1;
+
+ if ((fp = getf(fd)) == NULL)
+ return (set_errno(EBADF));
+
+ vp = fp->f_vnode;
+
+ switch (vp->v_type) {
+ case VFIFO:
+ case VSOCK:
+ err = VOP_IOCTL(vp, _I_GETPEERCRED, (intptr_t)&kpc,
+ FKIOCTL, CRED(), &rval);
+ break;
+ case VCHR: {
+ struct strioctl strioc;
+
+ if (vp->v_stream == NULL) {
+ err = ENOTSUP;
+ break;
+ }
+ strioc.ic_cmd = _I_GETPEERCRED;
+ strioc.ic_timout = INFTIM;
+ strioc.ic_len = (int)sizeof (k_peercred_t);
+ strioc.ic_dp = (char *)&kpc;
+
+ err = strdoioctl(vp->v_stream, &strioc, FNATIVE|FKIOCTL,
+ STR_NOSIG|K_TO_K, CRED(), &rval);
+
+ /*
+ * Map all unexpected error codes to ENOTSUP.
+ */
+ switch (err) {
+ case 0:
+ case ENOTSUP:
+ case ENOTCONN:
+ case ENOMEM:
+ break;
+ default:
+ err = ENOTSUP;
+ break;
+ }
+ break;
+ }
+ default:
+ err = ENOTSUP;
+ break;
+ }
+ releasef(fd);
+
+ /*
+ * If someone gave us a credential, err will be 0.
+ */
+ if (kpc.pc_cr != NULL) {
+ ASSERT(err == 0);
+
+ uc = cred2ucred(kpc.pc_cr, kpc.pc_cpid, NULL);
+
+ crfree(kpc.pc_cr);
+
+ err = copyout(uc, buf, uc->uc_size);
+
+ kmem_free(uc, uc->uc_size);
+
+ if (err != 0)
+ return (set_errno(EFAULT));
+
+ return (0);
+ }
+ return (set_errno(err));
+}
+
+static int
+ucred_get(pid_t pid, void *ubuf)
+{
+ proc_t *p;
+ cred_t *pcr;
+ int err;
+ struct ucred_s *uc;
+
+ if (pid == P_MYID || pid == curproc->p_pid) {
+ pcr = CRED();
+ crhold(pcr);
+ pid = curproc->p_pid;
+ } else {
+ cred_t *updcred = NULL;
+
+ if (pid < 0)
+ return (set_errno(EINVAL));
+
+ if (audit_active)
+ updcred = cralloc();
+
+ mutex_enter(&pidlock);
+ p = prfind(pid);
+
+ if (p == NULL) {
+ mutex_exit(&pidlock);
+ if (updcred != NULL)
+ crfree(updcred);
+ return (set_errno(ESRCH));
+ }
+
+ /*
+ * Assure that audit data in cred is up-to-date.
+ * updcred will be used or freed.
+ */
+ if (audit_active)
+ audit_update_context(p, updcred);
+
+ err = priv_proc_cred_perm(CRED(), p, &pcr, VREAD);
+ mutex_exit(&pidlock);
+
+ if (err != 0)
+ return (set_errno(err));
+ }
+
+ uc = cred2ucred(pcr, pid, NULL);
+
+ crfree(pcr);
+
+ err = copyout(uc, ubuf, uc->uc_size);
+
+ kmem_free(uc, uc->uc_size);
+
+ if (err)
+ return (set_errno(EFAULT));
+
+ return (0);
+}
+
+int
+ucredsys(int code, int obj, void *buf)
+{
+ switch (code) {
+ case UCREDSYS_UCREDGET:
+ return (ucred_get((pid_t)obj, buf));
+ case UCREDSYS_GETPEERUCRED:
+ return (getpeerucred(obj, buf));
+ default:
+ return (set_errno(EINVAL));
+ }
+}
+
+#ifdef _SYSCALL32_IMPL
+int
+ucredsys32(int arg1, int arg2, caddr32_t arg3)
+{
+ return (ucredsys(arg1, arg2, (void *)(uintptr_t)arg3));
+}
+#endif
diff --git a/usr/src/uts/common/syscall/uid.c b/usr/src/uts/common/syscall/uid.c
new file mode 100644
index 0000000000..65bcabcaf0
--- /dev/null
+++ b/usr/src/uts/common/syscall/uid.c
@@ -0,0 +1,323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/tuneable.h>
+#include <sys/cred_impl.h>
+#include <sys/errno.h>
+#include <sys/proc.h>
+#include <sys/signal.h>
+#include <sys/debug.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+int
+setuid(uid_t uid)
+{
+ register proc_t *p;
+ int error;
+ int do_nocd = 0;
+ int uidchge = 0;
+ cred_t *cr, *newcr;
+ uid_t oldruid = uid;
+ zoneid_t zoneid = getzoneid();
+
+ if (uid < 0 || uid > MAXUID)
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+
+ p = ttoproc(curthread);
+
+retry:
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if ((uid == cr->cr_ruid || uid == cr->cr_suid) &&
+ secpolicy_allow_setid(cr, uid, B_TRUE) != 0) {
+ error = 0;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_uid = uid;
+ } else if ((error = secpolicy_allow_setid(cr, uid, B_FALSE)) == 0) {
+ if (!uidchge && uid != cr->cr_ruid) {
+ /*
+ * The ruid of the process is going to change. In order
+ * to avoid a race condition involving the
+ * process-count associated with the newly given ruid,
+ * we increment the count before assigning the
+ * credential to the process.
+ * To do that, we'll have to take pidlock, so we first
+ * release p_crlock.
+ */
+ mutex_exit(&p->p_crlock);
+ uidchge = 1;
+ mutex_enter(&pidlock);
+ upcount_inc(uid, zoneid);
+ mutex_exit(&pidlock);
+ /*
+ * As we released p_crlock we can't rely on the cr
+ * we read. So retry the whole thing.
+ */
+ goto retry;
+ }
+ /*
+ * A privileged process that gives up its privilege
+ * must be marked to produce no core dump.
+ */
+ if (cr->cr_uid != uid ||
+ cr->cr_ruid != uid ||
+ cr->cr_suid != uid)
+ do_nocd = 1;
+ oldruid = cr->cr_ruid;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_ruid = uid;
+ newcr->cr_suid = uid;
+ newcr->cr_uid = uid;
+ ASSERT(uid != oldruid ? uidchge : 1);
+ } else
+ crfree(newcr);
+
+ mutex_exit(&p->p_crlock);
+
+ /*
+ * We decrement the number of processes associated with the oldruid
+ * to match the increment above, even if the ruid of the process
+ * did not change or an error occurred (oldruid == uid).
+ */
+ if (uidchge) {
+ mutex_enter(&pidlock);
+ upcount_dec(oldruid, zoneid);
+ mutex_exit(&pidlock);
+ }
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ return (set_errno(error));
+}
+
+int64_t
+getuid(void)
+{
+ rval_t r;
+ cred_t *cr;
+
+ cr = curthread->t_cred;
+ r.r_val1 = cr->cr_ruid;
+ r.r_val2 = cr->cr_uid;
+ return (r.r_vals);
+}
+
+int
+seteuid(uid_t uid)
+{
+ register proc_t *p;
+ int error = EPERM;
+ int do_nocd = 0;
+ cred_t *cr, *newcr;
+
+ if (uid < 0 || uid > MAXUID)
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+ p = ttoproc(curthread);
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if (uid == cr->cr_ruid || uid == cr->cr_uid || uid == cr->cr_suid ||
+ (error = secpolicy_allow_setid(cr, uid, B_FALSE)) == 0) {
+ /*
+ * A privileged process that makes itself look like a
+ * set-uid process must be marked to produce no core dump,
+ * if the effective uid did changed.
+ */
+ if (cr->cr_uid != uid && error == 0)
+ do_nocd = 1;
+ error = 0;
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+ newcr->cr_uid = uid;
+ } else
+ crfree(newcr);
+
+ mutex_exit(&p->p_crlock);
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ return (set_errno(error));
+}
+
+/*
+ * Buy-back from SunOS 4.x
+ *
+ * Like setuid() and seteuid() combined -except- that non-root users
+ * can change cr_ruid to cr_uid, and the semantics of cr_suid are
+ * subtly different.
+ */
+int
+setreuid(uid_t ruid, uid_t euid)
+{
+ proc_t *p;
+ int error = 0;
+ int do_nocd = 0;
+ int uidchge = 0;
+ uid_t oldruid = ruid;
+ cred_t *cr, *newcr;
+ zoneid_t zoneid = getzoneid();
+
+ if ((ruid != -1 && (ruid < 0 || ruid > MAXUID)) ||
+ (euid != -1 && (euid < 0 || euid > MAXUID)))
+ return (set_errno(EINVAL));
+
+ /*
+ * Need to pre-allocate the new cred structure before grabbing
+ * the p_crlock mutex.
+ */
+ newcr = cralloc();
+
+ p = ttoproc(curthread);
+
+retry:
+ mutex_enter(&p->p_crlock);
+ cr = p->p_cred;
+
+ if (ruid != -1 && ruid != cr->cr_ruid && ruid != cr->cr_uid &&
+ secpolicy_allow_setid(cr, ruid, B_FALSE) != 0) {
+ error = EPERM;
+ } else if (euid != -1 &&
+ euid != cr->cr_ruid && euid != cr->cr_uid &&
+ euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) {
+ error = EPERM;
+ } else {
+ if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) {
+ /*
+ * The ruid of the process is going to change. In order
+ * to avoid a race condition involving the
+ * process-count associated with the newly given ruid,
+ * we increment the count before assigning the
+ * credential to the process.
+ * To do that, we'll have to take pidlock, so we first
+ * release p_crlock.
+ */
+ mutex_exit(&p->p_crlock);
+ uidchge = 1;
+ mutex_enter(&pidlock);
+ upcount_inc(ruid, zoneid);
+ mutex_exit(&pidlock);
+ /*
+ * As we released p_crlock we can't rely on the cr
+ * we read. So retry the whole thing.
+ */
+ goto retry;
+ }
+ crhold(cr);
+ crcopy_to(cr, newcr);
+ p->p_cred = newcr;
+
+ if (euid != -1)
+ newcr->cr_uid = euid;
+ if (ruid != -1) {
+ oldruid = newcr->cr_ruid;
+ newcr->cr_ruid = ruid;
+ ASSERT(ruid != oldruid ? uidchge : 1);
+ }
+ /*
+ * "If the real uid is being changed, or the effective uid is
+ * being changed to a value not equal to the real uid, the
+ * saved uid is set to the new effective uid."
+ */
+ if (ruid != -1 ||
+ (euid != -1 && newcr->cr_uid != newcr->cr_ruid))
+ newcr->cr_suid = newcr->cr_uid;
+ /*
+ * A process that gives up its privilege
+ * must be marked to produce no core dump.
+ */
+ if ((cr->cr_uid != newcr->cr_uid ||
+ cr->cr_ruid != newcr->cr_ruid ||
+ cr->cr_suid != newcr->cr_suid))
+ do_nocd = 1;
+
+ crfree(cr);
+ }
+ mutex_exit(&p->p_crlock);
+
+ /*
+ * We decrement the number of processes associated with the oldruid
+ * to match the increment above, even if the ruid of the process
+ * did not change or an error occurred (oldruid == uid).
+ */
+ if (uidchge) {
+ ASSERT(oldruid != -1 && ruid != -1);
+ mutex_enter(&pidlock);
+ upcount_dec(oldruid, zoneid);
+ mutex_exit(&pidlock);
+ }
+
+ if (error == 0) {
+ if (do_nocd) {
+ mutex_enter(&p->p_lock);
+ p->p_flag |= SNOCD;
+ mutex_exit(&p->p_lock);
+ }
+ crset(p, newcr); /* broadcast to process threads */
+ return (0);
+ }
+ crfree(newcr);
+ return (set_errno(error));
+}
diff --git a/usr/src/uts/common/syscall/umask.c b/usr/src/uts/common/syscall/umask.c
new file mode 100644
index 0000000000..e80d1de9a6
--- /dev/null
+++ b/usr/src/uts/common/syscall/umask.c
@@ -0,0 +1,49 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/debug.h>
+
+/*
+ * Mode mask for creation of files.
+ */
+
+int
+umask(int mask)
+{
+ register mode_t t;
+
+ t = u.u_cmask;
+ u.u_cmask = (mode_t)(mask & PERMMASK);
+ return ((int)t);
+}
diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c
new file mode 100644
index 0000000000..f5fb881f5d
--- /dev/null
+++ b/usr/src/uts/common/syscall/umount.c
@@ -0,0 +1,188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/fstyp.h>
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/vfs.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+
+
+/*
+ * New umount() system call (for force unmount flag and perhaps others later).
+ */
+int
+umount2(char *pathp, int flag)
+{
+ struct pathname pn;
+ struct vfs *vfsp;
+ int error;
+
+ /*
+ * Some flags are disallowed through the system call interface.
+ */
+ flag &= MS_UMOUNT_MASK;
+
+ /*
+ * Lookup user-supplied name by trying to match it against the
+ * mount points recorded at mount time. If no match is found
+ * (which can happen if the path to the mount point is specified
+ * differently between mount & umount, or if a block device were
+ * passed to umount) then we fall back to calling lookupname()
+ * to find the vfs. Doing it this way prevents calling lookupname()
+ * in most cases and that allows forcible umount to work even if
+ * lookupname() would hang (i.e. because an NFS server is dead).
+ */
+
+ if (error = pn_get(pathp, UIO_USERSPACE, &pn))
+ return (set_errno(error));
+
+ /*
+ * Only a privileged user is allowed to bypass the security
+ * checks done by lookupname() and use the results from
+ * vfs_mntpoint2vfsp() instead. It could be argued that the
+ * proper check is FILE_DAC_SEARCH but we put it all
+ * under the mount privilege. Also, make sure the caller
+ * isn't in an environment with an alternate root (to the zone's root)
+ * directory, i.e. chroot(2).
+ */
+ if (secpolicy_fs_unmount(CRED(), NULL) != 0 ||
+ (PTOU(curproc)->u_rdir != NULL &&
+ PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) ||
+ (vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) {
+ vnode_t *fsrootvp;
+
+ /* fall back to lookupname() on path given to us */
+ if (error = lookupname(pn.pn_path, UIO_SYSSPACE, FOLLOW,
+ NULLVPP, &fsrootvp)) {
+ pn_free(&pn);
+ return (set_errno(error));
+ }
+ /*
+ * Find the vfs to be unmounted. The caller may have specified
+ * either the directory mount point (preferred) or else (for a
+ * disk-based file system) the block device which was mounted.
+ * Check to see which it is; if it's the device, search the VFS
+ * list to find the associated vfs entry.
+ */
+ if (fsrootvp->v_flag & VROOT) {
+ vfsp = fsrootvp->v_vfsp;
+ VFS_HOLD(vfsp);
+ } else if (fsrootvp->v_type == VBLK)
+ vfsp = vfs_dev2vfsp(fsrootvp->v_rdev);
+ else
+ vfsp = NULL;
+
+ VN_RELE(fsrootvp);
+
+ if (vfsp == NULL) {
+ pn_free(&pn);
+ return (set_errno(EINVAL));
+ }
+ }
+ pn_free(&pn);
+
+ /*
+ * Protect the call to vn_vfswlock() with the vfs reflock. This
+ * ensures vfs_vnodecovered will either be NULL (because someone
+ * beat us to the umount) or valid (because vfs_lock() prevents
+ * another umount from getting through here until we've called
+ * vn_vfswlock() on the covered vnode).
+ *
+ * At one point, we did the non-blocking version (vfs_lock()),
+ * and if it failed, bailed out with EBUSY. However, dounmount()
+ * calls vfs_lock_wait() and we drop the vfs lock before calling
+ * dounmount(), so there's no difference between waiting here
+ * for the lock or waiting there because grabbed it as soon as
+ * we drop it below. No returning with EBUSY at this point
+ * reduces the number of spurious unmount failures that happen
+ * as a side-effect of fsflush() and other mount and unmount
+ * operations that might be going on simultaneously.
+ */
+ vfs_lock_wait(vfsp);
+
+ /*
+ * Call vn_vfswlock() on the covered vnode so that dounmount()
+ * can do its thing. It will call the corresponding vn_vfsunlock().
+ * Note that vfsp->vfs_vnodecovered can be NULL here, either because
+ * someone did umount on "/" or because someone beat us to the umount
+ * before we did the vfs_lock() above. In these cases, vn_vfswlock()
+ * returns EBUSY and we just pass that up. Also note that we're
+ * looking at a vnode without doing a VN_HOLD() on it. This is
+ * safe because it can't go away while something is mounted on it
+ * and we're locking out other umounts at this point.
+ */
+ if (vn_vfswlock(vfsp->vfs_vnodecovered)) {
+ vfs_unlock(vfsp);
+ VFS_RELE(vfsp);
+ return (set_errno(EBUSY));
+ }
+
+ /*
+ * Now that the VVFSLOCK in the covered vnode is protecting this
+ * path, we don't need the vfs reflock or the hold on the vfs anymore.
+ */
+ vfs_unlock(vfsp);
+ VFS_RELE(vfsp);
+
+ /*
+ * Perform the unmount.
+ */
+ if ((error = dounmount(vfsp, flag, CRED())) != 0)
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Old umount() system call for compatibility.
+ * Changes due to support for forced unmount.
+ */
+int
+umount(char *pathp)
+{
+ return (umount2(pathp, 0));
+}
diff --git a/usr/src/uts/common/syscall/uname.c b/usr/src/uts/common/syscall/uname.c
new file mode 100644
index 0000000000..9a5a2608f7
--- /dev/null
+++ b/usr/src/uts/common/syscall/uname.c
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/utsname.h>
+#include <sys/debug.h>
+
+int
+uname(struct utsname *buf)
+{
+ char *name_to_use = uts_nodename();
+
+ if (copyout(utsname.sysname, buf->sysname, strlen(utsname.sysname)+1)) {
+ return (set_errno(EFAULT));
+ }
+ if (copyout(name_to_use, buf->nodename, strlen(name_to_use)+1)) {
+ return (set_errno(EFAULT));
+ }
+ if (copyout(utsname.release, buf->release, strlen(utsname.release)+1)) {
+ return (set_errno(EFAULT));
+ }
+ if (copyout(utsname.version, buf->version, strlen(utsname.version)+1)) {
+ return (set_errno(EFAULT));
+ }
+ if (copyout(utsname.machine, buf->machine, strlen(utsname.machine)+1)) {
+ return (set_errno(EFAULT));
+ }
+ return (1); /* XXX why 1 and not 0? 1003.1 says "non-negative" */
+}
diff --git a/usr/src/uts/common/syscall/unlink.c b/usr/src/uts/common/syscall/unlink.c
new file mode 100644
index 0000000000..d4b84c0272
--- /dev/null
+++ b/usr/src/uts/common/syscall/unlink.c
@@ -0,0 +1,111 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/uio.h>
+#include <sys/debug.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <c2/audit.h>
+
+/*
+ * Unlink (i.e. delete) a file.
+ */
+int
+unlink(char *fname)
+{
+ int error;
+
+ if (error = vn_remove(fname, UIO_USERSPACE, RMFILE))
+ return (set_errno(error));
+ return (0);
+}
+
+/*
+ * Unlink a file from a directory
+ */
+int
+unlinkat(int fd, char *name, int flags)
+{
+ file_t *dirfp;
+ vnode_t *dirvp;
+ int error;
+ char startchar;
+
+ if (fd == AT_FDCWD && name == NULL)
+ return (set_errno(EFAULT));
+
+ if (name != NULL) {
+ if (copyin(name, &startchar, sizeof (char)))
+ return (set_errno(EFAULT));
+ } else
+ startchar = '\0';
+
+ if (fd == AT_FDCWD) {
+ dirvp = NULL;
+ } else {
+ if (startchar != '/') {
+ if ((dirfp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ dirvp = dirfp->f_vnode;
+ VN_HOLD(dirvp);
+ releasef(fd);
+ } else {
+ dirvp = NULL;
+ }
+ }
+
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_setfsat_path(1);
+#endif /* C2_AUDIT */
+
+ error = vn_removeat(dirvp, name,
+ UIO_USERSPACE, (flags == AT_REMOVEDIR) ? RMDIRECTORY : RMFILE);
+ if (dirvp != NULL)
+ VN_RELE(dirvp);
+
+ if (error != NULL)
+ return (set_errno(error));
+ return (0);
+}
diff --git a/usr/src/uts/common/syscall/utime.c b/usr/src/uts/common/syscall/utime.c
new file mode 100644
index 0000000000..b37681fe4c
--- /dev/null
+++ b/usr/src/uts/common/syscall/utime.c
@@ -0,0 +1,230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+/*
+ * Portions of this source code were derived from Berkeley 4.3 BSD
+ * under license from the Regents of the University of California.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/time.h>
+#include <sys/debug.h>
+#include <sys/model.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/pathname.h>
+#include <c2/audit.h>
+
+extern int namesetattr(char *, enum symfollow, vattr_t *, int);
+extern int fdsetattr(int, vattr_t *);
+
+static int
+cfutimesat(int fd, char *fname, int nmflag, vattr_t *vap, int flags)
+{
+
+ file_t *fp;
+ vnode_t *startvp, *vp;
+ int error;
+ char startchar;
+
+ if (fd == AT_FDCWD && fname == NULL)
+ return (set_errno(EFAULT));
+
+ if (nmflag == 1 || (nmflag == 2 && fname != NULL)) {
+ if (copyin(fname, &startchar, sizeof (char)))
+ return (set_errno(EFAULT));
+ } else
+ startchar = '\0';
+
+ if (fd == AT_FDCWD)
+ startvp = NULL;
+ else {
+
+ /*
+ * is this absolute path?
+ */
+ if (startchar != '/') {
+ if ((fp = getf(fd)) == NULL) {
+ return (set_errno(EBADF));
+ }
+ startvp = fp->f_vnode;
+ VN_HOLD(startvp);
+ releasef(fd);
+ } else {
+ startvp = NULL;
+ }
+ }
+
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_setfsat_path(1);
+#endif /* C2_AUDIT */
+
+ if ((nmflag == 1) || ((nmflag == 2) && (fname != NULL))) {
+ if (error = lookupnameat(fname, UIO_USERSPACE, FOLLOW,
+ NULLVPP, &vp, startvp)) {
+ if (startvp != NULL)
+ VN_RELE(startvp);
+ return (set_errno(error));
+ }
+ } else {
+ vp = startvp;
+ VN_HOLD(vp);
+ }
+
+ if (startvp != NULL) {
+ VN_RELE(startvp);
+ }
+
+ if (vn_is_readonly(vp)) {
+ error = EROFS;
+ } else {
+ error = VOP_SETATTR(vp, vap, flags, CRED(), NULL);
+ }
+
+ VN_RELE(vp);
+ if (error != 0)
+ return (set_errno(error));
+ else
+ return (0);
+}
+
+static int
+get_utimesvattr(struct timeval *tvptr, struct vattr *vattr, int *flags)
+{
+ struct timeval tv[2];
+
+ *flags = 0;
+
+ if (tvptr != NULL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(tvptr, tv, sizeof (tv)))
+ return (EFAULT);
+ } else {
+ struct timeval32 tv32[2];
+
+ if (copyin(tvptr, tv32, sizeof (tv32)))
+ return (EFAULT);
+
+ TIMEVAL32_TO_TIMEVAL(&tv[0], &tv32[0]);
+ TIMEVAL32_TO_TIMEVAL(&tv[1], &tv32[1]);
+ }
+
+ if (tv[0].tv_usec < 0 || tv[0].tv_usec >= 1000000 ||
+ tv[1].tv_usec < 0 || tv[1].tv_usec >= 1000000)
+ return (EINVAL);
+
+ vattr->va_atime.tv_sec = tv[0].tv_sec;
+ vattr->va_atime.tv_nsec = tv[0].tv_usec * 1000;
+ vattr->va_mtime.tv_sec = tv[1].tv_sec;
+ vattr->va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+ *flags |= ATTR_UTIME;
+ } else {
+ gethrestime(&vattr->va_atime);
+ vattr->va_mtime = vattr->va_atime;
+ }
+ vattr->va_mask = AT_ATIME | AT_MTIME;
+
+ return (0);
+}
+int
+futimesat(int fd, char *fname, struct timeval *tvptr)
+{
+ struct vattr vattr;
+ int flags = 0;
+ int error;
+
+ if ((error = get_utimesvattr(tvptr, &vattr, &flags)) != 0)
+ return (set_errno(error));
+
+ return (cfutimesat(fd, fname, 2, &vattr, flags));
+}
+/*
+ * Set access/modify times on named file.
+ */
+int
+utime(char *fname, time_t *tptr)
+{
+ time_t tv[2];
+ struct vattr vattr;
+ int flags = 0;
+
+ if (tptr != NULL) {
+ if (get_udatamodel() == DATAMODEL_NATIVE) {
+ if (copyin(tptr, tv, sizeof (tv)))
+ return (set_errno(EFAULT));
+ } else {
+ time32_t tv32[2];
+
+ if (copyin(tptr, &tv32, sizeof (tv32)))
+ return (set_errno(EFAULT));
+
+ tv[0] = (time_t)tv32[0];
+ tv[1] = (time_t)tv32[1];
+ }
+
+ vattr.va_atime.tv_sec = tv[0];
+ vattr.va_atime.tv_nsec = 0;
+ vattr.va_mtime.tv_sec = tv[1];
+ vattr.va_mtime.tv_nsec = 0;
+ flags |= ATTR_UTIME;
+ } else {
+ gethrestime(&vattr.va_atime);
+ vattr.va_mtime = vattr.va_atime;
+ }
+
+ vattr.va_mask = AT_ATIME|AT_MTIME;
+ return (cfutimesat(AT_FDCWD, fname, 1, &vattr, flags));
+}
+
+/*
+ * SunOS4.1 Buyback:
+ * Set access/modify time on named file, with hi res timer
+ */
+int
+utimes(char *fname, struct timeval *tvptr)
+{
+ struct vattr vattr;
+ int flags = 0;
+ int error;
+
+ if ((error = get_utimesvattr(tvptr, &vattr, &flags)) != 0)
+ return (set_errno(error));
+
+ return (cfutimesat(AT_FDCWD, fname, 1, &vattr, flags));
+}
diff --git a/usr/src/uts/common/syscall/utssys.c b/usr/src/uts/common/syscall/utssys.c
new file mode 100644
index 0000000000..380df8e8fc
--- /dev/null
+++ b/usr/src/uts/common/syscall/utssys.c
@@ -0,0 +1,954 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/errno.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/session.h>
+#include <sys/var.h>
+#include <sys/utsname.h>
+#include <sys/utssys.h>
+#include <sys/ustat.h>
+#include <sys/statvfs.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/pathname.h>
+#include <sys/modctl.h>
+#include <sys/fs/snode.h>
+#include <sys/sunldi_impl.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/ddipropdefs.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/modctl.h>
+#include <sys/flock.h>
+#include <sys/share.h>
+#include <vm/as.h>
+#include <vm/seg.h>
+#include <vm/seg_vn.h>
+#include <util/qsort.h>
+#include <sys/zone.h>
+
+/*
+ * utssys()
+ */
+static int uts_fusers(char *, int, intptr_t);
+static int _statvfs64_by_dev(dev_t, struct statvfs64 *);
+
+#if defined(_ILP32) || defined(_SYSCALL32_IMPL)
+
+static int utssys_uname32(caddr_t, rval_t *);
+static int utssys_ustat32(dev_t, struct ustat32 *);
+
+int64_t
+utssys32(void *buf, int arg, int type, void *outbp)
+{
+ int error;
+ rval_t rv;
+
+ rv.r_vals = 0;
+
+ switch (type) {
+ case UTS_UNAME:
+ /*
+ * This is an obsolete way to get the utsname structure
+ * (it only gives you the first 8 characters of each field!)
+ * uname(2) is the preferred and better interface.
+ */
+ error = utssys_uname32(buf, &rv);
+ break;
+ case UTS_USTAT:
+ error = utssys_ustat32(expldev((dev32_t)arg), buf);
+ break;
+ case UTS_FUSERS:
+ error = uts_fusers(buf, arg, (intptr_t)outbp);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error == 0 ? rv.r_vals : (int64_t)set_errno(error));
+}
+
+static int
+utssys_uname32(caddr_t buf, rval_t *rvp)
+{
+ if (copyout(utsname.sysname, buf, 8))
+ return (EFAULT);
+ buf += 8;
+ if (subyte(buf, 0) < 0)
+ return (EFAULT);
+ buf++;
+ if (copyout(uts_nodename(), buf, 8))
+ return (EFAULT);
+ buf += 8;
+ if (subyte(buf, 0) < 0)
+ return (EFAULT);
+ buf++;
+ if (copyout(utsname.release, buf, 8))
+ return (EFAULT);
+ buf += 8;
+ if (subyte(buf, 0) < 0)
+ return (EFAULT);
+ buf++;
+ if (copyout(utsname.version, buf, 8))
+ return (EFAULT);
+ buf += 8;
+ if (subyte(buf, 0) < 0)
+ return (EFAULT);
+ buf++;
+ if (copyout(utsname.machine, buf, 8))
+ return (EFAULT);
+ buf += 8;
+ if (subyte(buf, 0) < 0)
+ return (EFAULT);
+ rvp->r_val1 = 1;
+ return (0);
+}
+
+static int
+utssys_ustat32(dev_t dev, struct ustat32 *cbuf)
+{
+ struct ustat32 ust32;
+ struct statvfs64 stvfs;
+ fsblkcnt64_t fsbc64;
+ char *cp, *cp2;
+ int i, error;
+
+ if ((error = _statvfs64_by_dev(dev, &stvfs)) != 0)
+ return (error);
+
+ fsbc64 = stvfs.f_bfree * (stvfs.f_frsize / 512);
+ /*
+ * Check to see if the number of free blocks can be expressed
+ * in 31 bits or whether the number of free files is more than
+ * can be expressed in 32 bits and is not -1 (UINT64_MAX). NFS
+ * Version 2 does not support the number of free files and
+ * hence will return -1. -1, when translated from a 32 bit
+ * quantity to an unsigned 64 bit quantity, turns into UINT64_MAX.
+ */
+ if (fsbc64 > INT32_MAX ||
+ (stvfs.f_ffree > UINT32_MAX && stvfs.f_ffree != UINT64_MAX))
+ return (EOVERFLOW);
+
+ ust32.f_tfree = (daddr32_t)fsbc64;
+ ust32.f_tinode = (ino32_t)stvfs.f_ffree;
+
+ cp = stvfs.f_fstr;
+ cp2 = ust32.f_fname;
+ i = 0;
+ while (i++ < sizeof (ust32.f_fname))
+ if (*cp != '\0')
+ *cp2++ = *cp++;
+ else
+ *cp2++ = '\0';
+ while (*cp != '\0' &&
+ (i++ < sizeof (stvfs.f_fstr) - sizeof (ust32.f_fpack)))
+ cp++;
+ (void) strncpy(ust32.f_fpack, cp + 1, sizeof (ust32.f_fpack));
+
+ if (copyout(&ust32, cbuf, sizeof (ust32)))
+ return (EFAULT);
+ return (0);
+}
+
+#endif /* _ILP32 || _SYSCALL32_IMPL */
+
+#ifdef _LP64
+
+static int uts_ustat64(dev_t, struct ustat *);
+
+int64_t
+utssys64(void *buf, long arg, int type, void *outbp)
+{
+ int error;
+ rval_t rv;
+
+ rv.r_vals = 0;
+
+ switch (type) {
+ case UTS_USTAT:
+ error = uts_ustat64((dev_t)arg, buf);
+ break;
+ case UTS_FUSERS:
+ error = uts_fusers(buf, (int)arg, (intptr_t)outbp);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error == 0 ? rv.r_vals : (int64_t)set_errno(error));
+}
+
+static int
+uts_ustat64(dev_t dev, struct ustat *cbuf)
+{
+ struct ustat ust;
+ struct statvfs64 stvfs;
+ fsblkcnt64_t fsbc64;
+ char *cp, *cp2;
+ int i, error;
+
+ if ((error = _statvfs64_by_dev(dev, &stvfs)) != 0)
+ return (error);
+
+ fsbc64 = stvfs.f_bfree * (stvfs.f_frsize / 512);
+ ust.f_tfree = (daddr_t)fsbc64;
+ ust.f_tinode = (ino_t)stvfs.f_ffree;
+
+ cp = stvfs.f_fstr;
+ cp2 = ust.f_fname;
+ i = 0;
+ while (i++ < sizeof (ust.f_fname))
+ if (*cp != '\0')
+ *cp2++ = *cp++;
+ else
+ *cp2++ = '\0';
+ while (*cp != '\0' &&
+ (i++ < sizeof (stvfs.f_fstr) - sizeof (ust.f_fpack)))
+ cp++;
+ (void) strncpy(ust.f_fpack, cp + 1, sizeof (ust.f_fpack));
+
+ if (copyout(&ust, cbuf, sizeof (ust)))
+ return (EFAULT);
+ return (0);
+}
+
+#endif /* _LP64 */
+
+/*
+ * Utility routine for the ustat implementations.
+ * (If it wasn't for the 'find-by-dev_t' semantic of ustat(2), we could push
+ * this all out into userland, sigh.)
+ */
+static int
+_statvfs64_by_dev(dev_t dev, struct statvfs64 *svp)
+{
+ vfs_t *vfsp;
+ int error;
+
+ if ((vfsp = vfs_dev2vfsp(dev)) == NULL) {
+ /*
+ * See if it's the root of our zone.
+ */
+ vfsp = curproc->p_zone->zone_rootvp->v_vfsp;
+ if (vfsp->vfs_dev == dev) {
+ VFS_HOLD(vfsp);
+ } else {
+ vfsp = NULL;
+ }
+ }
+ if (vfsp == NULL)
+ return (EINVAL);
+ error = VFS_STATVFS(vfsp, svp);
+ VFS_RELE(vfsp);
+ return (error);
+}
+
+/*
+ * Check if this pid has an NBMAND lock or share reservation
+ * on this vp. llp is a snapshoted list of all NBMAND locks
+ * set by this pid. Return 1 if there is an NBMAND lock else
+ * return 0.
+ */
+static int
+proc_has_nbmand_on_vp(vnode_t *vp, pid_t pid, locklist_t *llp)
+{
+ /*
+ * Any NBMAND lock held by the process on this vp?
+ */
+ while (llp) {
+ if (llp->ll_vp == vp) {
+ return (1);
+ }
+ llp = llp->ll_next;
+ }
+ /*
+ * Any NBMAND share reservation on the vp for this process?
+ */
+ return (proc_has_nbmand_share_on_vp(vp, pid));
+}
+
+static fu_data_t *
+dofusers(vnode_t *fvp, int flags)
+{
+ fu_data_t *fu_data;
+ proc_t *prp;
+ vfs_t *cvfsp;
+ pid_t npids, pidx, *pidlist;
+ int v_proc = v.v_proc; /* max # of procs */
+ int pcnt = 0;
+ int contained = (flags & F_CONTAINED);
+ int nbmandonly = (flags & F_NBMANDLIST);
+ int dip_usage = (flags & F_DEVINFO);
+ int fvp_isdev = vn_matchops(fvp, spec_getvnodeops());
+ zone_t *zone = curproc->p_zone;
+ int inglobal = INGLOBALZONE(curproc);
+
+ /* get a pointer to the file system containing this vnode */
+ cvfsp = fvp->v_vfsp;
+ ASSERT(cvfsp);
+
+ /* allocate the data structure to return our results in */
+ fu_data = kmem_alloc(fu_data_size(v_proc), KM_SLEEP);
+ fu_data->fud_user_max = v_proc;
+ fu_data->fud_user_count = 0;
+
+ /* get a snapshot of all the pids we're going to check out */
+ pidlist = kmem_alloc(v_proc * sizeof (pid_t), KM_SLEEP);
+ mutex_enter(&pidlock);
+ for (npids = 0, prp = practive; prp != NULL; prp = prp->p_next) {
+ if (inglobal || prp->p_zone == zone)
+ pidlist[npids++] = prp->p_pid;
+ }
+ mutex_exit(&pidlock);
+
+ /* grab each process and check its file usage */
+ for (pidx = 0; pidx < npids; pidx++) {
+ locklist_t *llp = NULL;
+ uf_info_t *fip;
+ vnode_t *vp;
+ user_t *up;
+ sess_t *sp;
+ uid_t uid;
+ pid_t pid = pidlist[pidx];
+ int i, use_flag = 0;
+
+ /*
+ * grab prp->p_lock using sprlock()
+ * if sprlock() fails the process does not exists anymore
+ */
+ prp = sprlock(pid);
+ if (prp == NULL)
+ continue;
+
+ /* get the processes credential info in case we need it */
+ mutex_enter(&prp->p_crlock);
+ uid = crgetruid(prp->p_cred);
+ mutex_exit(&prp->p_crlock);
+
+ /*
+ * it's safe to drop p_lock here because we
+ * called sprlock() before and it set the SPRLOCK
+ * flag for the process so it won't go away.
+ */
+ mutex_exit(&prp->p_lock);
+
+ /*
+ * now we want to walk a processes open file descriptors
+ * to do this we need to grab the fip->fi_lock. (you
+ * can't hold p_lock when grabbing the fip->fi_lock.)
+ */
+ fip = P_FINFO(prp);
+ mutex_enter(&fip->fi_lock);
+
+ /*
+ * Snapshot nbmand locks for pid
+ */
+ llp = flk_active_nbmand_locks(prp->p_pid);
+ for (i = 0; i < fip->fi_nfiles; i++) {
+ uf_entry_t *ufp;
+ file_t *fp;
+
+ UF_ENTER(ufp, fip, i);
+ if (((fp = ufp->uf_file) == NULL) ||
+ ((vp = fp->f_vnode) == NULL)) {
+ UF_EXIT(ufp);
+ continue;
+ }
+
+ /*
+ * if the target file (fvp) is not a device
+ * and corrosponds to the root of a filesystem
+ * (cvfsp), then check if it contains the file
+ * is use by this process (vp).
+ */
+ if (contained && (vp->v_vfsp == cvfsp))
+ use_flag |= F_OPEN;
+
+ /*
+ * if the target file (fvp) is not a device,
+ * then check if it matches the file in use
+ * by this process (vp).
+ */
+ if (!fvp_isdev && VN_CMP(fvp, vp))
+ use_flag |= F_OPEN;
+
+ /*
+ * if the target file (fvp) is a device,
+ * then check if the current file in use
+ * by this process (vp) maps to the same device
+ * minor node.
+ */
+ if (fvp_isdev &&
+ vn_matchops(vp, spec_getvnodeops()) &&
+ (fvp->v_rdev == vp->v_rdev))
+ use_flag |= F_OPEN;
+
+ /*
+ * if the target file (fvp) is a device,
+ * and we're checking for device instance
+ * usage, then check if the current file in use
+ * by this process (vp) maps to the same device
+ * instance.
+ */
+ if (dip_usage &&
+ vn_matchops(vp, spec_getvnodeops()) &&
+ (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip))
+ use_flag |= F_OPEN;
+
+ /*
+ * if the current file in use by this process (vp)
+ * doesn't match what we're looking for, move on
+ * to the next file in the process.
+ */
+ if ((use_flag & F_OPEN) == 0) {
+ UF_EXIT(ufp);
+ continue;
+ }
+
+ if (proc_has_nbmand_on_vp(vp, prp->p_pid, llp)) {
+ /* A nbmand found so we're done. */
+ use_flag |= F_NBM;
+ UF_EXIT(ufp);
+ break;
+ }
+ UF_EXIT(ufp);
+ }
+ if (llp)
+ flk_free_locklist(llp);
+
+ mutex_exit(&fip->fi_lock);
+
+ /*
+ * If nbmand usage tracking is desired and no nbmand was
+ * found for this process, then no need to do further
+ * usage tracking for this process.
+ */
+ if (nbmandonly && (!(use_flag & F_NBM))) {
+ /*
+ * grab the process lock again, clear the SPRLOCK
+ * flag, release the process, and continue.
+ */
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ continue;
+ }
+
+ /*
+ * All other types of usage.
+ * For the next few checks we need to hold p_lock.
+ */
+ mutex_enter(&prp->p_lock);
+ up = PTOU(prp);
+ if (fvp_isdev) {
+ /*
+ * if the target file (fvp) is a device
+ * then check if it matches the processes tty
+ *
+ * we grab s_lock to protect ourselves against
+ * freectty() freeing the vnode out from under us.
+ */
+ sp = prp->p_sessp;
+ mutex_enter(&sp->s_lock);
+ vp = prp->p_sessp->s_vp;
+ if (vp != NULL) {
+ if (fvp->v_rdev == vp->v_rdev)
+ use_flag |= F_TTY;
+
+ if (dip_usage &&
+ (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip))
+ use_flag |= F_TTY;
+ }
+ mutex_exit(&sp->s_lock);
+ } else {
+ /* check the processes current working directory */
+ if (up->u_cdir &&
+ (VN_CMP(fvp, up->u_cdir) ||
+ (contained && (up->u_cdir->v_vfsp == cvfsp))))
+ use_flag |= F_CDIR;
+
+ /* check the processes root directory */
+ if (up->u_rdir &&
+ (VN_CMP(fvp, up->u_rdir) ||
+ (contained && (up->u_rdir->v_vfsp == cvfsp))))
+ use_flag |= F_RDIR;
+
+ /* check the program text vnode */
+ if (prp->p_exec &&
+ (VN_CMP(fvp, prp->p_exec) ||
+ (contained && (prp->p_exec->v_vfsp == cvfsp))))
+ use_flag |= F_TEXT;
+ }
+
+ /* Now we can drop p_lock again */
+ mutex_exit(&prp->p_lock);
+
+ /*
+ * now we want to walk a processes memory mappings.
+ * to do this we need to grab the prp->p_as lock. (you
+ * can't hold p_lock when grabbing the prp->p_as lock.)
+ */
+ if (prp->p_as != &kas) {
+ struct seg *seg;
+ struct as *as = prp->p_as;
+
+ AS_LOCK_ENTER(as, &as->a_lock, RW_READER);
+ for (seg = AS_SEGFIRST(as); seg;
+ seg = AS_SEGNEXT(as, seg)) {
+ /*
+ * if we can't get a backing vnode for this
+ * segment then skip it
+ */
+ vp = NULL;
+ if ((SEGOP_GETVP(seg, seg->s_base, &vp)) ||
+ (vp == NULL))
+ continue;
+
+ /*
+ * if the target file (fvp) is not a device
+ * and corrosponds to the root of a filesystem
+ * (cvfsp), then check if it contains the
+ * vnode backing this segment (vp).
+ */
+ if (contained && (vp->v_vfsp == cvfsp)) {
+ use_flag |= F_MAP;
+ break;
+ }
+
+ /*
+ * if the target file (fvp) is not a device,
+ * check if it matches the the vnode backing
+ * this segment (vp).
+ */
+ if (!fvp_isdev && VN_CMP(fvp, vp)) {
+ use_flag |= F_MAP;
+ break;
+ }
+
+ /*
+ * if the target file (fvp) isn't a device,
+ * or the the vnode backing this segment (vp)
+ * isn't a device then continue.
+ */
+ if (!fvp_isdev ||
+ !vn_matchops(vp, spec_getvnodeops()))
+ continue;
+
+ /*
+ * check if the vnode backing this segment
+ * (vp) maps to the same device minor node
+ * as the target device (fvp)
+ */
+ if (fvp->v_rdev == vp->v_rdev) {
+ use_flag |= F_MAP;
+ break;
+ }
+
+ /*
+ * if we're checking for device instance
+ * usage, then check if the vnode backing
+ * this segment (vp) maps to the same device
+ * instance as the target device (fvp).
+ */
+ if (dip_usage &&
+ (VTOCS(fvp)->s_dip == VTOCS(vp)->s_dip)) {
+ use_flag |= F_MAP;
+ break;
+ }
+ }
+ AS_LOCK_EXIT(as, &as->a_lock);
+ }
+
+ if (use_flag) {
+ ASSERT(pcnt < fu_data->fud_user_max);
+ fu_data->fud_user[pcnt].fu_flags = use_flag;
+ fu_data->fud_user[pcnt].fu_pid = pid;
+ fu_data->fud_user[pcnt].fu_uid = uid;
+ pcnt++;
+ }
+
+ /*
+ * grab the process lock again, clear the SPRLOCK
+ * flag, release the process, and continue.
+ */
+ mutex_enter(&prp->p_lock);
+ sprunlock(prp);
+ }
+
+ kmem_free(pidlist, v_proc * sizeof (pid_t));
+
+ fu_data->fud_user_count = pcnt;
+ return (fu_data);
+}
+
+typedef struct dofkusers_arg {
+ vnode_t *fvp;
+ int flags;
+ int *error;
+ fu_data_t *fu_data;
+} dofkusers_arg_t;
+
+static int
+dofkusers_walker(const ldi_usage_t *ldi_usage, void *arg)
+{
+ dofkusers_arg_t *dofkusers_arg = (dofkusers_arg_t *)arg;
+
+ vnode_t *fvp = dofkusers_arg->fvp;
+ int flags = dofkusers_arg->flags;
+ int *error = dofkusers_arg->error;
+ fu_data_t *fu_data = dofkusers_arg->fu_data;
+
+ modid_t modid;
+ minor_t minor;
+ int instance;
+ int dip_usage = (flags & F_DEVINFO);
+
+ ASSERT(*error == 0);
+ ASSERT(vn_matchops(fvp, spec_getvnodeops()));
+
+ /*
+ * check if the dev_t of the target device matches the dev_t
+ * of the device we're trying to find usage info for.
+ */
+ if (fvp->v_rdev != ldi_usage->tgt_devt) {
+
+ /*
+ * if the dev_ts don't match and we're not trying
+ * to find usage information for device instances
+ * then return
+ */
+ if (!dip_usage)
+ return (LDI_USAGE_CONTINUE);
+
+
+ /*
+ * we're trying to find usage information for an
+ * device instance instead of just a minor node.
+ *
+ * check if the dip for the target device matches the
+ * dip of the device we're trying to find usage info for.
+ */
+ if (VTOCS(fvp)->s_dip != ldi_usage->tgt_dip)
+ return (LDI_USAGE_CONTINUE);
+ }
+
+ if (fu_data->fud_user_count >= fu_data->fud_user_max) {
+ *error = E2BIG;
+ return (LDI_USAGE_TERMINATE);
+ }
+
+ /* get the device vnode user information */
+ modid = ldi_usage->src_modid;
+ ASSERT(modid != -1);
+
+ minor = instance = -1;
+ if (ldi_usage->src_dip != NULL) {
+ instance = DEVI(ldi_usage->src_dip)->devi_instance;
+ }
+ if (ldi_usage->src_devt != DDI_DEV_T_NONE) {
+ minor = getminor(ldi_usage->src_devt);
+ }
+
+ /* set the device vnode user information */
+ fu_data->fud_user[fu_data->fud_user_count].fu_flags = F_KERNEL;
+ fu_data->fud_user[fu_data->fud_user_count].fu_modid = modid;
+ fu_data->fud_user[fu_data->fud_user_count].fu_instance = instance;
+ fu_data->fud_user[fu_data->fud_user_count].fu_minor = minor;
+
+ fu_data->fud_user_count++;
+
+ return (LDI_USAGE_CONTINUE);
+}
+
+int
+f_user_cmp(const void *arg1, const void *arg2)
+{
+ f_user_t *f_user1 = (f_user_t *)arg1;
+ f_user_t *f_user2 = (f_user_t *)arg2;
+
+ /*
+ * we should only be called for f_user_t entires that represent
+ * a kernel file consumer
+ */
+ ASSERT(f_user1->fu_flags & F_KERNEL);
+ ASSERT(f_user2->fu_flags & F_KERNEL);
+
+ if (f_user1->fu_modid != f_user2->fu_modid)
+ return ((f_user1->fu_modid < f_user2->fu_modid) ? -1 : 1);
+
+ if (f_user1->fu_instance != f_user2->fu_instance)
+ return ((f_user1->fu_instance < f_user2->fu_instance) ? -1 : 1);
+
+ if (f_user1->fu_minor != f_user2->fu_minor)
+ return ((f_user1->fu_minor < f_user2->fu_minor) ? -1 : 1);
+
+ return (0);
+}
+
+static fu_data_t *
+dofkusers(vnode_t *fvp, int flags, int *error)
+{
+ dofkusers_arg_t dofkusers_arg;
+ fu_data_t *fu_data;
+ int user_max, i;
+
+ /*
+ * we only keep track of kernel device consumers, so if the
+ * target vnode isn't a device then there's nothing to do here
+ */
+ if (!vn_matchops(fvp, spec_getvnodeops()))
+ return (NULL);
+
+ /* allocate the data structure to return our results in */
+ user_max = ldi_usage_count();
+ fu_data = kmem_alloc(fu_data_size(user_max), KM_SLEEP);
+ fu_data->fud_user_max = user_max;
+ fu_data->fud_user_count = 0;
+
+ /* invoke the callback to collect device usage information */
+ dofkusers_arg.fvp = fvp;
+ dofkusers_arg.flags = flags;
+ dofkusers_arg.error = error;
+ dofkusers_arg.fu_data = fu_data;
+ ldi_usage_walker(&dofkusers_arg, dofkusers_walker);
+
+ /* check for errors */
+ if (*error != 0)
+ return (fu_data);
+
+ /* if there aren't any file consumers then return */
+ if (fu_data->fud_user_count == 0)
+ return (fu_data);
+
+ /*
+ * since we ignore the spec_type of the target we're trying to
+ * access it's possible that we could have duplicates entries in
+ * the list of consumers.
+ *
+ * we don't want to check for duplicate in the callback because
+ * we're holding locks in the ldi when the callback is invoked.
+ *
+ * so here we need to go through the array of file consumers
+ * and remove duplicate entries.
+ */
+
+ /* first sort the array of file consumers */
+ qsort((caddr_t)fu_data->fud_user, fu_data->fud_user_count,
+ sizeof (f_user_t), f_user_cmp);
+
+ /* then remove any duplicate entires */
+ i = 1;
+ while (i < fu_data->fud_user_count) {
+
+ if (f_user_cmp(&fu_data->fud_user[i],
+ &fu_data->fud_user[i - 1]) != 0) {
+ /*
+ * the current element is unique, move onto
+ * the next one
+ */
+ i++;
+ continue;
+ }
+
+ /*
+ * this entry is a duplicate so if it's not the last
+ * entry in the array then remove it.
+ */
+ fu_data->fud_user_count--;
+ if (i == fu_data->fud_user_count)
+ break;
+
+ bcopy(&fu_data->fud_user[i + 1], &fu_data->fud_user[i],
+ sizeof (f_user_t) * (fu_data->fud_user_count - i));
+ }
+
+ return (fu_data);
+}
+
+/*
+ * Determine the ways in which processes and the kernel are using a named
+ * file or mounted file system (path). Normally return 0. In case of an
+ * error appropriate errno will be returned.
+ *
+ * Upon success, uts_fusers will also copyout the file usage information
+ * in the form of an array of f_user_t's that are contained within an
+ * fu_data_t pointed to by userbp.
+ */
+static int
+uts_fusers(char *path, int flags, intptr_t userbp)
+{
+ fu_data_t *fu_data = NULL, *fuk_data = NULL;
+ fu_data_t fu_header;
+ vnode_t *fvp = NULL;
+ size_t bcount;
+ int error = 0;
+ int total_max, total_out;
+ int contained = (flags & F_CONTAINED);
+ int dip_usage = (flags & F_DEVINFO);
+ int fvp_isdev;
+
+
+ /* figure out how man f_user_t's we can safetly copy out */
+ if (copyin((const void *)userbp, &total_max, sizeof (total_max)))
+ return (EFAULT);
+
+ /*
+ * check if we only want a count of how many kernel device
+ * consumers exist
+ */
+ if (flags & F_KINFO_COUNT) {
+ fu_header.fud_user_max = total_max;
+ fu_header.fud_user_count = ldi_usage_count();
+ bcount = fu_data_size(0);
+ if (copyout(&fu_header, (void *)userbp, bcount))
+ return (EFAULT);
+ return (0);
+ }
+
+ /* get the vnode for the file we want to look up usage for */
+ error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &fvp);
+ if (error != 0)
+ return (error);
+ ASSERT(fvp);
+ fvp_isdev = vn_matchops(fvp, spec_getvnodeops());
+
+ /*
+ * if we want to report usage for all files contained within a
+ * file system then the target file better correspond to the
+ * root node of a mounted file system, or the root of a zone.
+ */
+ if (contained && !(fvp->v_flag & VROOT) &&
+ fvp != curproc->p_zone->zone_rootvp) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * if we want to report usage for all files contained within a
+ * file system then the target file better not be a device.
+ */
+ if (contained && fvp_isdev) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * if we want to report usage for a device instance then the
+ * target file better corrospond to a device
+ */
+ if (dip_usage && !fvp_isdev) {
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * if the target vnode isn't a device and it has a reference count
+ * of one then no one else is going to have it open so we don't
+ * have any work to do.
+ */
+ if (!fvp_isdev && (fvp->v_count == 1)) {
+ goto out;
+ }
+
+ /* look up usage information for this vnode */
+ fu_data = dofusers(fvp, flags);
+ fuk_data = dofkusers(fvp, flags, &error);
+ if (error != 0)
+ goto out;
+
+ /* get a count of the number of f_user_t's we need to copy out */
+ total_out = 0;
+ if (fu_data)
+ total_out += fu_data->fud_user_count;
+ if (fuk_data)
+ total_out += fuk_data->fud_user_count;
+
+ /* check if there is enough space to copyout all results */
+ if (total_out > total_max) {
+ error = E2BIG;
+ goto out;
+ }
+
+ /* copyout file usage info counts */
+ fu_header.fud_user_max = total_max;
+ fu_header.fud_user_count = total_out;
+ bcount = fu_data_size(0);
+ if (copyout(&fu_header, (void *)userbp, bcount)) {
+ error = EFAULT;
+ goto out;
+ }
+
+ /* copyout userland process file usage info */
+ if ((fu_data != NULL) && (fu_data->fud_user_count > 0)) {
+ userbp += bcount;
+ bcount = fu_data->fud_user_count * sizeof (f_user_t);
+ if (copyout(fu_data->fud_user, (void *)userbp, bcount)) {
+ error = EFAULT;
+ goto out;
+ }
+ }
+
+ /* copyout kernel file usage info */
+ if ((fuk_data != NULL) && (fuk_data->fud_user_count > 0)) {
+ userbp += bcount;
+ bcount = fuk_data->fud_user_count * sizeof (f_user_t);
+ if (copyout(fuk_data->fud_user, (void *)userbp, bcount)) {
+ error = EFAULT;
+ goto out;
+ }
+ }
+
+out:
+ /* release the vnode that we were looking up usage for */
+ VN_RELE(fvp);
+
+ /* release any allocated memory */
+ if (fu_data)
+ kmem_free(fu_data, fu_data_size(fu_data->fud_user_max));
+ if (fuk_data)
+ kmem_free(fuk_data, fu_data_size(fuk_data->fud_user_max));
+
+ return (error);
+}
diff --git a/usr/src/uts/common/syscall/yield.c b/usr/src/uts/common/syscall/yield.c
new file mode 100644
index 0000000000..45133df129
--- /dev/null
+++ b/usr/src/uts/common/syscall/yield.c
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 1996-2002 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved */
+
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/thread.h>
+#include <sys/disp.h>
+#include <sys/debug.h>
+#include <sys/cpuvar.h>
+
+
+/*
+ * The calling LWP is preempted in favor of some other LWP.
+ */
+int
+yield()
+{
+ kthread_t *t = curthread;
+ klwp_t *lwp = ttolwp(t);
+
+ thread_lock(t);
+ lwp->lwp_ru.nvcsw++;
+ THREAD_TRANSITION(t);
+ CL_YIELD(t); /* does setbackdq */
+ thread_unlock_nopreempt(t);
+ swtch(); /* clears cpu_runrun and cpu_kprunrun */
+
+ return (0);
+}