1 files changed, 725 insertions, 0 deletions
diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c
new file mode 100644
index 0000000000..00ad0a49e7
--- /dev/null
+++ b/usr/src/uts/common/os/pid.c
@@ -0,0 +1,725 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/proc.h>
+#include <sys/kmem.h>
+#include <sys/tuneable.h>
+#include <sys/var.h>
+#include <sys/cred.h>
+#include <sys/systm.h>
+#include <sys/prsystm.h>
+#include <sys/vnode.h>
+#include <sys/session.h>
+#include <sys/cpuvar.h>
+#include <sys/cmn_err.h>
+#include <sys/bitmap.h>
+#include <sys/debug.h>
+#include <c2/audit.h>
+#include <sys/zone.h>
+
+/* directory entries for /proc */
+union procent {
+	proc_t *pe_proc;
+	union procent *pe_next;
+};
+
+struct pid pid0 = {
+	0,		/* pid_prinactive */
+	1,		/* pid_pgorphaned */
+	0,		/* pid_padding	*/
+	0,		/* pid_prslot	*/
+	0,		/* pid_id	*/
+	NULL,		/* pid_pglink	*/
+	NULL,		/* pid_link	*/
+	3		/* pid_ref	*/
+};
+
+static int pid_hashlen = 4;	/* desired average hash chain length */
+static int pid_hashsz;		/* number of buckets in the hash table */
+
+#define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
+
+extern uint_t nproc;
+extern struct kmem_cache *process_cache;
+static void	upcount_init(void);
+
+kmutex_t	pidlock;	/* global process lock */
+kmutex_t	pr_pidlock;	/* /proc global process lock */
+kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
+struct plock	*proc_lock;	/* persistent array of p_lock's */
+
+/*
+ * See the comment above pid_getlockslot() for a detailed explanation of this
+ * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
+ * granularity; if the coherence granularity is ever changed, this constant
+ * should be modified to reflect the change to minimize proc_lock false
+ * sharing (correctness, however, is guaranteed regardless of the coherence
+ * granularity).
+ */
+#define	PLOCK_SHIFT	3
+
+static kmutex_t	pidlinklock;
+static struct pid **pidhash;
+static pid_t minpid;
+static pid_t mpid;
+static union procent *procdir;
+static union procent *procentfree;
+
+static struct pid *
+pid_lookup(pid_t pid)
+{
+	struct pid *pidp;
+
+	ASSERT(MUTEX_HELD(&pidlinklock));
+
+	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
+		if (pidp->pid_id == pid) {
+			ASSERT(pidp->pid_ref > 0);
+			break;
+		}
+	}
+	return (pidp);
+}
+
+void
+pid_setmin(void)
+{
+	if (jump_pid && jump_pid > mpid)
+		minpid = mpid = jump_pid;
+	else
+		minpid = mpid + 1;
+}
+
+/*
+ * When prslots are simply used as an index to determine a process' p_lock,
+ * adjacent prslots share adjacent p_locks.  On machines where the size
+ * of a mutex is smaller than that of a cache line (which, as of this writing,
+ * is true for all machines on which Solaris runs), this can potentially
+ * induce false sharing.  The standard solution for false sharing is to pad
+ * out one's data structures (in this case, struct plock).  However,
+ * given the size and (generally) sparse use of the proc_lock array, this
+ * is suboptimal.  We therefore stride through the proc_lock array with
+ * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
+ *
+ *   log_2 (coherence_granularity / sizeof (kmutex_t))
+ *
+ * Under this scheme, false sharing is still possible -- but only when
+ * the number of active processes is very large.  Note that the one-to-one
+ * mapping between prslots and lockslots is maintained.
+ */
+static int
+pid_getlockslot(int prslot)
+{
+	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
+	int perlap = even >> PLOCK_SHIFT;
+
+	if (prslot >= even)
+		return (prslot);
+
+	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
+}
+
+/*
+ * This function assigns a pid for use in a fork request.  It allocates
+ * a pid structure, tries to find an empty slot in the proc table,
+ * and selects the process id.
+ *
+ * pid_assign() returns the new pid on success, -1 on failure.
+ */
+pid_t
+pid_assign(proc_t *prp)
+{
+	struct pid *pidp;
+	union procent *pep;
+	pid_t newpid, startpid;
+
+	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
+
+	mutex_enter(&pidlinklock);
+	if ((pep = procentfree) == NULL) {
+		/*
+		 * ran out of /proc directory entries
+		 */
+		goto failed;
+	}
+
+	/*
+	 * Allocate a pid
+	 */
+	startpid = mpid;
+	do  {
+		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
+	} while (pid_lookup(newpid) && newpid != startpid);
+
+	if (newpid == startpid && pid_lookup(newpid)) {
+		/* couldn't find a free pid */
+		goto failed;
+	}
+
+	procentfree = pep->pe_next;
+	pep->pe_proc = prp;
+	prp->p_pidp = pidp;
+
+	/*
+	 * Put pid into the pid hash table.
+	 */
+	pidp->pid_link = HASHPID(newpid);
+	HASHPID(newpid) = pidp;
+	pidp->pid_ref = 1;
+	pidp->pid_id = newpid;
+	pidp->pid_prslot = pep - procdir;
+	prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
+	mutex_exit(&pidlinklock);
+
+	return (newpid);
+
+failed:
+	mutex_exit(&pidlinklock);
+	kmem_free(pidp, sizeof (struct pid));
+	return (-1);
+}
+
+/*
+ * decrement the reference count for pid
+ */
+int
+pid_rele(struct pid *pidp)
+{
+	struct pid **pidpp;
+
+	mutex_enter(&pidlinklock);
+	ASSERT(pidp != &pid0);
+
+	pidpp = &HASHPID(pidp->pid_id);
+	for (;;) {
+		ASSERT(*pidpp != NULL);
+		if (*pidpp == pidp)
+			break;
+		pidpp = &(*pidpp)->pid_link;
+	}
+
+	*pidpp = pidp->pid_link;
+	mutex_exit(&pidlinklock);
+
+	kmem_free(pidp, sizeof (*pidp));
+	return (0);
+}
+
+void
+proc_entry_free(struct pid *pidp)
+{
+	mutex_enter(&pidlinklock);
+	pidp->pid_prinactive = 1;
+	procdir[pidp->pid_prslot].pe_next = procentfree;
+	procentfree = &procdir[pidp->pid_prslot];
+	mutex_exit(&pidlinklock);
+}
+
+void
+pid_exit(proc_t *prp)
+{
+	struct pid *pidp;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	/*
+	 * Exit process group.  If it is NULL, it's because fork failed
+	 * before calling pgjoin().
+	 */
+	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
+	if (prp->p_pgidp != NULL)
+		pgexit(prp);
+
+	SESS_RELE(prp->p_sessp);
+
+	pidp = prp->p_pidp;
+
+	proc_entry_free(pidp);
+
+#ifdef C2_AUDIT
+	if (audit_active)
+		audit_pfree(prp);
+#endif
+
+	if (practive == prp) {
+		practive = prp->p_next;
+	}
+
+	if (prp->p_next) {
+		prp->p_next->p_prev = prp->p_prev;
+	}
+	if (prp->p_prev) {
+		prp->p_prev->p_next = prp->p_next;
+	}
+
+	PID_RELE(pidp);
+
+	mutex_destroy(&prp->p_crlock);
+	kmem_cache_free(process_cache, prp);
+	nproc--;
+}
+
+/*
+ * Find a process visible from the specified zone given its process ID.
+ */
+proc_t *
+prfind_zone(pid_t pid, zoneid_t zoneid)
+{
+	struct pid *pidp;
+	proc_t *p;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pid);
+	mutex_exit(&pidlinklock);
+	if (pidp != NULL && pidp->pid_prinactive == 0) {
+		p = procdir[pidp->pid_prslot].pe_proc;
+		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
+			return (p);
+	}
+	return (NULL);
+}
+
+/*
+ * Find a process given its process ID.  This obeys zone restrictions,
+ * so if the caller is in a non-global zone it won't find processes
+ * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
+ * bypass this restriction.
+ */
+proc_t *
+prfind(pid_t pid)
+{
+	zoneid_t zoneid;
+
+	if (INGLOBALZONE(curproc))
+		zoneid = ALL_ZONES;
+	else
+		zoneid = getzoneid();
+	return (prfind_zone(pid, zoneid));
+}
+
+proc_t *
+pgfind_zone(pid_t pgid, zoneid_t zoneid)
+{
+	struct pid *pidp;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	mutex_enter(&pidlinklock);
+	pidp = pid_lookup(pgid);
+	mutex_exit(&pidlinklock);
+	if (pidp != NULL) {
+		proc_t *p = pidp->pid_pglink;
+
+		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
+		    p->p_zone->zone_id == zoneid)
+			return (p);
+	}
+	return (NULL);
+}
+
+/*
+ * return the head of the list of processes whose process group ID is 'pgid',
+ * or NULL, if no such process group
+ */
+proc_t *
+pgfind(pid_t pgid)
+{
+	zoneid_t zoneid;
+
+	if (INGLOBALZONE(curproc))
+		zoneid = ALL_ZONES;
+	else
+		zoneid = getzoneid();
+	return (pgfind_zone(pgid, zoneid));
+}
+
+/*
+ * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
+ * Returns the proc pointer on success, NULL on failure.  sprlock() is
+ * really just a stripped-down version of pr_p_lock() to allow practive
+ * walkers like dofusers() and dumpsys() to synchronize with /proc.
+ */
+proc_t *
+sprlock_zone(pid_t pid, zoneid_t zoneid)
+{
+	proc_t *p;
+	kmutex_t *mp;
+
+	for (;;) {
+		mutex_enter(&pidlock);
+		if ((p = prfind_zone(pid, zoneid)) == NULL) {
+			mutex_exit(&pidlock);
+			return (NULL);
+		}
+		/*
+		 * p_lock is persistent, but p itself is not -- it could
+		 * vanish during cv_wait().  Load p->p_lock now so we can
+		 * drop it after cv_wait() without referencing p.
+		 */
+		mp = &p->p_lock;
+		mutex_enter(mp);
+		mutex_exit(&pidlock);
+		/*
+		 * If the process is in some half-baked state, fail.
+		 */
+		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
+		    p->p_tlist == NULL || (p->p_flag & SEXITLWPS)) {
+			mutex_exit(mp);
+			return (NULL);
+		}
+		if (panicstr)
+			return (p);
+		if (!(p->p_proc_flag & P_PR_LOCK))
+			break;
+		cv_wait(&pr_pid_cv[p->p_slot], mp);
+		mutex_exit(mp);
+	}
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+	return (p);
+}
+
+proc_t *
+sprlock(pid_t pid)
+{
+	zoneid_t zoneid;
+
+	if (INGLOBALZONE(curproc))
+		zoneid = ALL_ZONES;
+	else
+		zoneid = getzoneid();
+	return (sprlock_zone(pid, zoneid));
+}
+
+void
+sprlock_proc(proc_t *p)
+{
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	while (p->p_proc_flag & P_PR_LOCK) {
+		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
+	}
+
+	p->p_proc_flag |= P_PR_LOCK;
+	THREAD_KPRI_REQUEST();
+}
+
+void
+sprunlock(proc_t *p)
+{
+	if (panicstr) {
+		mutex_exit(&p->p_lock);
+		return;
+	}
+
+	ASSERT(p->p_proc_flag & P_PR_LOCK);
+	ASSERT(MUTEX_HELD(&p->p_lock));
+
+	cv_signal(&pr_pid_cv[p->p_slot]);
+	p->p_proc_flag &= ~P_PR_LOCK;
+	mutex_exit(&p->p_lock);
+	THREAD_KPRI_RELEASE();
+}
+
+void
+pid_init(void)
+{
+	int i;
+
+	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
+
+	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
+	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
+	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
+	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
+
+	nproc = 1;
+	practive = proc_sched;
+	proc_sched->p_next = NULL;
+	procdir[0].pe_proc = proc_sched;
+
+	procentfree = &procdir[1];
+	for (i = 1; i < v.v_proc - 1; i++)
+		procdir[i].pe_next = &procdir[i+1];
+	procdir[i].pe_next = NULL;
+
+	HASHPID(0) = &pid0;
+
+	upcount_init();
+}
+
+proc_t *
+pid_entry(int slot)
+{
+	union procent *pep;
+	proc_t *prp;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+	ASSERT(slot >= 0 && slot < v.v_proc);
+
+	pep = procdir[slot].pe_next;
+	if (pep >= procdir && pep < &procdir[v.v_proc])
+		return (NULL);
+	prp = procdir[slot].pe_proc;
+	if (prp != 0 && prp->p_stat == SIDL)
+		return (NULL);
+	return (prp);
+}
+
+/*
+ * Send the specified signal to all processes whose process group ID is
+ * equal to 'pgid'
+ */
+
+void
+signal(pid_t pgid, int sig)
+{
+	struct pid *pidp;
+	proc_t *prp;
+
+	mutex_enter(&pidlock);
+	mutex_enter(&pidlinklock);
+	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
+		mutex_exit(&pidlinklock);
+		mutex_exit(&pidlock);
+		return;
+	}
+	mutex_exit(&pidlinklock);
+	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
+		mutex_enter(&prp->p_lock);
+		sigtoproc(prp, NULL, sig);
+		mutex_exit(&prp->p_lock);
+	}
+	mutex_exit(&pidlock);
+}
+
+/*
+ * Send the specified signal to the specified process
+ */
+
+void
+prsignal(struct pid *pidp, int sig)
+{
+	if (!(pidp->pid_prinactive))
+		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
+}
+
+#include <sys/sunddi.h>
+
+/*
+ * DDI/DKI interfaces for drivers to send signals to processes
+ */
+
+/*
+ * obtain an opaque reference to a process for signaling
+ */
+void *
+proc_ref(void)
+{
+	struct pid *pidp;
+
+	mutex_enter(&pidlock);
+	pidp = curproc->p_pidp;
+	PID_HOLD(pidp);
+	mutex_exit(&pidlock);
+
+	return (pidp);
+}
+
+/*
+ * release a reference to a process
+ * - a process can exit even if a driver has a reference to it
+ * - one proc_unref for every proc_ref
+ */
+void
+proc_unref(void *pref)
+{
+	mutex_enter(&pidlock);
+	PID_RELE((struct pid *)pref);
+	mutex_exit(&pidlock);
+}
+
+/*
+ * send a signal to a process
+ *
+ * - send the process the signal
+ * - if the process went away, return a -1
+ * - if the process is still there return 0
+ */
+int
+proc_signal(void *pref, int sig)
+{
+	struct pid *pidp = pref;
+
+	prsignal(pidp, sig);
+	return (pidp->pid_prinactive ? -1 : 0);
+}
+
+
+static struct upcount	**upc_hash;	/* a boot time allocated array */
+static ulong_t		upc_hashmask;
+#define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
+
+/*
+ * Get us off the ground.  Called once at boot.
+ */
+void
+upcount_init(void)
+{
+	ulong_t	upc_hashsize;
+
+	/*
+	 * An entry per MB of memory is our current guess
+	 */
+	/*
+	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
+	 * converts pages to megs (without overflowing a u_int
+	 * if you have more than 4G of memory, like ptob(physmem)/1M
+	 * would).
+	 */
+	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
+	upc_hashmask = upc_hashsize - 1;
+	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
+	    KM_SLEEP);
+}
+
+/*
+ * Increment the number of processes associated with a given uid and zoneid.
+ */
+void
+upcount_inc(uid_t uid, zoneid_t zoneid)
+{
+	struct upcount	**upc, **hupc;
+	struct upcount	*new;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+	new = NULL;
+	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
+top:
+	upc = hupc;
+	while ((*upc) != NULL) {
+		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
+			(*upc)->up_count++;
+			if (new) {
+				/*
+				 * did not need `new' afterall.
+				 */
+				kmem_free(new, sizeof (*new));
+			}
+			return;
+		}
+		upc = &(*upc)->up_next;
+	}
+
+	/*
+	 * There is no entry for this <uid,zoneid> pair.
+	 * Allocate one.  If we have to drop pidlock, check
+	 * again.
+	 */
+	if (new == NULL) {
+		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
+		if (new == NULL) {
+			mutex_exit(&pidlock);
+			new = (struct upcount *)kmem_alloc(sizeof (*new),
+			    KM_SLEEP);
+			mutex_enter(&pidlock);
+			goto top;
+		}
+	}
+
+
+	/*
+	 * On the assumption that a new user is going to do some
+	 * more forks, put the new upcount structure on the front.
+	 */
+	upc = hupc;
+
+	new->up_uid = uid;
+	new->up_zoneid = zoneid;
+	new->up_count = 1;
+	new->up_next = *upc;
+
+	*upc = new;
+}
+
+/*
+ * Decrement the number of processes a given uid and zoneid has.
+ */
+void
+upcount_dec(uid_t uid, zoneid_t zoneid)
+{
+	struct	upcount **upc;
+	struct	upcount *done;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	upc = &upc_hash[UPC_HASH(uid, zoneid)];
+	while ((*upc) != NULL) {
+		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
+			(*upc)->up_count--;
+			if ((*upc)->up_count == 0) {
+				done = *upc;
+				*upc = (*upc)->up_next;
+				kmem_free(done, sizeof (*done));
+			}
+			return;
+		}
+		upc = &(*upc)->up_next;
+	}
+	cmn_err(CE_PANIC, "decr_upcount-off the end");
+}
+
+/*
+ * Returns the number of processes a uid has.
+ * Non-existent uid's are assumed to have no processes.
+ */
+int
+upcount_get(uid_t uid, zoneid_t zoneid)
+{
+	struct	upcount *upc;
+
+	ASSERT(MUTEX_HELD(&pidlock));
+
+	upc = upc_hash[UPC_HASH(uid, zoneid)];
+	while (upc != NULL) {
+		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
+			return (upc->up_count);
+		}
+		upc = upc->up_next;
+	}
+	return (0);
+}