diff options
author | nn35248 <none@none> | 2006-09-11 22:51:59 -0700 |
---|---|---|
committer | nn35248 <none@none> | 2006-09-11 22:51:59 -0700 |
commit | 9acbbeaf2a1ffe5c14b244867d427714fab43c5c (patch) | |
tree | d1ecd54896325c19a463220e9cbc50864874fc82 /usr/src/uts/common | |
parent | da51466dc253d7c98dda4956059042bd0c476328 (diff) | |
download | illumos-gate-9acbbeaf2a1ffe5c14b244867d427714fab43c5c.tar.gz |
PSARC/2005/471 BrandZ: Support for non-native zones
6374606 ::nm -D without an object may not work on processes in zones
6409350 BrandZ project integration into Solaris
6455289 pthread_setschedparam() should return EPERM rather than panic libc
6455591 setpriority(3C) gets errno wrong for deficient privileges failure
6458178 fifofs doesn't support lofs mounts of fifos
6460380 Attempted open() of a symlink with the O_NOFOLLOW flag set returns EINVAL, not ELOOP
6463857 renice(1) errors erroneously
--HG--
rename : usr/src/lib/libzonecfg/zones/SUNWblank.xml => usr/src/lib/brand/native/zone/SUNWblank.xml
rename : usr/src/lib/libzonecfg/zones/SUNWdefault.xml => usr/src/lib/brand/native/zone/SUNWdefault.xml
Diffstat (limited to 'usr/src/uts/common')
101 files changed, 17861 insertions, 610 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index b022fcd0c9..f0203dfeb9 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -41,6 +41,7 @@ sparc_CORE_OBJS += COMMON_CORE_OBJS += \ atomic.o \ bp_map.o \ + brand.o \ chip.o \ cpu.o \ cpu_intr.o \ @@ -78,6 +79,7 @@ GENUNIX_OBJS += \ bio.o \ bitmap.o \ blabel.o \ + brandsys.o \ callb.o \ callout.o \ chdir.o \ @@ -318,6 +320,7 @@ GENUNIX_OBJS += \ urw.o \ utime.o \ utssys.o \ + uucopy.o \ vfs.o \ vfs_conf.o \ vmem.o \ @@ -360,6 +363,8 @@ PROFILE_OBJS += profile.o SYSTRACE_OBJS += systrace.o +LX_SYSTRACE_OBJS += lx_systrace.o + LOCKSTAT_OBJS += lockstat.o FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o @@ -397,6 +402,10 @@ PTSL_OBJS += tty_pts.o PTM_OBJS += ptm.o +LX_PTM_OBJS += lx_ptm.o + +LX_AUDIO_OBJS += lx_audio.o + PTS_OBJS += pts.o PTY_OBJS += ptms_conf.o @@ -937,6 +946,8 @@ DEDUMP_OBJS += dedump.o DRCOMPAT_OBJS += drcompat.o +LDLINUX_OBJS += ldlinux.o + LDTERM_OBJS += ldterm.o uwidth.o PCKT_OBJS += pckt.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 69e32b7ee5..27b347c937 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -70,6 +70,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/avl/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/sn1/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/c2/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -911,6 +915,9 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/acl/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/avl/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/brand/sn1/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/c2/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c new file mode 100644 index 0000000000..ecd4e8e44d --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c @@ -0,0 +1,1558 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <fs/fs_subr.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/dirent.h> +#include <sys/fs/fifonode.h> +#include <sys/modctl.h> +#include <sys/mount.h> +#include <sys/policy.h> +#include <sys/sunddi.h> + +#include <sys/sysmacros.h> +#include <sys/vfs.h> + +#include <sys/lx_autofs_impl.h> + +/* + * External functions + */ +extern uintptr_t space_fetch(char *key); +extern int space_store(char *key, uintptr_t ptr); + +/* + * Globals + */ +static vfsops_t *lx_autofs_vfsops; +static vnodeops_t *lx_autofs_vn_ops = NULL; +static int lx_autofs_fstype; +static major_t lx_autofs_major; +static minor_t lx_autofs_minor = 0; + +/* + * Support functions + */ +static void +i_strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +i_strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static int +i_str_to_int(char *str, int *val) +{ + long res; + + if (str == NULL) + return (-1); + + if ((ddi_strtol(str, NULL, 10, &res) != 0) || + (res < INT_MIN) || (res > INT_MAX)) + return (-1); + + *val = res; + return (0); +} + +static void +i_stack_init(list_t *lp) +{ + list_create(lp, + sizeof (stack_elem_t), offsetof(stack_elem_t, se_list)); +} + +static void +i_stack_fini(list_t *lp) +{ + ASSERT(list_head(lp) == NULL); + list_destroy(lp); +} + +static void +i_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3) +{ + stack_elem_t *se; + + se = kmem_alloc(sizeof (*se), KM_SLEEP); + se->se_ptr1 = ptr1; + se->se_ptr2 = ptr2; + se->se_ptr3 = ptr3; + list_insert_head(lp, se); +} + +static int +i_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3) +{ + stack_elem_t *se; + + if ((se = list_head(lp)) == NULL) + return (-1); + list_remove(lp, se); + if (ptr1 != NULL) + *ptr1 = se->se_ptr1; + if (ptr2 != NULL) + *ptr2 = se->se_ptr2; + if (ptr3 != NULL) + *ptr3 = se->se_ptr3; + kmem_free(se, sizeof (*se)); + return (0); +} + +static vnode_t * +fifo_peer_vp(vnode_t *vp) +{ + fifonode_t *fnp = VTOF(vp); + fifonode_t *fn_dest = fnp->fn_dest; + return (FTOV(fn_dest)); +} + +static vnode_t * +i_vn_alloc(vfs_t *vfsp, vnode_t *uvp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *vp, *vp_old; + + /* Allocate a new vnode structure in case we need it. */ + vp = vn_alloc(KM_SLEEP); + vn_setops(vp, lx_autofs_vn_ops); + VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev); + vp->v_data = uvp; + ASSERT(vp->v_count == 1); + + /* + * Take a hold on the vfs structure. This is how unmount will + * determine if there are any active vnodes in the file system. + */ + VFS_HOLD(vfsp); + + /* + * Check if we already have a vnode allocated for this underlying + * vnode_t. + */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) { + + /* + * Didn't find an existing node. + * Add this node to the hash and return. + */ + VERIFY(mod_hash_insert(data->lav_vn_hash, + (mod_hash_key_t)uvp, + (mod_hash_val_t)vp) == 0); + mutex_exit(&data->lav_lock); + return (vp); + } + + /* Get a hold on the existing vnode and free up the one we allocated. */ + VN_HOLD(vp_old); + mutex_exit(&data->lav_lock); + + /* Free up the new vnode we allocated. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); + + return (vp_old); +} + +static void +i_vn_free(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *uvp = vp->v_data; + vnode_t *vp_tmp; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + ASSERT(MUTEX_HELD((&vp->v_lock))); + + ASSERT(vp->v_count == 0); + + /* We're about to free this vnode so take it out of the hash. */ + (void) mod_hash_remove(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp); + + /* + * No one else can lookup this vnode any more so there's no need + * to hold locks. + */ + mutex_exit(&data->lav_lock); + mutex_exit(&vp->v_lock); + + /* Release the underlying vnode. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); +} + +static lx_autofs_lookup_req_t * +i_lalr_alloc(lx_autofs_vfs_t *data, int *dup_request, char *nm) +{ + lx_autofs_lookup_req_t *lalr, *lalr_dup; + + /* Pre-allocate a new automounter request before grabbing locks. */ + lalr = kmem_zalloc(sizeof (*lalr), KM_SLEEP); + mutex_init(&lalr->lalr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lalr->lalr_cv, NULL, CV_DEFAULT, NULL); + lalr->lalr_ref = 1; + lalr->lalr_pkt.lap_protover = LX_AUTOFS_PROTO_VERSION; + + /* Assign a unique id for this request. */ + lalr->lalr_pkt.lap_id = id_alloc(data->lav_ids); + + /* + * The token expected by the linux automount is the name of + * the directory entry to look up. (And not the entire + * path that is being accessed.) + */ + lalr->lalr_pkt.lap_name_len = strlen(nm); + if (lalr->lalr_pkt.lap_name_len > + (sizeof (lalr->lalr_pkt.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs lookup: \"%s\"", nm); + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); + return (NULL); + } + (void) strlcpy(lalr->lalr_pkt.lap_name, nm, + sizeof (lalr->lalr_pkt.lap_name)); + + /* Check for an outstanding request for this path. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_path_hash, + (mod_hash_key_t)nm, (mod_hash_val_t *)&lalr_dup) == 0) { + /* + * There's already an outstanding request for this + * path so we don't need a new one. + */ + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); + lalr = lalr_dup; + + /* Bump the ref count on the old request. */ + atomic_add_int(&lalr->lalr_ref, 1); + + *dup_request = 1; + } else { + /* Add it to the hashes. */ + VERIFY(mod_hash_insert(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)lalr) == 0); + VERIFY(mod_hash_insert(data->lav_path_hash, + (mod_hash_key_t)i_strdup(nm), + (mod_hash_val_t)lalr) == 0); + + *dup_request = 0; + } + mutex_exit(&data->lav_lock); + + return (lalr); +} + +static lx_autofs_lookup_req_t * +i_lalr_find(lx_autofs_vfs_t *data, int id) +{ + lx_autofs_lookup_req_t *lalr; + + /* Check for an outstanding request for this id. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&lalr) != 0) { + mutex_exit(&data->lav_lock); + return (NULL); + } + atomic_add_int(&lalr->lalr_ref, 1); + mutex_exit(&data->lav_lock); + return (lalr); +} + +static void +i_lalr_complete(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + lx_autofs_lookup_req_t *lalr_tmp; + + /* Remove this request from the hashes so no one can look it up. */ + mutex_enter(&data->lav_lock); + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)&lalr_tmp); + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)lalr->lalr_pkt.lap_name, + (mod_hash_val_t)&lalr_tmp); + mutex_exit(&data->lav_lock); + + /* Mark this requst as complete and wakeup anyone waiting on it. */ + mutex_enter(&lalr->lalr_lock); + lalr->lalr_complete = 1; + cv_broadcast(&lalr->lalr_cv); + mutex_exit(&lalr->lalr_lock); +} + +static void +i_lalr_release(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + ASSERT(!MUTEX_HELD(&lalr->lalr_lock)); + if (atomic_add_int_nv(&lalr->lalr_ref, -1) > 0) + return; + ASSERT(lalr->lalr_ref == 0); + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); +} + +static void +i_lalr_abort(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + lx_autofs_lookup_req_t *lalr_tmp; + + /* + * This is a little tricky. We're aborting the wait for this + * request. So if anyone else is waiting for this request we + * can't free it, but if no one else is waiting for the request + * we should free it. + */ + mutex_enter(&data->lav_lock); + if (atomic_add_int_nv(&lalr->lalr_ref, -1) > 0) { + mutex_exit(&data->lav_lock); + return; + } + ASSERT(lalr->lalr_ref == 0); + + /* Remove this request from the hashes so no one can look it up. */ + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)&lalr_tmp); + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)lalr->lalr_pkt.lap_name, + (mod_hash_val_t)&lalr_tmp); + mutex_exit(&data->lav_lock); + + /* It's ok to free this now because the ref count was zero. */ + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); +} + +static int +i_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_wr, *ufp_rd; + file_t *fp_wr, *fp_rd; + vnode_t *vp_wr, *vp_rd; + int i; + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* Sanity check fifo write fd. */ + if (fd >= fip->fi_nfiles) { + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* Get a pointer to the write fifo. */ + UF_ENTER(ufp_wr, fip, fd); + if (((fp_wr = ufp_wr->uf_file) == NULL) || + ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) { + /* Invalid fifo fd. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + vp_rd = fifo_peer_vp(fp_wr->f_vnode); + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * We need to drop fi_lock before we can try to aquire f_tlock + * the good news is that the file pointers are protected because + * we're still holding uf_lock. + */ + mutex_exit(&fip->fi_lock); + + /* + * Here we bump the open counts on the fifos. The reason + * that we do this is because when we go to write to the + * fifo we want to ensure that they are actually open (and + * not in the process of being closed) without having to + * stop the automounter. (If the write end of the fifo + * were closed and we tried to write to it we would panic. + * If the read end of the fifo was closed and we tried to + * write to the other end, the process that invoked the + * lookup operation would get an unexpected SIGPIPE.) + */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + ASSERT(fp_wr->f_count >= 2); + mutex_exit(&fp_wr->f_tlock); + + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + ASSERT(fp_rd->f_count >= 2); + mutex_exit(&fp_rd->f_tlock); + + /* Release all our locks. */ + UF_EXIT(ufp_wr); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + /* Return the file pointers. */ + *fpp_rd = fp_rd; + *fpp_wr = fp_wr; + return (0); +} + +static uint_t +/*ARGSUSED*/ +i_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + int *id = (int *)arg; + /* Return the key and terminate the walk. */ + *id = (uintptr_t)key; + return (MH_WALK_TERMINATE); +} + +static void +i_fifo_close(lx_autofs_vfs_t *data) +{ + /* + * Close the fifo to prevent any future requests from + * getting sent to the automounter. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr != NULL) { + (void) closef(data->lav_fifo_wr); + data->lav_fifo_wr = NULL; + } + if (data->lav_fifo_rd != NULL) { + (void) closef(data->lav_fifo_rd); + data->lav_fifo_rd = NULL; + } + mutex_exit(&data->lav_lock); + + /* + * Wakeup any threads currently waiting for the automounter + * note that it's possible for multiple threads to have entered + * this function and to be doing the work below simultaneously. + */ + for (;;) { + lx_autofs_lookup_req_t *lalr; + int id; + + /* Lookup the first entry in the hash. */ + id = -1; + mod_hash_walk(data->lav_id_hash, + i_fifo_close_cb, &id); + if (id == -1) { + /* No more id's in the hash. */ + break; + } + if ((lalr = i_lalr_find(data, id)) == NULL) { + /* Someone else beat us to it. */ + continue; + } + + /* Mark the request as compleate and release it. */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + } +} + +static int +i_fifo_verify_rd(lx_autofs_vfs_t *data) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_rd; + file_t *fp_rd; + vnode_t *vp_rd; + int i; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + + /* Check if we've already been shut down. */ + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + return (-1); + } + vp_rd = fifo_peer_vp(data->lav_fifo_wr->f_vnode); + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(data->lav_pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Seems the automounter still has the read end of the fifo + * open, we're done here. Release all our locks and exit. + */ + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + return (0); +} + +static int +i_fifo_write(lx_autofs_vfs_t *data, lx_autofs_pkt_t *lap) +{ + struct uio uio; + struct iovec iov; + file_t *fp_wr, *fp_rd; + int error; + + /* + * The catch here is we need to make sure _we_ don't close + * the the fifo while writing to it. (Another thread could come + * along and realize the automounter process is gone and close + * the fifo. To do this we bump the open count before we + * write to the fifo. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + fp_wr = data->lav_fifo_wr; + fp_rd = data->lav_fifo_rd; + + /* Bump the open count on the write fifo. */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + mutex_exit(&fp_wr->f_tlock); + + /* Bump the open count on the read fifo. */ + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + mutex_exit(&fp_rd->f_tlock); + + mutex_exit(&data->lav_lock); + + iov.iov_base = (caddr_t)lap; + iov.iov_len = sizeof (*lap); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = 0; + uio.uio_segflg = (short)UIO_SYSSPACE; + uio.uio_resid = sizeof (*lap); + uio.uio_llimit = 0; + uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK; + + error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL); + (void) closef(fp_wr); + (void) closef(fp_rd); + + /* + * After every write we verify that the automounter still has + * these files open. + */ + mutex_enter(&data->lav_lock); + if (i_fifo_verify_rd(data) != 0) { + /* + * Something happened to the automounter. + * Close down the communication pipe we setup. + */ + mutex_exit(&data->lav_lock); + i_fifo_close(data); + if (error != 0) + return (error); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + return (error); +} + +static int +i_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack) +{ + struct iovec iov; + struct uio uio; + dirent64_t *dp, *dbuf; + vnode_t *vp; + size_t dlen, dbuflen; + int eof, error, ndirents = 64; + char *nm; + + dlen = ndirents * (sizeof (*dbuf)); + dbuf = kmem_alloc(dlen, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_loffset = 0; + uio.uio_llimit = MAXOFFSET_T; + + eof = 0; + error = 0; + while (!error && !eof) { + uio.uio_resid = dlen; + iov.iov_base = (char *)dbuf; + iov.iov_len = dlen; + + (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); + if (VOP_READDIR(dvp, &uio, kcred, &eof) != 0) { + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + kmem_free(dbuf, dlen); + return (-1); + } + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + + if ((dbuflen = dlen - uio.uio_resid) == 0) { + /* We're done. */ + break; + } + + for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) { + + nm = dp->d_name; + + if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0) + continue; + + if (VOP_LOOKUP(dvp, + nm, &vp, NULL, 0, NULL, kcred) != 0) { + kmem_free(dbuf, dlen); + return (-1); + } + if (vp->v_type == VDIR) { + if (dir_stack != NULL) { + i_stack_push(dir_stack, (caddr_t)dvp, + (caddr_t)vp, i_strdup(nm)); + } else { + VN_RELE(vp); + } + } else { + if (file_stack != NULL) { + i_stack_push(file_stack, (caddr_t)dvp, + (caddr_t)vp, i_strdup(nm)); + } else { + VN_RELE(vp); + } + } + } + } + kmem_free(dbuf, dlen); + return (0); +} + +static void +i_bs_destroy(vnode_t *dvp, char *path) +{ + list_t search_stack; + list_t dir_stack; + list_t file_stack; + vnode_t *pdvp, *vp; + char *dpath, *fpath; + int ret; + + if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred) != 0) { + /* A directory entry with this name doesn't actually exist. */ + return; + } + + if ((vp->v_type & VDIR) == 0) { + /* Easy, the directory entry is a file so delete it. */ + VN_RELE(vp); + (void) VOP_REMOVE(dvp, path, kcred); + return; + } + + /* + * The directory entry is a subdirectory, now we have a bit more + * work to do. (We'll have to recurse into the sub directory.) + * It would have been much easier to do this recursively but kernel + * stacks are notoriously small. + */ + i_stack_init(&search_stack); + i_stack_init(&dir_stack); + i_stack_init(&file_stack); + + /* Save our newfound subdirectory into a list. */ + i_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp, i_strdup(path)); + + /* Do a recursive depth first search into the subdirectories. */ + while (i_stack_pop(&search_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the subdirectories in this directory. */ + if (i_bs_readdir(dvp, &search_stack, NULL) != 0) + goto exit; + + /* Save the current directory a seperate stack. */ + i_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp, dpath); + } + + /* + * Now dir_stack contains a list of directories, the deepest paths + * are at the top of the list. So let's go through and process them. + */ + while (i_stack_pop(&dir_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the files in this directory. */ + if (i_bs_readdir(dvp, NULL, &file_stack) != 0) { + VN_RELE(dvp); + i_strfree(dpath); + goto exit; + } + + /* Delete all the files in this directory. */ + while (i_stack_pop(&file_stack, + NULL, (caddr_t *)&vp, &fpath) == 0) { + VN_RELE(vp) + ret = VOP_REMOVE(dvp, fpath, kcred); + i_strfree(fpath); + if (ret != 0) { + i_strfree(dpath); + goto exit; + } + } + + /* Delete this directory. */ + VN_RELE(dvp); + ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred); + i_strfree(dpath); + if (ret != 0) + goto exit; + } + +exit: + while ( + (i_stack_pop(&search_stack, NULL, (caddr_t *)&vp, &path) == 0) || + (i_stack_pop(&dir_stack, NULL, (caddr_t *)&vp, &path) == 0) || + (i_stack_pop(&file_stack, NULL, (caddr_t *)&vp, &path) == 0)) { + VN_RELE(vp); + i_strfree(path); + } + i_stack_fini(&search_stack); + i_stack_fini(&dir_stack); + i_stack_fini(&file_stack); +} + +static vnode_t * +i_bs_create(vnode_t *dvp, char *bs_name) +{ + vnode_t *vp; + vattr_t vattr; + + /* + * After looking at the mkdir syscall path it seems we don't need + * to initialize all of the vattr_t structure. + */ + bzero(&vattr, sizeof (vattr)); + vattr.va_type = VDIR; + vattr.va_mode = 0755; /* u+rwx,og=rx */ + vattr.va_mask = AT_TYPE|AT_MODE; + + if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred) != 0) + return (NULL); + return (vp); +} + +static int +i_automounter_call(vnode_t *dvp, char *nm) +{ + lx_autofs_lookup_req_t *lalr; + lx_autofs_vfs_t *data; + int error, dup_request; + + /* Get a pointer to the vfs mount data. */ + data = dvp->v_vfsp->vfs_data; + + /* The automounter only support queries in the root directory. */ + if (dvp != data->lav_root) + return (ENOENT); + + /* + * Check if the current process is in the automounters process + * group. (If it is, the current process is either the autmounter + * itself or one of it's forked child processes.) If so, don't + * redirect this lookup back into the automounter because we'll + * hang. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter request structure. */ + if ((lalr = i_lalr_alloc(data, &dup_request, nm)) == NULL) + return (ENOENT); + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if ((!dup_request) && + ((error = i_fifo_write(data, &lalr->lalr_pkt)) != 0)) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + return (error); + } + + /* Wait for someone to signal us that this request has compleated. */ + mutex_enter(&lalr->lalr_lock); + while (!lalr->lalr_complete) { + if (cv_wait_sig(&lalr->lalr_cv, &lalr->lalr_lock) == 0) { + /* We got a signal, abort this lookup. */ + mutex_exit(&lalr->lalr_lock); + i_lalr_abort(data, lalr); + return (EINTR); + } + } + mutex_exit(&lalr->lalr_lock); + i_lalr_release(data, lalr); + + return (0); +} + +static int +i_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + /* + * Be strict. + * We only accept ioctls from the automounter process group. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp != curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + if ((cmd == LX_AUTOFS_IOC_READY) || (cmd == LX_AUTOFS_IOC_FAIL)) { + lx_autofs_lookup_req_t *lalr; + int id = arg; + + /* + * We don't actually care if the request failed or succeeded. + * We do the same thing either way. + */ + if ((lalr = i_lalr_find(data, id)) == NULL) + return (ENXIO); + + /* Mark the request as compleate and release it. */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + return (0); + } + if (cmd == LX_AUTOFS_IOC_CATATONIC) { + /* The automounter is shutting down. */ + i_fifo_close(data); + return (0); + } + return (ENOTSUP); +} + +static int +i_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data) +{ + char *fd_str, *pgrp_str, *minproto_str, *maxproto_str; + int fd, pgrp, minproto, maxproto; + file_t *fp_wr, *fp_rd; + + /* Require all options to be present. */ + if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1)) + return (EINVAL); + + /* Get the values for each parameter. */ + if ((i_str_to_int(fd_str, &fd) != 0) || + (i_str_to_int(pgrp_str, &pgrp) != 0) || + (i_str_to_int(minproto_str, &minproto) != 0) || + (i_str_to_int(maxproto_str, &maxproto) != 0)) + return (EINVAL); + + /* + * We support v2 of the linux kernel automounter protocol. + * Make sure the mount request we got indicates support + * for this version of the protocol. + */ + if ((minproto > 2) || (maxproto < 2)) + return (EINVAL); + + /* + * Now we need to lookup the fifos we'll be using + * to talk to the userland automounter process. + */ + if (i_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) + return (EINVAL); + + /* Save the mount options and fifo pointers. */ + data->lav_fd = fd; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + return (0); +} + +/* + * VFS entry points + */ +static int +lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lx_autofs_vfs_t *data; + dev_t dev; + char name[40]; + int error; + + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) + return (EBUSY); + + /* We don't support mountes in the global zone. */ + if (getzoneid() == GLOBAL_ZONEID) + return (EPERM); + + /* We don't support mounting on top of ourselves. */ + if (vn_matchops(mvp, lx_autofs_vn_ops)) + return (EPERM); + + /* Allocate a vfs struct. */ + data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP); + + /* Parse mount options. */ + if ((error = i_parse_mntopt(vfsp, data)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + + /* Initialize the backing store. */ + i_bs_destroy(mvp, LX_AUTOFS_BS_DIR); + if ((data->lav_bs_vp = i_bs_create(mvp, LX_AUTOFS_BS_DIR)) == NULL) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (EBUSY); + } + data->lav_bs_name = LX_AUTOFS_BS_DIR; + + /* We have to hold the underlying vnode we're mounted on. */ + data->lav_mvp = mvp; + VN_HOLD(mvp); + + /* Initialize vfs fields */ + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lx_autofs_fstype; + vfsp->vfs_data = data; + + /* Invent a dev_t (sigh) */ + do { + dev = makedevice(lx_autofs_major, + atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32); + } while (vfs_devismounted(dev)); + vfsp->vfs_dev = dev; + vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype); + + /* Create an id space arena for automounter requests. */ + (void) snprintf(name, sizeof (name), "lx_autofs_id_%d", + getminor(vfsp->vfs_dev)); + data->lav_ids = id_space_create(name, 1, INT_MAX); + + /* Create hashes to keep track of automounter requests. */ + mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL); + (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_path_hash = mod_hash_create_strhash(name, + LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor); + (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_id_hash = mod_hash_create_idhash(name, + LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor); + + /* Create a hash to keep track of vnodes. */ + (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_vn_hash = mod_hash_create_ptrhash(name, + LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor, + sizeof (vnode_t)); + + /* Create root vnode */ + data->lav_root = i_vn_alloc(vfsp, data->lav_bs_vp); + data->lav_root->v_flag |= + VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; + + return (0); +} + +static int +lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (EPERM); + + /* We do not currently support forced unmounts. */ + if (flag & MS_FORCE) + return (ENOTSUP); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (EBUSY); + + /* Check for any remaining holds on the root vnode. */ + data = vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + if (data->lav_root->v_count > 1) + return (EBUSY); + + /* Close the fifo to the automount process. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + /* + * We have to release our hold on our root vnode before we can + * delete the backing store. (Since the root vnode is linked + * to the backing store.) + */ + VN_RELE(data->lav_root); + + /* Cleanup the backing store. */ + i_bs_destroy(data->lav_mvp, data->lav_bs_name); + VN_RELE(data->lav_mvp); + + /* Cleanup out remaining data structures. */ + mod_hash_destroy_strhash(data->lav_path_hash); + mod_hash_destroy_idhash(data->lav_id_hash); + mod_hash_destroy_ptrhash(data->lav_vn_hash); + id_space_destroy(data->lav_ids); + kmem_free(data, sizeof (lx_autofs_vfs_t)); + + return (0); +} + +static int +lx_autofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + + *vpp = data->lav_root; + VN_HOLD(*vpp); + + return (0); +} + +static int +lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *urvp = data->lav_root->v_data; + dev32_t d32; + int error; + + if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0) + return (error); + + /* Update some of values before returning. */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name, + sizeof (sp->f_basetype)); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + return (0); +} + +static const fs_operation_def_t lx_autofs_vfstops[] = { + { VFSNAME_MOUNT, lx_autofs_mount }, + { VFSNAME_UNMOUNT, lx_autofs_unmount }, + { VFSNAME_ROOT, lx_autofs_root }, + { VFSNAME_STATVFS, lx_autofs_statvfs }, + { NULL, NULL } +}; + +/* + * VOP entry points - simple passthrough + * + * For most VOP entry points we can simply pass the request on to + * the underlying filesystem we're mounted on. + */ +static int +lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + vnode_t *uvp = vp->v_data; + return (VOP_CLOSE(uvp, flag, count, offset, cr)); +} + +static int +lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_READDIR(uvp, uiop, cr, eofp)); +} + +static int +lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr) +{ + vnode_t *uvp = vp->v_data; + return (VOP_ACCESS(uvp, mode, flags, cr)); +} + +static int +lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_RWLOCK(uvp, write_lock, ctp)); +} + +static void +lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + VOP_RWUNLOCK(uvp, write_lock, ctp); +} + +/*ARGSUSED*/ +static int +lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) +{ + vnode_t *udvp = dvp->v_data; + + /* + * cdir is the calling processes current directory. + * If cdir is lx_autofs vnode then get its real underlying + * vnode ptr. (It seems like the only thing cdir is + * ever used for is to make sure the user doesn't delete + * their current directory.) + */ + if (vn_matchops(cdir, lx_autofs_vn_ops)) { + vnode_t *ucdir = cdir->v_data; + return (VOP_RMDIR(udvp, nm, ucdir, cr)); + } + + return (VOP_RMDIR(udvp, nm, cdir, cr)); +} + +/* + * VOP entry points - special passthrough + * + * For some VOP entry points we will first pass the request on to + * the underlying filesystem we're mounted on. If there's an error + * then we immediatly return the error, but if the request succeedes + * we have to do some extra work before returning. + */ +static int +lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + vnode_t *ovp = *vpp; + vnode_t *uvp = ovp->v_data; + int error; + + if ((error = VOP_OPEN(&uvp, flag, cr)) != 0) + return (error); + + /* Check for clone opens. */ + if (uvp == ovp->v_data) + return (0); + + /* Deal with clone opens by returning a new vnode. */ + *vpp = i_vn_alloc(ovp->v_vfsp, uvp); + VN_RELE(ovp); + return (0); +} + +static int +lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + vnode_t *uvp = vp->v_data; + int error; + + if ((error = VOP_GETATTR(uvp, vap, flags, cr)) != 0) + return (error); + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = vp->v_vfsp->vfs_dev; + return (0); +} + +static int +lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp, + cred_t *cr) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + int error; + + if ((error = VOP_MKDIR(udvp, nm, vap, &uvp, cr)) != 0) + return (error); + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = dvp->v_vfsp->vfs_dev; + + /* Allocate a new vnode. */ + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); +} + +/* + * VOP entry points - custom + */ +/*ARGSUSED*/ +static void +lx_autofs_inactive(struct vnode *vp, struct cred *cr) +{ + lx_autofs_vfs_t *data = vp->v_vfsp->vfs_data; + + /* + * We need to hold the vfs lock because if we're going to free + * this vnode we have to prevent anyone from looking it up + * in the vnode hash. + */ + mutex_enter(&data->lav_lock); + mutex_enter(&vp->v_lock); + + if (vp->v_count < 1) { + panic("lx_autofs_inactive: bad v_count"); + /*NOTREACHED*/ + } + + /* Drop the temporary hold by vn_rele now. */ + if (--vp->v_count > 0) { + mutex_exit(&vp->v_lock); + mutex_exit(&data->lav_lock); + return; + } + + /* + * No one should have been blocked on this lock because we're + * about to free this vnode. + */ + i_vn_free(vp); +} + +static int +lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + int error; + + /* First try to lookup if this path component already exitst. */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr)) == 0) { + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + + /* Only query the automounter if the path does not exist. */ + if (error != ENOENT) + return (error); + + /* Refer the lookup to the automounter. */ + if ((error = i_automounter_call(dvp, nm)) != 0) + return (error); + + /* Retry the lookup operation. */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr)) == 0) { + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + return (error); +} + +/*ARGSUSED*/ +static int +lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rvalp) +{ + vnode_t *uvp = vp->v_data; + + /* Intercept certain ioctls. */ + switch ((uint_t)cmd) { + case LX_AUTOFS_IOC_READY: + case LX_AUTOFS_IOC_FAIL: + case LX_AUTOFS_IOC_CATATONIC: + case LX_AUTOFS_IOC_EXPIRE: + case LX_AUTOFS_IOC_PROTOVER: + case LX_AUTOFS_IOC_SETTIMEOUT: + return (i_automounter_ioctl(vp, cmd, arg)); + } + + /* Pass any remaining ioctl on. */ + return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp)); +} + +/* + * VOP entry points definitions + */ +static const fs_operation_def_t lx_autofs_tops_root[] = { + { VOPNAME_OPEN, lx_autofs_open }, + { VOPNAME_CLOSE, lx_autofs_close }, + { VOPNAME_IOCTL, lx_autofs_ioctl }, + { VOPNAME_RWLOCK, lx_autofs_rwlock }, + { VOPNAME_RWUNLOCK, (fs_generic_func_p)lx_autofs_rwunlock }, + { VOPNAME_GETATTR, lx_autofs_getattr }, + { VOPNAME_ACCESS, lx_autofs_access }, + { VOPNAME_READDIR, lx_autofs_readdir }, + { VOPNAME_LOOKUP, lx_autofs_lookup }, + { VOPNAME_INACTIVE, (fs_generic_func_p)lx_autofs_inactive }, + { VOPNAME_MKDIR, lx_autofs_mkdir }, + { VOPNAME_RMDIR, lx_autofs_rmdir }, + { NULL } +}; + +/* + * lx_autofs_init() gets invoked via the mod_install() call in + * this modules _init() routine. Therefor, the code that cleans + * up the structures we allocate below is actually found in + * our _fini() routine. + */ +/* ARGSUSED */ +static int +lx_autofs_init(int fstype, char *name) +{ + int error; + + if ((lx_autofs_major = + (major_t)space_fetch(LX_AUTOFS_SPACE_KEY_UDEV)) == 0) { + + if ((lx_autofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lx_autofs_init: " + "can't get unique device number"); + return (EAGAIN); + } + + if (space_store(LX_AUTOFS_SPACE_KEY_UDEV, + (uintptr_t)lx_autofs_major) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: " + "can't save unique device number"); + return (EAGAIN); + } + } + + lx_autofs_fstype = fstype; + if ((error = vfs_setfsops( + fstype, lx_autofs_vfstops, &lx_autofs_vfsops)) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template"); + return (error); + } + + if ((error = vn_make_ops("lx_autofs vnode ops", + lx_autofs_tops_root, &lx_autofs_vn_ops)) != 0) { + VERIFY(vfs_freevfsops_by_type(fstype) == 0); + lx_autofs_vn_ops = NULL; + return (error); + } + + return (0); +} + + +/* + * Module linkage + */ +static mntopt_t lx_autofs_mntopt[] = { + { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE } +}; + +static mntopts_t lx_autofs_mntopts = { + sizeof (lx_autofs_mntopt) / sizeof (mntopt_t), + lx_autofs_mntopt +}; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + LX_AUTOFS_NAME, + lx_autofs_init, + VSW_HASPROTO | VSW_VOLATILEDEV, + &lx_autofs_mntopts +}; + +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "linux autofs filesystem", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + if (lx_autofs_vn_ops != NULL) { + vn_freevnodeops(lx_autofs_vn_ops); + lx_autofs_vn_ops = NULL; + } + + /* + * In our init routine, if we get an error after calling + * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type(). + * But we don't need to call vfs_freevfsops_by_type() here + * because the fs framework did this for us as part of the + * mod_remove() call above. + */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c new file mode 100644 index 0000000000..ae049e2792 --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/frame.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> + +#include <sys/lx_impl.h> + +#define LX_SYSTRACE_SHIFT 16 +#define LX_SYSTRACE_ISENTRY(x) ((int)(x) >> LX_SYSTRACE_SHIFT) +#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & ((1 << LX_SYSTRACE_SHIFT) - 1)) +#define LX_SYSTRACE_ENTRY(id) ((1 << LX_SYSTRACE_SHIFT) | (id)) +#define LX_SYSTRACE_RETURN(id) (id) + +#define LX_SYSTRACE_ENTRY_AFRAMES 2 +#define LX_SYSTRACE_RETURN_AFRAMES 4 + +typedef struct lx_systrace_sysent { + const char *lss_name; + dtrace_id_t lss_entry; + dtrace_id_t lss_return; +} lx_systrace_sysent_t; + +static dev_info_t *lx_systrace_devi; +static dtrace_provider_id_t lx_systrace_id; +static kmutex_t lx_systrace_lock; +static uint_t lx_systrace_nenabled; + +static int lx_systrace_nsysent; +static lx_systrace_sysent_t *lx_systrace_sysent; + +/*ARGSUSED*/ +static void +lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + + if (sysnum >= lx_systrace_nsysent) + return; + + if ((id = lx_systrace_sysent[sysnum].lss_entry) == DTRACE_IDNONE) + return; + + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + + if (sysnum >= lx_systrace_nsysent) + return; + + if ((id = lx_systrace_sysent[sysnum].lss_return) == DTRACE_IDNONE) + return; + + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc) +{ + int i; + + if (desc != NULL) + return; + + for (i = 0; i < lx_systrace_nsysent; i++) { + if (dtrace_probe_lookup(lx_systrace_id, NULL, + lx_systrace_sysent[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, NULL, + lx_systrace_sysent[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, NULL, + lx_systrace_sysent[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE_RETURN(i))); + + lx_systrace_sysent[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent[i].lss_return = DTRACE_IDNONE; + } +} + +/*ARGSUSED*/ +static void +lx_systrace_enable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + ASSERT(sysnum < lx_systrace_nsysent); + + mutex_enter(&lx_systrace_lock); + if (lx_systrace_nenabled++ == 0) + lx_brand_systrace_enable(); + mutex_exit(&lx_systrace_lock); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent[sysnum].lss_entry = id; + } else { + lx_systrace_sysent[sysnum].lss_return = id; + } +} + +/*ARGSUSED*/ +static void +lx_systrace_disable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + ASSERT(sysnum < lx_systrace_nsysent); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent[sysnum].lss_return = DTRACE_IDNONE; + } + + mutex_enter(&lx_systrace_lock); + if (--lx_systrace_nenabled == 0) + lx_brand_systrace_disable(); + mutex_exit(&lx_systrace_lock); +} + +/*ARGSUSED*/ +static void +lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg) +{ +} + +/*ARGSUSED*/ +static uint64_t +lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct frame *fp = (struct frame *)dtrace_getfp(); + uintptr_t *stack; + uint64_t val = 0; + int i; + + if (argno >= 6) + return (0); + + /* + * Walk the four frames down the stack to the entry or return callback. + * Our callback calls dtrace_probe() which calls dtrace_dif_variable() + * which invokes this function to get the extended arguments. We get + * the frame pointer in via call to dtrace_getfp() above which makes for + * four frames. + */ + for (i = 0; i < 4; i++) { + fp = (struct frame *)fp->fr_savfp; + } + + stack = (uintptr_t *)&fp[1]; + + /* + * Skip the first argument to the callback -- the system call number. + */ + argno++; + +#ifdef __amd64 + /* + * On amd64, the first 6 arguments are passed in registers while + * subsequent arguments are on the stack. + */ + argno -= 6; +#endif + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[argno]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + + +static const dtrace_pattr_t lx_systrace_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t lx_systrace_pops = { + lx_systrace_provide, + NULL, + lx_systrace_enable, + lx_systrace_disable, + NULL, + NULL, + NULL, + lx_systrace_getarg, + NULL, + lx_systrace_destroy +}; + +static int +lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int i; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR, + 0, DDI_PSEUDO, NULL) == DDI_FAILURE || + dtrace_register("lx-syscall", &lx_systrace_attr, + DTRACE_PRIV_KERNEL, 0, &lx_systrace_pops, NULL, + &lx_systrace_id) != 0) { + ddi_remove_minor_node(devi, NULL); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + lx_systrace_devi = devi; + + /* + * Count up the lx_brand system calls. + */ + for (i = 0; lx_sysent[i].sy_callc != NULL; i++) + continue; + + /* + * Initialize our corresponding table. + */ + lx_systrace_sysent = kmem_zalloc(i * sizeof (lx_systrace_sysent_t), + KM_SLEEP); + lx_systrace_nsysent = i; + + for (i = 0; i < lx_systrace_nsysent; i++) { + lx_systrace_sysent[i].lss_name = lx_sysent[i].sy_name; + lx_systrace_sysent[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent[i].lss_return = DTRACE_IDNONE; + } + + /* + * Install probe triggers. + */ + lx_systrace_entry_ptr = lx_systrace_entry; + lx_systrace_return_ptr = lx_systrace_return; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (dtrace_unregister(lx_systrace_id) != 0) + return (DDI_FAILURE); + + /* + * Free table. + */ + kmem_free(lx_systrace_sysent, lx_systrace_nsysent * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent = NULL; + lx_systrace_nsysent = 0; + + /* + * Reset probe triggers. + */ + lx_systrace_entry_ptr = NULL; + lx_systrace_return_ptr = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +static struct cb_ops lx_systrace_cb_ops = { + lx_systrace_open, /* open */ + nodev, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops lx_systrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_systrace_attach, /* attach */ + lx_systrace_detach, /* detach */ + nodev, /* reset */ + &lx_systrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev /* dev power */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Linux Brand System Call Tracing", /* name of module */ + &lx_systrace_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf new file mode 100644 index 0000000000..e4499c8a5b --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_systrace" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/ldlinux.c b/usr/src/uts/common/brand/lx/io/ldlinux.c new file mode 100644 index 0000000000..76c5e1d255 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/ldlinux.c @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/ddi.h> +#include <sys/cmn_err.h> +#include <sys/modctl.h> +#include <sys/ptms.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> + +#include <sys/ldlinux.h> + + +/* + * ldlinuxopen - open routine gets called when the module gets pushed onto the + * stream. + */ +/* ARGSUSED */ +static int +ldlinuxopen( + queue_t *q, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + struct ldlinux *tp; /* ldlinux entry for this module */ + mblk_t *mop; + struct stroptions *sop; + struct termios *termiosp; + int len; + + if (sflag != MODOPEN) + return (EINVAL); + + if (q->q_ptr != NULL) { + /* It's already attached. */ + return (0); + } + + mop = allocb(sizeof (struct stroptions), BPRI_MED); + if (mop == NULL) + return (ENOSR); + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)mop->b_rptr; + sop->so_flags = SO_ISTTY; + + /* + * Allocate state structure. + */ + tp = kmem_alloc(sizeof (*tp), KM_SLEEP); + + /* Stash a pointer to our private data in q_ptr. */ + q->q_ptr = WR(q)->q_ptr = tp; + + /* + * Get termios defaults. These are stored as + * a property in the "options" node. + */ + if (ddi_getlongprop(DDI_DEV_T_ANY, ddi_root_node(), 0, "ttymodes", + (caddr_t)&termiosp, &len) == DDI_PROP_SUCCESS && + len == sizeof (struct termios)) { + if (termiosp->c_lflag & ICANON) { + tp->veof = termiosp->c_cc[VEOF]; + tp->veol = termiosp->c_cc[VEOL]; + tp->vmin = 1; + tp->vtime = 0; + } else { + tp->veof = 0; + tp->veol = 0; + tp->vmin = termiosp->c_cc[VMIN]; + tp->vtime = termiosp->c_cc[VTIME]; + } + kmem_free(termiosp, len); + } else { + /* + * winge winge winge... + */ + cmn_err(CE_WARN, + "ldlinuxopen: Couldn't get ttymodes property!"); + bzero(tp, sizeof (*tp)); + } + + tp->state = 0; + + /* + * Commit to the open and send the M_SETOPTS off to the stream head. + */ + qprocson(q); + putnext(q, mop); + + return (0); +} + + +/* + * ldlinuxclose - This routine gets called when the module gets + * popped off of the stream. + */ +/* ARGSUSED */ +static int +ldlinuxclose(queue_t *q, int flag, cred_t *credp) +{ + struct ldlinux *tp; + + qprocsoff(q); + tp = q->q_ptr; + kmem_free(tp, sizeof (*tp)); + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + + +static void +do_ioctl(queue_t *q, mblk_t *mp) +{ + struct ldlinux *tp = q->q_ptr; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + struct lx_cc *cb; + mblk_t *tmp; + int error; + + switch (iocp->ioc_cmd) { + case TIOCSETLD: + /* prepare caller supplied data for access */ + error = miocpullup(mp, sizeof (struct lx_cc)); + if (error != 0) { + miocnak(q, mp, 0, error); + return; + } + + /* get a pointer to the caller supplied data */ + cb = (struct lx_cc *)mp->b_cont->b_rptr; + + /* save caller supplied data in our per-stream cache */ + tp->veof = cb->veof; + tp->veol = cb->veol; + tp->vmin = cb->vmin; + tp->vtime = cb->vtime; + + /* initialize and send a reply indicating that we're done */ + miocack(q, mp, 0, 0); + return; + + case TIOCGETLD: + /* allocate a reply message */ + if ((tmp = allocb(sizeof (struct lx_cc), BPRI_MED)) == NULL) { + miocnak(q, mp, 0, ENOSR); + return; + } + + /* initialize the reply message */ + mioc2ack(mp, tmp, sizeof (struct lx_cc), 0); + + /* get a pointer to the reply data */ + cb = (struct lx_cc *)mp->b_cont->b_rptr; + + /* copy data from our per-stream cache into the reply data */ + cb->veof = tp->veof; + cb->veol = tp->veol; + cb->vmin = tp->vmin; + cb->vtime = tp->vtime; + + /* send the reply indicating that we're done */ + qreply(q, mp); + return; + + case PTSSTTY: + tp->state |= ISPTSTTY; + break; + + default: + break; + } + + putnext(q, mp); +} + + +/* + * ldlinuxput - Module read and write queue put procedure. + */ +static void +ldlinuxput(queue_t *q, mblk_t *mp) +{ + struct ldlinux *tp = q->q_ptr; + + switch (DB_TYPE(mp)) { + default: + break; + case M_IOCTL: + if ((q->q_flag & QREADR) == 0) { + do_ioctl(q, mp); + return; + } + break; + + case M_FLUSH: + /* + * Handle read and write flushes. + */ + if ((((q->q_flag & QREADR) != 0) && (*mp->b_rptr & FLUSHR)) || + (((q->q_flag & QREADR) == 0) && (*mp->b_rptr & FLUSHW))) { + if ((tp->state & ISPTSTTY) && (*mp->b_rptr & FLUSHBAND)) + flushband(q, *(mp->b_rptr + 1), FLUSHDATA); + else + flushq(q, FLUSHDATA); + } + break; + } + putnext(q, mp); +} + + +static struct module_info ldlinux_info = { + LDLINUX_MODID, + LDLINUX_MOD, + 0, + INFPSZ, + 0, + 0 +}; + +static struct qinit ldlinuxinit = { + (int (*)()) ldlinuxput, + NULL, + ldlinuxopen, + ldlinuxclose, + NULL, + &ldlinux_info +}; + +static struct streamtab ldlinuxinfo = { + &ldlinuxinit, + &ldlinuxinit +}; + +/* + * Module linkage information for the kernel. + */ +static struct fmodsw fsw = { + LDLINUX_MOD, + &ldlinuxinfo, + D_MTQPAIR | D_MP +}; + +static struct modlstrmod modlstrmod = { + &mod_strmodops, "termios extensions for lx brand", &fsw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlstrmod, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_audio.c b/usr/src/uts/common/brand/lx/io/lx_audio.c new file mode 100644 index 0000000000..07c3bd0949 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_audio.c @@ -0,0 +1,2026 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/audio.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/disp.h> +#include <sys/ddi.h> +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/kmem.h> +#include <sys/lx_audio.h> +#include <sys/mixer.h> +#include <sys/modhash.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/stropts.h> +#include <sys/types.h> +#include <sys/zone.h> + +/* Properties used by the lx_audio driver */ +#define LXA_PROP_INPUTDEV "inputdev" +#define LXA_PROP_OUTPUTDEV "outputdev" + +/* default device paths used by this driver */ +#define LXA_DEV_DEFAULT "/dev/audio" +#define LXA_DEV_CUSTOM_DIR "/dev/sound/" + +/* maximum possible number of concurrent opens of this driver */ +#define LX_AUDIO_MAX_OPENS 1024 + +/* + * these are default fragment size and fragment count values. + * these values were chosen to make quake work well on my + * laptop: 2Ghz Pentium M + NVIDIA GeForce Go 6400. + * + * for reference: + * - 1 sec of stereo output at 44Khz is about 171 Kb of data + * - 1 sec of mono output at 8Khz is about 8Kb of data + */ +#define LXA_OSS_FRAG_SIZE (1024) /* 1/8 sec at 8Khz mono */ +#define LXA_OSS_FRAG_CNT (1024 * 2) + +/* maximum ammount of fragment memory we'll allow a process to mmap */ +#define LXA_OSS_FRAG_MEM (1024 * 1024 * 2) /* 2Mb */ + +/* forward declarations */ +typedef struct lxa_state lxa_state_t; +typedef struct lxa_zstate lxa_zstate_t; + +/* + * Structure and enum declarations + */ +typedef enum { + LXA_TYPE_INVALID = 0, + LXA_TYPE_AUDIO = 1, /* audio device */ + LXA_TYPE_AUDIOCTL = 2 /* audio control/mixer device */ +} lxa_dev_type_t; + +struct lxa_zstate { + char *lxa_zs_zonename; + + /* + * we could store the input/output audio device setting here, + * but instead we're keeing them as device node properties + * so that a user can easily see the audio configuration for + * a zone via prtconf. + */ + + /* + * OSS doesn't support multiple opens of the audio device. + * (multiple opens of the mixer device are supported.) + * so here we'll keep a pointer to any open input/output + * streams. (OSS does support two opens if one is for input + * and the other is for output.) + */ + lxa_state_t *lxa_zs_istate; + lxa_state_t *lxa_zs_ostate; + + /* + * we need to cache channel gain and balance. channel gain and + * balance map to PCM volume in OSS, which are supposedly a property + * of the underlying hardware. but in solaris, channels are + * implemented in software and only exist when an audio device + * is actually open. (each open returns a unique channel.) OSS + * apps will expect consistent PCM volume set/get operations to + * work even if no audio device is open. hence, if no underlying + * device is open we need to cache the gain and balance setting. + */ + lxa_mixer_levels_t lxa_zs_pcm_levels; +}; + +struct lxa_state { + lxa_zstate_t *lxas_zs; /* zone state pointer */ + + dev_t lxas_dev_old; /* dev_t used to open the device */ + dev_t lxas_dev_new; /* new dev_t assigned to an open */ + int lxas_flags; /* original flags passed to open */ + lxa_dev_type_t lxas_type; /* type of device that was opened */ + + int lxas_devs_same; /* input and output device the same? */ + + /* input device variables */ + ldi_handle_t lxas_idev_lh; /* ldi handle for access */ + int lxas_idev_flags; /* flags used for open */ + + /* output device variables */ + ldi_handle_t lxas_odev_lh; /* ldi handle for access */ + int lxas_odev_flags; /* flags used for open */ + + /* + * since we support multiplexing of devices we need to remember + * certain parameters about the devices + */ + uint_t lxas_hw_features; + uint_t lxas_sw_features; + + uint_t lxas_frag_size; + uint_t lxas_frag_cnt; + + /* + * members needed to support mmap device access. note that to + * simplifly things we only support one mmap access per open. + */ + ddi_umem_cookie_t lxas_umem_cookie; + char *lxas_umem_ptr; + size_t lxas_umem_len; + kthread_t *lxas_mmap_thread; + int lxas_mmap_thread_running; + int lxas_mmap_thread_exit; + int lxas_mmap_thread_frag; +}; + +/* + * Global variables + */ +dev_info_t *lxa_dip = NULL; +kmutex_t lxa_lock; +id_space_t *lxa_minor_id = NULL; +mod_hash_t *lxa_state_hash = NULL; +mod_hash_t *lxa_zstate_hash = NULL; +size_t lxa_state_hash_size = 15; +size_t lxa_zstate_hash_size = 15; +size_t lxa_registered_zones = 0; + +/* + * function declarations + */ +static void lxa_mmap_output_disable(lxa_state_t *); + +/* + * functions + */ +static void +lxa_state_close(lxa_state_t *lxa_state) +{ + lxa_zstate_t *lxa_zs = lxa_state->lxas_zs; + minor_t minor = getminor(lxa_state->lxas_dev_new); + + /* disable any mmap output that might still be going on */ + lxa_mmap_output_disable(lxa_state); + + /* + * if this was the active input/output device, unlink it from + * the global zone state so that other opens of the audio device + * can now succeed. + */ + mutex_enter(&lxa_lock); + if (lxa_zs->lxa_zs_istate == lxa_state) + lxa_zs->lxa_zs_istate = NULL; + if (lxa_zs->lxa_zs_ostate == lxa_state) { + lxa_zs->lxa_zs_ostate = NULL; + } + mutex_exit(&lxa_lock); + + /* remove this state structure from the hash (if it's there) */ + (void) mod_hash_remove(lxa_state_hash, + (mod_hash_key_t)(uintptr_t)minor, (mod_hash_val_t *)&lxa_state); + + /* close any audio device that we have open */ + if (lxa_state->lxas_idev_lh != NULL) + (void) ldi_close(lxa_state->lxas_idev_lh, + lxa_state->lxas_idev_flags, kcred); + if (lxa_state->lxas_odev_lh != NULL) + (void) ldi_close(lxa_state->lxas_odev_lh, + lxa_state->lxas_odev_flags, kcred); + + /* free up any memory allocated by mmaps */ + if (lxa_state->lxas_umem_cookie != NULL) + ddi_umem_free(lxa_state->lxas_umem_cookie); + + /* release the id associated with this state structure */ + id_free(lxa_minor_id, minor); + + kmem_free(lxa_state, sizeof (*lxa_state)); +} + +static char * +getzonename(void) +{ + return (curproc->p_zone->zone_name); +} + +static void +strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static char * +lxa_devprop_name(char *zname, char *pname) +{ + char *zpname; + int n; + + ASSERT((pname != NULL) && (zname != NULL)); + + /* prepend the zone name to the property name */ + n = snprintf(NULL, 0, "%s_%s", zname, pname) + 1; + zpname = kmem_alloc(n, KM_SLEEP); + (void) snprintf(zpname, n, "%s_%s", zname, pname); + + return (zpname); +} + +static int +lxa_devprop_verify(char *pval) +{ + int n; + + ASSERT(pval != NULL); + + if (strcmp(pval, "default") == 0) + return (0); + + /* make sure the value is an integer */ + for (n = 0; pval[n] != '\0'; n++) { + if ((pval[n] < '0') && (pval[n] > '9')) { + return (-1); + } + } + + return (0); +} + +static char * +lxa_devprop_lookup(char *zname, char *pname, lxa_dev_type_t lxa_type) +{ + char *zprop_name, *pval; + char *dev_path; + int n, rv; + + ASSERT((pname != NULL) && (zname != NULL)); + ASSERT((lxa_type == LXA_TYPE_AUDIO) || (lxa_type == LXA_TYPE_AUDIOCTL)); + + zprop_name = lxa_devprop_name(zname, pname); + + /* attempt to lookup the property */ + rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, zprop_name, &pval); + strfree(zprop_name); + + if (rv != DDI_PROP_SUCCESS) + return (NULL); + + if (lxa_devprop_verify(pval) != 0) { + ddi_prop_free(pval); + return (NULL); + } + + if (strcmp(pval, "none") == 0) { + /* there is no audio device specified */ + return (NULL); + } else if (strcmp(pval, "default") == 0) { + /* use the default audio device on the system */ + dev_path = strdup(LXA_DEV_DEFAULT); + } else { + /* a custom audio device was specified, generate a path */ + n = snprintf(NULL, 0, "%s%s", LXA_DEV_CUSTOM_DIR, pval) + 1; + dev_path = kmem_alloc(n, KM_SLEEP); + (void) snprintf(dev_path, n, "%s%s", LXA_DEV_CUSTOM_DIR, pval); + } + ddi_prop_free(pval); + + /* + * if this is an audio control device so we need to append + * "ctl" to the path + */ + if (lxa_type == LXA_TYPE_AUDIOCTL) { + char *tmp; + n = snprintf(NULL, 0, "%s%s", dev_path, "ctl") + 1; + tmp = kmem_alloc(n, KM_SLEEP); + (void) snprintf(tmp, n, "%s%s", dev_path, "ctl"); + strfree(dev_path); + dev_path = tmp; + } + + return (dev_path); +} + +static int +lxa_dev_getfeatures(lxa_state_t *lxa_state) +{ + audio_info_t ai_idev, ai_odev; + int n, rv; + + /* set a default fragment size */ + lxa_state->lxas_frag_size = LXA_OSS_FRAG_SIZE; + lxa_state->lxas_frag_cnt = LXA_OSS_FRAG_CNT; + + /* get info for the currently open audio devices */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, + AUDIO_GETINFO, (intptr_t)&ai_idev, FKIOCTL, kcred, &n)) != 0)) + return (rv); + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_GETINFO, (intptr_t)&ai_odev, FKIOCTL, kcred, &n)) != 0)) + return (rv); + + /* if we're only open for reading or writing then it's easy */ + if (lxa_state->lxas_idev_lh == NULL) { + lxa_state->lxas_sw_features = ai_odev.sw_features; + lxa_state->lxas_hw_features = ai_odev.hw_features; + return (0); + } else if (lxa_state->lxas_odev_lh == NULL) { + lxa_state->lxas_sw_features = ai_idev.sw_features; + lxa_state->lxas_hw_features = ai_idev.hw_features; + return (0); + } + + /* + * well if we're open for reading and writing but the underlying + * device is the same then it's also pretty easy + */ + if (lxa_state->lxas_devs_same) { + if ((ai_odev.sw_features != ai_idev.sw_features) || + (ai_odev.hw_features != ai_idev.hw_features)) { + zcmn_err(getzoneid(), CE_WARN, "lx_audio error: " + "audio device reported inconsistent features"); + return (EIO); + } + lxa_state->lxas_sw_features = ai_odev.sw_features; + lxa_state->lxas_hw_features = ai_odev.hw_features; + return (0); + } + + /* + * figure out which software features we're going to support. + * we will report a feature as supported if both the input + * and output device support it. + */ + lxa_state->lxas_sw_features = 0; + n = ai_idev.sw_features & ai_odev.sw_features; + if (n & AUDIO_SWFEATURE_MIXER) + lxa_state->lxas_sw_features |= AUDIO_SWFEATURE_MIXER; + + /* + * figure out which hardware features we're going to support. + * for a first pass we will report a feature as supported if + * both the input and output device support it. + */ + lxa_state->lxas_hw_features = 0; + n = ai_idev.hw_features & ai_odev.hw_features; + if (n & AUDIO_HWFEATURE_MSCODEC) + lxa_state->lxas_hw_features |= AUDIO_HWFEATURE_MSCODEC; + + /* + * if we made it here then we have different audio input and output + * devices. this will allow us to report support for additional + * hardware features that may not supported by just the input or + * output device alone. + */ + + /* always report tha we support both playback and recording */ + lxa_state->lxas_hw_features = + AUDIO_HWFEATURE_PLAY | AUDIO_HWFEATURE_RECORD; + + /* always report full duplex support */ + lxa_state->lxas_hw_features = AUDIO_HWFEATURE_DUPLEX; + + /* never report that we have input to output loopback support */ + ASSERT((lxa_state->lxas_hw_features & AUDIO_HWFEATURE_IN2OUT) == 0); + return (0); +} + +static int +lxa_dev_open(lxa_state_t *lxa_state) +{ + char *idev, *odev; + int flags, rv; + ldi_handle_t lh; + ldi_ident_t li = NULL; + + ASSERT((lxa_state->lxas_type == LXA_TYPE_AUDIO) || + (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL)); + + /* + * check if we have configuration properties for this zone. + * if we don't then audio isn't supported in this zone. + */ + idev = lxa_devprop_lookup(getzonename(), LXA_PROP_INPUTDEV, + lxa_state->lxas_type); + odev = lxa_devprop_lookup(getzonename(), LXA_PROP_OUTPUTDEV, + lxa_state->lxas_type); + + /* make sure there is at least one device to read from or write to */ + if ((idev == NULL) && (odev == NULL)) + return (ENODEV); + + /* see if the input and output devices are actually the same device */ + if (((idev != NULL) && (odev != NULL)) && + (strcmp(idev, odev) == 0)) + lxa_state->lxas_devs_same = 1; + + /* we don't respect FEXCL */ + flags = lxa_state->lxas_flags & ~FEXCL; + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + /* + * if we're opening audio devices then we need to muck + * with the FREAD/FWRITE flags. + * + * certain audio device may only support input or output + * (but not both.) so if we're multiplexing input/output + * to different devices we need to make sure we don't try + * and open the output device for reading and the input + * device for writing. + * + * if we're using the same device for input/output we still + * need to do this because some audio devices won't let + * themselves be opened multiple times for read access. + */ + lxa_state->lxas_idev_flags = flags & ~FWRITE; + lxa_state->lxas_odev_flags = flags & ~FREAD; + + /* make sure we have devices to read from and write to */ + if (((flags & FREAD) && (idev == NULL)) || + ((flags & FWRITE) && (odev == NULL))) { + rv = ENODEV; + goto out; + } + } else { + lxa_state->lxas_idev_flags = lxa_state->lxas_odev_flags = flags; + } + + /* get an ident to open the devices */ + if (ldi_ident_from_dev(lxa_state->lxas_dev_new, &li) != 0) { + rv = ENODEV; + goto out; + } + + /* open the input device */ + lxa_state->lxas_idev_lh = NULL; + if (((lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) || + (lxa_state->lxas_idev_flags & FREAD)) && + (idev != NULL)) { + rv = ldi_open_by_name(idev, lxa_state->lxas_idev_flags, + kcred, &lh, li); + if (rv != 0) { + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "unable to open audio device: %s", idev); + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "possible zone audio configuration error"); + goto out; + } + lxa_state->lxas_idev_lh = lh; + } + + /* open the output device */ + lxa_state->lxas_odev_lh = NULL; + if (((lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) || + (lxa_state->lxas_odev_flags & FWRITE)) && + (odev != NULL)) { + rv = ldi_open_by_name(odev, lxa_state->lxas_odev_flags, + kcred, &lh, li); + if (rv != 0) { + /* if we opened an input device, close it now */ + if (lxa_state->lxas_idev_lh != NULL) { + (void) ldi_close(lxa_state->lxas_idev_lh, + lxa_state->lxas_idev_flags, kcred); + } + + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "unable to open audio device: %s", odev); + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "possible zone audio configuration error"); + goto out; + } + lxa_state->lxas_odev_lh = lh; + } + + /* free up stuff */ +out: + if (li != NULL) + ldi_ident_release(li); + if (idev != NULL) + strfree(idev); + if (odev != NULL) + strfree(odev); + + return (rv); +} + +void +lxa_mmap_thread_exit(lxa_state_t *lxa_state) +{ + mutex_enter(&lxa_lock); + lxa_state->lxas_mmap_thread = NULL; + lxa_state->lxas_mmap_thread_frag = 0; + lxa_state->lxas_mmap_thread_running = 0; + lxa_state->lxas_mmap_thread_exit = 0; + mutex_exit(&lxa_lock); + thread_exit(); + /*NOTREACHED*/ +} + +void +lxa_mmap_thread(lxa_state_t *lxa_state) +{ + struct uio uio, uio_null; + iovec_t iovec, iovec_null; + uint_t bytes_per_sec, usec_per_frag, ticks_per_frag; + int rv, junk, eof, retry; + audio_info_t ai; + + /* we better be setup for writing to the output device */ + ASSERT((lxa_state->lxas_flags & FWRITE) != 0); + ASSERT(lxa_state->lxas_odev_lh != NULL); + + /* setup a uio to output one fragment */ + uio.uio_iov = &iovec; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + + /* setup a uio to output a eof (a fragment with a length of 0) */ + uio_null.uio_iov = &iovec_null; + uio_null.uio_iov->iov_len = 0; + uio_null.uio_iov->iov_base = NULL; + uio_null.uio_iovcnt = 1; + uio_null.uio_offset = 0; + uio_null.uio_segflg = UIO_SYSSPACE; + uio_null.uio_fmode = 0; + uio_null.uio_extflg = 0; + uio_null.uio_llimit = MAXOFFSET_T; + uio_null.uio_resid = 0; + +lxa_mmap_thread_top: + ASSERT(!MUTEX_HELD(&lxa_lock)); + + /* first drain any pending audio output */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_DRAIN, NULL, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_DRAIN failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* + * we depend on the ai.play.eof value to keep track of + * audio output progress so reset it here. + */ + AUDIO_INITINFO(&ai); + ai.play.eof = 0; + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_SETINFO failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* + * we're going to need to know the sampling rate and number + * of output channels to estimate how long we can sleep between + * requests. + */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_GETINFO, + (intptr_t)&ai, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_GETINFO failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* estimate how many ticks it takes to output a fragment of data */ + bytes_per_sec = (ai.play.sample_rate * ai.play.channels * + ai.play.precision) / 8; + usec_per_frag = MICROSEC * lxa_state->lxas_frag_size / bytes_per_sec; + ticks_per_frag = drv_usectohz(usec_per_frag); + + /* queue up three fragments of of data into the output stream */ + eof = 3; + + /* sanity check the eof value */ + ASSERT(ai.play.eof == 0); + ai.play.eof = 0; + + /* we always start audio output at fragment 0 */ + mutex_enter(&lxa_lock); + lxa_state->lxas_mmap_thread_frag = 0; + + /* + * we shouldn't have allowed the mapping if it isn't a multiple + * of the fragment size + */ + ASSERT((lxa_state->lxas_umem_len % lxa_state->lxas_frag_size) == 0); + + while (!lxa_state->lxas_mmap_thread_exit) { + size_t start, end; + + /* + * calculate the start and ending offsets of the next + * fragment to output + */ + start = lxa_state->lxas_mmap_thread_frag * + lxa_state->lxas_frag_size; + end = start + lxa_state->lxas_frag_size; + + ASSERT(start < lxa_state->lxas_umem_len); + ASSERT(end <= lxa_state->lxas_umem_len); + + /* setup the uio to output one fragment of audio */ + uio.uio_resid = end - start; + uio.uio_iov->iov_len = end - start; + uio.uio_iov->iov_base = &lxa_state->lxas_umem_ptr[start]; + + /* increment the current fragment index */ + lxa_state->lxas_mmap_thread_frag = + (lxa_state->lxas_mmap_thread_frag + 1) % + (lxa_state->lxas_umem_len / lxa_state->lxas_frag_size); + + /* drop the audio lock before actually outputting data */ + mutex_exit(&lxa_lock); + + /* + * write the fragment of audio data to the device stream + * then write a eof to the stream to tell the device to + * increment ai.play.eof when it's done processing the + * fragment we just wrote + */ + if ((rv = ldi_write(lxa_state->lxas_odev_lh, + &uio, kcred)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "ldi_write() failed (%d), " + "resetting audio output", rv); + goto lxa_mmap_thread_top; + } + if ((rv = ldi_write(lxa_state->lxas_odev_lh, + &uio_null, kcred)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "ldi_write(eof) failed (%d), " + "resetting audio output", rv); + goto lxa_mmap_thread_top; + } + + /* + * we want to avoid buffer underrun so ensure that + * there is always at least one fragment of data in the + * output stream. + */ + mutex_enter(&lxa_lock); + if (--eof > 0) { + continue; + } + + /* + * now we wait until the audio device has finished outputting + * at least one fragment of data. + */ + retry = 0; + while (!lxa_state->lxas_mmap_thread_exit && (eof == 0)) { + uint_t ai_eof_old = ai.play.eof; + + mutex_exit(&lxa_lock); + + /* + * delay for the number of ticks it takes + * to output one fragment of data + */ + if (ticks_per_frag > 0) + delay(ticks_per_frag); + + /* check if we've managed to output any fragments */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_GETINFO, (intptr_t)&ai, + FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_GETINFO failed (%d), " + "resetting audio output", rv); + /* re-start mmap audio output */ + goto lxa_mmap_thread_top; + } + + if (ai_eof_old == ai.play.eof) { + /* institute a random retry limit */ + if (retry++ < 100) { + mutex_enter(&lxa_lock); + continue; + } + cmn_err(CE_WARN, "lxa_mmap_thread: " + "output stalled, " + "resetting audio output"); + /* re-start mmap audio output */ + goto lxa_mmap_thread_top; + } + + if (ai.play.eof > ai_eof_old) { + eof = ai.play.eof - ai_eof_old; + } else { + /* eof counter wrapped around */ + ASSERT(ai_eof_old < ai.play.eof); + eof = ai.play.eof + (ai_eof_old - UINTMAX_MAX); + } + /* we're done with this loop so re-aquire the lock */ + ASSERT(eof != 0); + mutex_enter(&lxa_lock); + } + } + mutex_exit(&lxa_lock); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ +} + +static void +lxa_mmap_output_disable(lxa_state_t *lxa_state) +{ + kt_did_t tid; + + mutex_enter(&lxa_lock); + + /* if the output thread isn't running there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running == 0) { + mutex_exit(&lxa_lock); + return; + } + + /* tell the pcm mmap output thread to exit */ + lxa_state->lxas_mmap_thread_exit = 1; + + /* wait for the mmap output thread to exit */ + tid = lxa_state->lxas_mmap_thread->t_did; + mutex_exit(&lxa_lock); + thread_join(tid); +} + +static void +lxa_mmap_output_enable(lxa_state_t *lxa_state) +{ + mutex_enter(&lxa_lock); + + /* if the output thread is already running there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running != 0) { + mutex_exit(&lxa_lock); + return; + } + + /* setup output state */ + lxa_state->lxas_mmap_thread_running = 1; + lxa_state->lxas_mmap_thread_exit = 0; + lxa_state->lxas_mmap_thread_frag = 0; + + /* kick off a thread to do the mmap pcm output */ + lxa_state->lxas_mmap_thread = thread_create(NULL, 0, + (void (*)())lxa_mmap_thread, lxa_state, + 0, &p0, TS_RUN, minclsyspri); + ASSERT(lxa_state->lxas_mmap_thread != NULL); + + mutex_exit(&lxa_lock); +} + +static int +lxa_ioc_mmap_output(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + uint_t trigger; + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* if the user hasn't mmap the device then there's nothing to do */ + if (lxa_state->lxas_umem_cookie == NULL) + return (EINVAL); + + /* copy in the request */ + if (ddi_copyin((void *)arg, &trigger, sizeof (trigger), mode) != 0) + return (EFAULT); + + /* a zero value disables output */ + if (trigger == 0) { + lxa_mmap_output_disable(lxa_state); + return (0); + } + + /* a non-zero value enables output */ + lxa_mmap_output_enable(lxa_state); + return (0); +} + +static int +lxa_ioc_mmap_ptr(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + int ptr; + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* if the user hasn't mmap the device then there's nothing to do */ + if (lxa_state->lxas_umem_cookie == NULL) + return (EINVAL); + + /* if the output thread isn't running then there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running == 0) + return (EINVAL); + + mutex_enter(&lxa_lock); + ptr = lxa_state->lxas_mmap_thread_frag * lxa_state->lxas_frag_size; + mutex_exit(&lxa_lock); + + if (ddi_copyout(&ptr, (void *)arg, sizeof (ptr), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_ioc_get_frag_info(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_frag_info_t fi; + + fi.lxa_fi_size = lxa_state->lxas_frag_size; + fi.lxa_fi_cnt = lxa_state->lxas_frag_cnt; + + if (ddi_copyout(&fi, (void *)arg, sizeof (fi), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_ioc_set_frag_info(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_frag_info_t fi; + + /* if the device is mmaped we can't change the fragment settings */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EINVAL); + + /* copy in the request */ + if (ddi_copyin((void *)arg, &fi, sizeof (fi), mode) != 0) + return (EFAULT); + + /* do basic bounds checking */ + if ((fi.lxa_fi_cnt == 0) || (fi.lxa_fi_size < 16)) + return (EINVAL); + + /* don't accept size values less than 16 */ + + lxa_state->lxas_frag_size = fi.lxa_fi_size; + lxa_state->lxas_frag_cnt = fi.lxa_fi_cnt; + + return (0); +} + +static int +lxa_audio_drain(lxa_state_t *lxa_state) +{ + int junk; + + /* only applies to output buffers */ + if (lxa_state->lxas_odev_lh == NULL) + return (EINVAL); + + /* can't fail so ignore the return value */ + (void) ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_DRAIN, NULL, + FKIOCTL, kcred, &junk); + return (0); +} + +/* + * lxa_audio_info_merge() usage notes: + * + * - it's important to make sure NOT to get the ai_idev and ai_odev + * parameters mixed up when calling lxa_audio_info_merge(). + * + * - it's important for the caller to make sure that AUDIO_GETINFO + * was called for the input device BEFORE the output device. (see + * the comments for merging the monitor_gain setting to see why.) + */ +static void +lxa_audio_info_merge(lxa_state_t *lxa_state, + audio_info_t *ai_idev, audio_info_t *ai_odev, audio_info_t *ai_merged) +{ + /* if we're not setup for output return the intput device info */ + if (lxa_state->lxas_odev_lh == NULL) { + *ai_merged = *ai_idev; + return; + } + + /* if we're not setup for input return the output device info */ + if (lxa_state->lxas_idev_lh == NULL) { + *ai_merged = *ai_odev; + return; + } + + /* get record values from the input device */ + ai_merged->record = ai_idev->record; + + /* get play values from the output device */ + ai_merged->play = ai_odev->play; + + /* muting status only matters for the output device */ + ai_merged->output_muted = ai_odev->output_muted; + + /* we don't support device reference counts, always return 1 */ + ai_merged->ref_cnt = 1; + + /* + * for supported hw/sw features report the combined feature + * set we calcuated out earlier. + */ + ai_merged->hw_features = lxa_state->lxas_hw_features; + ai_merged->sw_features = lxa_state->lxas_sw_features; + + if (!lxa_state->lxas_devs_same) { + /* + * if the input and output devices are different + * physical devices then we don't support input to + * output loopback so we always report the input + * to output loopback gain to be zero. + */ + ai_merged->monitor_gain = 0; + } else { + /* + * the intput and output devices are actually the + * same physical device. hence it probably supports + * intput to output loopback. regardless we should + * pass back the intput to output gain reported by + * the device. when we pick a value to passback we + * use the output device value since that was + * the most recently queried. (we base this + * decision on the assumption that io gain is + * actually hardware setting in the device and + * hence if it is changed on one open instance of + * the device the change will be visable to all + * other instances of the device.) + */ + ai_merged->monitor_gain = ai_odev->monitor_gain; + } + + /* + * for currently enabled software features always return the + * merger of the two. (of course the enabled software features + * for the input and output devices should alway be the same, + * so if it isn't complain.) + */ + if (ai_idev->sw_features_enabled != ai_odev->sw_features_enabled) + zcmn_err(getzoneid(), CE_WARN, "lx_audio: " + "unexpected sofware feature state"); + ai_merged->sw_features_enabled = + ai_idev->sw_features_enabled & ai_odev->sw_features_enabled; +} + +static int +lxa_audio_setinfo(lxa_state_t *lxa_state, int cmd, intptr_t arg, + int mode) +{ + audio_info_t ai, ai_null, ai_idev, ai_odev; + int rv, junk; + + /* copy in the request */ + if (ddi_copyin((void *)arg, &ai, sizeof (ai), mode) != 0) + return (EFAULT); + + /* + * if the caller is attempting to enable a software feature that + * we didn't report as supported the return an error + */ + if ((ai.sw_features_enabled != -1) && + (ai.sw_features_enabled & ~lxa_state->lxas_sw_features)) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * changes to the play.eof field (since mmap output depends + * on this field. + */ + if ((lxa_state->lxas_umem_cookie != NULL) && + (ai.play.eof != -1)) + return (EIO); + + /* initialize the new requests */ + AUDIO_INITINFO(&ai_null); + ai_idev = ai_odev = ai; + + /* remove audio input settings from the output device request */ + ai_odev.record = ai_null.record; + + /* remove audio output settings from the input device request */ + ai_idev.play = ai_null.play; + ai_idev.output_muted = ai_null.output_muted; + + /* apply settings to the intput device */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, cmd, + (intptr_t)&ai_idev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* apply settings to the output device */ + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, cmd, + (intptr_t)&ai_odev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* + * a AUDIO_SETINFO call performs an implicit AUDIO_GETINFO to + * return values (see the coments in audioio.h.) so we need + * to combine the values returned from the input and output + * device back into the users buffer. + */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, &ai); + + /* copyout the results */ + if (ddi_copyout(&ai, (void *)arg, sizeof (ai), mode) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +lxa_audio_getinfo(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + audio_info_t ai, ai_idev, ai_odev; + int rv, junk; + + /* get the settings from the input device */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, AUDIO_GETINFO, + (intptr_t)&ai_idev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* get the settings from the output device */ + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_GETINFO, + (intptr_t)&ai_odev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* + * we need to combine the values returned from the input + * and output device back into a single user buffer. + */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, &ai); + + /* copyout the results */ + if (ddi_copyout(&ai, (void *)arg, sizeof (ai), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_mixer_ai_from_lh(ldi_handle_t lh, audio_info_t *ai) +{ + am_control_t *actl; + int rv, ch_count, junk; + + ASSERT((lh != NULL) && (ai != NULL)); + + /* get the number of channels for the underlying device */ + if ((rv = ldi_ioctl(lh, AUDIO_GET_NUM_CHS, + (intptr_t)&ch_count, FKIOCTL, kcred, &junk)) != 0) + return (rv); + + /* allocate the am_control_t structure */ + actl = kmem_alloc(AUDIO_MIXER_CTL_STRUCT_SIZE(ch_count), KM_SLEEP); + + /* get the device state and channel state */ + if ((rv = ldi_ioctl(lh, AUDIO_MIXERCTL_GETINFO, + (intptr_t)actl, FKIOCTL, kcred, &junk)) != 0) { + kmem_free(actl, AUDIO_MIXER_CTL_STRUCT_SIZE(ch_count)); + return (rv); + } + + /* return the audio_info structure */ + *ai = actl->dev_info; + kmem_free(actl, AUDIO_MIXER_CTL_STRUCT_SIZE(ch_count)); + return (0); +} + +static int +lxa_mixer_get_ai(lxa_state_t *lxa_state, audio_info_t *ai) +{ + audio_info_t ai_idev, ai_odev; + int rv; + + /* if there is no input device, query the output device */ + if (lxa_state->lxas_idev_lh == NULL) + return (lxa_mixer_ai_from_lh(lxa_state->lxas_odev_lh, ai)); + + /* if there is no ouput device, query the intput device */ + if (lxa_state->lxas_odev_lh == NULL) + return (lxa_mixer_ai_from_lh(lxa_state->lxas_idev_lh, ai)); + + /* + * now get the audio_info and channel information for the + * underlying output device. + */ + if ((rv = lxa_mixer_ai_from_lh(lxa_state->lxas_idev_lh, + &ai_idev)) != 0) + return (rv); + if ((rv = lxa_mixer_ai_from_lh(lxa_state->lxas_odev_lh, + &ai_odev)) != 0) + return (rv); + + /* now merge the audio_info structures */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, ai); + return (0); +} + +static int +lxa_mixer_get_common(lxa_state_t *lxa_state, int cmd, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + audio_info_t ai; + int rv; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + if ((rv = lxa_mixer_get_ai(lxa_state, &ai)) != 0) + return (rv); + + switch (cmd) { + case LXA_IOC_MIXER_GET_VOL: + lxa_ml.lxa_ml_gain = ai.play.gain; + lxa_ml.lxa_ml_balance = ai.play.balance; + break; + case LXA_IOC_MIXER_GET_MIC: + lxa_ml.lxa_ml_gain = ai.record.gain; + lxa_ml.lxa_ml_balance = ai.record.balance; + break; + } + + if (ddi_copyout(&lxa_ml, (void *)arg, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + return (0); +} + +static int +lxa_mixer_set_common(lxa_state_t *lxa_state, int cmd, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + audio_info_t ai; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* get the new mixer settings */ + if (ddi_copyin((void *)arg, &lxa_ml, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + + /* sanity check the mixer settings */ + if (!LXA_MIXER_LEVELS_OK(&lxa_ml)) + return (EINVAL); + + /* initialize an audio_info struct with the new settings */ + AUDIO_INITINFO(&ai); + switch (cmd) { + case LXA_IOC_MIXER_SET_VOL: + ai.play.gain = lxa_ml.lxa_ml_gain; + ai.play.balance = lxa_ml.lxa_ml_balance; + break; + case LXA_IOC_MIXER_SET_MIC: + ai.record.gain = lxa_ml.lxa_ml_gain; + ai.record.balance = lxa_ml.lxa_ml_balance; + break; + } + + /* + * we're going to cheat here. normally the + * MIXERCTL_SETINFO ioctl take am_control_t and the + * AUDIO_SETINFO takes an audio_info_t. as it turns + * out the first element in a am_control_t is an + * audio_info_t. also, the rest of the am_control_t + * structure is normally ignored for a MIXERCTL_SETINFO + * ioctl. so here we'll try to fall back to the code + * that handles AUDIO_SETINFO ioctls. + */ + return (lxa_audio_setinfo(lxa_state, AUDIO_MIXERCTL_SETINFO, + (intptr_t)&ai, FKIOCTL)); +} + +static int +lxa_mixer_get_pcm(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* simply return the cached pcm mixer settings */ + mutex_enter(&lxa_lock); + if (ddi_copyout(&lxa_state->lxas_zs->lxa_zs_pcm_levels, + (void *)arg, + sizeof (lxa_state->lxas_zs->lxa_zs_pcm_levels), mode) != 0) { + mutex_exit(&lxa_lock); + return (EFAULT); + } + mutex_exit(&lxa_lock); + return (0); +} + +static int +lxa_mixer_set_pcm(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + int rv; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* get the new mixer settings */ + if (ddi_copyin((void *)arg, &lxa_ml, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + + /* sanity check the mixer settings */ + if (!LXA_MIXER_LEVELS_OK(&lxa_ml)) + return (EINVAL); + + mutex_enter(&lxa_lock); + + /* if there is an active output channel, update it */ + if (lxa_state->lxas_zs->lxa_zs_ostate != NULL) { + audio_info_t ai; + + /* initialize an audio_info struct with the new settings */ + AUDIO_INITINFO(&ai); + ai.play.gain = lxa_ml.lxa_ml_gain; + ai.play.balance = lxa_ml.lxa_ml_balance; + + if ((rv = lxa_audio_setinfo(lxa_state->lxas_zs->lxa_zs_ostate, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL)) != 0) { + mutex_exit(&lxa_lock); + return (rv); + } + } + + /* update the cached mixer settings */ + lxa_state->lxas_zs->lxa_zs_pcm_levels = lxa_ml; + + mutex_exit(&lxa_lock); + return (0); +} + +static int +lxa_zone_reg(intptr_t arg, int mode) +{ + lxa_zone_reg_t lxa_zr; + lxa_zstate_t *lxa_zs = NULL; + char *idev_name = NULL, *odev_name = NULL, *pval = NULL; + int i, junk; + + if (ddi_copyin((void *)arg, &lxa_zr, sizeof (lxa_zr), mode) != 0) + return (EFAULT); + + /* make sure that zone_name is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_zone_name); i++) + if (lxa_zr.lxa_zr_zone_name[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_zone_name)) + return (EINVAL); + + /* make sure that inputdev is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_inputdev); i++) + if (lxa_zr.lxa_zr_inputdev[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_inputdev)) + return (EINVAL); + + /* make sure it's a valid inputdev property value */ + if (lxa_devprop_verify(lxa_zr.lxa_zr_inputdev) != 0) + return (EINVAL); + + /* make sure that outputdev is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_outputdev); i++) + if (lxa_zr.lxa_zr_outputdev[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_outputdev)) + return (EINVAL); + + /* make sure it's a valid outputdev property value */ + if (lxa_devprop_verify(lxa_zr.lxa_zr_outputdev) != 0) + return (EINVAL); + + /* get the property names */ + idev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_INPUTDEV); + odev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_OUTPUTDEV); + + /* + * allocate and initialize a zone state structure + * since the audio device can't possibly be opened yet + * (since we're setting it up now and the zone isn't booted + * yet) assign some some resonable default pcm channel settings. + * also, default to one mixer channel. + */ + lxa_zs = kmem_zalloc(sizeof (*lxa_zs), KM_SLEEP); + lxa_zs->lxa_zs_zonename = strdup(lxa_zr.lxa_zr_zone_name); + lxa_zs->lxa_zs_pcm_levels.lxa_ml_gain = AUDIO_MID_GAIN; + lxa_zs->lxa_zs_pcm_levels.lxa_ml_balance = AUDIO_MID_BALANCE; + + mutex_enter(&lxa_lock); + + /* + * make sure this zone isn't already registered + * a zone is registered with properties for that zone exist + * or there is a zone state structure for that zone + */ + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + idev_name, &pval) == DDI_PROP_SUCCESS) { + goto err_unlock; + } + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + odev_name, &pval) == DDI_PROP_SUCCESS) { + goto err_unlock; + } + if (mod_hash_find(lxa_zstate_hash, + (mod_hash_key_t)lxa_zs->lxa_zs_zonename, + (mod_hash_val_t *)&junk) == 0) + goto err_unlock; + + /* + * create the new properties and insert the zone state structure + * into the global hash + */ + if (ddi_prop_update_string(DDI_DEV_T_NONE, lxa_dip, + idev_name, lxa_zr.lxa_zr_inputdev) != DDI_PROP_SUCCESS) + goto err_prop_remove; + if (ddi_prop_update_string(DDI_DEV_T_NONE, lxa_dip, + odev_name, lxa_zr.lxa_zr_outputdev) != DDI_PROP_SUCCESS) + goto err_prop_remove; + if (mod_hash_insert(lxa_zstate_hash, + (mod_hash_key_t)lxa_zs->lxa_zs_zonename, + (mod_hash_val_t)lxa_zs) != 0) + goto err_prop_remove; + + /* success! */ + lxa_registered_zones++; + mutex_exit(&lxa_lock); + + /* cleanup */ + strfree(idev_name); + strfree(odev_name); + return (0); + +err_prop_remove: + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, idev_name); + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, odev_name); + +err_unlock: + mutex_exit(&lxa_lock); + +err: + if (lxa_zs != NULL) { + strfree(lxa_zs->lxa_zs_zonename); + kmem_free(lxa_zs, sizeof (*lxa_zs)); + } + if (pval != NULL) + ddi_prop_free(pval); + if (idev_name != NULL) + strfree(idev_name); + if (odev_name != NULL) + strfree(odev_name); + return (EIO); +} + +static int +lxa_zone_unreg(intptr_t arg, int mode) +{ + lxa_zone_reg_t lxa_zr; + lxa_zstate_t *lxa_zs = NULL; + char *idev_name = NULL, *odev_name = NULL, *pval = NULL; + int rv, i; + + if (ddi_copyin((void *)arg, &lxa_zr, sizeof (lxa_zr), mode) != 0) + return (EFAULT); + + /* make sure that zone_name is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_zone_name); i++) + if (lxa_zr.lxa_zr_zone_name[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_zone_name)) + return (EINVAL); + + /* get the property names */ + idev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_INPUTDEV); + odev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_OUTPUTDEV); + + mutex_enter(&lxa_lock); + + if (lxa_registered_zones <= 0) { + rv = ENOENT; + goto err_unlock; + } + + /* make sure this zone is actually registered */ + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + idev_name, &pval) != DDI_PROP_SUCCESS) { + rv = ENOENT; + goto err_unlock; + } + ddi_prop_free(pval); + pval = NULL; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + odev_name, &pval) != DDI_PROP_SUCCESS) { + rv = ENOENT; + goto err_unlock; + } + ddi_prop_free(pval); + pval = NULL; + if (mod_hash_find(lxa_zstate_hash, + (mod_hash_key_t)lxa_zr.lxa_zr_zone_name, + (mod_hash_val_t *)&lxa_zs) != 0) { + rv = ENOENT; + goto err_unlock; + } + ASSERT(strcmp(lxa_zr.lxa_zr_zone_name, lxa_zs->lxa_zs_zonename) == 0); + + /* + * if the audio device is currently in use then refuse to + * unregister the zone + */ + if ((lxa_zs->lxa_zs_ostate != NULL) || + (lxa_zs->lxa_zs_ostate != NULL)) { + rv = EBUSY; + goto err_unlock; + } + + /* success! cleanup zone config state */ + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, idev_name); + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, odev_name); + + /* + * note, the action of removing the zone state structure from the + * hash will automatically free lxa_zs->lxa_zs_zonename. + * + * the reason for this is that we used lxa_zs->lxa_zs_zonename + * as the hash key and by default mod_hash_create_strhash() uses + * mod_hash_strkey_dtor() as a the hash key destructor. (which + * free's the key for us. + */ + (void) mod_hash_remove(lxa_zstate_hash, + (mod_hash_key_t)lxa_zr.lxa_zr_zone_name, + (mod_hash_val_t *)&lxa_zs); + lxa_registered_zones--; + mutex_exit(&lxa_lock); + + /* cleanup */ + kmem_free(lxa_zs, sizeof (*lxa_zs)); + strfree(idev_name); + strfree(odev_name); + return (0); + +err_unlock: + mutex_exit(&lxa_lock); + +err: + if (pval != NULL) + ddi_prop_free(pval); + if (idev_name != NULL) + strfree(idev_name); + if (odev_name != NULL) + strfree(odev_name); + return (rv); +} + +static int +lxa_ioctl_devctl(int cmd, intptr_t arg, int mode) +{ + /* devctl ioctls are only allowed from the global zone */ + ASSERT(getzoneid() == 0); + if (getzoneid() != 0) + return (EINVAL); + + switch (cmd) { + case LXA_IOC_ZONE_REG: + return (lxa_zone_reg(arg, mode)); + case LXA_IOC_ZONE_UNREG: + return (lxa_zone_unreg(arg, mode)); + } + + return (EINVAL); +} + +static int +/*ARGSUSED*/ +lxa_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + lxa_dev_type_t open_type = LXA_TYPE_INVALID; + lxa_zstate_t *lxa_zs; + lxa_state_t *lxa_state; + minor_t minor; + int rv; + + if (getminor(*devp) == LXA_MINORNUM_DEVCTL) { + /* + * this is a devctl node, it exists to administer this + * pseudo driver so it doesn't actually need access to + * any underlying audio devices. hence there is nothing + * really to do here. course, this driver should + * only be administered from the global zone. + */ + ASSERT(getzoneid() == 0); + if (getzoneid() != 0) + return (EINVAL); + return (0); + } + + /* lookup the zone state structure */ + if (mod_hash_find(lxa_zstate_hash, (mod_hash_key_t)getzonename(), + (mod_hash_val_t *)&lxa_zs) != 0) { + return (EIO); + } + + /* determine what type of device was opened */ + switch (getminor(*devp)) { + case LXA_MINORNUM_DSP: + open_type = LXA_TYPE_AUDIO; + break; + case LXA_MINORNUM_MIXER: + open_type = LXA_TYPE_AUDIOCTL; + break; + default: + return (EINVAL); + } + ASSERT(open_type != LXA_TYPE_INVALID); + + /* all other opens are clone opens so get a new minor node */ + minor = id_alloc(lxa_minor_id); + + /* allocate and initialize the new lxa_state structure */ + lxa_state = kmem_zalloc(sizeof (*lxa_state), KM_SLEEP); + lxa_state->lxas_zs = lxa_zs; + lxa_state->lxas_dev_old = *devp; + lxa_state->lxas_dev_new = makedevice(getmajor(*devp), minor); + lxa_state->lxas_flags = flags; + lxa_state->lxas_type = open_type; + + /* initialize the input and output device */ + if (((rv = lxa_dev_open(lxa_state)) != 0) || + ((rv = lxa_dev_getfeatures(lxa_state)) != 0)) { + lxa_state_close(lxa_state); + return (rv); + } + + /* + * save this audio statue structure into a hash indexed + * by it's minor device number. (this will provide a convient + * way to lookup the state structure on future operations.) + */ + if (mod_hash_insert(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t)lxa_state) != 0) { + lxa_state_close(lxa_state); + return (EIO); + } + + mutex_enter(&lxa_lock); + + /* apply the currently cached zone PCM mixer levels */ + if ((lxa_state->lxas_type == LXA_TYPE_AUDIO) && + (lxa_state->lxas_odev_lh != NULL)) { + audio_info_t ai; + + AUDIO_INITINFO(&ai); + ai.play.gain = lxa_zs->lxa_zs_pcm_levels.lxa_ml_gain; + ai.play.balance = lxa_zs->lxa_zs_pcm_levels.lxa_ml_balance; + + if ((rv = lxa_audio_setinfo(lxa_state, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL)) != 0) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (rv); + } + } + + /* + * we only allow one active open of the input or output device. + * check here for duplicate opens + */ + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + if ((lxa_state->lxas_idev_lh != NULL) && + (lxa_zs->lxa_zs_istate != NULL)) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (EBUSY); + } + if ((lxa_state->lxas_odev_lh != NULL) && + (lxa_zs->lxa_zs_ostate != NULL)) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (EBUSY); + } + + /* not a duplicate open, update the global zone state */ + if (lxa_state->lxas_idev_lh != NULL) + lxa_zs->lxa_zs_istate = lxa_state; + if (lxa_state->lxas_odev_lh != NULL) + lxa_zs->lxa_zs_ostate = lxa_state; + } + mutex_exit(&lxa_lock); + + /* make sure to return our newly allocated dev_t */ + *devp = lxa_state->lxas_dev_new; + return (0); +} + +static int +/*ARGSUSED*/ +lxa_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* handle devctl minor nodes (these nodes don't have a handle */ + if (getminor(dev) == LXA_MINORNUM_DEVCTL) + return (0); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) return + (EINVAL); + + lxa_state_close(lxa_state); + return (0); +} + +static int +/*ARGSUSED*/ +lxa_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * any more reads or writes to the device + */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EIO); + + /* we can't do a read if there is no input device */ + if (lxa_state->lxas_idev_lh == NULL) + return (EBADF); + + /* pass the request on */ + return (ldi_read(lxa_state->lxas_idev_lh, uiop, kcred)); +} + +static int +/*ARGSUSED*/ +lxa_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * any more reads or writes to the device + */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EIO); + + /* we can't do a write if there is no output device */ + if (lxa_state->lxas_odev_lh == NULL) + return (EBADF); + + /* pass the request on */ + return (ldi_write(lxa_state->lxas_odev_lh, uiop, kcred)); +} + +static int +/*ARGSUSED*/ +lxa_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* handle devctl minor nodes (these nodes don't have a handle */ + if (getminor(dev) == LXA_MINORNUM_DEVCTL) + return (lxa_ioctl_devctl(cmd, arg, mode)); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + ASSERT((lxa_state->lxas_type == LXA_TYPE_AUDIO) || + (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL)); + + switch (cmd) { + case LXA_IOC_GETMINORNUM: + { + int minornum = getminor(lxa_state->lxas_dev_old); + if (ddi_copyout(&minornum, (void *)arg, + sizeof (minornum), mode) != 0) + return (EFAULT); + } + return (0); + } + + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + /* deal with native ioctl */ + switch (cmd) { + case LXA_IOC_MMAP_OUTPUT: + return (lxa_ioc_mmap_output(lxa_state, arg, mode)); + case LXA_IOC_MMAP_PTR: + return (lxa_ioc_mmap_ptr(lxa_state, arg, mode)); + case LXA_IOC_GET_FRAG_INFO: + return (lxa_ioc_get_frag_info(lxa_state, arg, mode)); + case LXA_IOC_SET_FRAG_INFO: + return (lxa_ioc_set_frag_info(lxa_state, arg, mode)); + } + + /* deal with layered ioctls */ + switch (cmd) { + case AUDIO_DRAIN: + return (lxa_audio_drain(lxa_state)); + case AUDIO_SETINFO: + return (lxa_audio_setinfo(lxa_state, + AUDIO_SETINFO, arg, mode)); + case AUDIO_GETINFO: + return (lxa_audio_getinfo(lxa_state, arg, mode)); + } + } + + if (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) { + /* deal with native ioctl */ + switch (cmd) { + case LXA_IOC_MIXER_GET_VOL: + return (lxa_mixer_get_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_SET_VOL: + return (lxa_mixer_set_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_GET_MIC: + return (lxa_mixer_get_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_SET_MIC: + return (lxa_mixer_set_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_GET_PCM: + return (lxa_mixer_get_pcm(lxa_state, arg, mode)); + case LXA_IOC_MIXER_SET_PCM: + return (lxa_mixer_set_pcm(lxa_state, arg, mode)); + } + + } + + return (EINVAL); +} + +static int +/*ARGSUSED*/ +lxa_devmap(dev_t dev, devmap_cookie_t dhp, + offset_t off, size_t len, size_t *maplen, uint_t model) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + ddi_umem_cookie_t umem_cookie; + void *umem_ptr; + int rv; + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* we only support mmaping of audio devices */ + if (lxa_state->lxas_type != LXA_TYPE_AUDIO) + return (EINVAL); + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* sanity check the amount of memory the user is allocating */ + if ((len == 0) || + (len > LXA_OSS_FRAG_MEM) || + ((len % lxa_state->lxas_frag_size) != 0)) + return (EINVAL); + + /* allocate and clear memory to mmap */ + umem_ptr = ddi_umem_alloc(len, DDI_UMEM_NOSLEEP, &umem_cookie); + if (umem_ptr == NULL) + return (ENOMEM); + bzero(umem_ptr, len); + + /* setup the memory mappings */ + rv = devmap_umem_setup(dhp, lxa_dip, NULL, umem_cookie, 0, len, + PROT_USER | PROT_READ | PROT_WRITE, 0, NULL); + if (rv != 0) { + ddi_umem_free(umem_cookie); + return (EIO); + } + + mutex_enter(&lxa_lock); + + /* we only support one mmap per open */ + if (lxa_state->lxas_umem_cookie != NULL) { + ASSERT(lxa_state->lxas_umem_ptr != NULL); + mutex_exit(&lxa_lock); + ddi_umem_free(umem_cookie); + return (EBUSY); + } + ASSERT(lxa_state->lxas_umem_ptr == NULL); + + *maplen = len; + lxa_state->lxas_umem_len = len; + lxa_state->lxas_umem_ptr = umem_ptr; + lxa_state->lxas_umem_cookie = umem_cookie; + mutex_exit(&lxa_lock); + return (0); +} + +static int +/*ARGSUSED*/ +lxa_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance = ddi_get_instance(dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + ASSERT(instance == 0); + if (instance != 0) + return (DDI_FAILURE); + + lxa_dip = dip; + mutex_init(&lxa_lock, NULL, MUTEX_DEFAULT, NULL); + + /* create our minor nodes */ + if (ddi_create_minor_node(dip, LXA_MINORNAME_DEVCTL, S_IFCHR, + LXA_MINORNUM_DEVCTL, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LXA_MINORNAME_DSP, S_IFCHR, + LXA_MINORNUM_DSP, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LXA_MINORNAME_MIXER, S_IFCHR, + LXA_MINORNUM_MIXER, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + /* allocate our data structures */ + lxa_minor_id = id_space_create("lxa_minor_id", + LXA_MINORNUM_COUNT, LX_AUDIO_MAX_OPENS); + lxa_state_hash = mod_hash_create_idhash("lxa_state_hash", + lxa_state_hash_size, mod_hash_null_valdtor); + lxa_zstate_hash = mod_hash_create_strhash("lxa_zstate_hash", + lxa_zstate_hash_size, mod_hash_null_valdtor); + + return (DDI_SUCCESS); +} + +static int +/*ARGSUSED*/ +lxa_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ASSERT(!MUTEX_HELD(&lxa_lock)); + if (lxa_registered_zones > 0) + return (DDI_FAILURE); + + mod_hash_destroy_idhash(lxa_state_hash); + mod_hash_destroy_idhash(lxa_zstate_hash); + id_space_destroy(lxa_minor_id); + lxa_state_hash = NULL; + lxa_dip = NULL; + + return (DDI_SUCCESS); +} + +static int +/*ARGSUSED*/ +lxa_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = lxa_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Driver flags + */ +static struct cb_ops lxa_cb_ops = { + lxa_open, /* open */ + lxa_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lxa_read, /* read */ + lxa_write, /* write */ + lxa_ioctl, /* ioctl */ + lxa_devmap, /* devmap */ + nodev, /* mmap */ + ddi_devmap_segmap, /* segmap */ + nochpoll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP | D_DEVMAP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lxa_ops = { + DEVO_REV, + 0, + lxa_getinfo, + nulldev, + nulldev, + lxa_attach, + lxa_detach, + nodev, + &lxa_cb_ops, + NULL, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "linux audio driver 'lx_audio' %I%", + &lxa_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +/* + * standard module entry points + */ +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_audio.conf b/usr/src/uts/common/brand/lx/io/lx_audio.conf new file mode 100644 index 0000000000..2eeb5eb7ee --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_audio.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_audio" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c new file mode 100644 index 0000000000..e4079df133 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c @@ -0,0 +1,1137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * This driver attempts to emulate some of the the behaviors of + * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris + * + * It does this by layering over the /dev/ptmx device and intercepting + * opens to it. + * + * This driver makes the following assumptions about the way the ptm/pts + * drivers on Solaris work: + * + * - all opens of the /dev/ptmx device node return a unique dev_t. + * + * - the dev_t minor node value for each open ptm instance corrospondes + * to it's associated slave terminal device number. ie. the path to + * the slave terminal device associated with an open ptm instance + * who's dev_t minor node vaue is 5, is /dev/pts/5. + * + * - the ptm driver always allocates the lowest numbered slave terminal + * device possible. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/devops.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/kstr.h> +#include <sys/ldlinux.h> +#include <sys/lx_ptm.h> +#include <sys/modctl.h> +#include <sys/pathname.h> +#include <sys/ptms.h> +#include <sys/ptyvar.h> +#include <sys/stat.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/types.h> + +#define LP_PTM_PATH "/dev/ptmx" +#define LP_PTS_PATH "/dev/pts/" +#define LP_PTS_DRV_NAME "pts" +#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */ +#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */ + +/* + * this driver is layered on top of the ptm driver. we'd like to + * make this drivers minor name space a mirror of the ptm drivers + * namespace, but we can't actually do this. the reason is that the + * ptm driver is opened via the clone driver. there for no minor nodes + * of the ptm driver are actually accessible via the filesystem. + * since we're not a streams device we can't be opened by the clone + * driver. there for we need to have at least minor node accessible + * via the filesystem so that consumers can open it. we use the device + * node with a minor number of 0 for this purpose. what this means is + * that minor node 0 can't be used to map ptm minor node 0. since this + * minor node is now reserved we need to shift our ptm minor node + * mappings by one. ie. a ptm minor node with a value of 0 will + * corrospond to our minor node with a value of 1. these mappings are + * managed with the following macros. + */ +#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x) +#define INDEX_TO_MINOR(x) ((x) + 1) + +/* + * grow our layered handle array by the same size increment that the ptm + * driver uses to grow the pty device space - PTY_MAXDELTA + */ +#define LP_PTY_INC 128 + +/* + * lx_ptm_ops contains state information about outstanding operations on the + * underlying master terminal device. Currently we only track information + * for read operations. + * + * Note that this data has not been rolled directly into the lx_ptm_handle + * structure because we can't put mutex's of condition variables into + * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle + * structures linked to from the global lx_ptm state can be resized + * dynamically, and when it's resized, the new array is at a different + * memory location and the old array memory is discarded. Mutexs and cvs + * are accessed based off their address, so if this array was re-sized while + * there were outstanding operations on any mutexs or cvs in the array + * then the system would tip over. In the future the lx_ptm_handle structure + * array should probably be replaced with either an array of pointers to + * lx_ptm_handle structures or some other kind of data structure containing + * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure + * could be folded directly into the lx_ptm_handle structures. (This will + * also require the definition of a new locking mechanism to protect the + * contents of lx_ptm_handle structures.) + */ +typedef struct lx_ptm_ops { + int lpo_rops; + kcondvar_t lpo_rops_cv; + kmutex_t lpo_rops_lock; +} lx_ptm_ops_t; + +/* + * Every open of the master terminal device in a zone results in a new + * lx_ptm_handle handle allocation. These handles are stored in an array + * hanging off the lx_ptm_state structure. + */ +typedef struct lx_ptm_handle { + /* Device handle to the underlying real /dev/ptmx master terminal. */ + ldi_handle_t lph_handle; + + /* Flag to indicate if TIOCPKT mode has been enabled. */ + int lph_pktio; + + /* Number of times the slave device has been opened/closed. */ + int lph_eofed; + + /* Callback handler in the ptm driver to check if slave is open. */ + ptmptsopencb_t lph_ppocb; + + /* Pointer to state for operations on underlying device. */ + lx_ptm_ops_t *lph_lpo; +} lx_ptm_handle_t; + +/* + * Global state for the lx_ptm driver. + */ +typedef struct lx_ptm_state { + /* lx_ptm device devinfo pointer */ + dev_info_t *lps_dip; + + /* LDI ident used to open underlying real /dev/ptmx master terminals. */ + ldi_ident_t lps_li; + + /* pts drivers major number */ + major_t lps_pts_major; + + /* rw lock used to manage access and growth of lps_lh_array */ + krwlock_t lps_lh_rwlock; + + /* number of elements in lps_lh_array */ + uint_t lps_lh_count; + + /* Array of handles to underlying real /dev/ptmx master terminals. */ + lx_ptm_handle_t *lps_lh_array; +} lx_ptm_state_t; + +/* Pointer to the lx_ptm global state structure. */ +static lx_ptm_state_t lps; + +/* + * List of modules to be autopushed onto slave terminal devices when they + * are opened in an lx branded zone. + */ +static char *lx_pts_mods[] = { + "ptem", + "ldterm", + "ttcompat", + LDLINUX_MOD, + NULL +}; + +static void +lx_ptm_lh_grow(uint_t index) +{ + uint_t new_lh_count, old_lh_count; + lx_ptm_handle_t *new_lh_array, *old_lh_array; + + /* + * allocate a new array. we drop the rw lock on the array so that + * readers can still access devices in case our memory allocation + * blocks. + */ + new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1); + new_lh_array = + kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP); + + /* + * double check that we still actually need to increase the size + * of the array + */ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + if (index < lps.lps_lh_count) { + /* someone beat us to it so there's nothing more to do */ + rw_exit(&lps.lps_lh_rwlock); + kmem_free(new_lh_array, + sizeof (lx_ptm_handle_t) * new_lh_count); + return; + } + + /* copy the existing data into the new array */ + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_count != 0) { + bcopy(lps.lps_lh_array, new_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + } + + /* save info on the old array */ + old_lh_array = lps.lps_lh_array; + old_lh_count = lps.lps_lh_count; + + /* install the new array */ + lps.lps_lh_array = new_lh_array; + lps.lps_lh_count = new_lh_count; + + rw_exit(&lps.lps_lh_rwlock); + + /* free the old array */ + if (old_lh_array != NULL) { + kmem_free(old_lh_array, + sizeof (lx_ptm_handle_t) * old_lh_count); + } +} + +static void +lx_ptm_lh_insert(uint_t index, ldi_handle_t lh) +{ + lx_ptm_ops_t *lpo; + + ASSERT(lh != NULL); + + /* Allocate and initialize the ops structure */ + lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP); + mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + /* check if we need to grow the size of the layered handle array */ + if (index >= lps.lps_lh_count) { + rw_exit(&lps.lps_lh_rwlock); + lx_ptm_lh_grow(index); + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + } + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle == NULL); + ASSERT(lps.lps_lh_array[index].lph_pktio == 0); + ASSERT(lps.lps_lh_array[index].lph_eofed == 0); + ASSERT(lps.lps_lh_array[index].lph_lpo == NULL); + + /* insert the new handle and return */ + lps.lps_lh_array[index].lph_handle = lh; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + lps.lps_lh_array[index].lph_lpo = lpo; + + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_remove(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0); + ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock)); + + /* free the write handle */ + kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t)); + lps.lps_lh_array[index].lph_lpo = NULL; + + /* remove the handle and return it */ + lh = lps.lps_lh_array[index].lph_handle; + lps.lps_lh_array[index].lph_handle = NULL; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static void +lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + *ppocb = lps.lps_lh_array[index].lph_ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static void +lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + lps.lps_lh_array[index].lph_ppocb = *ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_lookup(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the handle */ + lh = lps.lps_lh_array[index].lph_handle; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static lx_ptm_ops_t * +lx_ptm_lpo_lookup(uint_t index) +{ + lx_ptm_ops_t *lpo; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_lpo != NULL); + + /* return the handle */ + lpo = lps.lps_lh_array[index].lph_lpo; + rw_exit(&lps.lps_lh_rwlock); + return (lpo); +} + +static int +lx_ptm_lh_pktio_get(uint_t index) +{ + int pktio; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the pktio state */ + pktio = lps.lps_lh_array[index].lph_pktio; + rw_exit(&lps.lps_lh_rwlock); + return (pktio); +} + +static void +lx_ptm_lh_pktio_set(uint_t index, int pktio) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the pktio state */ + lps.lps_lh_array[index].lph_pktio = pktio; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_lh_eofed_get(uint_t index) +{ + int eofed; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the eofed state */ + eofed = lps.lps_lh_array[index].lph_eofed; + rw_exit(&lps.lps_lh_rwlock); + return (eofed); +} + +static void +lx_ptm_lh_eofed_set(uint_t index) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the eofed state */ + lps.lps_lh_array[index].lph_eofed++; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_read_start(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* Wait for other read operations to finish */ + while (lpo->lpo_rops != 0) { + if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) { + mutex_exit(&lpo->lpo_rops_lock); + return (-1); + } + } + + /* Start a read operation */ + VERIFY(++lpo->lpo_rops == 1); + mutex_exit(&lpo->lpo_rops_lock); + return (0); +} + +static void +lx_ptm_read_end(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* End a read operation */ + VERIFY(--lpo->lpo_rops == 0); + cv_signal(&lpo->lpo_rops_cv); + + mutex_exit(&lpo->lpo_rops_lock); +} + +static int +lx_ptm_pts_isopen(dev_t dev) +{ + ptmptsopencb_t ppocb; + + lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb); + return (ppocb.ppocb_func(ppocb.ppocb_arg)); +} + +static void +lx_ptm_eof_read(ldi_handle_t lh) +{ + struct uio uio; + iovec_t iov; + char junk[1]; + + /* + * We can remove any EOF message from the head of the stream by + * doing a zero byte read from the stream. + */ + iov.iov_len = 0; + iov.iov_base = junk; + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_resid = iov.iov_len; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + (void) ldi_read(lh, &uio, kcred); +} + +static int +lx_ptm_eof_drop_1(dev_t dev, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, msg_size, msg_count; + + *rvalp = 0; + + /* + * Check if there is an EOF message (represented by a zero length + * data message) at the head of the stream. Note that the + * I_NREAD ioctl is a streams framework ioctl so it will succeed + * even if there have been previous write errors on this stream. + */ + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + + if ((msg_count == 0) || (msg_size != 0)) { + /* No EOF message found */ + return (0); + } + + /* Record the fact that the slave device has been closed. */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + + /* drop the EOF */ + lx_ptm_eof_read(lh); + *rvalp = 1; + return (0); +} + +static int +lx_ptm_eof_drop(dev_t dev, int *rvalp) +{ + int rval, err; + + if (rvalp != NULL) + *rvalp = 0; + for (;;) { + if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0) + return (err); + if (rval == 0) + return (0); + if (rvalp != NULL) + *rvalp = 1; + } +} + +static int +lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + *rvalp = 0; + if (ignore_eof) { + int size, rval; + + if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size, + FKIOCTL, kcred, &rval)) != 0) + return (err); + if (size != 0) + *rvalp = 1; + } else { + int msg_size, msg_count; + + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + if (msg_count != 0) + *rvalp = 1; + } + return (0); +} + +static int +lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int err; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + err = ldi_ident_from_dip(dip, &lps.lps_li); + if (err != 0) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + lps.lps_dip = dip; + lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME); + + rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL); + lps.lps_lh_count = 0; + lps.lps_lh_array = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ldi_ident_release(lps.lps_li); + lps.lps_dip = NULL; + + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_array != NULL) { + kmem_free(lps.lps_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + lps.lps_lh_array = NULL; + lps.lps_lh_count = 0; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + struct strioctl iocb; + ptmptsopencb_t ppocb = { NULL, NULL }; + ldi_handle_t lh; + major_t maj, our_major = getmajor(*devp); + minor_t min, lastmin; + uint_t index, anchor = 1; + dev_t ptm_dev; + int err, rval = 0; + + /* + * Don't support the FNDELAY flag and FNONBLOCK until we either + * find a Linux app that opens /dev/ptmx with the O_NDELAY + * or O_NONBLOCK flags explicitly, or until we create test cases + * to determine how reads of master terminal devices opened with + * these flags behave in different situations on Linux. Supporting + * these flags will involve enhancing our read implementation + * and changing the way it deals with EOF notifications. + */ + if (flag & (FNDELAY | FNONBLOCK)) + return (ENOTSUP); + + /* + * we're layered on top of the ptm driver so open that driver + * first. (note that we're opening /dev/ptmx in the global + * zone, not ourselves in the Linux zone.) + */ + err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li); + if (err != 0) + return (err); + + /* get the devt returned by the ptmx open */ + err = ldi_get_dev(lh, &ptm_dev); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (err); + } + + /* + * we're a cloning driver so here's well change the devt that we + * return. the ptmx is also a cloning driver so we'll just use + * it's minor number as our minor number (it already manages it's + * minor name space so no reason to duplicate the effort.) + */ + index = getminor(ptm_dev); + *devp = makedevice(our_major, INDEX_TO_MINOR(index)); + + /* Get a callback function to query if the pts device is open. */ + iocb.ic_cmd = PTMPTSOPENCB; + iocb.ic_timout = 0; + iocb.ic_len = sizeof (ppocb); + iocb.ic_dp = (char *)&ppocb; + + err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval); + if ((err != 0) || (rval != 0)) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + ASSERT(ppocb.ppocb_func != NULL); + + /* + * now setup autopush for the terminal slave device. this is + * necessary so that when a Linux program opens the device we + * can push required strmod modules onto the stream. in Solaris + * this is normally done by the application that actually + * allocates the terminal. + */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin, + &anchor, lx_pts_mods); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + + /* save off this layered handle for future accesses */ + lx_ptm_lh_insert(index, lh); + lx_ptm_lh_set_ppocb(index, &ppocb); + return (0); +} + +/*ARGSUSED*/ +static int +lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + ldi_handle_t lh; + major_t maj; + minor_t min, lastmin; + uint_t index; + int err; + + index = DEVT_TO_INDEX(dev); + + /* + * we must cleanup all the state associated with this major/minor + * terminal pair before actually closing the ptm master device. + * this is required because once the close of the ptm device is + * complete major/minor terminal pair is immediatly available for + * re-use in any zone. + */ + + /* free up our saved reference for this layered handle */ + lh = lx_ptm_lh_remove(index); + + /* unconfigure autopush for the associated terminal slave device */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + do { + /* + * we loop here because we don't want to release this ptm + * node if autopush can't be disabled on the associated + * slave device because then bad things could happen if + * another brand were to get this terminal allocated + * to them. + * + * XXX should we ever give up? + */ + err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin, + 0, NULL); + } while (err != 0); + + err = ldi_close(lh, flag, credp); + + /* + * note that we don't have to bother with changing the permissions + * on the associated slave device here. the reason is that no one + * can actually open the device untill it's associated master + * device is re-opened, which will result in the permissions on + * it being reset. + */ + return (err); +} + +static int +lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, rval; + struct uio uio = *uiop; + + *loop = 0; + + /* + * Here's another way that Linux master terminals behave differently + * from Solaris master terminals. If you do a read on a Linux + * master terminal (that was opened witout NDELAY and NONBLOCK) + * who's corrosponding slave terminal is currently closed and + * has been opened and closed at least once, Linux return -1 and + * set errno to EIO where as Solaris blocks. + */ + if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) { + /* Slave has been opened and closed at least once. */ + if (lx_ptm_pts_isopen(dev) == 0) { + /* + * Slave is closed. Make sure that data is avaliable + * before attempting a read. + */ + if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0) + return (err); + + /* If there is no data available then return. */ + if (rval == 0) + return (EIO); + } + } + + /* Actually do the read operation. */ + if ((err = ldi_read(lh, uiop, credp)) != 0) + return (err); + + /* If read returned actual data then return. */ + if (uio.uio_resid != uiop->uio_resid) + return (0); + + /* + * This was a zero byte read (ie, an EOF). This indicates + * that the slave terinal device has been closed. Record + * the fact that the slave device has been closed and retry + * the read operation. + */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + *loop = 1; + return (0); +} + +static int +lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev)); + int err, loop; + struct uio uio; + struct iovec iovp; + + ASSERT(uiop->uio_iovcnt > 0); + + /* + * If packet mode has been enabled (via TIOCPKT) we need to pad + * all read requests with a leading byte that indicates any + * relevant control status information. + */ + if (pktio != 0) { + /* + * We'd like to write the control information into + * the current buffer but we can't yet. We don't + * want to modify userspace memory here only to have + * the read operation fail later. So instead + * what we'll do here is read one character from the + * beginning of the memory pointed to by the uio + * structure. This will advance the output pointer + * by one. Then when the read completes successfully + * we can update the byte that we passed over. Before + * we do the read make a copy of the current uiop and + * iovec structs so we can write to them later. + */ + uio = *uiop; + iovp = *uiop->uio_iov; + uio.uio_iov = &iovp; + + if (uwritec(uiop) == -1) + return (EFAULT); + } + + do { + /* + * Serialize all reads. We need to do this so that we can + * properly emulate the behavior of master terminals on Linux. + * In reality this serializaion should not pose any kind of + * performance problem since it would be very strange to have + * multiple threads trying to read from the same master + * terminal device concurrently. + */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_read_loop(dev, uiop, credp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + + if (pktio != 0) { + uint8_t pktio_data = TIOCPKT_DATA; + + /* + * Note that the control status information we + * pass back is faked up in the sense that we + * don't actually report any events, we always + * report a status of 0. + */ + if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0) + return (EFAULT); + } + + return (0); +} + +static int +lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + err = ldi_write(lh, uiop, credp); + + return (err); +} + +static int +lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + /* + * here we need to make sure that we never allow the + * I_SETSIG and I_ESETSIG ioctls to pass through. we + * do this because we can't support them. + * + * the native Solaris ptm device supports these ioctls because + * they are streams framework ioctls and all streams devices + * support them by default. these ioctls cause the current + * process to be registered with a stream and receive signals + * when certain stream events occur. + * + * a problem arises with cleanup of these registrations + * for layered drivers. + * + * normally the streams framework is notified whenever a + * process closes any reference to a stream and it goes ahead + * and cleans up these registrations. but actual device drivers + * are not notified when a process performs a close operation + * unless the process is closing the last opened reference to + * the device on the entire system. + * + * so while we could pass these ioctls on and allow processes + * to register for signal delivery, we would never receive + * any notification when those processes exit (or close a + * stream) and we wouldn't be able to unregister them. + * + * luckily these operations are streams specific and Linux + * doesn't support streams devices. so it doesn't actually + * seem like we need to support these ioctls. if it turns + * out that we do need to support them for some reason in + * the future, the current driver model will have to be + * enhanced to better support streams device layering. + */ + if ((cmd == I_SETSIG) || (cmd == I_ESETSIG)) + return (EINVAL); + + /* + * here we fake up support for TIOCPKT. Linux applications expect + * /etc/ptmx to support this ioctl, but on Solaris it doesn't. + * (it is supported on older bsd style ptys.) so we'll fake + * up support for it here. + * + * the reason that this ioctl is emulated here instead of in + * userland is that this ioctl affects the results returned + * from read() operations. if this ioctl was emulated in + * userland the brand library would need to intercept all + * read operations and check to see if pktio was enabled + * for the fd being read from. since this ioctl only needs + * to be supported on the ptmx device it makes more sense + * to support it here where we can easily update the results + * returned for read() operations performed on ourselves. + */ + if (cmd == TIOCPKT) { + int pktio; + + if (ddi_copyin((void *)arg, &pktio, sizeof (pktio), + mode) != DDI_SUCCESS) + return (EFAULT); + + if (pktio == 0) + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0); + else + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1); + + return (0); + } + + err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp); + + return (err); +} + +static int +lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + short reventsp2; + int err, rval; + + *loop = 0; + + /* + * If the slave device has been opened and closed at least + * once and the slave device is currently closed, then poll + * always needs to returns immediatly. + */ + if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) && + (lx_ptm_pts_isopen(dev) == 0)) { + /* In this case always return POLLHUP */ + *reventsp = POLLHUP; + + /* + * Check if there really is data on the stream. + * If so set the correct return flags. + */ + if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) { + /* Something went wrong. */ + return (err); + } + if (rval != 0) + *reventsp |= (events & (POLLIN | POLLRDNORM)); + + /* + * Is the user checking for writability? Note that for ptm + * devices Linux seems to ignore the POLLWRBAND write flag. + */ + if ((events & POLLWRNORM) == 0) + return (0); + + /* + * To check if the stream is writable we have to actually + * call poll, but make sure to set anyyet to 1 to prevent + * the streams framework from setting up callbacks. + */ + if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0) + return (err); + + *reventsp |= (reventsp2 & POLLWRNORM); + } else { + int lockstate; + + /* The slave device is open, do the poll */ + if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0) + return (err); + + /* + * Drop any leading EOFs on the stream. + * + * Note that we have to use pollunlock() here to avoid + * recursive mutex enters in the poll framework. The + * reason is that if there is an EOF message on the stream + * then the act of reading from the queue to remove the + * message can cause the ptm drivers event service + * routine to be invoked, and if there is no open + * slave device then the ptm driver may generate + * error messages and put them on the stream. This + * in turn will generate a poll event and the poll + * framework will try to invoke any poll callbacks + * associated with the stream. In the process of + * doing that the poll framework will try to aquire + * locks that we are already holding. So we need to + * drop those locks here before we do our read. + */ + lockstate = pollunlock(); + err = lx_ptm_eof_drop(dev, &rval); + pollrelock(lockstate); + if (err) + return (err); + + /* If no EOF was dropped then return */ + if (rval == 0) + return (0); + + /* + * An EOF was removed from the stream. Retry the entire + * poll operation from the top because polls on the ptm + * device should behave differently now. + */ + *loop = 1; + } + return (0); +} + +static int +lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int loop, err; + + do { + /* Serialize ourself wrt read operations. */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_poll_loop(dev, + events, anyyet, reventsp, phpp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + return (0); +} + +static struct cb_ops lx_ptm_cb_ops = { + lx_ptm_open, /* open */ + lx_ptm_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lx_ptm_read, /* read */ + lx_ptm_write, /* write */ + lx_ptm_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + lx_ptm_poll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lx_ptm_ops = { + DEVO_REV, + 0, + ddi_getinfo_1to1, + nulldev, + nulldev, + lx_ptm_attach, + lx_ptm_detach, + nodev, + &lx_ptm_cb_ops, + NULL, + NULL +}; + +static struct modldrv modldrv = { + &mod_driverops, + "Linux master terminal driver 'lx_ptm' %I%", + &lx_ptm_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf new file mode 100644 index 0000000000..481b4e3c74 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_ptm" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c new file mode 100644 index 0000000000..d993c1eefc --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -0,0 +1,836 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/exec.h> +#include <sys/lx_impl.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_pid.h> +#include <sys/lx_futex.h> +#include <sys/lx_brand.h> +#include <sys/termios.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/exec.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/machelf.h> +#include <sys/auxv.h> +#include <sys/priv.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/archsystm.h> +#include <sys/zone.h> +#include <sys/brand.h> + +int lx_debug = 0; + +void lx_setbrand(proc_t *); +int lx_getattr(zone_t *, int, void *, size_t *); +int lx_setattr(zone_t *, int, void *, size_t); +int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t, uintptr_t); +void lx_copy_procdata(proc_t *, proc_t *); + +extern void lx_setrval(klwp_t *, int, int); +extern void lx_proc_exit(proc_t *, klwp_t *); +extern void lx_exec(); +extern int lx_initlwp(klwp_t *); +extern void lx_forklwp(klwp_t *, klwp_t *); +extern void lx_exitlwp(klwp_t *); +extern void lx_freelwp(klwp_t *); +extern greg_t lx_fixsegreg(greg_t, model_t); +extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); + +int lx_systrace_brand_enabled; + +lx_systrace_f *lx_systrace_entry_ptr; +lx_systrace_f *lx_systrace_return_ptr; + +static int lx_systrace_enabled; + +static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int brand_action); + +/* lx brand */ +struct brand_ops lx_brops = { + lx_brandsys, + lx_setbrand, + lx_getattr, + lx_setattr, + lx_copy_procdata, + lx_proc_exit, + lx_exec, + lx_setrval, + lx_initlwp, + lx_forklwp, + lx_freelwp, + lx_exitlwp, + lx_elfexec +}; + +struct brand_mach_ops lx_mops = { + NULL, + lx_brand_int80_callback, + NULL, + NULL, + NULL, + lx_fixsegreg, +}; + +struct brand lx_brand = { + BRAND_VER_1, + "lx", + &lx_brops, + &lx_mops +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "lx brand %I%", &lx_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +lx_proc_exit(proc_t *p, klwp_t *lwp) +{ + zone_t *z = p->p_zone; + + ASSERT(p->p_brand != NULL); + ASSERT(p->p_brand_data != NULL); + + /* + * If init is dying and we aren't explicitly shutting down the zone + * or the system, then Solaris is about to restart init. The Linux + * init is not designed to handle a restart, which it interprets as + * a reboot. To give it a sane environment in which to run, we + * reboot the zone. + */ + if (p->p_pid == z->zone_proc_initpid) { + if (z->zone_boot_err == 0 && + z->zone_restart_init && + zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && + zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) + (void) zone_kadmin(A_REBOOT, 0, NULL, CRED()); + } else { + lx_exitlwp(lwp); + kmem_free(p->p_brand_data, sizeof (struct lx_proc_data)); + p->p_brand_data = NULL; + p->p_brand = &native_brand; + } +} + +void +lx_setbrand(proc_t *p) +{ + kthread_t *t = p->p_tlist; + int err; + + ASSERT(p->p_brand_data == NULL); + ASSERT(ttolxlwp(curthread) == NULL); + + p->p_brand_data = kmem_zalloc(sizeof (struct lx_proc_data), KM_SLEEP); + + /* + * This routine can only be called for single-threaded processes. + * Since lx_initlwp() can only fail if we run out of PIDs for + * multithreaded processes, we know that this can never fail. + */ + err = lx_initlwp(t->t_lwp); + ASSERT(err == 0); +} + +/* ARGSUSED */ +int +lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) +{ + boolean_t val; + + if (attr == LX_ATTR_RESTART_INIT) { + if (bufsize > sizeof (boolean_t)) + return (ERANGE); + if (copyin(buf, &val, sizeof (val)) != 0) + return (EFAULT); + if (val != B_TRUE && val != B_FALSE) + return (EINVAL); + zone->zone_restart_init = val; + return (0); + } + return (EINVAL); +} + +/* ARGSUSED */ +int +lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) +{ + if (attr == LX_ATTR_RESTART_INIT) { + if (*bufsize < sizeof (boolean_t)) + return (ERANGE); + if (copyout(&zone->zone_restart_init, buf, + sizeof (boolean_t)) != 0) + return (EFAULT); + *bufsize = sizeof (boolean_t); + return (0); + } + return (-EINVAL); +} + +/* + * Enable ptrace system call tracing for the given LWP. This is done by + * both setting the flag in that LWP's brand data (in the kernel) and setting + * the process-wide trace flag (in the brand library of the traced process). + */ +static int +lx_ptrace_syscall_set(pid_t pid, id_t lwpid, int set) +{ + proc_t *p; + kthread_t *t; + klwp_t *lwp; + lx_proc_data_t *lpdp; + lx_lwp_data_t *lldp; + uintptr_t addr; + int ret, flag = 1; + + if ((p = sprlock(pid)) == NULL) + return (ESRCH); + + if (priv_proc_cred_perm(curproc->p_cred, p, NULL, VWRITE) != 0) { + sprunlock(p); + return (EPERM); + } + + if ((t = idtot(p, lwpid)) == NULL || (lwp = ttolwp(t)) == NULL) { + sprunlock(p); + return (ESRCH); + } + + if ((lpdp = p->p_brand_data) == NULL || + (lldp = lwp->lwp_brand) == NULL) { + sprunlock(p); + return (ESRCH); + } + + if (set) { + /* + * Enable the ptrace flag for this LWP and this process. Note + * that we will turn off the LWP's ptrace flag, but we don't + * turn off the process's ptrace flag. + */ + lldp->br_ptrace = 1; + lpdp->l_ptrace = 1; + + addr = lpdp->l_traceflag; + + mutex_exit(&p->p_lock); + + /* + * This can fail only in some rare corner cases where the + * process is exiting or we're completely out of memory. In + * these cases, it's sufficient to return an error to the ptrace + * consumer and leave the process-wide flag set. + */ + ret = uwrite(p, &flag, sizeof (flag), addr); + + mutex_enter(&p->p_lock); + + /* + * If we couldn't set the trace flag, unset the LWP's ptrace + * flag as there ptrace consumer won't expect this LWP to stop. + */ + if (ret != 0) + lldp->br_ptrace = 0; + } else { + lldp->br_ptrace = 0; + ret = 0; + } + + sprunlock(p); + + if (ret != 0) + ret = EIO; + + return (ret); +} + +static void +lx_ptrace_fire(void) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + lx_lwp_data_t *lldp = lwp->lwp_brand; + + /* + * The ptrace flag only applies until the next event is encountered + * for the given LWP. If it's set, turn off the flag and poke the + * controlling process by raising a signal. + */ + if (lldp->br_ptrace) { + lldp->br_ptrace = 0; + tsignal(t, SIGTRAP); + } +} + +void +lx_brand_systrace_enable(void) +{ + extern void lx_brand_int80_enable(void); + + ASSERT(!lx_systrace_enabled); + + lx_brand_int80_enable(); + + lx_systrace_enabled = 1; +} + +void +lx_brand_systrace_disable(void) +{ + extern void lx_brand_int80_disable(void); + + ASSERT(lx_systrace_enabled); + + lx_brand_int80_disable(); + + lx_systrace_enabled = 0; +} + +/* + * Get the addresses of the user-space system call handler and attach it to + * the proc structure. Returning 0 indicates success; the value returned + * by the system call is the value stored in rval. Returning a non-zero + * value indicates a failure; the value returned is used to set errno, -1 + * is returned from the syscall and the contents of rval are ignored. To + * set errno and have the syscall return a value other than -1 we can + * manually set errno and rval and return 0. + */ +int +lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) +{ + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + lx_proc_data_t *pd; + int linux_call; + struct termios *termios; + uint_t termios_len; + int error; + lx_brand_registration_t reg; + + /* + * There is one operation that is suppored for non-branded + * process. B_EXEC_BRAND. This is the equilivant of an + * exec call, but the new process that is created will be + * a branded process. + */ + if (cmd == B_EXEC_BRAND) { + ASSERT(p->p_zone != NULL); + ASSERT(p->p_zone->zone_brand == &lx_brand); + return (exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_BRAND)); + } + + /* For all other operations this must be a branded process. */ + if (p->p_brand == NULL) + return (set_errno(ENOSYS)); + + ASSERT(p->p_brand == &lx_brand); + ASSERT(p->p_brand_data != NULL); + + switch (cmd) { + case B_REGISTER: + if (p->p_model == DATAMODEL_NATIVE) { + if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } +#ifdef _LP64 + } else { + lx_brand_registration32_t reg32; + + if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + + reg.lxbr_version = (uint_t)reg32.lxbr_version; + reg.lxbr_handler = + (void *)(uintptr_t)reg32.lxbr_handler; + reg.lxbr_tracehandler = + (void *)(uintptr_t)reg32.lxbr_tracehandler; + reg.lxbr_traceflag = + (void *)(uintptr_t)reg32.lxbr_traceflag; +#endif + } + + if (reg.lxbr_version != LX_VERSION_1) { + lx_print("Invalid brand library version (%u)\n", + reg.lxbr_version); + return (EINVAL); + } + + lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", + (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); + pd = p->p_brand_data; + pd->l_handler = (uintptr_t)reg.lxbr_handler; + pd->l_tracehandler = (uintptr_t)reg.lxbr_tracehandler; + pd->l_traceflag = (uintptr_t)reg.lxbr_traceflag; + *rval = 0; + return (0); + case B_TTYMODES: + /* This is necessary for emulating TCGETS ioctls. */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, + &termios_len) != DDI_SUCCESS) + return (EIO); + + ASSERT(termios_len == sizeof (*termios)); + + if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { + ddi_prop_free(termios); + return (EFAULT); + } + + ddi_prop_free(termios); + *rval = 0; + return (0); + + case B_ELFDATA: + pd = curproc->p_brand_data; + if (copyout(&pd->l_elf_data, (void *)arg1, + sizeof (lx_elf_data_t)) != 0) { + (void) set_errno(EFAULT); + return (*rval = -1); + } + *rval = 0; + return (0); + + case B_EXEC_NATIVE: + error = exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_NATIVE); + if (error) { + (void) set_errno(error); + return (*rval = -1); + } + return (*rval = 0); + + case B_LPID_TO_SPAIR: + /* + * Given a Linux pid as arg1, return the Solaris pid in arg2 and + * the Solaris LWP in arg3. We also translate pid 1 (which is + * hardcoded in many applications) to the zone's init process. + */ + { + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, + &s_tid) < 0) + return (ESRCH); + + if (copyout(&s_pid, (void *)arg2, + sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) + return (EFAULT); + + *rval = 0; + return (0); + } + + case B_PTRACE_SYSCALL: + *rval = lx_ptrace_syscall_set((pid_t)arg1, (id_t)arg2, + (int)arg3); + return (0); + + case B_SYSENTRY: + if (lx_systrace_enabled) { + uint32_t args[6]; + + ASSERT(lx_systrace_entry_ptr != NULL); + + if (copyin((void *)arg2, args, sizeof (args)) != 0) + return (EFAULT); + + (*lx_systrace_entry_ptr)(arg1, args[0], args[1], + args[2], args[3], args[4], args[5]); + } + + lx_ptrace_fire(); + + pd = p->p_brand_data; + + /* + * If neither DTrace not ptrace are interested in tracing + * this process any more, turn off the trace flag. + */ + if (!lx_systrace_enabled && !pd->l_ptrace) + (void) suword32((void *)pd->l_traceflag, 0); + + *rval = 0; + return (0); + + case B_SYSRETURN: + if (lx_systrace_enabled) { + ASSERT(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(arg1, arg2, arg2, 0, 0, 0, 0); + } + + lx_ptrace_fire(); + + pd = p->p_brand_data; + + /* + * If neither DTrace not ptrace are interested in tracing + * this process any more, turn off the trace flag. + */ + if (!lx_systrace_enabled && !pd->l_ptrace) + (void) suword32((void *)pd->l_traceflag, 0); + + *rval = 0; + return (0); + + case B_SET_AFFINITY_MASK: + case B_GET_AFFINITY_MASK: + /* + * Retrieve or store the CPU affinity mask for the + * requested linux pid. + * + * arg1 is a linux PID (0 means curthread). + * arg2 is the size of the given mask. + * arg3 is the address of the affinity mask. + */ + return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); + + default: + linux_call = cmd - B_EMULATE_SYSCALL; + if (linux_call >= 0 && linux_call < LX_NSYSCALLS) { + *rval = lx_emulate_syscall(linux_call, arg1, arg2, + arg3, arg4, arg5, arg6); + return (0); + } + } + + return (EINVAL); +} + +/* + * Copy the per-process brand data from a parent proc to a child. + */ +void +lx_copy_procdata(proc_t *child, proc_t *parent) +{ + lx_proc_data_t *cpd, *ppd; + + ppd = parent->p_brand_data; + + ASSERT(ppd != NULL); + + cpd = kmem_alloc(sizeof (lx_proc_data_t), KM_SLEEP); + *cpd = *ppd; + + child->p_brand_data = cpd; +} + +#if defined(_ELF32_COMPAT) +/* + * Currently, only 32-bit branded ELF executables are supported. + */ +#define elfexec elf32exec +#define mapexec_brand mapexec32_brand +#endif /* __amd64 */ + +extern int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, + intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, + cred_t *cred, int brand_action); +extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *ehdr, Elf32_Addr *, + intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *); + +/* + * Exec routine called by elfexec() to load 32-bit Linux binaries. + */ +static int +lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int brand_action) +{ + int error; + vnode_t *nvp; + auxv32_t phdr_auxv = { AT_SUN_BRAND_PHDR, 0 }; + Ehdr ehdr; + Elf32_Addr uphdr_vaddr; + intptr_t voffset; + int interp; + int i; + struct execenv env; + struct user *up = PTOU(ttoproc(curthread)); + lx_elf_data_t *edp = + &((lx_proc_data_t *)ttoproc(curthread)->p_brand_data)->l_elf_data; + + ASSERT(ttoproc(curthread)->p_brand == &lx_brand); + ASSERT(ttoproc(curthread)->p_brand_data != NULL); + + /* + * Set the brandname and library name for the new process so that + * elfexec() puts them onto the stack. + */ + args->brandname = LX_BRANDNAME; + args->emulator = LX_LIB_PATH; + + /* + * We will exec the brand library, and map in the linux linker and the + * linux executable. + */ + if (error = lookupname(LX_LIB_PATH, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp)) { + uprintf("%s: not found.", LX_LIB); + return (error); + } + + if (error = elfexec(nvp, uap, args, idata, level + 1, execsz, setid, + exec_file, cred, brand_action)) { + VN_RELE(nvp); + return (error); + } + VN_RELE(nvp); + + bzero(&env, sizeof (env)); + + if (error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, + exec_file, &interp, &env.ex_bssbase, &env.ex_brkbase, + &env.ex_brksize)) + return (error); + + /* + * Save off the important properties of the lx executable. The brand + * library will ask us for this data later, when it is ready to set + * things up for the lx executable. + */ + edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : + voffset + uphdr_vaddr; + edp->ed_entry = voffset + ehdr.e_entry; + edp->ed_phent = ehdr.e_phentsize; + edp->ed_phnum = ehdr.e_phnum; + + if (interp) { + if (ehdr.e_type == ET_DYN) { + /* + * This is a shared object executable, so we need to + * pick a reasonable place to put the heap. Just don't + * use the first page. + */ + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_bssbase = (caddr_t)PAGESIZE; + } + + /* + * If the program needs an interpreter (most do), map it in and + * store relevant information about it in the aux vector, where + * the brand library can find it. + */ + if (error = lookupname(LX_LINKER, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp)) { + uprintf("%s: not found.", LX_LINKER); + return (error); + } + if (error = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, + &voffset, exec_file, &interp, NULL, NULL, NULL)) { + VN_RELE(nvp); + return (error); + } + VN_RELE(nvp); + + /* + * Now that we know the base address of the brand's linker, + * place it in the aux vector. + */ + edp->ed_base = voffset; + edp->ed_ldentry = voffset + ehdr.e_entry; + } else { + /* + * This program has no interpreter. The lx brand library will + * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, + * so in this case, put the entry point of the main executable + * there. + */ + if (ehdr.e_type == ET_EXEC) { + /* + * An executable with no interpreter, this must be a + * statically linked executable, which means we loaded + * it at the address specified in the elf header, in + * which case the e_entry field of the elf header is an + * absolute address. + */ + edp->ed_ldentry = ehdr.e_entry; + edp->ed_entry = ehdr.e_entry; + } else { + /* + * A shared object with no interpreter, we use the + * calculated address from above. + */ + edp->ed_ldentry = edp->ed_entry; + } + + /* + * Delay setting the brkbase until the first call to brk(); + * see elfexec() for details. + */ + env.ex_bssbase = (caddr_t)0; + env.ex_brkbase = (caddr_t)0; + env.ex_brksize = 0; + } + + env.ex_vp = vp; + setexecenv(&env); + + /* + * We don't need to copy this stuff out. It is only used by our + * tools to locate the lx linker's debug section. But we should at + * least try to keep /proc's view of the aux vector consistent with + * what's on the process stack. + */ + phdr_auxv.a_un.a_val = edp->ed_phdr; + if (copyout(&phdr_auxv, args->brand_auxp, sizeof (phdr_auxv)) == -1) + return (EFAULT); + + /* + * /proc uses the AT_ENTRY aux vector entry to deduce + * the location of the executable in the address space. The user + * structure contains a copy of the aux vector that needs to have those + * entries patched with the values of the real lx executable (they + * currently contain the values from the lx brand library that was + * elfexec'd, above). + * + * For live processes, AT_BASE is used to locate the linker segment, + * which /proc and friends will later use to find Solaris symbols + * (such as rtld_db_preinit). However, for core files, /proc uses + * AT_ENTRY to find the right segment to label as the executable. + * So we set AT_ENTRY to be the entry point of the linux executable, + * but leave AT_BASE to be the address of the Solaris linker. + */ + for (i = 0; i < __KERN_NAUXV_IMPL; i++) { + if (up->u_auxv[i].a_type == AT_ENTRY) + up->u_auxv[i].a_un.a_val = edp->ed_entry; + if (up->u_auxv[i].a_type == AT_SUN_BRAND_PHDR) + up->u_auxv[i].a_un.a_val = edp->ed_phdr; + } + + return (0); +} + +int +_init(void) +{ + int err = 0; + + /* pid/tid conversion hash tables */ + lx_pid_init(); + + /* for lx_futex() */ + lx_futex_init(); + + err = mod_install(&modlinkage); + if (err != 0) { + cmn_err(CE_WARN, "Couldn't install lx brand module"); + + /* + * This looks drastic, but it should never happen. These + * two data structures should be completely free-able until + * they are used by Linux processes. Since the brand + * wasn't loaded there should be no Linux processes, and + * thus no way for these data structures to be modified. + */ + if (lx_futex_fini()) + panic("lx brand module cannot be loaded or unloaded."); + } + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + int futex_done = 0; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&lx_brand)) + return (EBUSY); + + lx_pid_fini(); + + if ((err = lx_futex_fini()) != 0) + goto done; + futex_done = 1; + + err = mod_remove(&modlinkage); + +done: + if (err) { + /* + * If we can't unload the module, then we have to get it + * back into a sane state. + */ + lx_pid_init(); + + if (futex_done) + lx_futex_init(); + + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c new file mode 100644 index 0000000000..375b99fa46 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -0,0 +1,383 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/privregs.h> +#include <sys/exec.h> +#include <sys/lwp.h> +#include <sys/sem.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_pid.h> +#include <sys/lx_futex.h> + +/* Linux specific functions and definitions */ +void lx_setrval(klwp_t *, int, int); +void lx_exec(); +int lx_initlwp(klwp_t *); +void lx_forklwp(klwp_t *, klwp_t *); +void lx_exitlwp(klwp_t *); +void lx_freelwp(klwp_t *); +static void lx_save(klwp_t *); +static void lx_restore(klwp_t *); +extern void lx_ptrace_free(proc_t *); + +/* + * Set the return code for the forked child, always zero + */ +/*ARGSUSED*/ +void +lx_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_r0 = 0; +} + +/* + * Reset process state on exec(2) + */ +void +lx_exec() +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + int err; + + /* + * There are two mutually exclusive special cases we need to + * address. First, if this was a native process prior to this + * exec(), then this lwp won't have its brand-specific data + * initialized and it won't be assigned a Linux PID yet. Second, + * if this was a multi-threaded Linux process and this lwp wasn't + * the main lwp, then we need to make its Solaris and Linux PIDS + * match. + */ + if (lwpd == NULL) { + err = lx_initlwp(lwp); + /* + * Only possible failure from this routine should be an + * inability to allocate a new PID. Since single-threaded + * processes don't need a new PID, we should never hit this + * error. + */ + ASSERT(err == 0); + lwpd = lwptolxlwp(lwp); + } else if (curthread->t_tid != 1) { + lx_pid_reassign(curthread); + } + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, + NULL); + + /* + * clear out the tls array + */ + bzero(lwpd->br_tls, sizeof (lwpd->br_tls)); + + /* + * reset the tls entries in the gdt + */ + kpreempt_disable(); + lx_restore(lwp); + kpreempt_enable(); +} + +void +lx_exitlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p; + kthread_t *t; + sigqueue_t *sqp = NULL; + pid_t ppid; + id_t ptid; + + if (lwpd == NULL) + return; /* second time thru' */ + + if (lwpd->br_clear_ctidp != NULL) { + (void) suword32(lwpd->br_clear_ctidp, 0); + (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1, + NULL, NULL, 0); + } + + if (lwpd->br_signal != 0) { + /* + * The first thread in a process doesn't cause a signal to + * be sent when it exits. It was created by a fork(), not + * a clone(), so the parent should get signalled when the + * process exits. + */ + if (lwpd->br_ptid == -1) + goto free; + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * If br_ppid is 0, it means this is a CLONE_PARENT thread, + * so the signal goes to the parent process - not to a + * specific thread in this process. + */ + p = lwptoproc(lwp); + if (lwpd->br_ppid == 0) { + mutex_enter(&p->p_lock); + ppid = p->p_ppid; + t = NULL; + } else { + /* + * If we have been reparented to init or if our + * parent thread is gone, then nobody gets + * signaled. + */ + if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) || + (ptid == -1)) + goto free; + + mutex_enter(&pidlock); + if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + goto free; + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((t = idtot(p, ptid)) == NULL) { + mutex_exit(&p->p_lock); + goto free; + } + } + + sqp->sq_info.si_signo = lwpd->br_signal; + sqp->sq_info.si_code = lwpd->br_exitwhy; + sqp->sq_info.si_status = lwpd->br_exitwhat; + sqp->sq_info.si_pid = lwpd->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + sqp = NULL; + } + +free: + if (sqp) + kmem_free(sqp, sizeof (sigqueue_t)); + + lx_freelwp(lwp); +} + +void +lx_freelwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + + if (lwpd != NULL) { + (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, + NULL, NULL, lx_save, NULL); + if (lwpd->br_pid != 0) + lx_pid_rele(lwptoproc(lwp)->p_pid, + lwptot(lwp)->t_tid); + + lwp->lwp_brand = NULL; + kmem_free(lwpd, sizeof (struct lx_lwp_data)); + } +} + +int +lx_initlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd; + struct lx_lwp_data *plwpd; + kthread_t *tp = lwptot(lwp); + + lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP); + lwpd->br_exitwhy = CLD_EXITED; + lwpd->br_lwp = lwp; + lwpd->br_clear_ctidp = NULL; + lwpd->br_set_ctidp = NULL; + lwpd->br_signal = 0; + lwpd->br_affinitymask = 0; + + /* + * The first thread in a process has ppid set to the parent + * process's pid, and ptid set to -1. Subsequent threads in the + * process have their ppid set to the pid of the thread that + * created them, and their ptid to that thread's tid. + */ + if (tp->t_next == tp) { + lwpd->br_ppid = tp->t_procp->p_ppid; + lwpd->br_ptid = -1; + } else if (ttolxlwp(curthread) != NULL) { + plwpd = ttolxlwp(curthread); + bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls)); + lwpd->br_ppid = plwpd->br_pid; + lwpd->br_ptid = curthread->t_tid; + } else { + /* + * Oddball case: the parent thread isn't a Linux process. + */ + lwpd->br_ppid = 0; + lwpd->br_ptid = -1; + } + lwp->lwp_brand = lwpd; + + if (lx_pid_assign(tp)) { + kmem_free(lwpd, sizeof (struct lx_lwp_data)); + lwp->lwp_brand = NULL; + return (-1); + } + lwpd->br_tgid = lwpd->br_pid; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + + return (0); +} + +/* + * There is no need to have any locking for either the source or + * destination struct lx_lwp_data structs. This is always run in the + * thread context of the source thread, and the destination thread is + * always newly created and not referred to from anywhere else. + */ +void +lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + struct lx_lwp_data *src = srclwp->lwp_brand; + struct lx_lwp_data *dst = dstlwp->lwp_brand; + + dst->br_ppid = src->br_pid; + dst->br_ptid = lwptot(srclwp)->t_tid; + bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + + /* + * copy only these flags + */ + dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; + dst->br_clone_args = NULL; +} + +/* + * When switching a Linux process off the CPU, clear its GDT entries. + */ +/* ARGSUSED */ +static void +lx_save(klwp_t *t) +{ + static user_desc_t null_desc; + static int inited; + user_desc_t *gdt; + int i; + + if (inited == 0) { + bzero(&null_desc, sizeof (null_desc)); + inited = 1; + } + + gdt = cpu_get_gdt(); + for (i = 0; i < LX_TLSNUM; i++) + gdt[GDT_TLSMIN + i] = null_desc; +} + +/* + * When switching a Linux process on the CPU, set its GDT entries. + */ +static void +lx_restore(klwp_t *t) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(t); + user_desc_t *gdt; + user_desc_t *tls; + int i; + + ASSERT(lwpd); + + gdt = cpu_get_gdt(); + tls = lwpd->br_tls; + for (i = 0; i < LX_TLSNUM; i++) + gdt[GDT_TLSMIN + i] = tls[i]; +} + +void +lx_set_gdt(int entry, user_desc_t *descrp) +{ + user_desc_t *gdt = cpu_get_gdt(); + + gdt[entry] = *descrp; +} + +void +lx_clear_gdt(int entry) +{ + user_desc_t *gdt = cpu_get_gdt(); + + bzero(&gdt[entry], sizeof (user_desc_t)); +} + +longlong_t +lx_nosys() +{ + return (set_errno(ENOSYS)); +} + +longlong_t +lx_opnotsupp() +{ + return (set_errno(EOPNOTSUPP)); +} + +/* + * Brand-specific routine to check if given non-Solaris standard segment + * register values should be used as-is or if they should be modified to other + * values. + */ +/*ARGSUSED*/ +greg_t +lx_fixsegreg(greg_t sr, model_t datamodel) +{ + struct lx_lwp_data *lxlwp = ttolxlwp(curthread); + + /* + * If the segreg is the same as the %gs the brand callback was last + * entered with, allow it to be used unmodified. + */ + ASSERT(sr == (sr & 0xffff)); + + if (sr == (lxlwp->br_ugs & 0xffff)) + return (sr); + + /* + * Force the SR into the LDT in ring 3 for 32-bit processes. + * + * 64-bit processes get the null GDT selector since they are not + * allowed to have a private LDT. + */ +#if defined(__amd64) + return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0); +#elif defined(__i386) + datamodel = datamodel; /* datamodel currently unused for 32-bit */ + return (sr | SEL_TI_LDT | SEL_UPL); +#endif /* __amd64 */ +} diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c new file mode 100644 index 0000000000..4f22efd1ee --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c @@ -0,0 +1,348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> +#include <sys/lx_pid.h> + +#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */ +static int hash_len = 4; /* desired average hash chain length */ +static int hash_size; /* no of buckets in the hash table */ + +static struct lx_pid **stol_pid_hash; +static struct lx_pid **ltos_pid_hash; + +#define LTOS_HASH(pid) ((pid) & (hash_size - 1)) +#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1)) + +static kmutex_t hash_lock; + +static void +lx_pid_insert_hash(struct lx_pid *lpidp) +{ + int shash = STOL_HASH(lpidp->s_pid, lpidp->s_tid); + int lhash = LTOS_HASH(lpidp->l_pid); + + ASSERT(MUTEX_HELD(&hash_lock)); + + lpidp->stol_next = stol_pid_hash[shash]; + stol_pid_hash[shash] = lpidp; + + lpidp->ltos_next = ltos_pid_hash[lhash]; + ltos_pid_hash[lhash] = lpidp; +} + +static struct lx_pid * +lx_pid_remove_hash(pid_t pid, id_t tid) +{ + struct lx_pid **hpp; + struct lx_pid *lpidp; + + ASSERT(MUTEX_HELD(&hash_lock)); + + hpp = &stol_pid_hash[STOL_HASH(pid, tid)]; + while (*hpp) { + if ((*hpp)->s_pid == pid && (*hpp)->s_tid == tid) { + lpidp = *hpp; + *hpp = (*hpp)->stol_next; + break; + } + hpp = &(*hpp)->stol_next; + } + + /* + * when called during error recovery the pid may already + * be released + */ + if (lpidp == NULL) + return (NULL); + + hpp = <os_pid_hash[LTOS_HASH(lpidp->l_pid)]; + while (*hpp) { + if (*hpp == lpidp) { + *hpp = lpidp->ltos_next; + break; + } + hpp = &(*hpp)->ltos_next; + } + + return (lpidp); +} + +/* + * given a solaris pid/tid pair, create a linux pid + */ +int +lx_pid_assign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + pid_t s_pid = p->p_pid; + id_t s_tid = t->t_tid; + struct pid *pidp; + struct lx_pid *lpidp; + lx_lwp_data_t *lwpd = ttolxlwp(t); + pid_t newpid; + + if (p->p_lwpcnt > 0) { + /* + * Allocate a pid for any thread other than the first + */ + if ((newpid = pid_allocate(p, 0)) < 0) + return (-1); + + pidp = pid_find(newpid); + } else { + pidp = NULL; + newpid = s_pid; + } + + lpidp = kmem_alloc(sizeof (struct lx_pid), KM_SLEEP); + lpidp->l_pid = newpid; + lpidp->s_pid = s_pid; + lpidp->s_tid = s_tid; + lpidp->l_pidp = pidp; + lpidp->l_start = t->t_start; + + /* + * now put the pid into the linux-solaris and solaris-linux + * conversion hash tables + */ + mutex_enter(&hash_lock); + lx_pid_insert_hash(lpidp); + mutex_exit(&hash_lock); + + lwpd->br_pid = newpid; + + return (0); +} + +/* + * If we are exec()ing the process, this thread's tid is about to be reset + * to 1. Make sure the Linux PID bookkeeping reflects that change. + */ +void +lx_pid_reassign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + struct pid *old_pidp; + struct lx_pid *lpidp; + + ASSERT(p->p_lwpcnt == 1); + + mutex_enter(&hash_lock); + + /* + * Clean up all the traces of this thread's 'fake' Linux PID. + */ + lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid); + ASSERT(lpidp != NULL); + old_pidp = lpidp->l_pidp; + lpidp->l_pidp = NULL; + + /* + * Now register this thread as (pid, 1). + */ + lpidp->l_pid = p->p_pid; + lpidp->s_pid = p->p_pid; + lpidp->s_tid = 1; + lx_pid_insert_hash(lpidp); + + mutex_exit(&hash_lock); + + if (old_pidp) + (void) pid_rele(old_pidp); +} + +/* + * release a solaris pid/tid pair + */ +void +lx_pid_rele(pid_t pid, id_t tid) +{ + struct lx_pid *lpidp; + + mutex_enter(&hash_lock); + lpidp = lx_pid_remove_hash(pid, tid); + mutex_exit(&hash_lock); + + if (lpidp) { + if (lpidp->l_pidp) + (void) pid_rele(lpidp->l_pidp); + + kmem_free(lpidp, sizeof (*lpidp)); + } +} + +/* + * given a linux pid, return the solaris pid/tid pair + */ +int +lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid) +{ + struct lx_pid *hp; + + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(l_pid)]; hp; hp = hp->ltos_next) + if (l_pid == hp->l_pid) { + if (s_pid) + *s_pid = hp->s_pid; + if (s_tid) + *s_tid = hp->s_tid; + break; + } + mutex_exit(&hash_lock); + + return (hp ? 0 : -1); +} + +/* + * Given an lwp, return the Linux pid of its parent. If the caller + * wants them, we return the Solaris (pid, tid) as well. + */ +pid_t +lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + struct lx_pid *hp; + pid_t zoneinit = curproc->p_zone->zone_proc_initpid; + pid_t lppid, ppid; + + /* + * Be sure not to return a parent pid that should be invisible + * within this zone. + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * If the parent process's pid is the zone's init process, force it + * to the Linux init pid value of 1. + */ + if (ppid == zoneinit) + ppid = 1; + + /* + * There are two cases in which the Linux definition of a 'parent' + * matches that of Solaris: + * + * - if our tgid is the same as our PID, then we are either the + * first thread in the process or a CLONE_THREAD thread. + * + * - if the brand lwp value for ppid is 0, then we are either the + * child of a differently-branded process or a CLONE_PARENT thread. + */ + if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) { + if (ppidp != NULL) + *ppidp = ppid; + if (ptidp != NULL) + *ptidp = -1; + return (ppid); + } + + /* + * Set the default Linux parent pid to be the pid of the zone's init + * process; this will get converted back to the Linux default of 1 + * later. + */ + lppid = zoneinit; + + /* + * If the process's parent isn't init, try and look up the Linux "pid" + * corresponding to the process's parent. + */ + if (ppid != 1) { + /* + * In all other cases, we are looking for the parent of this + * specific thread, which in Linux refers to the thread that + * clone()d it. We stashed that thread's PID away when this + * thread was created. + */ + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; hp; + hp = hp->ltos_next) { + if (lwpd->br_ppid == hp->l_pid) { + /* + * We found the PID we were looking for, but + * since we cached its value in this LWP's brand + * structure, it has exited and been reused by + * another process. + */ + if (hp->l_start > lwptot(lwp)->t_start) + break; + + lppid = lwpd->br_ppid; + if (ppidp != NULL) + *ppidp = hp->s_pid; + if (ptidp != NULL) + *ptidp = hp->s_tid; + + break; + } + } + mutex_exit(&hash_lock); + } + + if (lppid == zoneinit) { + lppid = 1; + + if (ppidp != NULL) + *ppidp = lppid; + if (ptidp != NULL) + *ptidp = -1; + } + + return (lppid); +} + +void +lx_pid_init(void) +{ + hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR)); + + stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + + mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_pid_fini(void) +{ + kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size); + kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c new file mode 100644 index 0000000000..686afea458 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -0,0 +1,409 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + +/* + * Some system calls return either a 32-bit or a 64-bit value, depending + * on the datamodel. + */ +#ifdef _LP64 +#define V_RVAL SE_64RVAL +#else +#define V_RVAL SE_32RVAL1 +#endif + +/* + * Define system calls that return a native 'long' quantity i.e. a 32-bit + * or 64-bit integer - depending on how the kernel is itself compiled + * e.g. read(2) returns 'ssize_t' in the kernel and in userland. + */ +#define LX_CL(name, call, narg) \ + { V_RVAL, (name), (llfcn_t)(call), (narg) } + +/* + * Returns a 32 bit quantity regardless of datamodel + */ +#define LX_CI(name, call, narg) \ + { SE_32RVAL1, (name), (llfcn_t)(call), (narg) } + +extern longlong_t lx_nosys(void); +#define LX_NOSYS(name) \ + {SE_64RVAL, (name), (llfcn_t)lx_nosys, 0} + +lx_sysent_t lx_sysent[] = +{ + LX_NOSYS("lx_nosys"), /* 0 */ + LX_NOSYS("exit"), /* 0 */ + LX_NOSYS("lx_fork"), + LX_NOSYS("read"), + LX_NOSYS("write"), + LX_NOSYS("open"), + LX_NOSYS("close"), + LX_NOSYS("waitpid"), + LX_NOSYS("creat"), + LX_NOSYS("link"), + LX_NOSYS("unlink"), /* 10 */ + LX_NOSYS("exec"), + LX_NOSYS("chdir"), + LX_NOSYS("gtime"), + LX_NOSYS("mknod"), + LX_NOSYS("chmod"), + LX_NOSYS("lchown16"), + LX_NOSYS("break"), + LX_NOSYS("stat"), + LX_NOSYS("lseek"), + LX_CL("getpid", lx_getpid, 0), /* 20 */ + LX_NOSYS("mount"), + LX_NOSYS("umount"), + LX_NOSYS("setuid16"), + LX_NOSYS("getuid16"), + LX_NOSYS("stime"), + LX_NOSYS("ptrace"), + LX_NOSYS("alarm"), + LX_NOSYS("fstat"), + LX_NOSYS("pause"), + LX_NOSYS("utime"), /* 30 */ + LX_NOSYS("stty"), + LX_NOSYS("gtty"), + LX_NOSYS("access"), + LX_NOSYS("nice"), + LX_NOSYS("ftime"), + LX_NOSYS("sync"), + LX_CL("kill", lx_kill, 2), + LX_NOSYS("rename"), + LX_NOSYS("mkdir"), + LX_NOSYS("rmdir"), /* 40 */ + LX_NOSYS("dup"), + LX_NOSYS("pipe"), + LX_NOSYS("times"), + LX_NOSYS("prof"), + LX_CL("brk", lx_brk, 1), + LX_NOSYS("setgid16"), + LX_NOSYS("getgid16"), + LX_NOSYS("signal"), + LX_NOSYS("geteuid16"), + LX_NOSYS("getegid16"), /* 50 */ + LX_NOSYS("sysacct"), + LX_NOSYS("umount2"), + LX_NOSYS("lock"), + LX_NOSYS("ioctl"), + LX_NOSYS("fcntl"), + LX_NOSYS("mpx"), + LX_NOSYS("setpgid"), + LX_NOSYS("ulimit"), + LX_NOSYS("olduname"), + LX_NOSYS("umask"), /* 60 */ + LX_NOSYS("chroot"), + LX_NOSYS("ustat"), + LX_NOSYS("dup2"), + LX_CL("getppid", lx_getppid, 0), + LX_NOSYS("pgrp"), + LX_NOSYS("setsid"), + LX_NOSYS("sigaction"), + LX_NOSYS("sgetmask"), + LX_NOSYS("ssetmask"), + LX_NOSYS("setreuid16"), /* 70 */ + LX_NOSYS("setregid16"), + LX_NOSYS("sigsuspend"), + LX_NOSYS("sigpending"), + LX_NOSYS("sethostname"), + LX_NOSYS("setrlimit"), + LX_NOSYS("old_getrlimit"), + LX_NOSYS("getrusage"), + LX_NOSYS("gettimeofday"), + LX_NOSYS("settimeofday"), + LX_NOSYS("getgroups16"), /* 80 */ + LX_NOSYS("setgroups16"), + LX_NOSYS("old_select"), + LX_NOSYS("symlink"), + LX_NOSYS("oldlstat"), + LX_NOSYS("readlink"), + LX_NOSYS("uselib"), + LX_NOSYS("swapon"), + LX_NOSYS("reboot"), + LX_NOSYS("old_readdir"), + LX_NOSYS("old_mmap"), /* 90 */ + LX_NOSYS("munmap"), + LX_NOSYS("truncate"), + LX_NOSYS("ftruncate"), + LX_NOSYS("fchmod"), + LX_NOSYS("fchown16"), + LX_NOSYS("getpriority"), + LX_NOSYS("setpriority"), + LX_NOSYS("profil"), + LX_NOSYS("statfs"), + LX_NOSYS("fstatfs"), /* 100 */ + LX_NOSYS("ioperm"), + LX_NOSYS("socketcall"), + LX_NOSYS("syslog"), + LX_NOSYS("setitimer"), + LX_NOSYS("getitimer"), + LX_NOSYS("newstat"), + LX_NOSYS("newsltat"), + LX_NOSYS("newsftat"), + LX_NOSYS("uname"), + LX_NOSYS("oldiopl"), /* 110 */ + LX_NOSYS("oldvhangup"), + LX_NOSYS("idle"), + LX_NOSYS("vm86old"), + LX_NOSYS("wait4"), + LX_NOSYS("swapoff"), + LX_CL("sysinfo", lx_sysinfo, 1), + LX_NOSYS("ipc"), + LX_NOSYS("fsync"), + LX_NOSYS("sigreturn"), + LX_CL("clone", lx_clone, 5), /* 120 */ + LX_NOSYS("setdomainname"), + LX_NOSYS("newuname"), + LX_CL("modify_ldt", lx_modify_ldt, 3), + LX_NOSYS("adjtimex"), + LX_NOSYS("mprotect"), + LX_NOSYS("sigprocmask"), + LX_NOSYS("create_module"), + LX_NOSYS("init_module"), + LX_NOSYS("delete_module"), + LX_NOSYS("get_kernel_syms"), /* 130 */ + LX_NOSYS("quotactl"), + LX_NOSYS("getpgid"), + LX_NOSYS("fchdir"), + LX_NOSYS("bdflush"), + LX_NOSYS("sysfs"), + LX_NOSYS("personality"), + LX_NOSYS("afs_syscall"), + LX_NOSYS("setfsuid16"), + LX_NOSYS("setfsgid16"), + LX_NOSYS("llseek"), /* 140 */ + LX_NOSYS("getdents"), + LX_NOSYS("select"), + LX_NOSYS("flock"), + LX_NOSYS("msync"), + LX_NOSYS("readv"), + LX_NOSYS("writev"), + LX_NOSYS("getsid"), + LX_NOSYS("fdatasync"), + LX_NOSYS("sysctl"), + LX_NOSYS("mlock"), /* 150 */ + LX_NOSYS("munlock"), + LX_NOSYS("mlockall"), + LX_NOSYS("munlockall"), + LX_CL("sched_setparam", lx_sched_setparam, 2), + LX_CL("sched_getparam", lx_sched_getparam, 2), + LX_NOSYS("sched_setscheduler"), + LX_NOSYS("sched_getscheduler"), + LX_NOSYS("yield"), + LX_NOSYS("sched_get_priority_max"), + LX_NOSYS("sched_get_priority_min"), /* 160 */ + LX_CL("sched_rr_get_interval", lx_sched_rr_get_interval, 2), + LX_NOSYS("nanosleep"), + LX_NOSYS("mremap"), + LX_CL("setresuid16", lx_setresuid16, 3), + LX_NOSYS("getresuid16"), + LX_NOSYS("vm86"), + LX_NOSYS("query_module"), + LX_NOSYS("poll"), + LX_NOSYS("nfsserctl"), + LX_CL("setresgid16", lx_setresgid16, 3), /* 170 */ + LX_NOSYS("getresgid16"), + LX_NOSYS("prctl"), + LX_NOSYS("rt_sigreturn"), + LX_NOSYS("rt_sigaction"), + LX_NOSYS("rt_sigprocmask"), + LX_NOSYS("rt_sigpending"), + LX_NOSYS("rt_sigtimedwait"), + LX_NOSYS("rt_sigqueueinfo"), + LX_NOSYS("rt_sigsuspend"), + LX_NOSYS("pread64"), /* 180 */ + LX_NOSYS("pwrite64"), + LX_NOSYS("chown16"), + LX_NOSYS("getcwd"), + LX_NOSYS("capget"), + LX_NOSYS("capset"), + LX_NOSYS("sigaltstack"), + LX_NOSYS("sendfile"), + LX_NOSYS("getpmsg"), + LX_NOSYS("putpmsg"), + LX_NOSYS("vfork"), /* 190 */ + LX_NOSYS("getrlimit"), + LX_NOSYS("mmap2"), + LX_NOSYS("truncate64"), + LX_NOSYS("ftruncate64"), + LX_NOSYS("stat64"), + LX_NOSYS("lstat64"), + LX_NOSYS("fstat64"), + LX_NOSYS("lchown"), + LX_NOSYS("getuid"), + LX_NOSYS("getgid"), /* 200 */ + LX_NOSYS("geteuid"), + LX_NOSYS("getegid"), + LX_NOSYS("setreuid"), + LX_NOSYS("setregid"), + LX_NOSYS("getgroups"), + LX_CL("setgroups", lx_setgroups, 2), + LX_NOSYS("fchown"), + LX_CL("setresuid", lx_setresuid, 3), + LX_NOSYS("getresuid"), + LX_CL("setresgid", lx_setresgid, 3), /* 210 */ + LX_NOSYS("getresgid"), + LX_NOSYS("chown"), + LX_NOSYS("setuid"), + LX_NOSYS("setgid"), + LX_NOSYS("setfsuid"), + LX_NOSYS("setfsgid"), + LX_NOSYS("pivot_root"), + LX_NOSYS("mincore"), + LX_NOSYS("madvise"), + LX_NOSYS("getdents64"), /* 220 */ + LX_NOSYS("fcntl64"), + LX_NOSYS("lx_nosys"), + LX_NOSYS("security"), + LX_CL("gettid", lx_gettid, 0), + LX_NOSYS("readahead"), + LX_NOSYS("setxattr"), + LX_NOSYS("lsetxattr"), + LX_NOSYS("fsetxattr"), + LX_NOSYS("getxattr"), + LX_NOSYS("lgetxattr"), /* 230 */ + LX_NOSYS("fgetxattr"), + LX_NOSYS("listxattr"), + LX_NOSYS("llistxattr"), + LX_NOSYS("flistxattr"), + LX_NOSYS("removexattr"), + LX_NOSYS("lremovexattr"), + LX_NOSYS("fremovexattr"), + LX_CL("tkill", lx_tkill, 2), + LX_NOSYS("sendfile64"), + LX_CL("futex", lx_futex, 6), /* 240 */ + LX_NOSYS("sched_setaffinity"), + LX_NOSYS("sched_getaffinity"), + LX_CL("set_thread_area", lx_set_thread_area, 1), + LX_CL("get_thread_area", lx_get_thread_area, 1), + LX_NOSYS("io_setup"), + LX_NOSYS("io_destroy"), + LX_NOSYS("io_getevents"), + LX_NOSYS("io_submit"), + LX_NOSYS("io_cancel"), + LX_NOSYS("fadvise64"), /* 250 */ + LX_NOSYS("lx_nosys"), + LX_NOSYS("exit_group"), + LX_NOSYS("lookup_dcookie"), + LX_NOSYS("epoll_create"), + LX_NOSYS("epoll_ctl"), + LX_NOSYS("epoll_wait"), + LX_NOSYS("remap_file_pages"), + LX_CL("set_tid_address", lx_set_tid_address, 1), + LX_NOSYS("timer_create"), + LX_NOSYS("timer_settime"), /* 260 */ + LX_NOSYS("timer_gettime"), + LX_NOSYS("timer_getoverrun"), + LX_NOSYS("timer_delete"), + LX_NOSYS("clock_settime"), + LX_NOSYS("clock_gettime"), + LX_NOSYS("clock_getres"), + LX_NOSYS("clock_nanosleep"), + LX_NOSYS("statfs64"), + LX_NOSYS("fstatfs64"), + LX_NOSYS("tgkill"), /* 270 */ + LX_NOSYS("utimes"), + LX_NOSYS("fadvise64_64"), + LX_NOSYS("vserver"), + NULL /* NULL-termination is required for lx_systrace */ +}; + +int64_t +lx_emulate_syscall(int num, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) +{ + struct lx_sysent *jsp; + int64_t rval; + + rval = (int64_t)0; + + jsp = &(lx_sysent[num]); + + switch (jsp->sy_narg) { + case 0: { + lx_print("--> %s()\n", jsp->sy_name); + rval = (int64_t)jsp->sy_callc(); + break; + } + case 1: { + lx_print("--> %s(0x%lx)\n", jsp->sy_name, arg1); + rval = (int64_t)jsp->sy_callc(arg1); + break; + } + case 2: { + lx_print("--> %s(0x%lx, 0x%lx)\n", jsp->sy_name, arg1, arg2); + rval = (int64_t)jsp->sy_callc(arg1, arg2); + break; + } + case 3: { + lx_print("--> %s(0x%lx, 0x%lx, 0x%lx)\n", + jsp->sy_name, arg1, arg2, arg3); + rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3); + break; + } + case 4: { + lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx)\n", + jsp->sy_name, arg1, arg2, arg3, arg4); + rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4); + break; + } + case 5: { + lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx)\n", + jsp->sy_name, arg1, arg2, arg3, arg4, arg5); + rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5); + break; + } + case 6: { + lx_print("--> %s(0x%lx, 0x%lx, 0x%lx, 0x%lx," + " 0x%lx, 0x%lx)\n", + jsp->sy_name, arg1, arg2, arg3, arg4, arg5, arg6); + rval = (int64_t)jsp->sy_callc(arg1, arg2, arg3, arg4, arg5, + arg6); + break; + } + default: + panic("Invalid syscall entry: #%d at 0x%p\n", num, jsp); + } + lx_print("----------> return (0x%llx)\n", (long long)rval); + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h new file mode 100644 index 0000000000..c79e3fa590 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h @@ -0,0 +1,233 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LXPROC_H +#define _LXPROC_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred_impl.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ + ldi_handle_t lxpr_cons_ldih; /* ldi handle for console device */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c new file mode 100644 index 0000000000..5d252207fb --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * lxprsubr.c: Various functions for the /lxproc vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_proc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void*, void*, int); +static void lxpr_node_destructor(void*, void*); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +#define BUFSIZE 4000 + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + BUFSIZE, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = BUFSIZE; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + + size_t size = uiobuf->pos - uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg+size > off && off >= 0) + uiobuf->error = + uiomove(uaddr+(off-beg), size-(off-beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uint_t remain + = uiobuf->buffsize-(uiobuf->pos-uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len+1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = + kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, + NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(KM_SLEEP); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = (caddr_t)lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c new file mode 100644 index 0000000000..44891dc612 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * lxprvfsops.c: vfs operations for /lxprocfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_proc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_proc", + lxpr_init, + 0, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, lxpr_mount, + VFSNAME_UNMOUNT, lxpr_unmount, + VFSNAME_ROOT, lxpr_root, + VFSNAME_STATVFS, lxpr_statvfs, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialise cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc"); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/proc"); + (void) strcpy(&sp->f_fstr[6], "/proc"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c new file mode 100644 index 0000000000..45bff38e16 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -0,0 +1,2951 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * lxpr_vnops.c: Vnode operations for the lx /proc file system + * + * Assumptions and Gotchas: + * + * In order to preserve Solaris' security policy. This file system's + * functionality does not override Solaris' security policies even if + * that means breaking Linux compatability. + * + * Linux has no concept of lwps so we only implement procs here as in the + * old /proc interface. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> + +/* Dependent on the Solaris procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lx_proc.h" + +extern pgcnt_t swapfs_minfree; +extern volatile clock_t lbolt; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *); +static int lxpr_readlink(vnode_t *, uio_t *); +static int lxpr_cmp(vnode_t *, vnode_t *); +static int lxpr_realvp(vnode_t *, vnode_t **); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lx /proc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, lxpr_open, + VOPNAME_CLOSE, lxpr_close, + VOPNAME_READ, lxpr_read, + VOPNAME_GETATTR, lxpr_getattr, + VOPNAME_ACCESS, lxpr_access, + VOPNAME_LOOKUP, lxpr_lookup, + VOPNAME_READDIR, lxpr_readdir, + VOPNAME_READLINK, lxpr_readlink, + VOPNAME_FSYNC, lxpr_sync, + VOPNAME_SEEK, lxpr_sync, + VOPNAME_INACTIVE, (fs_generic_func_p) lxpr_inactive, + VOPNAME_CMP, lxpr_cmp, + VOPNAME_REALVP, lxpr_realvp, + NULL, NULL +}; + + +/* + * file contents of an lx /proc directory. + */ +static lxpr_dirent_t lx_procdir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0])) + +/* + * Contents of an lx /proc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of lx /proc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = ldi_open_by_name("/dev/log", FREAD, cr, + &lxpnp->lxpr_cons_ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(lxpnp->lxpr_cons_ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + int err; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + if (type == LXPR_KMSG) { + if ((err = ldi_close(lxpr->lxpr_cons_ldih, 0, cr)) != 0) + return (err); + } + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_kmsg, /* /proc/kmsg */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by lx /proc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in the lx procfs is human + * readable and not binary structures there do not have to be different + * read variants depending on whether the reading process model is 32 or 64 bits + * (at least in general, and certainly the difference is unlikely to be enough + * to justify have different routines for 32 and 64 bit reads + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type >= 0 && type < LXPR_NFILES); + + lxpr_read_function[type](lxpnp, uiobuf); + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with linux: + * + * The linux cmdline returns argv with the correct separation + * using \0 between the arguments, we cannot do that without + * copying the real argv from the correct process context. + * This is too difficult to attempt so we pretend that the + * entire cmdline is just argv[0]. This is good enough for + * ps to display correctly, but might cause some other things + * not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + if (PTOU(p)->u_argv != 0) { + char *buff = PTOU(p)->u_psargs; + int len = strlen(buff); + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, buff, len+1); + } else { + lxpr_unlock(p); + } +} + + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + int inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED()) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %d %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %d\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = btopr(rm_assize(as)); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%d\t%d\t%d\t%d\n" + "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid until we map lwps to procs */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%d ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = rm_assize(as); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < MAXSIG; i++) { + lx_sig = stol_signo[i]; + + if ((lx_sig > 0) && (lx_sig < MAXSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = -1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + /* XXBRAND psdev = DEV_TO_LXDEV(p->p_sessp->s_dev, VCHR); */ + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = v.v_maxsyspri - t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_seqid; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = rm_assize(as); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d " + "0 " + "%ld %lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, + PTOU(p)->u_comm, + stat, + ppid, pgpid, + spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, + 0l, PTOU(p)->u_ticks, /* ticks till next SIGALARM, start time */ + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * XXX: data about each interface should go here, but we'll wait to + * see if anybody wants to use it. + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this lx zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf) +{ + ldi_handle_t lh = lxpnp->lxpr_cons_ldih; + mblk_t *mp; + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lx procfs doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. + * + * Just enough for uptime to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + + /* + * We'll report the total number of lwps in the zone for the + * "nproc" parameter of /proc/loadavg; good enough for lx. + */ + nlwps = curproc->p_zone->zone_nlwps; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = &avenrun[0]; + + /* + * This will report kernel threads as well as user lwps, but it + * should be good enough for lx consumers. + */ + nlwps = nthread; + } + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + long total_mem = physmem * PAGESIZE; + long free_mem = freemem * PAGESIZE; + long total_swap = k_anoninfo.ani_max * PAGESIZE; + long used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || + strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + LX_UNAME_SYSNAME, LX_UNAME_RELEASE, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + LX_UNAME_VERSION, + __TIME__ " " __DATE__); +} + + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, + "cpu %ld %ld %ld %ld\n", + user_cum, 0, sys_cum, idle_cum); + + /* Do per processor stats */ + do { + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %ld %ld %ld %ld\n", + cp->cpu_id, + user_ticks, 0, sys_ticks, idle_ticks); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum); +} + + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Capture lbolt in case it changes */ + up_cs = lbolt; + up_s = up_cs / hz; + up_cs %= hz; + up_cs *= 100; + up_cs /= hz; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (x86_feature & X86_HTT) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", chip_plat_get_chipid(cp), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + + + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr)); + } + + /* + * Access check is based on only + * one of owner, group, public. + * If not owner, then check group. + * If not a member of the group, then + * check public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + + + + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type >= 0 && type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr)) != 0) { + return (error); + } + + /* + * Just return the parent vnode + * if thats where we are trying to go + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches + * Note: null component name is synonym for + * current directory being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + /* + * got the fd data so now done with this proc + */ + lxpr_unlock(p); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will + * be able to find the underlying vnode. + * The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our + * file system structure except those that are pid names. + * These change as pids are created/deleted etc. + * So just look for a number as the first char to see if we + * are we doing pid lookups? + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10*pid + c - '0'; + + /* + * Can't continue if the process is still loading + * or it doesn't really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lx /proc node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES)); +} + + + + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type >= 0 && type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + ASSERT(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and + * uiop->uio_offset by the same amount. But we want + * uiop->uio_offset to change in increments + * of LXPR_SDSIZE, which is different from the number of bytes + * being returned to the user. + * So we set uiop->uio_offset separately, ignoring what + * uiomove() does. + */ + if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) { + return (error); + } + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: + * "." and ".." then the unique lx procfs files, then the + * directories corresponding to the running processes. + * + * This is a good order because it allows us to more easily + * keep track of where we are betwen calls to getdents(). + * If the number of processes changes between calls then we + * can't lose track of where we are in the lx procfs files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, the zched process, + * a PID of 0, and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == curproc->p_zone->zone_zsched->p_pid || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and + * uiop->uio_offset by the same amount. But we want + * uiop->uio_offset to change in increments + * of LXPR_SDSIZE, which is different from the number of bytes + * being returned to the user. + * So we set uiop->uio_offset separately, in the + * increment of this for loop, ignoring what uiomove() does. + */ + if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) + return (error); + +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp) + *eofp = + (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + + return (0); +} + + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize; + uf_info_t *fip; + + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + fddirsize = 0; + else + fddirsize = fip->fi_nfiles; + + mutex_enter(&fip->fi_lock); + lxpr_unlock(p); + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if (error = uiomove((caddr_t)dirent, reclen, UIO_READ, uiop)) + goto out; + } + + if (eofp) + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + +out: + mutex_exit(&fip->fi_lock); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* + * Try to produce a symlink name for anything that's really a regular + * file or directory (but not for anything else) + */ + if (lxpnp->lxpr_realvp != NULL && (lxpnp->lxpr_realvp->v_type == VDIR || + lxpnp->lxpr_realvp->v_type == VREG)) { + if ((error = lxpr_access(vp, VREAD, 0, CRED())) != 0) + return (error); + error = vnodetopath(NULL, lxpnp->lxpr_realvp, bp, buflen, + CRED()); + if (error != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes + */ + + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + (void) snprintf(bp, buflen, "%d", pid); + break; + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr) +{ + lxpr_freenode(VTOLXP(vp)); +} + + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * nothing to sync but this + * function must never fail + */ + return (0); +} + + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) + vp1 = rvp; + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) + vp2 = rvp; + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2)); +} + + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sys/ldlinux.h b/usr/src/uts/common/brand/lx/sys/ldlinux.h new file mode 100644 index 0000000000..b259c05d97 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/ldlinux.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LDLINUX_H +#define _SYS_LDLINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The ldlinux streams module is only intended for use in lx branded zones. + * This streams module implements the following ioctls: + * TIOCSETLD and TIOCGETLD + * + * These ioctls are special ioctls supported only by the ldlinux streams + * module and invoked only by the lx brand emulation library. These ioctls + * do not exist on native Linux systems. + * + * The TIOCSETLD ioctl is used when emulating the following Linux ioctls: + * TCSETS/TCSETSW/TCSETSF + * TCSETA/TCSETAW/TCSETAF + * + * The TIOCGETLD ioctl is used when emulating the following Linux ioctls: + * TCGETS/TCGETA + * + * This module is needed to emulate these ioctls because the following arrays: + * termio.c_cc + * termios.c_cc + * which are parameters for the following ioctls: + * TCSETS/TCSETSW/TCSETSF + * TCSETA/TCSETAW/TCSETAF + * TCGETS/TCGETA + * + * are defined differently on Solaris and Linux. + * + * According to the termio(7I) man page on Solaris the following is true of + * the members of the c_cc array: + * The VMIN element is the same element as the VEOF element. + * The VTIME element is the same element as the VEOL element. + * + * But on Linux the termios(3) man page states: + * These symbolic subscript values are all different, except that + * VTIME, VMIN may have the same value as VEOL, VEOF, respectively. + * + * While the man page indicates that these values may be the same empirical + * tests shows them to be different. Since these values are different on + * Linux systems it's possible that applications could set the members of + * the c_cc array to different values and then later expect to be able to + * read back those same separate values. The ldlinux module exists to provide + * a per-stream storage area where the lx_brand emulation library can save + * these values. The values are set and retrieved via the TIOCSETLD and + * TIOCGETLD ioctls respectively. + */ + +#include <sys/termios.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define LDLINUX_MOD "ldlinux" + +#ifdef _KERNEL + +/* + * LDLINUX_MODID - This should be a unique number associated with + * this particular module. Unfortunatly there is no authority responsible + * for administering this name space, hence there's no real guarantee that + * whatever number we choose will be unique. Luckily, this constant + * is not really used anywhere by the system. It is used by some + * kernel subsystems to check for the presence of certain streams + * modules with known id vaules. Since no other kernel subsystem + * checks for the presence of this module we'll just set the id to 0. + */ +#define LDLINUX_MODID 0 + +struct ldlinux { + int state; /* state information */ + /* Linux expects the next four c_cc values */ + /* to be distinct, whereas solaris (legally) */ + /* overlaps their storage */ + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +#define ISPTSTTY 0x01 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LDLINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_audio.h b/usr/src/uts/common/brand/lx/sys/lx_audio.h new file mode 100644 index 0000000000..cbb3431c4b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_audio.h @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_AUDIO_H +#define _LX_AUDIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zone.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * name for this driver + */ +#define LX_AUDIO_DRV "lx_audio" + +/* + * names for the minor nodes this driver exports + */ +#define LXA_MINORNAME_DEVCTL "lx_devctl" +#define LXA_MINORNAME_DSP "lx_dsp" +#define LXA_MINORNAME_MIXER "lx_mixer" + +/* + * minor numbers for the minor nodes this driver exporrts + */ +#define LXA_MINORNUM_DEVCTL 0 +#define LXA_MINORNUM_DSP 1 +#define LXA_MINORNUM_MIXER 2 +#define LXA_MINORNUM_COUNT 3 + +/* + * driver ioctls + * + * note that we're layering on top of solaris audio devices so we want + * to make sure that our ioctls namespace doesn't conflict with theirs. + * looking in sys/audioio.h and sys/mixer.h we see that they seem to + * use an _IO key of 'A' and 'M', so we'll choose an _IO key of 'a.' + */ + +/* + * administrative ioctls. + * these ioctls are only supported on the DEVCTL minor node + */ +#define LXA_IOC_ZONE_REG (_IOR('a', 0, lxa_zone_reg_t)) +#define LXA_IOC_ZONE_UNREG (_IOR('a', 1, lxa_zone_reg_t)) + + +/* + * audio and mixer device ioctls + * these ioctls are supported on DSP and MIXER minor nodes. + */ +#define LXA_IOC_GETMINORNUM (_IOR('a', 20, int)) + +/* + * audio device ioctls. + * these ioctls are supports on DSP minor nodes. + */ +#define LXA_IOC_MMAP_OUTPUT (_IOR('a', 41, int)) +#define LXA_IOC_MMAP_PTR (_IOR('a', 42, int)) +#define LXA_IOC_GET_FRAG_INFO (_IOR('a', 43, lxa_frag_info_t)) +#define LXA_IOC_SET_FRAG_INFO (_IOR('a', 44, lxa_frag_info_t)) + +/* + * mixer device ioctls. + * these ioctls are supports on MIXER minor nodes. + */ +#define LXA_IOC_MIXER_GET_VOL (_IOR('a', 60, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_VOL (_IOR('a', 61, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_GET_MIC (_IOR('a', 62, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_MIC (_IOR('a', 63, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_GET_PCM (_IOR('a', 64, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_PCM (_IOR('a', 65, lxa_mixer_levels_t)) + +/* command structure for LXA_IOC_ZONE_REG */ +#define LXA_INTSTRLEN 32 +typedef struct lxa_zone_reg { + char lxa_zr_zone_name[ZONENAME_MAX]; + char lxa_zr_inputdev[LXA_INTSTRLEN]; + char lxa_zr_outputdev[LXA_INTSTRLEN]; +} lxa_zone_reg_t; + +/* command structure for LXA_IOC_GET_FRAG_INFO and LXA_IOC_SET_FRAG_INFO */ +typedef struct lxa_frag_info { + int lxa_fi_size; + int lxa_fi_cnt; +} lxa_frag_info_t; + +/* command structure for LXA_IOC_MIXER_GET_* and LXA_IOC_MIXER_SET_* */ +typedef struct lxa_mixer_levels { + int lxa_ml_gain; + int lxa_ml_balance; +} lxa_mixer_levels_t; + +/* verify that a solaris mixer level structure has valid values */ +#define LXA_MIXER_LEVELS_OK(x) (((x)->lxa_ml_gain >= AUDIO_MIN_GAIN) && \ + ((x)->lxa_ml_gain <= AUDIO_MAX_GAIN) && \ + ((x)->lxa_ml_balance >= AUDIO_LEFT_BALANCE) && \ + ((x)->lxa_ml_balance <= AUDIO_RIGHT_BALANCE)) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUDIO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h new file mode 100644 index 0000000000..4436226deb --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_AUTOFS_H +#define _LX_AUTOFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * The lx_autofs filesystem exists to emulate the Linux autofs filesystem + * and provide support for the Linux "automount" automounter. + * + * + * + * +++ Linux automounter background. + * + * Linux has two automounters: "amd" and "automount" + * + * 1) "amd" is a userland NFS server. It basically mounts an NFS filesystem + * at an automount point, and it acts as the NFS server for the mount. When + * an access is done to that NFS filesystem, the access is redirected by the + * kernel to the "amd" process via rpc. "amd" then looks up any information + * required to resolve the requests, mounts real NFS filesystems if + * necessary, and returns. "amd" has it's own strange configuration + * mechanism that doesn't seem to be very compatabile with Solaris's network + * based automounter map support. + * + * 2) "automount" is the other Linux automounter. It utilizes a kernel + * filesystem (autofs) to provide it's functionality. Basically, it mounts + * the autofs filesystem at any automounter controlled mount point. This + * filesystem then intercepts and redirects lookup operations (and only + * lookup ops) to the userland automounter process via a pipe. (The + * pipe to the automounter is establised via mount options when the autofs + * filesystem is mounted.) When the automounter recieves a request via this + * pipe, it does lookups to whatever backing store it's configured to use, + * does mkdir operations on the autofs filesystem, mounts remote NFS + * filesystems on any leaf directories it just created, and signals the + * autofs filesystem via an ioctl to let it know that the lookup can + * continue. + * + * + * + * +++ Linux autofs (and automount daemon) notes + * + * Since we're mimicking the behavior of the Linux autofs filesystem it's + * important to document some of it's observed behavior here since there's + * no doubt that in the future this behavior will change. These comments + * apply to the behavior of the automounter as observed on a system + * running Linux v2.4.21 (autofs is bundled with the Linux kernel). + * + * A) Autofs allows root owned, non-automounter processes to create + * directories in the autofs filesystem. The autofs filesystem treats the + * automounter's process group as special, but it doesn't prevent root + * processes outside of the automounter's process group from creating new + * directories in the autofs filesystem. + * + * B) Autofs doesn't allow creation of any non-directory entries in the + * autofs filesystem. No entity can create files (e.g. /bin/touch or + * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within + * the autofs filesystem are directories. + * + * C) Autofs only intercepts vop lookup operations. Notably, it does _not_ + * intercept and re-direct vop readdir operations. This means that the + * observed behavior of the Linux automounter can be considerably different + * from that of the Solaris automounter. Specifically, on Solaris if autofs + * mount point is mounted _without_ the -nobrowse option then if a user does + * an ls operation (which translates into a vop readdir operation) then the + * automounter will intercept that operation and list all the possible + * directories and mount points without actually mounting any filesystems. + * Essentially, all automounter managed mount points on Linux will behave + * like "-nobrowse" mount points on Solaris. Here's an example to + * illustrate this. If /ws was mounted on Solaris without the -nobrowse + * option and an auto_ws yp map was setup as the backing store for this + * mount point, then an "ls /ws" would list all the keys in the map as + * valid directories, but an "ls /ws" on Linux would list an emptry + * directory. + * + * D) NFS mounts are performed by the automount process. When the automount + * process gets a redirected lookup request, it determines _all_ the + * possible remote mount points for that request, creates directory paths + * via mkdir, and mounts the remote filesystems on the newly created paths. + * So for example, if a machine called mcescher exported /var/crash and + * /var/core, an "ls /net/mcescher" would result in the following actions + * being done by the automounter: + * mkdir /net/mcescher + * mkdir /net/mcescher/var + * mkdir /net/mcescher/var/crash + * mkdir /net/mcescher/var/core + * mount mcescher:/var/crash /var/crash + * mount mcescher:/var/crash /var/core + * once the automounter compleated the work above it would signal the autofs + * filesystem (via an ioctl) that the lookup could continue. + * + * E.1) Autofs only redirects vop lookup operations for path entries that + * don't already exist in the autofs filesystem. So for the example above, + * an initial (after the start of the automounter) "ls /net/mcescher" would + * result in a request to the automounter. A subsequest "ls /net/mcescher" + * would not result in a request to the automounter. Even if + * /net/mcescher/var/crash and /net/mcescher/var/core were manually unmounted + * after the initial "ls /net/mcescher", a subsequest "ls /net/mcescher" + * would not result in a new request to the automounter. + * + * E.2) Autofs lookup requests that are sent to the automounter only include + * the root directory path component. So for example, after starting up + * the automounter if a user were to do a "ls /net/mcescher/var/crash", the + * lookup request actually sent to the automounter would just be for + * "mcescher". (The same request as if the user had done "ls /net/mcescher".) + * + * E.3) The two statements above aren't entirely entirely true. The Linux + * autofs filesystem will also redirect lookup operations for leaf + * directories that don't have a filesystem mounted on them. Using the + * example above, if a user did a "ls /net/mcescher", then manually + * unmounted /net/mcescher/var/crash, and then did an "ls + * /net/mcescher/var/crash", this would result in a request for + * "mcescher/var/crash" being sent to the automounter. The strange thing + * (a Linux bug perhaps) is that the automounter won't do anything with this + * request and the lookup will fail. + * + * F) The autofs filesystem communication protocol (what ioctls it supports + * and what data it passes to the automount process) are versioned. The + * source for the userland automount daemon (i looked at version v3.1.7) + * seemed to support two versions of the Linux kernel autofs implementation. + * Both versions supported communiciation with a pipe and the format of the + * structure passed via this pipe was the same. The difference between the + * two versions was in the functionality supported. (The v3 version has + * additional ioctls to support automount timeouts.) + * + * + * + * +++ lx_autofs notes + * + * 1) In general, the lx_autofs filesystem tries to mimic the behavior of the + * Linux autofs filesystem with the following exceptions: + * + * 1.1) We don't bother to implement the E.3 functionality listed above + * since it doesn't appear to be of any use. + * + * 1.2) We only implement v2 of the automounter protocol since + * implementing v3 would take a _lot_ more work. If this proves to be a + * problem we can re-visit this decision later. (More details about v3 + * support are included in comments below.) + * + * 2) In general, the approach taken for lx_autofs is to keep it as simple + * as possible and to minimize it's memory usage. To do this all information + * about the contents of the lx_autofs filesystem are mirrored in the + * underlying filesystem that lx_autofs is mounted on and most vop operations + * are simply passed onto this underlying filesystem. This means we don't + * have to implement most the complex operations that a full filesystem + * normally has to implement. It also means that most of our filesystem state + * (wrt the contents of the filesystem) doesn't actually have to be stored + * in memory, we can simply go to the underlying filesystem to get it when + * it's requested. For the purposes of discussion, we'll call the underlying + * filesystem the "backing store." + * + * The backing store is actually directory called ".lx_afs" which is created in + * the directory where the lx_autofs filesystem is mounted. When the lx_autofs + * filesystem is unmounted this backing store directory is deleted. If this + * directory exists at mount time (perhaps the system crashed while a previous + * lx_autofs instance was mounted at the same location) it will be deleted. + * There are a few implications of using a backing store worth mentioning. + * + * 2.1) lx_autofs can't be mounted on a read only filesystem. If this + * proves to be a problem we can probably move the location of the + * backing store. + * + * 2.2) If the backing store filesystem runs out of space then the + * automounter process won't be able to create more directories and mount + * new filesystems. Of course, strange failures usually happen when + * filesystems run out of space. + * + * 3) Why aren't we using gfs? gfs has two different usage models. + * + * 3.1) I'm my own filesystem but i'm using gfs to help with managing + * readdir operations. + * + * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes + * + * We're not using the 3.1 interfaces because we don't implement readdir + * ourselves. We pass all readdir operations onto the backing store + * filesystem and utilize its readdir implementation. + * + * We're not using the 3.2 interfaces because they are really designed for + * in memory filesystems where all of the filesystem state is stored in + * memory. They don't lend themselves to filesystems where part of the + * state is in memory and part of the state is on disk. + * + * For more information on gfs take a look at the block comments in the + * top of gfs.c + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that the name of the actual Solaris filesystem is lx_afs and not + * lx_autofs. This is becase filesystem names are stupidly limited to 8 + * characters. + */ +#define LX_AUTOFS_NAME "lx_afs" + +/* + * Mount options supported. + */ +#define LX_MNTOPT_FD "fd" +#define LX_MNTOPT_PGRP "pgrp" +#define LX_MNTOPT_MINPROTO "minproto" +#define LX_MNTOPT_MAXPROTO "maxproto" + +/* Version of the Linux kernel automount protocol we support. */ +#define LX_AUTOFS_PROTO_VERSION 2 + +/* + * Command structure sent to automount process from lx_autofs via a pipe. + * This structure is the same for v2 and v3 of the automount protocol + * (the communication pipe is established at mount time). + */ +typedef struct lx_autofs_pkt { + int lap_protover; /* protocol version number */ + int lap_constant; /* always set to 0 */ + int lap_id; /* every pkt must have a unique id */ + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component to lookup */ +} lx_autofs_pkt_t; + +/* + * Ioctls supprted (v2 protocol). + */ +#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */ +#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */ +#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */ + +/* + * Ioctls not supported (v3 protocol). + * + * Initially we're only going to support v2 of the Linux kernel automount + * protocol. This means that we don't support the following ioctls. + * + * 1) The protocol version ioctl (by not supporting it the automounter + * will assume version 2). + * + * 2) Automounter timeout ioctls. For v3 and later the automounter can + * be started with a timeout option. It will notify the filesystem of + * this timeout and, if any automounter filesystem root directory entry + * is not in use, it will notify the automounter via the LX_AUTOFS_IOC_EXPIRE + * ioctl. For example, if the timeout is 60 seconds, the Linux + * automounter will use the LX_AUTOFS_IOC_EXPIRE ioctl to query for + * timeouts more often than that. (v3.1.7 of the automount daemon would + * perform this ioctl every <timeout>/4 seconds.) Then, if the autofs + * filesystem will + * report top level directories that aren't in use to the automounter + * via this ioctl. If /net was managed by the automounter and + * there were the following mount points: + * /net/jurassic/var/crash + * /net/mcescher/var/crash + * and no one was looking at any crash dumps on mcescher but someone + * was analyzing a crash dump on jurassic, then after <timeout> seconds + * had passed the autofs filesystem would let the automounter know that + * "mcescher" could be unmounted. (Note the granularity of notification + * is directories in the root of the autofs filesystem.) Here's two + * ideas for how this functionality could be implemented on Solaris: + * + * 2.1) The easy incomplete way. Don't do any in-use detection. Simply + * tell the automounter it can try to unmount the filesystem every time + * the specified timeout passes. If the filesystem is in use then the + * unmount will fail. This would break down for remote hosts with multiple + * mounts. For example, if the automounter had mounted the following + * filesystems: + * /net/jurassic/var/crash + * /net/jurassic/var/core + * and the user was looking at a core file, and the timeout expired, the + * automounter would recieve notification to unmount "jurassic". Then + * it would unmount crash (which would succeed) and then to try unmount + * core (which would fail). After that (since the automounter only + * performs mounts for failed lookups in the root autofs directory) + * future access to /net/jurassic/var/crash would result to access + * to an empty autofs directory. We might be able to work around + * this by caching which root autofs directories we've timed out, + * then any access to paths that contain those directories could be + * stalled and we could resend another request to the automounter. + * This could work if the automounter ignores mount failures. + * + * 2.2) The hard correct way. The real difficulty here is detecting + * files in use on other filesystems (say NFS) that have been mounted + * on top of autofs. (Detecting in use autofs vnodes should be easy.) + * to do this we would probably have to create a new brand op to intercept + * mount/umount filesystem operations. Then using this entry point we + * could detect mounts of other filesystems on top of lx_autofs. When + * a successful mount finishes we would use the FEM (file event + * monitoring) framework to push a module onto that filesystem and + * intercept VOP operations that allocate/free vnodes in that filesystem. + * (We would also then have to track mount operations on top of that + * filesystem, etc.) this would allow us to properly detect any + * usage of subdirectories of an autofs directory. + */ +#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE 0x81109365 /* arg: lx_autofs_expire * */ +#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0049364 /* arg: ulong_t */ + +typedef struct lx_autofs_expire { + int lap_protover; /* protol version number */ + int lap_constant; /* always set to 1 */ + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component that has timed out */ +} lx_autofs_expire_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h new file mode 100644 index 0000000000..9c5517b8d5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_AUTOFS_IMPL_H +#define _LX_AUTOFS_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/modhash.h> +#include <sys/vnode.h> + +#include <sys/lx_autofs.h> + +/* + * Space key. + * Used to persist data across lx_autofs filesystem module unloads. + */ +#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev" + +/* + * Name of the backing store directory. + */ +#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME + +#define LX_AUTOFS_VFS_ID_HASH_SIZE 15 +#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15 +#define LX_AUTOFS_VFS_VN_HASH_SIZE 15 + +/* + * VFS data object. + */ +typedef struct lx_autofs_vfs { + /* Info about the underlying filesystem and backing store. */ + vnode_t *lav_mvp; + char *lav_bs_name; + vnode_t *lav_bs_vp; + + /* Info about the automounter process managing this filesystem. */ + int lav_fd; + pid_t lav_pgrp; + file_t *lav_fifo_wr; + file_t *lav_fifo_rd; + + /* Each automount requests needs a unique id. */ + id_space_t *lav_ids; + + /* All remaining structure members are protected by lav_lock. */ + kmutex_t lav_lock; + + /* Hashes to keep track of outstanding automounter requests. */ + mod_hash_t *lav_path_hash; + mod_hash_t *lav_id_hash; + + /* We need to keep track of all our vnodes. */ + vnode_t *lav_root; + mod_hash_t *lav_vn_hash; +} lx_autofs_vfs_t; + +/* + * Structure to keep track of requests sent to the automounter. + */ +typedef struct lx_autofs_lookup_req { + /* Packet that gets sent to the automounter. */ + lx_autofs_pkt_t lalr_pkt; + + /* Reference count. Always updated atomically. */ + uint_t lalr_ref; + + /* + * Fields to keep track and sync threads waiting on a lookup. + * Fields are protected by lalr_lock. + */ + kmutex_t lalr_lock; + kcondvar_t lalr_cv; + int lalr_complete; +} lx_autofs_lookup_req_t; + +/* + * Generic stack structure. + */ +typedef struct stack_elem { + list_node_t se_list; + caddr_t se_ptr1; + caddr_t se_ptr2; + caddr_t se_ptr3; +} stack_elem_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h new file mode 100644 index 0000000000..4cbcda48bf --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -0,0 +1,210 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_BRAND_H +#define _LX_BRAND_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifndef _ASM +#include <sys/types.h> +#include <sys/cpuvar.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_BRANDNAME "lx" + +/* + * Brand uname info + */ +#define LX_UNAME_SYSNAME "Linux" +#define LX_UNAME_RELEASE "2.4.21" +#define LX_UNAME_VERSION "BrandZ fake linux" +#define LX_UNAME_MACHINE "i686" + +#define LX_LINKER_NAME "ld-linux.so.2" +#define LX_LINKER "/lib/" LX_LINKER_NAME +#define LX_LIBC_NAME "libc.so.6" +#define LIB_PATH "/native/usr/lib/" +#define LX_LIB "lx_brand.so.1" +#define LX_LIB_PATH LIB_PATH LX_LIB + +#define LX_NSYSCALLS 270 + +/* + * brand(2) subcommands + * + * Everything >= 128 is a brand-specific subcommand. + * 192 to 462 are reserved for system calls, although most of that space is + * unused. + */ +#define B_LPID_TO_SPAIR 128 +#define B_SYSENTRY 129 +#define B_SYSRETURN 130 +#define B_PTRACE_SYSCALL 131 +#define B_SET_AFFINITY_MASK 132 +#define B_GET_AFFINITY_MASK 133 + +#define B_EMULATE_SYSCALL 192 + +#define LX_VERSION_1 1 +#define LX_VERSION LX_VERSION_1 + +#define LX_ATTR_RESTART_INIT ZONE_ATTR_BRAND_ATTRS + +#ifndef _ASM + +typedef struct lx_brand_registration { + uint_t lxbr_version; /* version number */ + void *lxbr_handler; /* base address of handler */ + void *lxbr_tracehandler; /* base address of trace handler */ + void *lxbr_traceflag; /* address of trace flag */ +} lx_brand_registration_t; + +#ifdef _SYSCALL32 +typedef struct lx_brand_registration32 { + uint32_t lxbr_version; /* version number */ + caddr32_t lxbr_handler; /* base address of handler */ + caddr32_t lxbr_tracehandler; /* base address of trace handler */ + caddr32_t lxbr_traceflag; /* address of trace flag */ +} lx_brand_registration32_t; +#endif + +typedef struct lx_regs { + long lxr_gs; + long lxr_edi; + long lxr_esi; + long lxr_ebp; + long lxr_esp; + long lxr_ebx; + long lxr_edx; + long lxr_ecx; + long lxr_eax; + long lxr_eip; + + long lxr_orig_eax; +} lx_regs_t; + +#endif /* _ASM */ + +/* + * GDT usage + */ +#define GDT_TLSMIN (GDT_BRANDMIN) +#define GDT_TLSMAX (GDT_TLSMIN + 2) +#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN) + +#ifndef _ASM + +/* + * Stores information needed by the lx linker to launch the main + * lx executable. + */ +typedef struct lx_elf_data { + int ed_phdr; + int ed_phent; + int ed_phnum; + int ed_entry; + int ed_base; + int ed_ldentry; +} lx_elf_data_t; + +#ifdef _KERNEL + +typedef struct lx_proc_data { + uintptr_t l_handler; /* address of user-space handler */ + uintptr_t l_tracehandler; /* address of user-space traced handler */ + uintptr_t l_traceflag; /* address of 32-bit tracing flag */ + void (*l_sigrestorer[MAXSIG])(void); /* array of sigrestorer fns */ + pid_t l_ppid; /* pid of originating parent proc */ + uint64_t l_ptrace; /* process being observed with ptrace */ + lx_elf_data_t l_elf_data; /* ELF data for linux executable */ +} lx_proc_data_t; + +#ifdef __amd64 +typedef uint64_t lx_affmask_t; /* Tolerates NCPU up to 64 */ +#else +typedef uint32_t lx_affmask_t; /* Tolerates NCPU up to 32 */ +#endif /* __amd64 */ + +/* + * lx-specific data in the klwp_t + */ +typedef struct lx_lwp_data { + uint_t br_lwp_flags; /* misc. flags */ + klwp_t *br_lwp; /* back pointer to container lwp */ + int br_signal; /* signal to send to parent when */ + /* clone()'ed child terminates */ + int br_exitwhy; /* reason for thread (process) exit */ + int br_exitwhat; /* exit code / killing signal */ + lx_affmask_t br_affinitymask; /* bitmask of CPU sched affinities */ + struct user_desc br_tls[LX_TLSNUM]; + /* descriptors used by libc for TLS */ + pid_t br_pid; /* converted pid for this thread */ + pid_t br_tgid; /* thread group ID for this thread */ + pid_t br_ppid; /* parent pid for this thread */ + id_t br_ptid; /* parent tid for this thread */ + void *br_clear_ctidp; /* clone thread id ptr */ + void *br_set_ctidp; /* clone thread id ptr */ + + /* + * The following struct is used by lx_clone() + * to pass info into fork() + */ + void *br_clone_args; + + /* + * Space to save off userland Linux %gs pointer so we can restore it + * before calling signal handlers. + */ + greg_t br_ugs; + + uint_t br_ptrace; /* ptrace is active for this LWP */ +} lx_lwp_data_t; + +#define BR_CPU_BOUND 0x0001 + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) +#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l)) +#define ttolxproc(t) ((struct lx_proc_data *)(t)->t_procp->p_brand_data) + +void lx_brand_int80_callback(void); +int64_t lx_emulate_syscall(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t); + +extern int lx_debug; +#define lx_print if (lx_debug) printf + +#endif /* _KERNEL */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_BRAND_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h new file mode 100644 index 0000000000..ac963b015b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LX_FUTEX_H +#define _SYS_LX_FUTEX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE + +extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2); +extern void lx_futex_init(void); +extern int lx_futex_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FUTEX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h new file mode 100644 index 0000000000..12f1aab2b3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_IMPL_H +#define _LX_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int64_t (*llfcn_t)(); + +typedef struct lx_sysent { + int sy_flags; + char *sy_name; + llfcn_t sy_callc; + char sy_narg; +} lx_sysent_t; + +typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t, + ulong_t, ulong_t); + + +extern lx_sysent_t lx_sysent[]; + +extern lx_systrace_f *lx_systrace_entry_ptr; +extern lx_systrace_f *lx_systrace_return_ptr; + +extern void lx_brand_systrace_enable(void); +extern void lx_brand_systrace_disable(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h new file mode 100644 index 0000000000..5080c3adae --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LINUX_LDT_H +#define _SYS_LINUX_LDT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/segments.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ldt_info { + uint_t entry_number; + uint_t base_addr; + uint_t limit; + uint_t seg_32bit:1, + contents:2, + read_exec_only:1, + limit_in_pages:1, + seg_not_present:1, + useable:1; +}; + +#define LDT_INFO_EMPTY(info) \ + ((info)->base_addr == 0 && (info)->limit == 0 && \ + (info)->contents == 0 && (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && (info)->useable == 0) + +#if defined(__amd64) +#define SETMODE(desc) (desc)->usd_long = SDP_SHORT; +#else +#define SETMODE(desc) +#endif + +#define LDT_INFO_TO_DESC(info, desc) { \ + USEGD_SETBASE(desc, (info)->base_addr); \ + USEGD_SETLIMIT(desc, (info)->limit); \ + (desc)->usd_type = ((info)->contents << 2) | \ + ((info)->read_exec_only ^ 1) << 1 | 0x10; \ + (desc)->usd_dpl = SEL_UPL; \ + (desc)->usd_p = (info)->seg_not_present ^ 1; \ + (desc)->usd_def32 = (info)->seg_32bit; \ + (desc)->usd_gran = (info)->limit_in_pages; \ + (desc)->usd_avl = (info)->useable; \ + SETMODE(desc); \ +} + +#define DESC_TO_LDT_INFO(desc, info) { \ + bzero((info), sizeof (*(info))); \ + (info)->base_addr = USEGD_GETBASE(desc); \ + (info)->limit = USEGD_GETLIMIT(desc); \ + (info)->seg_not_present = (desc)->usd_p ^ 1; \ + (info)->contents = ((desc)->usd_type >> 2) & 3; \ + (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \ + (info)->seg_32bit = (desc)->usd_def32; \ + (info)->limit_in_pages = (desc)->usd_gran; \ + (info)->useable = (desc)->usd_avl; \ +} + +extern void lx_set_gdt(int, user_desc_t *); +extern void lx_clear_gdt(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_LDT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_pid.h b/usr/src/uts/common/brand/lx/sys/lx_pid.h new file mode 100644 index 0000000000..80c8079f0b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_pid.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LX_PID_H +#define _SYS_LX_PID_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/note.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +struct lx_pid { + pid_t s_pid; /* the solaris pid and ... */ + id_t s_tid; /* ... tid pair */ + pid_t l_pid; /* the corresponding linux pid */ + time_t l_start; /* birthday of this pid */ + struct pid *l_pidp; + struct lx_pid *stol_next; /* link in stol hash table */ + struct lx_pid *ltos_next; /* link in ltos hash table */ +}; + +extern int lx_pid_assign(kthread_t *); +extern void lx_pid_reassign(kthread_t *); +extern void lx_pid_rele(pid_t, id_t); +extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *); +extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *); +extern void lx_pid_init(void); +extern void lx_pid_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_PID_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h new file mode 100644 index 0000000000..74bbc939a3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PTM_LINUX_H +#define _SYS_PTM_LINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_PTM_DRV "lx_ptm" +#define LX_PTM_MINOR_NODE "lx_ptmajor" + +#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PTM_LINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_sched.h b/usr/src/uts/common/brand/lx/sys/lx_sched.h new file mode 100644 index 0000000000..b0ae748f3c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_sched.h @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LINUX_SCHED_H +#define _SYS_LINUX_SCHED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/procset.h> +#include <sys/priocntl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux scheduler policies. + */ +#define LX_SCHED_OTHER 0 +#define LX_SCHED_FIFO 1 +#define LX_SCHED_RR 2 + +#define LX_PRI_MAX 99 + +typedef int l_pid_t; + +struct lx_sched_param { + int lx_sched_prio; +}; + +extern int sched_setprocset(procset_t *, l_pid_t); +extern long do_priocntlsys(int, procset_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SCHED_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h new file mode 100644 index 0000000000..b4d41d5241 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LINUX_SYSCALLS_H +#define _SYS_LINUX_SYSCALLS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern long lx_brk(); +extern long lx_getpid(); +extern long lx_getppid(); +extern long lx_clone(); +extern long lx_kill(); +extern long lx_tkill(); +extern long lx_modify_ldt(); +extern long lx_gettid(); +extern long lx_futex(); +extern long lx_get_thread_area(); +extern long lx_sched_getparam(); +extern long lx_sched_getscheduler(); +extern long lx_sched_rr_get_interval(); +extern long lx_sched_setparam(); +extern long lx_sched_setscheduler(); +extern long lx_set_thread_area(); +extern long lx_set_tid_address(); +extern long lx_setresgid(); +extern long lx_setresgid16(); +extern long lx_setresuid(); +extern long lx_setresuid16(); +extern long lx_sysinfo(); +extern long lx_setgroups(); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SYSCALLS_H */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c new file mode 100644 index 0000000000..25a719986e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> + +/* + * The brk() system call needs to be in-kernel because Linux expects a call to + * brk(0) to return the current breakpoint. In Solaris, the process breakpoint + * is setup and managed by libc. Due to the way we link our libraries and the + * need for Linux to manage its own breakpoint, this has to remain in the + * kernel. + */ +extern int brk(caddr_t); + +long +lx_brk(caddr_t nva) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + + if (nva != 0) { + (void) brk(nva); + + /* + * Despite claims to the contrary in the manpage, when Linux + * brk() fails, errno is left unchanged. + */ + lwp->lwp_errno = 0; + } + +out: + return ((long)(p->p_brkbase + p->p_brksize)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c new file mode 100644 index 0000000000..2af3c00bae --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> + +#define LX_CSIGNAL 0x000000ff +#define LX_CLONE_VM 0x00000100 +#define LX_CLONE_FS 0x00000200 +#define LX_CLONE_FILES 0x00000400 +#define LX_CLONE_SIGHAND 0x00000800 +#define LX_CLONE_PID 0x00001000 +#define LX_CLONE_PTRACE 0x00002000 +#define LX_CLONE_PARENT 0x00008000 +#define LX_CLONE_THREAD 0x00010000 +#define LX_CLONE_SYSVSEM 0x00040000 +#define LX_CLONE_SETTLS 0x00080000 +#define LX_CLONE_PARENT_SETTID 0x00100000 +#define LX_CLONE_CHILD_CLEARTID 0x00200000 +#define LX_CLONE_DETACH 0x00400000 +#define LX_CLONE_CHILD_SETTID 0x01000000 + +/* + * Our lwp has already been created at this point, so this routine is + * responsible for setting up all the state needed to track this as a + * linux cloned thread. + */ +/* ARGSUSED */ +long +lx_clone(int flags, void *stkp, void *ptidp, void *ldtinfo, void *ctidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + struct ldt_info info; + struct user_desc descr; + int tls_index; + int entry = -1; + int signo; + + signo = flags & LX_CSIGNAL; + if (signo < 0 || signo > MAXSIG) + return (set_errno(EINVAL)); + + if (flags & LX_CLONE_SETTLS) { + if (copyin((caddr_t)ldtinfo, &info, sizeof (info))) + return (set_errno(EFAULT)); + + if (LDT_INFO_EMPTY(&info)) + return (set_errno(EINVAL)); + + entry = info.entry_number; + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + tls_index = entry - GDT_TLSMIN; + + /* + * Convert the user-space structure into a real x86 + * descriptor and copy it into this LWP's TLS array. We + * also load it into the GDT. + */ + LDT_INFO_TO_DESC(&info, &descr); + bcopy(&descr, &lwpd->br_tls[tls_index], sizeof (descr)); + lx_set_gdt(entry, &lwpd->br_tls[tls_index]); + } else { + tls_index = -1; + bzero(&descr, sizeof (descr)); + } + + lwpd->br_clear_ctidp = + (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL; + + if (signo && ! (flags & LX_CLONE_DETACH)) + lwpd->br_signal = signo; + else + lwpd->br_signal = 0; + + if (flags & LX_CLONE_THREAD) + lwpd->br_tgid = curthread->t_procp->p_pid; + + if (flags & LX_CLONE_PARENT) + lwpd->br_ppid = 0; + + if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) && + (suword32(ctidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) && + (suword32(ptidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + + return (lwpd->br_pid); +} + +long +lx_set_tid_address(int *tidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + + lwpd->br_clear_ctidp = tidp; + + return (lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c new file mode 100644 index 0000000000..ceb6f330aa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -0,0 +1,471 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/condvar.h> +#include <sys/inttypes.h> +#include <sys/lx_futex.h> + +/* + * Futexes are a Linux-specific implementation of inter-process mutexes. + * They are designed to use shared memory for simple, uncontested + * operations, and rely on the kernel to resolve any contention issues. + * + * Most of the information in this section comes from the paper "Futexes + * Are Tricky", by Ulrich Drepper. This paper is currently available at: + * http://people.redhat.com/~drepper/futex.pdf. + * + * A futex itself a 4-byte integer, which must be 4-byte aligned. The + * value of this integer is expected to be modified using user-level atomic + * operations. The futex(4) design itself does not impose any semantic + * constraints on the value stored in the futex; it is up to the + * application to define its own protocol. + * + * When the application decides that kernel intervention is required, it + * will use the futex(2) system call. There are 5 different operations + * that can be performed on a futex, using this system call. Since this + * interface has evolved over time, there are several different prototypes + * available to the user. Fortunately, there is only a single kernel-level + * interface: + * + * long sys_futex(void *futex1, int cmd, int val1, + * struct timespec *timeout, void *futex2, int val2) + * + * The kernel-level operations that may be performed on a futex are: + * + * FUTEX_WAIT + * + * Atomically verify that futex1 contains the value val1. If it + * doesn't, return EWOULDBLOCK. If it does contain the expected + * value, the thread will sleep until somebody performs a FUTEX_WAKE + * on the futex. The caller may also specify a timeout, indicating + * the maximum time the thread should sleep. If the timer expires, + * the call returns ETIMEDOUT. If the thread is awoken with a signal, + * the call returns EINTR. Otherwise, the call returns 0. + * + * FUTEX_WAKE + * + * Wake up val1 processes that are waiting on futex1. The call + * returns the number of blocked threads that were woken up. + * + * FUTEX_CMP_REQUEUE + * + * If the value stored in futex1 matches that passed in in val2, wake + * up val1 processes that are waiting on futex1. Otherwise, return + * EAGAIN. + * + * If there are more than val1 threads waiting on the futex, remove + * the remaining threads from this futex, and requeue them on futex2. + * The caller can limit the number of threads being requeued by + * encoding an integral numerical value in the position usually used + * for the timeout pointer. + * + * The call returns the number of blocked threads that were woken up + * or requeued. + * + * FUTEX_REQUEUE + * + * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. + * This command has been declared broken and obsolete, but we still + * need to support it. + * + * FUTEX_FD + * + * Return a file descriptor, which can be used to refer to the futex. + * We don't support this operation. + */ + +/* + * This structure is used to track all the threads currently waiting on a + * futex. There is one fwaiter_t for each blocked thread. We store all + * fwaiter_t's in a hash structure, indexed by the memid_t of the integer + * containing the futex's value. + * + * At the moment, all fwaiter_t's for a single futex are simply dumped into + * the hash bucket. If futex contention ever becomes a hot path, we can + * chain a single futex's waiters together. + */ +typedef struct fwaiter { + memid_t fw_memid; /* memid of the user-space futex */ + kcondvar_t fw_cv; /* cond var */ + struct fwaiter *fw_next; /* hash queue */ + struct fwaiter *fw_prev; /* hash queue */ + volatile int fw_woken; +} fwaiter_t; + +#define MEMID_COPY(s, d) \ + { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } +#define MEMID_EQUAL(s, d) \ + ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) + +/* Borrowed from the page freelist hash code. */ +#define HASH_SHIFT_SZ 7 +#define HASH_SIZE (1 << HASH_SHIFT_SZ) +#define HASH_FUNC(id) \ + ((((uintptr_t)((id)->val[1]) >> PAGESHIFT) + \ + ((uintptr_t)((id)->val[1]) >> (PAGESHIFT + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> 3) + \ + ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ + (HASH_SIZE - 1)) + +static fwaiter_t *futex_hash[HASH_SIZE]; +static kmutex_t futex_hash_lock[HASH_SIZE]; + +static void +futex_hashin(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash_lock[index])); + + fwp->fw_prev = NULL; + fwp->fw_next = futex_hash[index]; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp; + futex_hash[index] = fwp; +} + +static void +futex_hashout(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash_lock[index])); + + if (fwp->fw_prev) + fwp->fw_prev->fw_next = fwp->fw_next; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp->fw_prev; + if (futex_hash[index] == fwp) + futex_hash[index] = fwp->fw_next; + + fwp->fw_prev = NULL; + fwp->fw_next = NULL; +} + +/* + * Go to sleep until somebody does a WAKE operation on this futex, we get a + * signal, or the timeout expires. + */ +static int +futex_wait(memid_t *memid, caddr_t addr, int val, timespec_t *timeout) +{ + int err, ret; + int32_t curval; + fwaiter_t fw; + int index; + + fw.fw_woken = 0; + MEMID_COPY(memid, &fw.fw_memid); + cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL); + + index = HASH_FUNC(&fw.fw_memid); + mutex_enter(&futex_hash_lock[index]); + + if (fuword32(addr, (uint32_t *)&curval)) { + err = set_errno(EFAULT); + goto out; + } + if (curval != val) { + err = set_errno(EWOULDBLOCK); + goto out; + } + + futex_hashin(&fw); + + err = 0; + while ((fw.fw_woken == 0) && (err == 0)) { + ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index], + timeout, timechanged); + if (ret < 0) + err = set_errno(ETIMEDOUT); + else if (ret == 0) + err = set_errno(EINTR); + } + + /* + * The futex is normally hashed out in wakeup. If we timed out or + * got a signal, we need to hash it out here instead. + */ + if (fw.fw_woken == 0) + futex_hashout(&fw); + +out: + mutex_exit(&futex_hash_lock[index]); + + return (err); +} + +/* + * Wake up to wake_threads threads that are blocked on the futex at memid. + */ +static int +futex_wake(memid_t *memid, int wake_threads) +{ + fwaiter_t *fwp, *next; + int index; + int ret = 0; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash_lock[index]); + + for (fwp = futex_hash[index]; fwp && ret < wake_threads; fwp = next) { + next = fwp->fw_next; + if (MEMID_EQUAL(&fwp->fw_memid, memid)) { + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + ret++; + } + } + + mutex_exit(&futex_hash_lock[index]); + + return (ret); +} + +/* + * Wake up to wake_threads waiting on the futex at memid. If there are + * more than that many threads waiting, requeue the remaining threads on + * the futex at requeue_memid. + */ +static int +futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, + ulong_t requeue_threads, caddr_t addr, int *cmpval) +{ + fwaiter_t *fwp, *next; + int index1, index2; + int ret = 0; + int32_t curval; + kmutex_t *l1, *l2; + + /* + * To ensure that we don't miss a wakeup if the value of cmpval + * changes, we need to grab locks on both the original and new hash + * buckets. To avoid deadlock, we always grab the lower-indexed + * lock first. + */ + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(requeue_memid); + + if (index1 == index2) { + l1 = &futex_hash_lock[index1]; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash_lock[index1]; + l2 = &futex_hash_lock[index2]; + } else { + l1 = &futex_hash_lock[index2]; + l2 = &futex_hash_lock[index1]; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + if (cmpval != NULL) { + if (fuword32(addr, (uint32_t *)&curval)) { + ret = -EFAULT; + goto out; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out; + } + } + + for (fwp = futex_hash[index1]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + MEMID_COPY(requeue_memid, &fwp->fw_memid); + futex_hashin(fwp); + + if ((ret - wake_threads) >= requeue_threads) + break; + } + } + +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + if (ret < 0) + return (set_errno(-ret)); + return (ret); +} + +/* + * Copy in the relative timeout provided by the application and convert it + * to an absolute timeout. + */ +static int +get_timeout(void *lx_timeout, timestruc_t *timeout) +{ + timestruc_t now; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) + return (EFAULT); + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) + return (EFAULT); + timeout->tv_sec = (time_t)timeout32.tv_sec; + timeout->tv_nsec = timeout32.tv_nsec; + } +#endif + gethrestime(&now); + + if (itimerspecfix(timeout)) + return (EINVAL); + + timespecadd(timeout, &now); + return (0); +} + +long +lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2) +{ + struct as *as = curproc->p_as; + memid_t memid, requeue_memid; + timestruc_t timeout; + timestruc_t *tptr = NULL; + int requeue_threads; + int *requeue_cmp = NULL; + int rval = 0; + + /* must be aligned on int boundary */ + if (addr & 0x3) + return (set_errno(EINVAL)); + + /* Sanity check the futex command */ + if (cmd < 0 || cmd > FUTEX_MAX_CMD) + return (set_errno(EINVAL)); + + /* Copy in the timeout structure from userspace. */ + if (cmd == FUTEX_WAIT && lx_timeout != NULL) { + rval = get_timeout((timespec_t *)lx_timeout, &timeout); + if (rval != 0) + return (set_errno(rval)); + tptr = &timeout; + } + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { + if (cmd == FUTEX_CMP_REQUEUE) + requeue_cmp = &val2; + + /* + * lx_timeout is nominally a pointer to a userspace + * address. For these two commands, it actually contains + * an integer which indicates the maximum number of threads + * to requeue. This is horrible, and I'm sorry. + */ + requeue_threads = (int)lx_timeout; + } + + /* + * Translate the process-specific, user-space futex virtual + * address(es) to universal memid. + */ + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) + return (set_errno(rval)); + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) { + rval = as_getmemid(as, (void *)addr2, &requeue_memid); + if (rval) + return (set_errno(rval)); + } + + switch (cmd) { + case FUTEX_WAIT: + rval = futex_wait(&memid, (void *)addr, val, tptr); + break; + + case FUTEX_WAKE: + rval = futex_wake(&memid, val); + break; + + case FUTEX_CMP_REQUEUE: + case FUTEX_REQUEUE: + rval = futex_requeue(&memid, &requeue_memid, val, + requeue_threads, (void *)addr2, requeue_cmp); + + break; + } + + return (rval); +} + +void +lx_futex_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + mutex_init(&futex_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); + bzero(futex_hash, sizeof (futex_hash)); +} + +int +lx_futex_fini(void) +{ + int i, err; + + err = 0; + for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { + mutex_enter(&futex_hash_lock[i]); + if (futex_hash[i] != NULL) + err = EBUSY; + mutex_exit(&futex_hash_lock[i]); + } + return (err); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c new file mode 100644 index 0000000000..91dc24c6d6 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_pid.h> + +/* + * return the pid + */ +long +lx_getpid() +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + long rv; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) + rv = 1; + else + rv = lwpd->br_tgid; + + return (rv); +} + +/* + * return the parent pid + */ +long +lx_getppid(void) +{ + return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL)); +} + +/* + * return the thread id + */ +long +lx_gettid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + return (lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c new file mode 100644 index 0000000000..077194ee25 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -0,0 +1,297 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/zone.h> +#include <sys/cred_impl.h> +#include <sys/policy.h> + +typedef ushort_t l_uid16_t; +typedef ushort_t l_gid16_t; +typedef uint_t l_uid_t; +typedef uint_t l_gid_t; + +#define LINUX_UID16_TO_UID32(uid16) \ + (((uid16) == (l_uid16_t)-1) ? ((l_uid_t)-1) : (l_uid_t)(uid16)) + +#define LINUX_GID16_TO_GID32(gid16) \ + (((gid16) == (l_gid16_t)-1) ? ((l_gid_t)-1) : (l_gid_t)(gid16)) + +#define LX_NGROUPS_MAX 32 +extern int setgroups(int, gid_t *); + +/* + * This function is based on setreuid in common/syscall/uid.c and exists + * because Solaris does not have a way to explicitly set the saved uid (suid) + * from any other system call. + */ +long +lx_setresuid(l_uid_t ruid, l_uid_t euid, l_uid_t suid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid < 0 || ruid > MAXUID)) || + (euid != -1 && (euid < 0 || euid > MAXUID)) || + (suid != -1 && (suid < 0 || suid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && + ruid != cr->cr_ruid && ruid != cr->cr_uid && + ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else if (suid != -1 && + suid != cr->cr_ruid && suid != cr->cr_uid && + suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (suid != -1) + newcr->cr_suid = suid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16) +{ + long rval; + + rval = lx_setresuid( + LINUX_UID16_TO_UID32(ruid16), + LINUX_UID16_TO_UID32(euid16), + LINUX_UID16_TO_UID32(suid16)); + + return (rval); +} + +/* + * This function is based on setregid in common/syscall/gid.c + */ +long +lx_setresgid(l_gid_t rgid, l_gid_t egid, l_gid_t sgid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid < 0 || rgid > MAXUID)) || + (egid != -1 && (egid < 0 || egid > MAXUID)) || + (sgid != -1 && (sgid < 0 || sgid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (rgid != -1 && + rgid != cr->cr_rgid && rgid != cr->cr_gid && + rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (egid != -1 && + egid != cr->cr_rgid && egid != cr->cr_gid && + egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (sgid != -1 && + sgid != cr->cr_rgid && sgid != cr->cr_gid && + sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (sgid != -1) + newcr->cr_sgid = sgid; + if (rgid != -1) + newcr->cr_rgid = rgid; + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16) +{ + long rval; + + rval = lx_setresgid( + LINUX_GID16_TO_GID32(rgid16), + LINUX_GID16_TO_GID32(egid16), + LINUX_GID16_TO_GID32(sgid16)); + + return (rval); +} + +/* + * Linux defines NGROUPS_MAX to be 32, but on Solaris it is only 16. We employ + * the terrible hack below so that tests may proceed, if only on DEBUG kernels. + */ +long +lx_setgroups(int ngroups, gid_t *grouplist) +{ +#ifdef DEBUG + if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) + ngroups = ngroups_max; +#endif /* DEBUG */ + + return (setgroups(ngroups, grouplist)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c new file mode 100644 index 0000000000..d86d50f4e6 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/thread.h> +#include <sys/signal.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_pid.h> +#include <lx_signum.h> + +extern int kill(pid_t, int); + +/* + * Check if it is legal to send this signal to the init process. Linux + * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid + * 1. + */ +static int +init_sig_check(int sig, pid_t pid) +{ + proc_t *p; + int rv = 0; + + mutex_enter(&pidlock); + + if (((p = prfind(pid)) == NULL) || (p->p_stat == SIDL)) + rv = ESRCH; + else if (sig && (sigismember(&cantmask, sig) || + (PTOU(p)->u_signal[sig-1] == SIG_DFL) || + (PTOU(p)->u_signal[sig-1] == SIG_IGN))) + rv = EPERM; + + mutex_exit(&pidlock); + + return (rv); +} + +long +lx_tkill(pid_t pid, int lx_sig) +{ + kthread_t *t; + proc_t *pp; + pid_t initpid; + sigqueue_t *sqp; + struct lx_lwp_data *br = ttolxlwp(curthread); + int tid = 1; /* default tid */ + int sig, rv; + + /* + * Unlike kill(2), Linux tkill(2) doesn't allow signals to + * be sent to process IDs <= 0 as it doesn't overlay any special + * semantics on the pid. + */ + if ((pid <= 0) || ((lx_sig < 0) || (lx_sig >= LX_NSIG)) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * If the Linux pid is 1, translate the pid to the actual init + * pid for the zone. Note that Linux dictates that no unhandled + * signals may be sent to init, so check for that, too. + * + * Otherwise, extract the tid and real pid from the Linux pid. + */ + initpid = curproc->p_zone->zone_proc_initpid; + if (pid == 1) + pid = initpid; + if ((pid == initpid) && ((rv = init_sig_check(sig, pid)) != 0)) + return (set_errno(rv)); + else if (lx_lpid_to_spair(pid, &pid, &tid) < 0) + return (set_errno(ESRCH)); + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + + /* + * Find the process for the passed pid... + */ + mutex_enter(&pidlock); + if (((pp = prfind(pid)) == NULL) || (pp->p_stat == SIDL)) { + mutex_exit(&pidlock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + mutex_enter(&pp->p_lock); + mutex_exit(&pidlock); + + /* + * Deny permission to send the signal if either of the following + * is true: + * + * + The signal is SIGCONT and the target pid is not in the same + * session as the sender + * + * + prochasprocperm() shows the user lacks sufficient permission + * to send the signal to the target pid + */ + if (((sig == SIGCONT) && (pp->p_sessp != curproc->p_sessp)) || + (!prochasprocperm(pp, curproc, CRED()))) { + mutex_exit(&pp->p_lock); + rv = set_errno(EPERM); + goto free_and_exit; + } + + /* check for the tid */ + if ((t = idtot(pp, tid)) == NULL) { + mutex_exit(&pp->p_lock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + + /* a signal of 0 means just check for the existence of the thread */ + if (lx_sig == 0) { + mutex_exit(&pp->p_lock); + rv = 0; + goto free_and_exit; + } + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = br->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(pp, t, sqp); + + mutex_exit(&pp->p_lock); + + return (0); + +free_and_exit: + kmem_free(sqp, sizeof (sigqueue_t)); + return (rv); +} + +long +lx_kill(pid_t lx_pid, int lx_sig) +{ + pid_t s_pid, initpid; + sigsend_t v; + zone_t *zone = curproc->p_zone; + struct proc *p; + int err, sig, nfound; + + if ((lx_sig < 0) || (lx_sig >= LX_NSIG) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * Since some linux apps rely on init(1M) having PID 1, we + * transparently translate 1 to the real init(1M)'s pid. We then + * check to be sure that it is legal for this process to send this + * signal to init(1M). + */ + initpid = zone->zone_proc_initpid; + if (lx_pid == 1 || lx_pid == -1) { + s_pid = initpid; + } else if (lx_pid == 0) { + s_pid = 0; + } else { + if (lx_pid < 0) + err = lx_lpid_to_spair(-lx_pid, &s_pid, NULL); + else + err = lx_lpid_to_spair(lx_pid, &s_pid, NULL); + + /* + * If we didn't find this pid in our hash table, it either + * means that the process doesn't exist, that it exists but + * isn't a Linux process, or that it is a zombie process. + * In each of these cases, assuming that the Linux pid is + * the same as the Solaris pid will get us the correct + * behavior. + */ + if (err < 0) + s_pid = lx_pid; + } + + if ((s_pid == initpid) && ((err = init_sig_check(sig, s_pid)) != 0)) + return (set_errno(err)); + + /* + * For individual processes, kill() semantics are the same between + * Solaris and Linux. + */ + if (lx_pid >= 0) + return (kill(s_pid, sig)); + + /* + * In Solaris, sending a signal to -pid means "send a signal to + * everyone in process group pid." In Linux it means "send a + * signal to everyone in the group other than init." Sending a + * signal to -1 means "send a signal to every process except init + * and myself." + */ + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + err = 0; + + mutex_enter(&pidlock); + + p = (lx_pid == -1) ? practive : pgfind(s_pid); + nfound = 0; + while (err == 0 && p != NULL) { + if ((p->p_zone == zone) && (p->p_stat != SIDL) && + (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) { + nfound++; + err = sigsendproc(p, &v); + } + + p = (lx_pid == -1) ? p->p_next : p->p_pglink; + } + mutex_exit(&pidlock); + if (nfound == 0) + err = ESRCH; + else if (err == 0 && v.perm == 0) + err = EPERM; + return (err ? set_errno(err) : 0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c new file mode 100644 index 0000000000..aa6e12a7d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/segments.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/sysi86.h> +#include <sys/cmn_err.h> +#include <sys/lx_ldt.h> + +/* + * Read the ldt_info structure in from the Linux app, convert it to an ssd + * structure, and then call setdscr() to do all the heavy lifting. + */ +static int +write_ldt(void *data, ulong_t count) +{ + user_desc_t usd; + struct ssd ssd; + struct ldt_info ldt_inf; + proc_t *pp = curthread->t_procp; + int err; + + if (count != sizeof (ldt_inf)) + return (set_errno(EINVAL)); + + if (copyin(data, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + if (ldt_inf.entry_number >= MAXNLDT) + return (set_errno(EINVAL)); + + LDT_INFO_TO_DESC(&ldt_inf, &usd); + usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number)); + + /* + * Get everyone into a safe state before changing the LDT. + */ + if (!holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + + err = setdscr(&ssd); + + /* + * Release the hounds! + */ + mutex_enter(&pp->p_lock); + continuelwps(pp); + mutex_exit(&pp->p_lock); + + return (err ? set_errno(err) : 0); +} + +static int +read_ldt(void *uptr, ulong_t count) +{ + proc_t *pp = curproc; + int bytes; + + if (pp->p_ldt == NULL) + return (0); + + bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + if (bytes > count) + bytes = count; + + if (copyout(pp->p_ldt, uptr, bytes)) + return (set_errno(EFAULT)); + + return (bytes); +} + +long +lx_modify_ldt(int op, void *data, ulong_t count) +{ + int rval; + + switch (op) { + case 0: + rval = read_ldt(data, count); + break; + + case 1: + rval = write_ldt(data, count); + break; + + default: + rval = set_errno(ENOSYS); + break; + } + + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c new file mode 100644 index 0000000000..7b1cd49f37 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -0,0 +1,513 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/processor.h> +#include <sys/brand.h> +#include <sys/lx_pid.h> +#include <sys/lx_sched.h> +#include <sys/lx_brand.h> + +extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); + +int +lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp, + int64_t *rval) +{ + pid_t s_pid; + id_t s_tid; + kthread_t *t = curthread; + lx_lwp_data_t *lx_lwp; + + if (cmd != B_GET_AFFINITY_MASK && cmd != B_SET_AFFINITY_MASK) + return (set_errno(EINVAL)); + + /* + * The caller wants to know how large the mask should be. + */ + if (cmd == B_GET_AFFINITY_MASK && len == 0) { + *rval = sizeof (lx_affmask_t); + return (0); + } + + /* + * Otherwise, ensure they have a large enough mask. + */ + if (cmd == B_GET_AFFINITY_MASK && len < sizeof (lx_affmask_t)) { + *rval = -1; + return (set_errno(EINVAL)); + } + + if (pid == 0) { + s_pid = curproc->p_pid; + s_tid = curthread->t_tid; + } else if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) == -1) { + return (set_errno(ESRCH)); + } + + /* + * For now, we only support manipulating threads in the + * same process. + */ + if (curproc->p_pid != s_pid) + return (set_errno(EPERM)); + + /* + * We must hold the process lock so that the thread list + * doesn't change while we're looking at it. We'll hold + * the lock until we no longer reference the + * corresponding lwp. + */ + + mutex_enter(&curproc->p_lock); + + do { + if (t->t_tid == s_tid) + break; + t = t->t_forw; + } while (t != curthread); + + /* + * If the given PID is in the current thread's process, + * then we _must_ find it in the process's thread list. + */ + ASSERT(t->t_tid == s_tid); + + lx_lwp = t->t_lwp->lwp_brand; + + if (cmd == B_SET_AFFINITY_MASK) { + if (copyin_nowatch((void *)maskp, &lx_lwp->br_affinitymask, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = 0; + } else { + if (copyout_nowatch(&lx_lwp->br_affinitymask, (void *)maskp, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = sizeof (lx_affmask_t); + } + + mutex_exit(&curproc->p_lock); + return (0); +} + +long +lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (rv = sched_setprocset(&procset, pid)) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (policy < 0) { + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + } + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getscheduler(l_pid_t pid) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int policy; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (rv = sched_setprocset(&procset, pid)) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + policy = set_errno(EINVAL); + + return (policy); +} + +long +lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int policy; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (rv = sched_setprocset(&procset, pid)) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_sched_param local_param; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + tsinfo_t *tsi; + int prio, scale; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (rv = sched_setprocset(&procset, pid)) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + bzero(&local_param, sizeof (local_param)); + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + /* + * I don't know if we need to do this, coz it can't be + * changed from zero anyway..... + */ + tsi = (tsinfo_t *)pcinfo.pc_clinfo; + prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + scale = tsi->ts_maxupri; + if (scale == 0) + local_param.lx_sched_prio = 0; + else + local_param.lx_sched_prio = -(prio * 20) / scale; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) + local_param.lx_sched_prio = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + else + rv = set_errno(EINVAL); + + if (rv == 0) + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (rv); +} + +long +lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival) +{ + klwp_t *lwp = ttolwp(curthread); + struct timespec interval; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (rv = sched_setprocset(&procset, pid)) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0); + bzero(&pcinfo, sizeof (pcinfo)); + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (pcparm.pc_cid == pcinfo.pc_cid && + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) { + interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + + return (0); + } + + return (set_errno(EINVAL)); +} + +int +sched_setprocset(procset_t *procset, l_pid_t pid) +{ + id_t lid, rid; + idtype_t lidtype, ridtype; + + /* + * define the target lwp + */ + if (pid == 0) { + ridtype = P_ALL; + lidtype = P_PID; + rid = 0; + lid = P_MYID; + } else { + if (lx_lpid_to_spair(pid, &pid, &lid) < 0) + return (set_errno(ESRCH)); + if (pid != curproc->p_pid) + return (set_errno(ESRCH)); + rid = 0; + ridtype = P_ALL; + lidtype = P_LWPID; + } + setprocset(procset, POP_AND, lidtype, lid, ridtype, rid); + + return (0); +} + +long +do_priocntlsys(int cmd, procset_t *procset, void *arg) +{ + return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0, + UIO_SYSSPACE)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c new file mode 100644 index 0000000000..9fdb734805 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <vm/anon.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/zone.h> +#include <sys/time.h> + +struct lx_sysinfo { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ +}; + +long +lx_sysinfo(struct lx_sysinfo *sip) +{ + struct lx_sysinfo si; + hrtime_t birthtime; + zone_t *zone = curthread->t_procp->p_zone; + proc_t *init_proc; + + /* + * We don't record the time a zone was booted, so we use the + * birthtime of that zone's init process instead. + */ + mutex_enter(&pidlock); + init_proc = prfind(zone->zone_proc_initpid); + if (init_proc != NULL) + birthtime = init_proc->p_mstart; + else + birthtime = p0.p_mstart; + mutex_exit(&pidlock); + si.si_uptime = (gethrtime() - birthtime) / NANOSEC; + + /* + * We scale down the load in avenrun to allow larger load averages + * to fit in 32 bits. Linux doesn't, so we remove the scaling + * here. + */ + si.si_loads[0] = avenrun[0] << FSHIFT; + si.si_loads[1] = avenrun[1] << FSHIFT; + si.si_loads[2] = avenrun[2] << FSHIFT; + + /* + * In linux each thread looks like a process, so we conflate the + * two in this stat as well. + */ + si.si_procs = (int32_t)zone->zone_nlwps; + + /* + * If the maximum memory stat is less than 1^20 pages (i.e. 4GB), + * then we report the result in bytes. Otherwise we use pages. + * Once we start supporting >1TB x86 systems, we'll need a third + * option. + */ + if (MAX(physmem, k_anoninfo.ani_max) < 1024 * 1024) { + si.si_totalram = physmem * PAGESIZE; + si.si_freeram = freemem * PAGESIZE; + si.si_totalswap = k_anoninfo.ani_max * PAGESIZE; + si.si_freeswap = k_anoninfo.ani_free * PAGESIZE; + si.si_mem_unit = 1; + } else { + si.si_totalram = physmem; + si.si_freeram = freemem; + si.si_totalswap = k_anoninfo.ani_max; + si.si_freeswap = k_anoninfo.ani_free; + si.si_mem_unit = PAGESIZE; + } + si.si_bufferram = 0; + si.si_sharedram = 0; + + /* + * These two stats refer to high physical memory. If an + * application running in a Linux zone cares about this, then + * either it or we are broken. + */ + si.si_totalhigh = 0; + si.si_freehigh = 0; + + if (copyout(&si, sip, sizeof (si)) != 0) + return (set_errno(EFAULT)); + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c new file mode 100644 index 0000000000..f9751819f9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cpuvar.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> + +long +lx_get_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + + if (fuword32(&inf->entry_number, (uint32_t *)&entry)) + return (set_errno(EFAULT)); + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + /* + * convert the solaris ldt to the linux format expected by the + * caller + */ + DESC_TO_LDT_INFO(dscrp, &ldt_inf); + ldt_inf.entry_number = entry; + + if (copyout(&ldt_inf, inf, sizeof (struct ldt_info))) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_set_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + int i; + + if (copyin(inf, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + entry = ldt_inf.entry_number; + if (entry == -1) { + /* + * find an empty entry in the tls for this thread + */ + for (i = 0, dscrp = jlwp->br_tls; + i < LX_TLSNUM; i++, dscrp++) + if (((unsigned long *)dscrp)[0] == 0 && + ((unsigned long *)dscrp)[1] == 0) + break; + + if (i < LX_TLSNUM) { + /* + * found one + */ + entry = i + GDT_TLSMIN; + if (suword32(&inf->entry_number, entry)) + return (set_errno(EFAULT)); + } else { + return (set_errno(ESRCH)); + } + } + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + /* + * convert the linux ldt info to standard intel descriptor + */ + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + if (LDT_INFO_EMPTY(&ldt_inf)) { + ((unsigned long *)dscrp)[0] = 0; + ((unsigned long *)dscrp)[1] = 0; + } else { + LDT_INFO_TO_DESC(&ldt_inf, dscrp); + } + + /* + * update the gdt with the new descriptor + */ + kpreempt_disable(); + + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) + lx_set_gdt(GDT_TLSMIN + i, dscrp); + + kpreempt_enable(); + + return (0); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c new file mode 100644 index 0000000000..a46ea3c979 --- /dev/null +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -0,0 +1,288 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/errno.h> +#include <sys/exec.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/model.h> +#include <sys/proc.h> +#include <sys/syscall.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cmn_err.h> +#include <sys/archsystm.h> + +#include <sys/machbrand.h> +#include <sys/brand.h> +#include "sn1_brand.h" + +char *sn1_emulation_table = NULL; + +void sn1_setbrand(proc_t *); +int sn1_getattr(zone_t *, int, void *, size_t *); +int sn1_setattr(zone_t *, int, void *, size_t); +int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t, uintptr_t); +void sn1_copy_procdata(proc_t *, proc_t *); +void sn1_proc_exit(struct proc *, klwp_t *); +void sn1_exec(); +int sn1_initlwp(klwp_t *); +void sn1_forklwp(klwp_t *, klwp_t *); +void sn1_freelwp(klwp_t *); +void sn1_lwpexit(klwp_t *); +int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int); + +/* sn1 brand */ +struct brand_ops sn1_brops = { + sn1_brandsys, + sn1_setbrand, + sn1_getattr, + sn1_setattr, + sn1_copy_procdata, + sn1_proc_exit, + sn1_exec, + lwp_setrval, + sn1_initlwp, + sn1_forklwp, + sn1_freelwp, + sn1_lwpexit, + sn1_elfexec +}; + +#ifdef sparc + +struct brand_mach_ops sn1_mops = { + sn1_brand_syscall_callback, + sn1_brand_syscall_callback +}; + +#else /* sparc */ + +#ifdef __amd64 + +struct brand_mach_ops sn1_mops = { + sn1_brand_sysenter_callback, + NULL, + sn1_brand_int91_callback, + sn1_brand_syscall_callback, + sn1_brand_syscall32_callback, + NULL +}; + +#else /* ! __amd64 */ + +struct brand_mach_ops sn1_mops = { + sn1_brand_sysenter_callback, + NULL, + NULL, + sn1_brand_syscall_callback, + NULL, + NULL +}; +#endif /* __amd64 */ + +#endif /* _sparc */ + +struct brand sn1_brand = { + BRAND_VER_1, + "sn1", + &sn1_brops, + &sn1_mops +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "Solaris N-1 Brand %I%", &sn1_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +sn1_setbrand(proc_t *p) +{ + p->p_brand_data = NULL; + p->p_brand = &sn1_brand; +} + +/* ARGSUSED */ +int +sn1_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) +{ + return (EINVAL); +} + +/* ARGSUSED */ +int +sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) +{ + return (EINVAL); +} + +/* + * Get the address of the user-space system call handler from the user + * process and attach it to the proc structure. + */ +/*ARGSUSED*/ +int +sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) +{ + proc_t *p = curproc; + *rval = 0; + + if (cmd == B_REGISTER) { + p->p_brand = &sn1_brand; + p->p_brand_data = (void *) arg1; + return (0); + } + + ASSERT(p->p_brand == &sn1_brand); + + return (EINVAL); +} + +/* + * Copy the per-process brand data from a parent proc to a child. In the + * sn1 brand, the only per-process state is the address of the user-space + * handler. + */ +void +sn1_copy_procdata(proc_t *child, proc_t *parent) +{ + child->p_brand_data = parent->p_brand_data; +} + +/*ARGSUSED*/ +void +sn1_proc_exit(struct proc *p, klwp_t *l) +{ + p->p_brand_data = NULL; + p->p_brand = &native_brand; +} + +void +sn1_exec() +{ + curproc->p_brand_data = NULL; +} + +/*ARGSUSED*/ +int +sn1_initlwp(klwp_t *l) +{ + return (0); +} + +/*ARGSUSED*/ +void +sn1_forklwp(klwp_t *p, klwp_t *c) +{ +} + +/*ARGSUSED*/ +void +sn1_freelwp(klwp_t *l) +{ +} + +/*ARGSUSED*/ +void +sn1_lwpexit(klwp_t *l) +{ +} + +int +sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, + int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, + int brand_action) +{ + args->brandname = "sn1"; + return ((args->execswp->exec_func)(vp, uap, args, idatap, level + 1, + execsz, setid, exec_file, cred, brand_action)); +} + + +int +_init(void) +{ + int err; + +#if defined(sparc) && !defined(DEBUG) + cmn_err(CE_WARN, "The sn1 brand is only supported on DEBUG kernels."); + return (ENOTSUP); +#else + + /* + * Set up the table indicating which system calls we want to + * interpose on. We should probably build this automatically from + * a list of system calls that is shared with the user-space + * library. + */ + sn1_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP); + sn1_emulation_table[SYS_uname] = 1; + sn1_emulation_table[SYS_fork1] = 1; + + err = mod_install(&modlinkage); + if (err) { + cmn_err(CE_WARN, "Couldn't install brand module"); + kmem_free(sn1_emulation_table, NSYSCALL); + } + + return (err); +#endif +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&sn1_brand)) + return (EBUSY); + + kmem_free(sn1_emulation_table, NSYSCALL); + sn1_emulation_table = NULL; + + err = mod_remove(&modlinkage); + if (err) + cmn_err(CE_WARN, "Couldn't unload sn1 brand module"); + + return (err); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h new file mode 100644 index 0000000000..a4efca189b --- /dev/null +++ b/usr/src/uts/common/brand/sn1/sn1_brand.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SN1_BRAND_H +#define _SN1_BRAND_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +void sn1_brand_syscall_callback(void); +void sn1_brand_sysenter_callback(void); +void sn1_brand_int91_callback(void); +#ifdef __amd64 +void sn1_brand_syscall32_callback(void); +#endif + +extern struct brand *sbrand; + +#ifdef __cplusplus +} +#endif + +#endif /* _SN1_BRAND_H */ diff --git a/usr/src/uts/common/c2/audit_event.c b/usr/src/uts/common/c2/audit_event.c index 4ee95e1728..b45da7bf26 100644 --- a/usr/src/uts/common/c2/audit_event.c +++ b/usr/src/uts/common/c2/audit_event.c @@ -158,6 +158,7 @@ static void aus_sigqueue(struct t_audit_data *); static void aus_p_online(struct t_audit_data *); static void aus_processor_bind(struct t_audit_data *); static void aus_inst_sync(struct t_audit_data *); +static void aus_brandsys(struct t_audit_data *); static void auf_accept(struct t_audit_data *, int, rval_t *); @@ -270,7 +271,7 @@ aui_null, AUE_FSTATFS, aus_fstatfs, /* 38 fstatfs */ auf_null, S2E_PUB, aui_null, AUE_SETPGRP, aus_null, /* 39 setpgrp */ auf_null, 0, -aui_null, AUE_NULL, aus_null, /* 40 (loadable) was cxenix */ +aui_null, AUE_NULL, aus_null, /* 40 uucopystr */ auf_null, 0, aui_null, AUE_NULL, aus_null, /* 41 dup */ auf_null, 0, @@ -564,7 +565,7 @@ aui_null, AUE_NULL, aus_null, /* 175 llseek */ aui_null, AUE_INST_SYNC, aus_inst_sync, /* 176 (loadable) */ /* aus_inst_sync */ auf_null, 0, -aui_null, AUE_NULL, aus_null, /* 177 (loadable) */ +aui_null, AUE_BRANDSYS, aus_brandsys, /* 177 brandsys */ auf_null, 0, aui_null, AUE_NULL, aus_null, /* 178 (loadable) */ auf_null, 0, @@ -718,8 +719,7 @@ aui_null, AUE_NULL, aus_null, /* 252 lwp_mutex_init */ auf_null, 0, aui_null, AUE_NULL, aus_null, /* 253 cladm */ auf_null, 0, -aui_null, AUE_NULL, aus_null, /* 254 (loadable) */ - /* was lwp_sigtimedwait */ +aui_null, AUE_NULL, aus_null, /* 254 uucopy */ auf_null, 0, aui_null, AUE_UMOUNT2, aus_umount2, /* 255 umount2 */ auf_null, 0 @@ -4706,6 +4706,40 @@ aus_inst_sync(struct t_audit_data *tad) /*ARGSUSED*/ static void +aus_brandsys(struct t_audit_data *tad) +{ + klwp_t *clwp = ttolwp(curthread); + + struct a { + long cmd; + long arg1; + long arg2; + long arg3; + long arg4; + long arg5; + long arg6; + } *uap = (struct a *)clwp->lwp_ap; + + au_uwrite(au_to_arg32(1, "cmd", (uint_t)uap->cmd)); +#ifdef _LP64 + au_uwrite(au_to_arg64(2, "arg1", (uint64_t)uap->arg1)); + au_uwrite(au_to_arg64(3, "arg2", (uint64_t)uap->arg2)); + au_uwrite(au_to_arg64(4, "arg3", (uint64_t)uap->arg3)); + au_uwrite(au_to_arg64(5, "arg4", (uint64_t)uap->arg4)); + au_uwrite(au_to_arg64(6, "arg5", (uint64_t)uap->arg5)); + au_uwrite(au_to_arg64(7, "arg6", (uint64_t)uap->arg6)); +#else + au_uwrite(au_to_arg32(2, "arg1", (uint32_t)uap->arg1)); + au_uwrite(au_to_arg32(3, "arg2", (uint32_t)uap->arg2)); + au_uwrite(au_to_arg32(4, "arg3", (uint32_t)uap->arg3)); + au_uwrite(au_to_arg32(5, "arg4", (uint32_t)uap->arg4)); + au_uwrite(au_to_arg32(6, "arg5", (uint32_t)uap->arg5)); + au_uwrite(au_to_arg32(7, "arg6", (uint32_t)uap->arg6)); +#endif +} + +/*ARGSUSED*/ +static void aus_p_online(struct t_audit_data *tad) { struct a { diff --git a/usr/src/uts/common/c2/audit_kevents.h b/usr/src/uts/common/c2/audit_kevents.h index 942887ae72..4a2e5b27db 100644 --- a/usr/src/uts/common/c2/audit_kevents.h +++ b/usr/src/uts/common/c2/audit_kevents.h @@ -330,9 +330,10 @@ extern "C" { #define AUE_MODADDPRIV 291 /* =ad modctl(2) */ #define AUE_CRYPTOADM 292 /* =as kernel cryptographic framework */ #define AUE_CONFIGKSSL 293 /* =as kernel SSL */ +#define AUE_BRANDSYS 294 /* =ot */ /* NOTE: update MAX_KEVENTS below if events are added. */ -#define MAX_KEVENTS 293 +#define MAX_KEVENTS 294 #ifdef __cplusplus diff --git a/usr/src/uts/common/disp/class.c b/usr/src/uts/common/disp/class.c index b5b2674d89..8e83a839ee 100644 --- a/usr/src/uts/common/disp/class.c +++ b/usr/src/uts/common/disp/class.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -325,7 +324,8 @@ parmsset(pcparms_t *parmsp, kthread_id_t targtp) * The parameters are specified by a key. */ int -vaparmsout(char *classp, pcparms_t *prmsp, pc_vaparms_t *vaparmsp) +vaparmsout(char *classp, pcparms_t *prmsp, pc_vaparms_t *vaparmsp, + uio_seg_t seg) { char *clname; @@ -348,7 +348,8 @@ vaparmsout(char *classp, pcparms_t *prmsp, pc_vaparms_t *vaparmsp) return (EINVAL); clname = sclass[prmsp->pc_cid].cl_name; - if (copyout(clname, (void *)(uintptr_t)vaparmsp->pc_parms[0].pc_parm, + if ((seg == UIO_USERSPACE ? copyout : kcopy)(clname, + (void *)(uintptr_t)vaparmsp->pc_parms[0].pc_parm, MIN(strlen(clname) + 1, PC_CLNMSZ))) return (EFAULT); diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 3c1a271155..3bb90cf1fa 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -81,7 +80,7 @@ struct stprmargs { * between the 64-bit kernel ABI and the 32-bit user ABI. */ static int -copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap) +copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg) { pc_vaparms32_t vaparms32; pc_vaparm32_t *src; @@ -90,7 +89,8 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap) ASSERT(get_udatamodel() == DATAMODEL_ILP32); - if (copyin(arg, &vaparms32, sizeof (vaparms32))) + if ((seg == UIO_USERSPACE ? copyin : kcopy)(arg, &vaparms32, + sizeof (vaparms32))) return (EFAULT); vap->pc_vaparmscnt = vaparms32.pc_vaparmscnt; @@ -104,13 +104,13 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap) return (0); } -#define COPYIN_VAPARMS(arg, vap, size) \ +#define COPYIN_VAPARMS(arg, vap, size, seg) \ (get_udatamodel() == DATAMODEL_NATIVE ? \ - copyin(arg, vap, size) : copyin_vaparms32(arg, vap)) + (*copyinfn)(arg, vap, size) : copyin_vaparms32(arg, vap, seg)) #else -#define COPYIN_VAPARMS(arg, vap, size) copyin(arg, vap, size) +#define COPYIN_VAPARMS(arg, vap, size, seg) (*copyinfn)(arg, vap, size) #endif @@ -123,7 +123,8 @@ extern int threadcmp(struct pcmpargs *, kthread_id_t); * The priocntl system call. */ long -priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) +priocntl_common(int pc_version, procset_t *psp, int cmd, caddr_t arg, + caddr_t arg2, uio_seg_t seg) { pcinfo_t pcinfo; pcparms_t pcparms; @@ -144,6 +145,8 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) int rv = 0; pid_t saved_pid; id_t classid; + int (*copyinfn)(const void *, void *, size_t); + int (*copyoutfn)(const void *, void *, size_t); /* * First just check the version number. Right now there is only @@ -157,6 +160,14 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) if (pc_version != PC_VERSION) return (set_errno(EINVAL)); + if (seg == UIO_USERSPACE) { + copyinfn = copyin; + copyoutfn = copyout; + } else { + copyinfn = kcopy; + copyoutfn = kcopy; + } + switch (cmd) { case PC_GETCID: /* @@ -171,7 +182,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) rv = loaded_classes; break; } else { - if (copyin(arg, &pcinfo, sizeof (pcinfo))) + if ((*copyinfn)(arg, &pcinfo, sizeof (pcinfo))) return (set_errno(EFAULT)); } @@ -204,7 +215,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) if (error) return (set_errno(error)); - if (copyout(&pcinfo, arg, sizeof (pcinfo))) + if ((*copyoutfn)(&pcinfo, arg, sizeof (pcinfo))) return (set_errno(EFAULT)); rv = loaded_classes; @@ -221,7 +232,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) rv = loaded_classes; break; } else { - if (copyin(arg, &pcinfo, sizeof (pcinfo))) + if ((*copyinfn)(arg, &pcinfo, sizeof (pcinfo))) return (set_errno(EFAULT)); } @@ -245,7 +256,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) if (error) return (set_errno(error)); - if (copyout(&pcinfo, arg, sizeof (pcinfo))) + if ((*copyoutfn)(&pcinfo, arg, sizeof (pcinfo))) return (set_errno(EFAULT)); rv = loaded_classes; @@ -259,13 +270,14 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) * because it's done on a per thread basis by parmsset(). */ if (cmd == PC_SETPARMS) { - if (copyin(arg, &pcparms, sizeof (pcparms))) + if ((*copyinfn)(arg, &pcparms, sizeof (pcparms))) return (set_errno(EFAULT)); error = parmsin(&pcparms, NULL); } else { - if (copyin(arg, clname, PC_CLNMSZ) || - COPYIN_VAPARMS(arg2, &vaparms, sizeof (vaparms))) + if ((*copyinfn)(arg, clname, PC_CLNMSZ) || + COPYIN_VAPARMS(arg2, &vaparms, sizeof (vaparms), + seg)) return (set_errno(EFAULT)); clname[PC_CLNMSZ-1] = '\0'; @@ -281,7 +293,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) /* * Get the procset from the user. */ - if (copyin(psp, &procset, sizeof (procset))) + if ((*copyinfn)(psp, &procset, sizeof (procset))) return (set_errno(EFAULT)); /* @@ -372,11 +384,11 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) case PC_GETPARMS: case PC_GETXPARMS: if (cmd == PC_GETPARMS) { - if (copyin(arg, &pcparms, sizeof (pcparms))) + if ((*copyinfn)(arg, &pcparms, sizeof (pcparms))) return (set_errno(EFAULT)); } else { if (arg != NULL) { - if (copyin(arg, clname, PC_CLNMSZ)) + if ((*copyinfn)(arg, clname, PC_CLNMSZ)) return (set_errno(EFAULT)); clname[PC_CLNMSZ-1] = '\0'; @@ -385,7 +397,9 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) return (set_errno(EINVAL)); } else pcparms.pc_cid = PC_CLNULL; - if (COPYIN_VAPARMS(arg2, &vaparms, sizeof (vaparms))) + + if (COPYIN_VAPARMS(arg2, &vaparms, sizeof (vaparms), + seg)) return (set_errno(EFAULT)); } @@ -393,7 +407,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) (pcparms.pc_cid < 1 && pcparms.pc_cid != PC_CLNULL)) return (set_errno(EINVAL)); - if (copyin(psp, &procset, sizeof (procset))) + if ((*copyinfn)(psp, &procset, sizeof (procset))) return (set_errno(EFAULT)); /* @@ -590,9 +604,10 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) return (set_errno(error)); if (cmd == PC_GETPARMS) { - if (copyout(&pcparms, arg, sizeof (pcparms))) + if ((*copyoutfn)(&pcparms, arg, sizeof (pcparms))) return (set_errno(EFAULT)); - } else if ((error = vaparmsout(arg, &pcparms, &vaparms)) != 0) + } else if ((error = vaparmsout(arg, &pcparms, &vaparms, + seg)) != 0) return (set_errno(error)); /* @@ -603,14 +618,14 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) case PC_ADMIN: if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(arg, &pcadmin, sizeof (pcadmin_t))) + if ((*copyinfn)(arg, &pcadmin, sizeof (pcadmin_t))) return (set_errno(EFAULT)); #ifdef _SYSCALL32_IMPL } else { /* pcadmin struct from ILP32 callers */ pcadmin32_t pcadmin32; - if (copyin(arg, &pcadmin32, sizeof (pcadmin32_t))) + if ((*copyinfn)(arg, &pcadmin32, sizeof (pcadmin32_t))) return (set_errno(EFAULT)); pcadmin.pc_cid = pcadmin32.pc_cid; pcadmin.pc_cladmin = (caddr_t)(uintptr_t) @@ -632,7 +647,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) break; case PC_GETPRIRANGE: - if (copyin(arg, &pcpri, sizeof (pcpri_t))) + if ((*copyinfn)(arg, &pcpri, sizeof (pcpri_t))) return (set_errno(EFAULT)); if (pcpri.pc_cid >= loaded_classes || pcpri.pc_cid < 0) @@ -640,7 +655,7 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) error = CL_GETCLPRI(&sclass[pcpri.pc_cid], &pcpri); if (!error) { - if (copyout(&pcpri, arg, sizeof (pcpri))) + if ((*copyoutfn)(&pcpri, arg, sizeof (pcpri))) return (set_errno(EFAULT)); } break; @@ -649,14 +664,14 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) /* * Get pcnice and procset structures from the user. */ - if (copyin(arg, &pcnice, sizeof (pcnice)) || - copyin(psp, &procset, sizeof (procset))) + if ((*copyinfn)(arg, &pcnice, sizeof (pcnice)) || + (*copyinfn)(psp, &procset, sizeof (procset))) return (set_errno(EFAULT)); error = donice(&procset, &pcnice); if (!error && (pcnice.pc_op == PC_GETNICE)) { - if (copyout(&pcnice, arg, sizeof (pcnice))) + if ((*copyoutfn)(&pcnice, arg, sizeof (pcnice))) return (set_errno(EFAULT)); } break; @@ -684,6 +699,12 @@ priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) return (error ? (set_errno(error)) : rv); } +long +priocntlsys(int pc_version, procset_t *psp, int cmd, caddr_t arg, caddr_t arg2) +{ + return (priocntl_common(pc_version, psp, cmd, arg, arg2, + UIO_USERSPACE)); +} /* * The proccmp() function is part of the implementation of the @@ -844,7 +865,7 @@ setparms(proc_t *targpp, struct stprmargs *stprmp) return (0); } -static int +int setthreadnice(pcnice_t *pcnice, kthread_t *tp) { int error = 0; @@ -889,7 +910,7 @@ setthreadnice(pcnice_t *pcnice, kthread_t *tp) return (error); } -static int +int setprocnice(proc_t *pp, pcnice_t *pcnice) { kthread_t *tp; diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 91b4db8103..5f352b2203 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -64,6 +64,7 @@ #include <sys/spl.h> #include <sys/copyops.h> #include <sys/rctl.h> +#include <sys/brand.h> #include <sys/pool.h> #include <sys/zone.h> #include <sys/tsol/label.h> @@ -186,6 +187,7 @@ thread_init(void) rctl_init(); project_init(); + brand_init(); zone_init(); task_init(); tcache_init(); diff --git a/usr/src/uts/common/disp/ts.c b/usr/src/uts/common/disp/ts.c index a190297100..738a2e47b4 100644 --- a/usr/src/uts/common/disp/ts.c +++ b/usr/src/uts/common/disp/ts.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1269,14 +1268,14 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp) tspp->ts_flags |= TSIASET; thread_unlock(tx); } - TTY_HOLD(p->p_sessp); + mutex_enter(&p->p_sessp->s_lock); sess_held = 1; if ((pid == sid) && (p->p_sessp->s_vp != NULL) && ((stp = p->p_sessp->s_vp->v_stream) != NULL)) { if ((stp->sd_pgidp != NULL) && (stp->sd_sidp != NULL)) { pgid = stp->sd_pgidp->pid_id; sess_held = 0; - TTY_RELE(p->p_sessp); + mutex_exit(&p->p_sessp->s_lock); if (iaparmsp->ia_mode == IA_SET_INTERACTIVE) { off = 0; @@ -1292,7 +1291,7 @@ ia_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp) } } if (sess_held) - TTY_RELE(p->p_sessp); + mutex_exit(&p->p_sessp->s_lock); thread_lock(tx); @@ -2130,14 +2129,14 @@ ia_set_process_group(pid_t sid, pid_t bg_pgid, pid_t fg_pgid) * that do not have focus and are changing the process group * attatched to the tty, e.g. a process that is exiting */ - TTY_HOLD(leader->p_sessp); + mutex_enter(&leader->p_sessp->s_lock); if (!(tspp->ts_flags & TSIASET) || (leader->p_sessp->s_vp == NULL) || (leader->p_sessp->s_vp->v_stream == NULL)) { - TTY_RELE(leader->p_sessp); + mutex_exit(&leader->p_sessp->s_lock); return; } - TTY_RELE(leader->p_sessp); + mutex_exit(&leader->p_sessp->s_lock); /* * If we're already holding the leader's p_lock, we should use diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c index 5c7b6b1773..4e814b339b 100644 --- a/usr/src/uts/common/exec/aout/aout.c +++ b/usr/src/uts/common/exec/aout/aout.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,19 +55,19 @@ static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred); + caddr_t exec_file, cred_t *cred, int brand_action); static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz, int *isdyn); static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); #ifdef _LP64 extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *); + long *, int, caddr_t, cred_t *, int); extern int elf32core(vnode_t *, proc_t *, cred_t *, rlim64_t, int, core_content_t); #else /* _LP64 */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *); + long *, int, caddr_t, cred_t *, int); extern int elfcore(vnode_t *, proc_t *, cred_t *, rlim64_t, int, core_content_t); #endif /* _LP64 */ @@ -141,7 +140,7 @@ _info(struct modinfo *modinfop) static int aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred) + caddr_t exec_file, cred_t *cred, int brand_action) { int error; struct exdata edp, edpout; @@ -201,10 +200,10 @@ aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args, } #ifdef _LP64 if (error = elf32exec(nvp, uap, args, idatap, level, execsz, - setid, exec_file, cred)) + setid, exec_file, cred, brand_action)) #else /* _LP64 */ if (error = elfexec(nvp, uap, args, idatap, level, execsz, - setid, exec_file, cred)) + setid, exec_file, cred, brand_action)) #endif /* _LP64 */ { VN_RELE(nvp); diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index 33e3cc9b8e..6508cdae85 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -62,8 +62,11 @@ #include <sys/shm_impl.h> #include <sys/archsystm.h> #include <sys/fasttrap.h> +#include <sys/brand.h> #include "elf_impl.h" +#include <sys/sdt.h> + extern int at_flags; #define ORIGIN_STR "ORIGIN" @@ -77,7 +80,7 @@ static int getelfshdr(vnode_t *, cred_t *, const Ehdr *, int, int, caddr_t *, static size_t elfsize(Ehdr *, int, caddr_t, uintptr_t *); static int mapelfexec(vnode_t *, Ehdr *, int, caddr_t, Phdr **, Phdr **, Phdr **, Phdr **, Phdr *, - caddr_t *, caddr_t *, intptr_t *, size_t, long *, size_t *); + caddr_t *, caddr_t *, intptr_t *, intptr_t *, size_t, long *, size_t *); typedef enum { STR_CTF, @@ -160,10 +163,83 @@ dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base) return (0); } +/* + * Map in the executable pointed to by vp. Returns 0 on success. + */ +int +mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Elf32_Addr *uphdr_vaddr, + intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase, + caddr_t *brkbase, size_t *brksize) +{ + size_t len; + struct vattr vat; + caddr_t phdrbase = NULL; + ssize_t phdrsize; + int nshdrs, shstrndx, nphdrs; + int error = 0; + Phdr *uphdr = NULL; + Phdr *junk = NULL; + Phdr *dynphdr = NULL; + Phdr *dtrphdr = NULL; + uintptr_t lddata; + long execsz; + intptr_t minaddr; + + if (error = execpermissions(vp, &vat, args)) { + uprintf("%s: Cannot execute %s\n", exec_file, args->pathname); + return (error); + } + + if ((error = getelfhead(vp, CRED(), ehdr, &nshdrs, &shstrndx, + &nphdrs)) != 0 || + (error = getelfphdr(vp, CRED(), ehdr, nphdrs, &phdrbase, + &phdrsize)) != 0) { + uprintf("%s: Cannot read %s\n", exec_file, args->pathname); + return (error); + } + + if ((len = elfsize(ehdr, nphdrs, phdrbase, &lddata)) == 0) { + uprintf("%s: Nothing to load in %s", exec_file, args->pathname); + kmem_free(phdrbase, phdrsize); + return (ENOEXEC); + } + + if (error = mapelfexec(vp, ehdr, nphdrs, phdrbase, &uphdr, &dynphdr, + &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr, + len, &execsz, brksize)) { + uprintf("%s: Cannot map %s\n", exec_file, args->pathname); + kmem_free(phdrbase, phdrsize); + return (error); + } + + /* + * Inform our caller if the executable needs an interpreter. + */ + *interp = (dynphdr == NULL) ? 0 : 1; + + /* + * If this is a statically linked executable, voffset should indicate + * the address of the executable itself (it normally holds the address + * of the interpreter). + */ + if (ehdr->e_type == ET_EXEC && *interp == 0) + *voffset = minaddr; + + if (uphdr != NULL) { + *uphdr_vaddr = uphdr->p_vaddr; + } else { + *uphdr_vaddr = (Elf32_Addr)-1; + } + + kmem_free(phdrbase, phdrsize); + return (error); +} + /*ARGSUSED*/ int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, - int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred) + int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, + int brand_action) { caddr_t phdrbase = NULL; caddr_t bssbase = 0; @@ -175,10 +251,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ssize_t resid; int fd = -1; intptr_t voffset; - Phdr *dyphdr = NULL; - Phdr *stphdr = NULL; - Phdr *uphdr = NULL; - Phdr *junk = NULL; + Phdr *dyphdr = NULL; + Phdr *stphdr = NULL; + Phdr *uphdr = NULL; + Phdr *junk = NULL; size_t len; ssize_t phdrsize; int postfixsize = 0; @@ -189,6 +265,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int hasu = 0; int hasauxv = 0; int hasdy = 0; + int branded = 0; struct proc *p = ttoproc(curthread); struct user *up = PTOU(p); @@ -209,6 +286,13 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ASSERT(p->p_model == DATAMODEL_ILP32 || p->p_model == DATAMODEL_LP64); + if ((level < 2) && + (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + return (BROP(p)->b_elfexec(vp, uap, args, + idatap, level + 1, execsz, setid, exec_file, cred, + brand_action)); + } + bigwad = kmem_alloc(sizeof (struct bigwad), KM_SLEEP); ehdrp = &bigwad->ehdr; dlnp = bigwad->dl_name; @@ -353,6 +437,22 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, } else args->auxsize = 0; + /* + * If this binary is using an emulator, we need to add an + * AT_SUN_EMULATOR aux entry. + */ + if (args->emulator != NULL) + args->auxsize += sizeof (aux_entry_t); + + if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + branded = 1; + /* + * We will be adding 2 entries to the aux vector. One for + * the branded binary's phdr and one for the brandname. + */ + args->auxsize += 2 * sizeof (aux_entry_t); + } + aux = bigwad->elfargs; /* * Move args to the user's stack. @@ -364,6 +464,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, } goto out; } + /* we're single threaded after this point */ /* * If this is an ET_DYN executable (shared object), @@ -377,8 +478,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; if ((error = mapelfexec(vp, ehdrp, nphdrs, phdrbase, &uphdr, &dyphdr, - &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, len, - execsz, &brksize)) != 0) + &stphdr, &dtrphdr, dataphdrp, &bssbase, &brkbase, &voffset, NULL, + len, execsz, &brksize)) != 0) goto bad; if (uphdr != NULL && dyphdr == NULL) @@ -542,8 +643,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk, - &junk, &dtrphdr, NULL, NULL, NULL, &voffset, len, execsz, - NULL); + &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len, + execsz, NULL); if (error || junk != NULL) { VN_RELE(nvp); uprintf("%s: Cannot map %s\n", exec_file, dlnp); @@ -601,6 +702,16 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #else ADDAUX(aux, AT_SUN_HWCAP, auxv_hwcap) #endif + if (branded) { + /* + * Reserve space for the brand-private aux vector entry, + * and record the user addr of that space. + */ + args->brand_auxp = (auxv32_t *)((char *)args->stackend + + ((char *)&aux->a_type - (char *)bigwad->elfargs)); + ADDAUX(aux, AT_SUN_BRAND_PHDR, 0) + } + ADDAUX(aux, AT_NULL, 0) postfixsize = (char *)aux - (char *)bigwad->elfargs; ASSERT(postfixsize == args->auxsize); @@ -639,6 +750,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, /* * Copy auxv to the process's user structure for use by /proc. + * If this is a branded process, the brand's exec routine will + * copy it's private entries to the user structure later. It + * relies on the fact that the blank entries are at the end. */ num_auxv = postfixsize / sizeof (aux_entry_t); ASSERT(num_auxv <= sizeof (up->u_auxv) / sizeof (auxv_t)); @@ -968,6 +1082,7 @@ mapelfexec( caddr_t *bssbase, caddr_t *brkbase, intptr_t *voffset, + intptr_t *minaddr, size_t len, long *execsz, size_t *brksize) @@ -980,6 +1095,7 @@ mapelfexec( int page; off_t offset; int hsize = ehdr->e_phentsize; + caddr_t mintmp = (caddr_t)-1; if (ehdr->e_type == ET_DYN) { /* @@ -1010,6 +1126,14 @@ mapelfexec( prot |= PROT_EXEC; addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset); + + /* + * Keep track of the segment with the lowest starting + * address. + */ + if (addr < mintmp) + mintmp = addr; + zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz; offset = phdr->p_offset; @@ -1110,6 +1234,12 @@ mapelfexec( } phdr = (Phdr *)((caddr_t)phdr + hsize); } + + if (minaddr != NULL) { + ASSERT(mintmp != (caddr_t)-1); + *minaddr = (intptr_t)mintmp; + } + return (0); bad: if (error == 0) @@ -1850,13 +1980,14 @@ static struct execsw esw = { }; static struct modlexec modlexec = { - &mod_execops, "exec module for elf", &esw + &mod_execops, "exec module for elf %I%", &esw }; #ifdef _LP64 extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, - int setid, caddr_t exec_file, cred_t *cred); + int setid, caddr_t exec_file, cred_t *cred, + int brand_action); extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); diff --git a/usr/src/uts/common/exec/elf/elf_impl.h b/usr/src/uts/common/exec/elf/elf_impl.h index 52094e3794..010d5e6256 100644 --- a/usr/src/uts/common/exec/elf/elf_impl.h +++ b/usr/src/uts/common/exec/elf/elf_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -80,6 +79,7 @@ typedef struct { #define elfexec elf32exec #define elfnote elf32note #define elfcore elf32core +#define mapexec_brand mapexec32_brand #define setup_note_header setup_note_header32 #define write_elfnotes write_elfnotes32 #define setup_old_note_header setup_old_note_header32 diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c index 6c6c98246d..4d5c04dfd4 100644 --- a/usr/src/uts/common/exec/intp/intp.c +++ b/usr/src/uts/common/exec/intp/intp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -45,6 +44,7 @@ #include <sys/disp.h> #include <sys/exec.h> #include <sys/kmem.h> +#include <sys/note.h> /* * This is the loadable module wrapper. @@ -166,8 +166,10 @@ intpexec( long *execsz, int setid, caddr_t exec_file, - struct cred *cred) + struct cred *cred, + int brand_action) { + _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; int error = 0; struct intpdata idata; @@ -223,8 +225,8 @@ intpexec( args->fname = devfd; } - error = gexec(&nvp, uap, args, &idata, ++level, - execsz, exec_file, cred); + error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, + EBA_NONE); done: VN_RELE(nvp); args->pathname = opath; diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c index 0e8c3996e7..bcf61453c9 100644 --- a/usr/src/uts/common/exec/java/java.c +++ b/usr/src/uts/common/exec/java/java.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -88,7 +87,7 @@ char *jexec_arg = "-jar"; static int javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t execfile, cred_t *cred) + caddr_t execfile, cred_t *cred, int brand_action) { struct intpdata idata; int error; @@ -162,8 +161,8 @@ javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, args->pathname = resolvepn.pn_path; /* don't free resolvepn until we are done with args */ pn_free(&lookpn); - error = gexec(&nvp, - uap, args, &idata, level + 1, execsz, execfile, cred); + error = gexec(&nvp, uap, args, &idata, level + 1, execsz, execfile, + cred, EBA_NONE); VN_RELE(nvp); args->pathname = opath; pn_free(&resolvepn); diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 3ee72c9124..8767999322 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -304,7 +304,8 @@ static void fifo_reinit_vp(vnode_t *vp) { vn_reinit(vp); vp->v_type = VFIFO; - vp->v_flag = VNOMAP | VNOSWAP; + vp->v_flag &= VROOT; + vp->v_flag |= VNOMAP | VNOSWAP; } /* @@ -470,6 +471,7 @@ fifovp(vnode_t *vp, cred_t *crp) fifo_reinit_vp(newvp); newvp->v_vfsp = vp->v_vfsp; newvp->v_rdev = vp->v_rdev; + newvp->v_flag |= (vp->v_flag & VROOT); fifoinsert(fnp); mutex_exit(&ftable_lock); diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index cab88019ff..34f731af1e 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -77,6 +77,8 @@ static int fifo_setattr(vnode_t *, vattr_t *, int, cred_t *, caller_context_t *); static int fifo_realvp(vnode_t *, vnode_t **); static int fifo_access(vnode_t *, int, int, cred_t *); +static int fifo_create(struct vnode *, char *, vattr_t *, enum vcexcl, + int, struct vnode **, struct cred *, int); static int fifo_fid(vnode_t *, fid_t *); static int fifo_fsync(vnode_t *, int, cred_t *); static int fifo_seek(vnode_t *, offset_t, offset_t *); @@ -116,6 +118,7 @@ const fs_operation_def_t fifo_vnodeops_template[] = { VOPNAME_GETATTR, fifo_getattr, VOPNAME_SETATTR, fifo_setattr, VOPNAME_ACCESS, fifo_access, + VOPNAME_CREATE, fifo_create, VOPNAME_FSYNC, fifo_fsync, VOPNAME_INACTIVE, (fs_generic_func_p) fifo_inactive, VOPNAME_FID, fifo_fid, @@ -1542,6 +1545,27 @@ fifo_access(vnode_t *vp, int mode, int flags, cred_t *crp) } /* + * This can be called if creat or an open with O_CREAT is done on the root + * of a lofs mount where the mounted entity is a fifo. + */ +/*ARGSUSED*/ +static int +fifo_create(struct vnode *dvp, char *name, vattr_t *vap, enum vcexcl excl, + int mode, struct vnode **vpp, struct cred *cr, int flag) +{ + int error; + + ASSERT(dvp && (dvp->v_flag & VROOT) && *name == '\0'); + if (excl == NONEXCL) { + if (mode && (error = fifo_access(dvp, mode, 0, cr))) + return (error); + VN_HOLD(dvp); + return (0); + } + return (EEXIST); +} + +/* * If shadowing a vnode, apply the VOP_FSYNC to it. * Otherwise, return 0. */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_subr.c b/usr/src/uts/common/fs/nfs/nfs4_subr.c index 9278fe03da..2a6505ccf9 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_subr.c +++ b/usr/src/uts/common/fs/nfs/nfs4_subr.c @@ -1451,7 +1451,7 @@ nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, } else mutex_exit(&mi->mi_lock); - if (*doqueue && curproc->p_sessp->s_vp != NULL) { + if (*doqueue && nfs_has_ctty()) { *doqueue = 0; if (!(mi->mi_flags & MI4_NOPRINT)) nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi, @@ -1481,7 +1481,7 @@ nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, bufp = clnt_sperror(client, svp->sv_hostname); zprintf(zoneid, "NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], bufp); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI4_NOPRINT)) { uprintf("NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], @@ -1494,7 +1494,7 @@ nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp, "NFS %s failed for server %s: error %d (%s)\n", mi->mi_rfsnames[which], svp->sv_hostname, status, clnt_sperrno(status)); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI4_NOPRINT)) { uprintf( "NFS %s failed for server %s: error %d (%s)\n", diff --git a/usr/src/uts/common/fs/nfs/nfs_subr.c b/usr/src/uts/common/fs/nfs/nfs_subr.c index 05e70935be..bf1beb1275 100644 --- a/usr/src/uts/common/fs/nfs/nfs_subr.c +++ b/usr/src/uts/common/fs/nfs/nfs_subr.c @@ -1235,7 +1235,7 @@ failoverretry: #endif } else mutex_exit(&mi->mi_lock); - if (*douprintf && curproc->p_sessp->s_vp != NULL) { + if (*douprintf && nfs_has_ctty()) { *douprintf = 0; if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG @@ -1292,7 +1292,7 @@ failoverretry: bufp = clnt_sperror(client, svp->sv_hostname); zprintf(zoneid, "NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], bufp); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf("NFS%d %s failed for %s\n", mi->mi_vers, mi->mi_rfsnames[which], @@ -1305,7 +1305,7 @@ failoverretry: "NFS %s failed for server %s: error %d (%s)\n", mi->mi_rfsnames[which], svp->sv_hostname, status, clnt_sperrno(status)); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf( "NFS %s failed for server %s: error %d (%s)\n", @@ -1821,7 +1821,7 @@ failoverretry: #endif } else mutex_exit(&mi->mi_lock); - if (*douprintf && curproc->p_sessp->s_vp != NULL) { + if (*douprintf && nfs_has_ctty()) { *douprintf = 0; if (!(mi->mi_flags & MI_NOPRINT)) #ifdef DEBUG @@ -1886,7 +1886,7 @@ failoverretry: bufp = clnt_sperror(client, svp->sv_hostname); zprintf(zoneid, "NFS_ACL%d %s failed for %s\n", mi->mi_vers, mi->mi_aclnames[which], bufp); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) { uprintf("NFS_ACL%d %s failed for %s\n", mi->mi_vers, mi->mi_aclnames[which], @@ -1899,7 +1899,7 @@ failoverretry: "NFS %s failed for server %s: error %d (%s)\n", mi->mi_aclnames[which], svp->sv_hostname, status, clnt_sperrno(status)); - if (curproc->p_sessp->s_vp != NULL) { + if (nfs_has_ctty()) { if (!(mi->mi_flags & MI_NOPRINT)) uprintf( "NFS %s failed for server %s: error %d (%s)\n", @@ -5117,3 +5117,13 @@ out: label_rele(zlabel); return (retv); } + +boolean_t +nfs_has_ctty(void) +{ + boolean_t rv; + mutex_enter(&curproc->p_splock); + rv = (curproc->p_sessp->s_vp != NULL); + mutex_exit(&curproc->p_splock); + return (rv); +} diff --git a/usr/src/uts/common/fs/specfs/specvnops.c b/usr/src/uts/common/fs/specfs/specvnops.c index 6a2d6f73d0..24c7ffedab 100644 --- a/usr/src/uts/common/fs/specfs/specvnops.c +++ b/usr/src/uts/common/fs/specfs/specvnops.c @@ -680,13 +680,16 @@ streams_open: /* STREAMS devices don't have a size */ sp->s_size = csp->s_size = 0; - /* - * try to allocate it as a controlling terminal - */ - if ((stp->sd_flag & STRISTTY) && !(flag & FNOCTTY)) - stralloctty(stp); + if (!(stp->sd_flag & STRISTTY) || (flag & FNOCTTY)) + return (0); - return (0); + /* try to allocate it as a controlling terminal */ + if (strctty(stp) != EINTR) + return (0); + + /* strctty() was interrupted by a signal */ + (void) spec_close(vp, flag, 1, 0, cr); + return (EINTR); } /* diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 7c64462314..49bde7abeb 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -943,7 +943,7 @@ top: * Do remaining checks for FNOFOLLOW and FNOLINKS. */ if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { - error = EINVAL; + error = ELOOP; goto out; } if (filemode & FNOLINKS) { diff --git a/usr/src/uts/common/io/gentty.c b/usr/src/uts/common/io/gentty.c index 9cb3e23b87..431e80245d 100644 --- a/usr/src/uts/common/io/gentty.c +++ b/usr/src/uts/common/io/gentty.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 1990-1992,1996,1998-2003 Sun Microsystems, Inc. + * Copyright 2006 Sun Microsystems, Inc. * All rights reserved. * Use is subject to license terms. */ @@ -198,17 +197,20 @@ syopen(dev_t *devp, int flag, int otyp, struct cred *cr) { dev_t ttyd; vnode_t *ttyvp; - sess_t *sp = curproc->p_sessp; + sess_t *sp; int error; - if ((ttyd = sp->s_dev) == NODEV) + if ((sp = tty_hold()) == NULL) + return (EINTR); + + if (sp->s_dev == NODEV) { + tty_rele(sp); return (ENXIO); - TTY_HOLD(sp); - if ((ttyvp = sp->s_vp) == NULL) { - TTY_RELE(sp); - return (EIO); } + ttyd = sp->s_dev; + ttyvp = sp->s_vp; + /* * Open the control terminal. The control terminal may be * opened multiple times and it is closed in freectty(). @@ -237,10 +239,12 @@ syopen(dev_t *devp, int flag, int otyp, struct cred *cr) ASSERT(vn_matchops(ttyvp, spec_getvnodeops())); csp = VTOS(VTOS(ttyvp)->s_commonvp); mutex_enter(&csp->s_lock); + ASSERT(csp->s_count > 1); csp->s_count--; mutex_exit(&csp->s_lock); } - TTY_RELE(sp); + + tty_rele(sp); return (error); } @@ -255,41 +259,41 @@ syclose(dev_t dev, int flag, int otyp, struct cred *cr) int syread(dev_t dev, struct uio *uiop, struct cred *cr) { - vnode_t *ttyvp; - sess_t *sp = curproc->p_sessp; + sess_t *sp; int error; - if (sp->s_dev == NODEV) + if ((sp = tty_hold()) == NULL) + return (EINTR); + + if (sp->s_dev == NODEV) { + tty_rele(sp); return (ENXIO); - TTY_HOLD(sp); - if ((ttyvp = sp->s_vp) == NULL) { - TTY_RELE(sp); - return (EIO); } - error = VOP_READ(ttyvp, uiop, 0, cr, NULL); - TTY_RELE(sp); - return (error); + error = VOP_READ(sp->s_vp, uiop, 0, cr, NULL); + + tty_rele(sp); + return (error); } /* ARGSUSED */ int sywrite(dev_t dev, struct uio *uiop, struct cred *cr) { - vnode_t *ttyvp; - sess_t *sp = curproc->p_sessp; + sess_t *sp; int error; - if (sp->s_dev == NODEV) + if ((sp = tty_hold()) == NULL) + return (EINTR); + + if (sp->s_dev == NODEV) { + tty_rele(sp); return (ENXIO); - TTY_HOLD(sp); - if ((ttyvp = sp->s_vp) == NULL) { - TTY_RELE(sp); - return (EIO); } - error = VOP_WRITE(ttyvp, uiop, 0, cr, NULL); - TTY_RELE(sp); + error = VOP_WRITE(sp->s_vp, uiop, 0, cr, NULL); + + tty_rele(sp); return (error); } @@ -299,19 +303,32 @@ int syioctl(dev_t dev, int cmd, intptr_t arg, int mode, struct cred *cr, int *rvalp) { - vnode_t *ttyvp; - sess_t *sp = curproc->p_sessp; + sess_t *sp; int error; - if (sp->s_dev == NODEV) + if (cmd == TIOCNOTTY) { + /* + * we can't allow this ioctl. the reason is that it + * attempts to remove the ctty for a session. to do + * this the ctty can't be in use but we grab a hold on + * the current ctty (via tty_hold) to perform this ioctl. + * if we were to allow this ioctl to pass through we + * would deadlock with ourselves. + */ + return (EINVAL); + } + + if ((sp = tty_hold()) == NULL) + return (EINTR); + + if (sp->s_dev == NODEV) { + tty_rele(sp); return (ENXIO); - TTY_HOLD(sp); - if ((ttyvp = sp->s_vp) == NULL) { - TTY_RELE(sp); - return (EIO); } - error = VOP_IOCTL(ttyvp, cmd, arg, mode, cr, rvalp); - TTY_RELE(sp); + + error = VOP_IOCTL(sp->s_vp, cmd, arg, mode, cr, rvalp); + + tty_rele(sp); return (error); } @@ -322,18 +339,19 @@ int sypoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { - vnode_t *ttyvp; - sess_t *sp = curproc->p_sessp; + sess_t *sp; int error; - if (sp->s_dev == NODEV) + if ((sp = tty_hold()) == NULL) + return (EINTR); + + if (sp->s_dev == NODEV) { + tty_rele(sp); return (ENXIO); - TTY_HOLD(sp); - if ((ttyvp = sp->s_vp) == NULL) { - TTY_RELE(sp); - return (EIO); } - error = VOP_POLL(ttyvp, events, anyyet, reventsp, phpp); - TTY_RELE(sp); + + error = VOP_POLL(sp->s_vp, events, anyyet, reventsp, phpp); + + tty_rele(sp); return (error); } diff --git a/usr/src/uts/common/io/l_strplumb.c b/usr/src/uts/common/io/l_strplumb.c index 287ad1f08f..3997874684 100644 --- a/usr/src/uts/common/io/l_strplumb.c +++ b/usr/src/uts/common/io/l_strplumb.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -228,7 +227,7 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, li = ldi_ident_from_anon(); if (op == SET_AUTOPUSH || op == CLR_AUTOPUSH) { error = ldi_open_by_name(SAD_ADM, FREAD|FWRITE, - CRED(), &lh, li); + kcred, &lh, li); if (error) { printf("kstr_autopush: open failed error %d\n", error); ldi_ident_release(li); @@ -236,7 +235,7 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, } } else { error = ldi_open_by_name(SAD_USR, FREAD|FWRITE, - CRED(), &lh, li); + kcred, &lh, li); if (error) { printf("kstr_autopush: open failed error %d\n", error); ldi_ident_release(li); @@ -253,11 +252,11 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, push.sap_minor = *min; error = ldi_ioctl(lh, SAD_GAP, (intptr_t)&push, - FKIOCTL, CRED(), &rval); + FKIOCTL, kcred, &rval); if (error) { printf("kstr_autopush: ioctl failed, error %d\n", error); - (void) ldi_close(lh, FREAD|FWRITE, CRED()); + (void) ldi_close(lh, FREAD|FWRITE, kcred); return (error); } switch (push.sap_cmd) { @@ -288,7 +287,7 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, (void) strcpy(mods[i], push.sap_list[i]); mods[i] = NULL; } - (void) ldi_close(lh, FREAD|FWRITE, CRED()); + (void) ldi_close(lh, FREAD|FWRITE, kcred); return (0); case CLR_AUTOPUSH: @@ -299,12 +298,12 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, push.sap_major = *maj; error = ldi_ioctl(lh, SAD_SAP, (intptr_t)&push, - FKIOCTL, CRED(), &rval); + FKIOCTL, kcred, &rval); if (error) { printf("kstr_autopush: ioctl failed, error %d\n", error); } - (void) ldi_close(lh, FREAD|FWRITE, CRED()); + (void) ldi_close(lh, FREAD|FWRITE, kcred); return (error); case SET_AUTOPUSH: @@ -338,16 +337,16 @@ kstr_autopush(int op, major_t *maj, minor_t *min, minor_t *lastmin, push.sap_list[i][0] = '\0'; error = ldi_ioctl(lh, SAD_SAP, (intptr_t)&push, - FKIOCTL, CRED(), &rval); + FKIOCTL, kcred, &rval); if (error) { printf("kstr_autopush: ioctl failed, error %d\n", error); } - (void) ldi_close(lh, FREAD|FWRITE, CRED()); + (void) ldi_close(lh, FREAD|FWRITE, kcred); return (error); default: - (void) ldi_close(lh, FREAD|FWRITE, CRED()); + (void) ldi_close(lh, FREAD|FWRITE, kcred); return (EINVAL); } } diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index bd4dc10511..7910b58cc8 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -449,6 +449,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -572,6 +584,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; @@ -643,6 +690,13 @@ ptmwsrv(queue_t *qp) ASSERT(qp->q_ptr); ptmp = (struct pt_ttys *)qp->q_ptr; + + if ((mp = getq(qp)) == NULL) { + /* If there are no messages there's nothing to do. */ + DBG(("leaving ptmwsrv (no messages)\n")); + return; + } + PT_ENTER_READ(ptmp); if ((ptmp->pt_state & PTLOCK) || (ptmp->pts_rdq == NULL)) { DBG(("in master write srv proc but no slave\n")); @@ -652,12 +706,12 @@ ptmwsrv(queue_t *qp) * the user process waiting for ACK/NAK from * the ioctl invocation */ - while ((mp = getq(qp)) != NULL) { + do { if (mp->b_datap->db_type == M_IOCTL) miocnak(qp, mp, 0, EINVAL); else freemsg(mp); - } + } while ((mp = getq(qp)) != NULL); flushq(qp, FLUSHALL); mp = mexchange(NULL, NULL, 2, M_ERROR, -1); @@ -672,7 +726,7 @@ ptmwsrv(queue_t *qp) /* * while there are messages on this write queue... */ - while ((mp = getq(qp)) != NULL) { + do { /* * if don't have control message and cannot put * msg. on slave's read queue, put it back on @@ -689,7 +743,7 @@ ptmwsrv(queue_t *qp) */ DBG(("send message to slave\n")); putnext(ptmp->pts_rdq, mp); - } + } while ((mp = getq(qp)) != NULL); DBG(("leaving ptmwsrv\n")); PT_EXIT_READ(ptmp); } diff --git a/usr/src/uts/common/nfs/nfs.h b/usr/src/uts/common/nfs/nfs.h index eda293574e..03c32254b7 100644 --- a/usr/src/uts/common/nfs/nfs.h +++ b/usr/src/uts/common/nfs/nfs.h @@ -931,6 +931,7 @@ extern void nfsauth_fini(); extern int nfs_setopts(vnode_t *vp, model_t model, struct nfs_args *args); extern int nfs_mount_label_policy(vfs_t *vfsp, struct netbuf *addr, struct knetconfig *knconf, cred_t *cr); +extern boolean_t nfs_has_ctty(void); extern void nfs_srv_stop_all(void); extern void nfs_srv_quiesce_all(void); extern void (*nfs_srv_quiesce_func)(void); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c new file mode 100644 index 0000000000..15d82871bf --- /dev/null +++ b/usr/src/uts/common/os/brand.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/modctl.h> +#include <sys/rwlock.h> +#include <sys/zone.h> + +#define SUPPORTED_BRAND_VERSION BRAND_VER_1 + +#if defined(__sparcv9) +struct brand_mach_ops native_mach_ops = { + NULL, NULL +}; +#else +struct brand_mach_ops native_mach_ops = { + NULL, NULL, NULL, NULL, NULL, NULL +}; +#endif + +brand_t native_brand = { + BRAND_VER_1, + "native", + NULL, + &native_mach_ops +}; + +/* + * Used to maintain a list of all the brands currently loaded into the + * kernel. + */ +struct brand_list { + int bl_refcnt; + struct brand_list *bl_next; + brand_t *bl_brand; +}; + +static struct brand_list *brand_list = NULL; + +/* + * This lock protects the integrity of the brand list. + */ +static kmutex_t brand_list_lock; + +void +brand_init() +{ + mutex_init(&brand_list_lock, NULL, MUTEX_DEFAULT, NULL); + p0.p_brand = &native_brand; +} + +int +brand_register(brand_t *brand) +{ + struct brand_list *list, *scan; + + if (brand == NULL) + return (EINVAL); + + if (is_system_labeled()) { + cmn_err(CE_WARN, + "Branded zones are not allowed on labeled systems."); + return (EINVAL); + } + + if (brand->b_version != SUPPORTED_BRAND_VERSION) { + if (brand->b_version < SUPPORTED_BRAND_VERSION) { + cmn_err(CE_WARN, + "brand '%s' was built to run on older versions " + "of Solaris.", + brand->b_name); + } else { + cmn_err(CE_WARN, + "brand '%s' was built to run on a newer version " + "of Solaris.", + brand->b_name); + } + return (EINVAL); + } + + /* Sanity checks */ + if (brand->b_name == NULL || brand->b_ops == NULL || + brand->b_ops->b_brandsys == NULL) { + cmn_err(CE_WARN, "Malformed brand"); + return (EINVAL); + } + + list = kmem_alloc(sizeof (struct brand_list), KM_SLEEP); + + /* Add the brand to the list of loaded brands. */ + mutex_enter(&brand_list_lock); + + /* + * Check to be sure we haven't already registered this brand. + */ + for (scan = brand_list; scan != NULL; scan = scan->bl_next) { + if (strcmp(brand->b_name, scan->bl_brand->b_name) == 0) { + cmn_err(CE_WARN, + "Invalid attempt to load a second instance of " + "brand %s", brand->b_name); + mutex_exit(&brand_list_lock); + kmem_free(list, sizeof (struct brand_list)); + return (EINVAL); + } + } + + list->bl_brand = brand; + list->bl_refcnt = 0; + list->bl_next = brand_list; + brand_list = list; + mutex_exit(&brand_list_lock); + + return (0); +} + +/* + * The kernel module implementing this brand is being unloaded, so remove + * it from the list of active brands. + */ +int +brand_unregister(brand_t *brand) +{ + struct brand_list *list, *prev; + + /* Sanity checks */ + if (brand == NULL || brand->b_name == NULL) { + cmn_err(CE_WARN, "Malformed brand"); + return (EINVAL); + } + + prev = NULL; + mutex_enter(&brand_list_lock); + + for (list = brand_list; list != NULL; list = list->bl_next) { + if (list->bl_brand == brand) + break; + prev = list; + } + + if (list == NULL) { + cmn_err(CE_WARN, "Brand %s wasn't registered", brand->b_name); + mutex_exit(&brand_list_lock); + return (EINVAL); + } + + if (list->bl_refcnt > 0) { + cmn_err(CE_WARN, "Unregistering brand %s which is still in use", + brand->b_name); + mutex_exit(&brand_list_lock); + return (EBUSY); + } + + /* Remove brand from the list */ + if (prev != NULL) + prev->bl_next = list->bl_next; + else + brand_list = list->bl_next; + + mutex_exit(&brand_list_lock); + + kmem_free(list, sizeof (struct brand_list)); + + return (0); +} + +/* + * Record that a zone of this brand has been instantiated. If the kernel + * module implementing this brand's functionality is not present, this + * routine attempts to load the module as a side effect. + */ +brand_t * +brand_register_zone(struct brand_attr *attr) +{ + struct brand_list *l = NULL; + ddi_modhandle_t hdl = NULL; + char *modname; + int err = 0; + + if (is_system_labeled()) { + cmn_err(CE_WARN, + "Branded zones are not allowed on labeled systems."); + return (NULL); + } + + /* + * We make at most two passes through this loop. The first time + * through, we're looking to see if this is a new user of an + * already loaded brand. If the brand hasn't been loaded, we + * call ddi_modopen() to force it to be loaded and then make a + * second pass through the list of brands. If we don't find the + * brand the second time through it means that the modname + * specified in the brand_attr structure doesn't provide the brand + * specified in the brandname field. This would suggest a bug in + * the brand's config.xml file. We close the module and return + * 'NULL' to the caller. + */ + for (;;) { + /* + * Search list of loaded brands + */ + mutex_enter(&brand_list_lock); + for (l = brand_list; l != NULL; l = l->bl_next) + if (strcmp(attr->ba_brandname, + l->bl_brand->b_name) == 0) + break; + if ((l != NULL) || (hdl != NULL)) + break; + mutex_exit(&brand_list_lock); + + /* + * We didn't find that the requested brand has been loaded + * yet, so we trigger the load of the appropriate kernel + * module and search the list again. + */ + modname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + (void) strcpy(modname, "brand/"); + (void) strcat(modname, attr->ba_modname); + hdl = ddi_modopen(modname, KRTLD_MODE_FIRST, &err); + kmem_free(modname, MAXPATHLEN); + + if (err != 0) + return (NULL); + } + + /* + * If we found the matching brand, bump its reference count. + */ + if (l != NULL) + l->bl_refcnt++; + + mutex_exit(&brand_list_lock); + + if (hdl != NULL) + (void) ddi_modclose(hdl); + + return ((l != NULL) ? l->bl_brand : NULL); +} + +/* + * Return the number of zones currently using this brand. + */ +int +brand_zone_count(struct brand *bp) +{ + struct brand_list *l; + int cnt = 0; + + mutex_enter(&brand_list_lock); + for (l = brand_list; l != NULL; l = l->bl_next) + if (l->bl_brand == bp) { + cnt = l->bl_refcnt; + break; + } + mutex_exit(&brand_list_lock); + + return (cnt); +} + +void +brand_unregister_zone(struct brand *bp) +{ + struct brand_list *list; + + mutex_enter(&brand_list_lock); + for (list = brand_list; list != NULL; list = list->bl_next) { + if (list->bl_brand == bp) { + ASSERT(list->bl_refcnt > 0); + list->bl_refcnt--; + break; + } + } + mutex_exit(&brand_list_lock); +} + +void +brand_setbrand(proc_t *p) +{ + brand_t *bp = p->p_zone->zone_brand; + + ASSERT(bp != NULL); + ASSERT(p->p_brand == &native_brand); + + /* + * We should only be called from exec(), when we know the process + * is single-threaded. + */ + ASSERT(p->p_tlist == p->p_tlist->t_forw); + + p->p_brand = bp; + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_setbrand(p); + lwp_attach_brand_hdlrs(p->p_tlist->t_lwp); + } +} diff --git a/usr/src/uts/common/os/ddi.c b/usr/src/uts/common/os/ddi.c index ec12f51f37..6a0b6ace80 100644 --- a/usr/src/uts/common/os/ddi.c +++ b/usr/src/uts/common/os/ddi.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,7 +23,7 @@ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -293,14 +292,15 @@ WR(queue_t *q) int drv_getparm(unsigned int parm, void *valuep) { - time_t now; + proc_t *p = curproc; + time_t now; switch (parm) { case UPROCP: - *(proc_t **)valuep = ttoproc(curthread); + *(proc_t **)valuep = p; break; case PPGRP: - *(pid_t *)valuep = ttoproc(curthread)->p_pgrp; + *(pid_t *)valuep = p->p_pgrp; break; case LBOLT: *(clock_t *)valuep = lbolt; @@ -317,10 +317,12 @@ drv_getparm(unsigned int parm, void *valuep) } break; case PPID: - *(pid_t *)valuep = ttoproc(curthread)->p_pid; + *(pid_t *)valuep = p->p_pid; break; case PSID: - *(pid_t *)valuep = ttoproc(curthread)->p_sessp->s_sid; + mutex_enter(&p->p_splock); + *(pid_t *)valuep = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); break; case UCRED: *(cred_t **)valuep = CRED(); diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index a3cd19e423..3b01993465 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -65,6 +65,7 @@ #include <sys/lwpchan_impl.h> #include <sys/pool.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <c2/audit.h> @@ -89,7 +90,6 @@ uint_t auxv_hwcap32 = 0; /* 32-bit version of auxv_hwcap */ #endif int exec_lpg_disable = 0; - #define PSUIDFLAGS (SNOCD|SUGID) /* @@ -109,12 +109,13 @@ exece(const char *fname, const char **argp, const char **envp) { int error; - error = exec_common(fname, argp, envp); + error = exec_common(fname, argp, envp, EBA_NONE); return (error ? (set_errno(error)) : 0); } int -exec_common(const char *fname, const char **argp, const char **envp) +exec_common(const char *fname, const char **argp, const char **envp, + int brand_action) { vnode_t *vp = NULL, *dir = NULL, *tmpvp = NULL; proc_t *p = ttoproc(curthread); @@ -136,6 +137,7 @@ exec_common(const char *fname, const char **argp, const char **envp) lwpdir_t **old_tidhash; uint_t old_tidhash_sz; lwpent_t *lep; + int brandme = 0; /* * exec() is not supported for the /proc agent lwp. @@ -146,6 +148,35 @@ exec_common(const char *fname, const char **argp, const char **envp) if ((error = secpolicy_basic_exec(CRED())) != 0) return (error); + if (brand_action != EBA_NONE) { + /* + * Brand actions are not supported for processes that are not + * running in a branded zone. + */ + if (!ZONE_IS_BRANDED(p->p_zone)) + return (ENOTSUP); + + if (brand_action == EBA_NATIVE) { + /* Only branded processes can be unbranded */ + if (!PROC_IS_BRANDED(p)) + return (ENOTSUP); + } else { + /* Only unbranded processes can be branded */ + if (PROC_IS_BRANDED(p)) + return (ENOTSUP); + brandme = 1; + } + } else { + /* + * If this is a native zone, or if the process is already + * branded, then we don't need to do anything. If this is + * a native process in a branded zone, we need to brand the + * process as it exec()s the new binary. + */ + if (ZONE_IS_BRANDED(p->p_zone) && !PROC_IS_BRANDED(p)) + brandme = 1; + } + /* * Inform /proc that an exec() has started. * Hold signals that are ignored by default so that we will @@ -237,8 +268,14 @@ exec_common(const char *fname, const char **argp, const char **envp) ua.argp = argp; ua.envp = envp; + /* If necessary, brand this process before we start the exec. */ + if (brandme != 0) + brand_setbrand(p); + if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred)) != 0) { + exec_file, p->p_cred, brand_action)) != 0) { + if (brandme != 0) + BROP(p)->b_proc_exit(p, lwp); VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -351,6 +388,12 @@ exec_common(const char *fname, const char **argp, const char **envp) */ close_exec(P_FINFO(p)); TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); + + /* Unbrand ourself if requested. */ + if (brand_action == EBA_NATIVE) + BROP(p)->b_proc_exit(p, lwp); + ASSERT((brand_action != EBA_NATIVE) || !PROC_IS_BRANDED(p)); + setregs(&args); /* Mark this as an executable vnode */ @@ -376,6 +419,9 @@ exec_common(const char *fname, const char **argp, const char **envp) lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); } + if (PROC_IS_BRANDED(p)) + BROP(p)->b_exec(); + mutex_enter(&p->p_lock); prbarrier(p); @@ -411,6 +457,7 @@ exec_common(const char *fname, const char **argp, const char **envp) lep->le_start = curthread->t_start; lwp_hash_in(p, lep); } + /* * Restore the saved signal mask and * inform /proc that the exec() has finished. @@ -422,6 +469,7 @@ exec_common(const char *fname, const char **argp, const char **envp) kmem_free(old_lwpdir, old_lwpdir_sz * sizeof (lwpdir_t)); kmem_free(old_tidhash, old_tidhash_sz * sizeof (lwpdir_t *)); } + ASSERT(error == 0); DTRACE_PROC(exec__success); return (0); @@ -451,7 +499,8 @@ gexec( int level, long *execsz, caddr_t exec_file, - struct cred *cred) + struct cred *cred, + int brand_action) { struct vnode *vp; proc_t *pp = ttoproc(curthread); @@ -593,7 +642,7 @@ gexec( setidfl |= EXECSETID_PRIVS; error = (*eswp->exec_func)(vp, uap, args, idatap, level, execsz, - setidfl, exec_file, cred); + setidfl, exec_file, cred, brand_action); rw_exit(eswp->exec_lock); if (error != 0) { if (newcred != NULL) @@ -1016,17 +1065,44 @@ execmap(struct vnode *vp, caddr_t addr, size_t len, size_t zfodlen, } if (zfodlen) { + struct as *as = curproc->p_as; + struct seg *seg; + uint_t zprot = 0; + end = (size_t)addr + len; zfodbase = (caddr_t)roundup(end, PAGESIZE); zfoddiff = (uintptr_t)zfodbase - end; if (zfoddiff) { + /* + * Before we go to zero the remaining space on the last + * page, make sure we have write permission. + */ + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + seg = as_segat(curproc->p_as, (caddr_t)end); + if (seg != NULL) + SEGOP_GETPROT(seg, (caddr_t)end, zfoddiff - 1, + &zprot); + AS_LOCK_EXIT(as, &as->a_lock); + + if (seg != NULL && (zprot & PROT_WRITE) == 0) { + (void) as_setprot(as, (caddr_t)end, + zfoddiff - 1, zprot | PROT_WRITE); + } + if (on_fault(&ljb)) { no_fault(); + if (seg != NULL && (zprot & PROT_WRITE) == 0) + (void) as_setprot(as, (caddr_t)end, + zfoddiff - 1, zprot); error = EFAULT; goto bad; } uzero((void *)end, zfoddiff); no_fault(); + if (seg != NULL && (zprot & PROT_WRITE) == 0) + (void) as_setprot(as, (caddr_t)end, + zfoddiff - 1, zprot); } if (zfodlen > zfoddiff) { struct segvn_crargs crargs = @@ -1326,13 +1402,22 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM and AT_SUN_EXECNAME strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and + * AT_SUN_EMULATOR strings to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) return (error); if ((error = stk_add(args, args->pathname, UIO_SYSSPACE)) != 0) return (error); + if (args->brandname != NULL && + (error = stk_add(args, args->brandname, + UIO_SYSSPACE)) != 0) + return (error); + if (args->emulator != NULL && + (error = stk_add(args, args->emulator, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1438,19 +1523,32 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses - * for the AT_SUN_PLATFORM and AT_SUN_EXECNAME strings. + * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and + * AT_SUN_EMULATOR strings. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { auxv_t **a = (auxv_t **)auxvpp; ADDAUX(*a, AT_SUN_PLATFORM, (long)&ustrp[*--offp]) ADDAUX(*a, AT_SUN_EXECNAME, (long)&ustrp[*--offp]) + if (args->brandname != NULL) + ADDAUX(*a, + AT_SUN_BRANDNAME, (long)&ustrp[*--offp]) + if (args->emulator != NULL) + ADDAUX(*a, + AT_SUN_EMULATOR, (long)&ustrp[*--offp]) } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, AT_SUN_PLATFORM, (int)(uintptr_t)&ustrp[*--offp]) ADDAUX(*a, - AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp]); + AT_SUN_EXECNAME, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brandname != NULL) + ADDAUX(*a, AT_SUN_BRANDNAME, + (int)(uintptr_t)&ustrp[*--offp]) + if (args->emulator != NULL) + ADDAUX(*a, AT_SUN_EMULATOR, + (int)(uintptr_t)&ustrp[*--offp]) } } diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 70061a7d3e..3063e5717f 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -73,6 +73,7 @@ #include <sys/pool.h> #include <sys/sdt.h> #include <sys/corectl.h> +#include <sys/brand.h> /* * convert code/data pair into old style wait status @@ -158,7 +159,6 @@ restart_init(int what, int why) user_t *up = PTOU(p); vnode_t *oldcd, *oldrd; - sess_t *sp; int i, err; char reason_buf[64]; @@ -257,17 +257,9 @@ restart_init(int what, int why) if (oldcd != NULL) VN_RELE(oldcd); - /* - * Free the controlling tty. - */ - mutex_enter(&pidlock); - sp = p->p_sessp; - if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) { - mutex_exit(&pidlock); - freectty(sp); - } else { - mutex_exit(&pidlock); - } + /* Free the controlling tty. (freectty() always assumes curproc.) */ + ASSERT(p == curproc); + (void) freectty(B_TRUE); /* * Now exec() the new init(1M) on top of the current process. If we @@ -343,7 +335,6 @@ proc_exit(int why, int what) timeout_id_t tmp_id; int rv; proc_t *q; - sess_t *sp; task_t *tk; vnode_t *exec_vp, *execdir_vp, *cdir, *rdir; sigqueue_t *sqp; @@ -367,6 +358,14 @@ proc_exit(int why, int what) DTRACE_PROC1(exit, int, why); /* + * Will perform any brand specific proc exit processing, since this + * is always the last lwp, will also perform lwp_exit and free brand + * data + */ + if (PROC_IS_BRANDED(p)) + BROP(p)->b_proc_exit(p, lwp); + + /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. * @@ -377,6 +376,7 @@ proc_exit(int why, int what) if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN && + z->zone_restart_init == B_TRUE && restart_init(what, why) == 0) return (0); /* @@ -523,13 +523,9 @@ proc_exit(int why, int what) closeall(P_FINFO(p)); - mutex_enter(&pidlock); - sp = p->p_sessp; - if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) { - mutex_exit(&pidlock); - freectty(sp); - } else - mutex_exit(&pidlock); + /* Free the controlling tty. (freectty() always assumes curproc.) */ + ASSERT(p == curproc); + (void) freectty(B_TRUE); #if defined(__sparc) if (p->p_utraps != NULL) diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index c7c400246d..fbda5b8c4a 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -80,6 +80,7 @@ #include <sys/sdt.h> #include <sys/class.h> #include <sys/corectl.h> +#include <sys/brand.h> static int64_t cfork(int, int); static int getproc(proc_t **, int); @@ -461,8 +462,10 @@ cfork(int isvfork, int isfork1) mutex_exit(&p->p_lock); } - /* set return values for child */ - lwp_setrval(clone, p->p_pid, 1); + if (PROC_IS_BRANDED(p)) + BROP(p)->b_lwp_setrval(clone, p->p_pid, 1); + else + lwp_setrval(clone, p->p_pid, 1); /* set return values for parent */ r.r_val1 = (int)cp->p_pid; @@ -873,6 +876,7 @@ getproc(proc_t **cpp, int kernel) /* * Make proc entry for child process */ + mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL); #if defined(__x86) @@ -882,7 +886,7 @@ getproc(proc_t **cpp, int kernel) cp->p_stat = SIDL; cp->p_mstart = gethrtime(); - if ((newpid = pid_assign(cp)) == -1) { + if ((newpid = pid_allocate(cp, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); cmn_err(CE_WARN, "out of processes"); @@ -926,10 +930,13 @@ getproc(proc_t **cpp, int kernel) cp->p_siginfo = pp->p_siginfo; cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; - SESS_HOLD(pp->p_sessp); + sess_hold(pp); cp->p_exec = pp->p_exec; cp->p_execdir = pp->p_execdir; cp->p_zone = pp->p_zone; + cp->p_brand = pp->p_brand; + if (PROC_IS_BRANDED(pp)) + BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; @@ -1198,6 +1205,7 @@ try_again: if (p->p_segacct) shmexit(p); + /* * We grab p_lock for the benefit of /proc */ diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index dbccf77b9e..26a12c805e 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -59,6 +58,7 @@ #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> +#include <sys/brand.h> void *segkp_lwp; /* cookie for pool of segkp resources */ @@ -87,6 +87,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, uint_t old_hashsz = 0; int i; int rctlfail = 0; + boolean_t branded = 0; mutex_enter(&p->p_lock); mutex_enter(&p->p_zone->zone_nlwps_lock); @@ -448,6 +449,19 @@ grow: break; } while (lwp_hash_lookup(p, t->t_tid) != NULL); } + + /* + * If this is a branded process, let the brand do any necessary lwp + * initialization. + */ + if (PROC_IS_BRANDED(p)) { + if (BROP(p)->b_initlwp(lwp)) { + err = 1; + goto error; + } + branded = 1; + } + p->p_lwpcnt++; t->t_waitfor = -1; @@ -540,6 +554,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); + if (branded) + BROP(p)->b_freelwp(lwp); + mutex_exit(&p->p_lock); t->t_state = TS_FREE; thread_rele(t); @@ -673,6 +690,13 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); + /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) + BROP(p)->b_lwpexit(lwp); + mutex_enter(&p->p_lock); lwp_cleanup(); @@ -1565,6 +1589,7 @@ forklwp(klwp_t *lwp, proc_t *cp, id_t lwpid) proc_t *p = lwptoproc(lwp); int cid; void *bufp; + void *brand_data; int val; ASSERT(p == curproc); @@ -1578,6 +1603,7 @@ forklwp(klwp_t *lwp, proc_t *cp, id_t lwpid) if (t == curthread) /* copy args out of registers first */ (void) save_syscall_args(); + clwp = lwp_create(cp->p_lwpcnt == 0 ? lwp_rtt_initial : lwp_rtt, NULL, 0, cp, TS_STOPPED, t->t_pri, &t->t_hold, NOCLASS, lwpid); if (clwp == NULL) @@ -1591,14 +1617,16 @@ forklwp(klwp_t *lwp, proc_t *cp, id_t lwpid) ct = clwp->lwp_thread; tregs = clwp->lwp_regs; tfpu = clwp->lwp_fpu; + brand_data = clwp->lwp_brand; /* copy parent lwp to child lwp */ *clwp = *lwp; /* fix up child's lwp */ - clwp->lwp_pcb.pcb_flags = 0; -#if defined(__sparc) +#if defined(__i386) || defined(__amd64) + clwp->lwp_pcb.pcb_flags = clwp->lwp_pcb.pcb_flags & RUPDATE_PENDING; +#elif defined(__sparc) clwp->lwp_pcb.pcb_step = STEP_NONE; #endif clwp->lwp_cursig = 0; @@ -1608,6 +1636,7 @@ forklwp(klwp_t *lwp, proc_t *cp, id_t lwpid) ct->t_sysnum = t->t_sysnum; clwp->lwp_regs = tregs; clwp->lwp_fpu = tfpu; + clwp->lwp_brand = brand_data; clwp->lwp_ap = clwp->lwp_arg; clwp->lwp_procp = cp; bzero(clwp->lwp_timer, sizeof (clwp->lwp_timer)); @@ -1640,6 +1669,10 @@ forklwp(klwp_t *lwp, proc_t *cp, id_t lwpid) ct->t_proc_flag |= TP_MSACCT; mutex_exit(&cp->p_lock); + /* Allow brand to propagate brand-specific state */ + if (PROC_IS_BRANDED(p)) + BROP(p)->b_forklwp(lwp, clwp); + retry: cid = t->t_cid; diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 958bbf96c8..ec9fc6c3e3 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -70,6 +70,7 @@ #include <sys/errorq.h> #include <sys/class.h> #include <sys/stack.h> +#include <sys/brand.h> #include <vm/as.h> #include <vm/seg_kmem.h> @@ -124,6 +125,7 @@ cluster_wrapper(void) char initname[INITNAME_SZ] = "/sbin/init"; /* also referenced by zone0 */ char initargs[BOOTARGS_MAX] = ""; /* also referenced by zone0 */ +extern int64_t lwp_sigmask(int, uint_t, uint_t); /* * Construct a stack for init containing the arguments to it, then @@ -144,6 +146,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); + int brand_action; if (args == NULL) args = ""; @@ -247,9 +250,17 @@ exec_init(const char *initpath, const char *args) curthread->t_post_sys = 1; curthread->t_sysnum = SYS_execve; + /* + * If we are executing init from zsched, we may have inherited its + * parent process's signal mask. Clear it now so that we behave in + * the same way as when started from the global zone. + */ + (void) lwp_sigmask(SIG_UNBLOCK, 0xffffffff, 0xffffffff); + + brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; again: error = exec_common((const char *)(uintptr_t)exec_fnamep, - (const char **)(uintptr_t)uap, NULL); + (const char **)(uintptr_t)uap, NULL, brand_action); /* * Normally we would just set lwp_argsaved and t_post_sys and diff --git a/usr/src/uts/common/os/modconf.c b/usr/src/uts/common/os/modconf.c index 2992567207..3e662fac7d 100644 --- a/usr/src/uts/common/os/modconf.c +++ b/usr/src/uts/common/os/modconf.c @@ -55,6 +55,7 @@ #include <ipp/ipp.h> #include <sys/strsubr.h> #include <sys/kcpc.h> +#include <sys/brand.h> #include <sys/cpc_pcbe.h> #include <sys/kstat.h> #include <sys/fs/sdev_node.h> @@ -237,6 +238,16 @@ struct mod_ops mod_pcbeops = { mod_installpcbe, mod_removepcbe, mod_infonull }; +/* + * Brand modules. + */ +static int mod_installbrand(struct modlbrand *, struct modlinkage *); +static int mod_removebrand(struct modlbrand *, struct modlinkage *); + +struct mod_ops mod_brandops = { + mod_installbrand, mod_removebrand, mod_infonull +}; + static struct sysent *mod_getsysent(struct modlinkage *, struct sysent *); static char uninstall_err[] = "Cannot uninstall %s; not installed"; @@ -496,6 +507,23 @@ mod_removepcbe(struct modlpcbe *modl, struct modlinkage *modlp) } /* + * Manage BrandZ modules. + */ +/*ARGSUSED*/ +static int +mod_installbrand(struct modlbrand *modl, struct modlinkage *modlp) +{ + return (brand_register(modl->brand_branddef)); +} + +/*ARGSUSED*/ +static int +mod_removebrand(struct modlbrand *modl, struct modlinkage *modlp) +{ + return (brand_unregister(modl->brand_branddef)); +} + +/* * manage /dev fs modules */ /*ARGSUSED*/ @@ -1075,8 +1103,10 @@ mod_removefs(struct modlfs *modl, struct modlinkage *modlp) return (EBUSY); } - /* XXX - Shouldn't the refcount be sufficient? */ - + /* + * A mounted filesystem could still have vsw_count = 0 + * so we must check whether anyone is actually using our ops + */ if (vfs_opsinuse(&vswp->vsw_vfsops)) { vfs_unrefvfssw(vswp); WUNLOCK_VFSSW(); diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 66cfed74b4..88b0258afe 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -21,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -115,6 +114,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -154,14 +165,13 @@ pid_getlockslot(int prslot) } /* - * This function assigns a pid for use in a fork request. It allocates - * a pid structure, tries to find an empty slot in the proc table, - * and selects the process id. + * This function allocates a pid structure, a free pid, and optionally a + * slot in the proc table for it. * - * pid_assign() returns the new pid on success, -1 on failure. + * pid_allocate() returns the new pid on success, -1 on failure. */ pid_t -pid_assign(proc_t *prp) +pid_allocate(proc_t *prp, int flags) { struct pid *pidp; union procent *pep; @@ -170,7 +180,7 @@ pid_assign(proc_t *prp) pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); mutex_enter(&pidlinklock); - if ((pep = procentfree) == NULL) { + if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) { /* * ran out of /proc directory entries */ @@ -190,10 +200,6 @@ pid_assign(proc_t *prp) goto failed; } - procentfree = pep->pe_next; - pep->pe_proc = prp; - prp->p_pidp = pidp; - /* * Put pid into the pid hash table. */ @@ -201,8 +207,17 @@ pid_assign(proc_t *prp) HASHPID(newpid) = pidp; pidp->pid_ref = 1; pidp->pid_id = newpid; - pidp->pid_prslot = pep - procdir; - prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; + + if (flags & PID_ALLOC_PROC) { + procentfree = pep->pe_next; + pidp->pid_prslot = pep - procdir; + pep->pe_proc = prp; + prp->p_pidp = pidp; + prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; + } else { + pidp->pid_prslot = 0; + } + mutex_exit(&pidlinklock); return (newpid); @@ -264,7 +279,7 @@ pid_exit(proc_t *prp) if (prp->p_pgidp != NULL) pgexit(prp); - SESS_RELE(prp->p_sessp); + sess_rele(prp->p_sessp, B_TRUE); pidp = prp->p_pidp; diff --git a/usr/src/uts/common/os/printf.c b/usr/src/uts/common/os/printf.c index 603da31b62..a50bfa0db9 100644 --- a/usr/src/uts/common/os/printf.c +++ b/usr/src/uts/common/os/printf.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -142,21 +141,15 @@ retry: if (sl & SL_USER) { ssize_t resid; - sess_t *sessp; - - mutex_enter(&pidlock); - sessp = curproc->p_sessp; - SESS_HOLD(sessp); - TTY_HOLD(sessp); - mutex_exit(&pidlock); - if (sessp->s_vp) - (void) vn_rdwr(UIO_WRITE, sessp->s_vp, - body, len, 0LL, UIO_SYSSPACE, - FAPPEND, (rlim64_t)LOG_HIWAT, kcred, &resid); - mutex_enter(&pidlock); - TTY_RELE(sessp); - SESS_RELE(sessp); - mutex_exit(&pidlock); + sess_t *sp; + + if ((sp = tty_hold()) != NULL) { + if (sp->s_vp != NULL) + (void) vn_rdwr(UIO_WRITE, sp->s_vp, body, + len, 0LL, UIO_SYSSPACE, FAPPEND, + (rlim64_t)LOG_HIWAT, kcred, &resid); + tty_rele(sp); + } } if (on_intr && !panicstr) { diff --git a/usr/src/uts/common/os/procset.c b/usr/src/uts/common/os/procset.c index 7a675c604e..ae5473847e 100644 --- a/usr/src/uts/common/os/procset.c +++ b/usr/src/uts/common/os/procset.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -290,8 +289,10 @@ procinset(proc_t *pp, procset_t *psp) break; case P_SID: + mutex_enter(&pp->p_splock); if (pp->p_sessp->s_sid == psp->p_lid) loperand++; + mutex_exit(&pp->p_splock); break; case P_CID: @@ -380,8 +381,10 @@ procinset(proc_t *pp, procset_t *psp) break; case P_SID: + mutex_enter(&pp->p_splock); if (pp->p_sessp->s_sid == psp->p_rid) roperand++; + mutex_exit(&pp->p_splock); break; case P_TASKID: @@ -533,8 +536,10 @@ lwpinset(proc_t *pp, procset_t *psp, kthread_t *tp, int *done) break; case P_SID: + mutex_enter(&pp->p_splock); if (pp->p_sessp->s_sid == psp->p_lid) loperand++; + mutex_exit(&pp->p_splock); break; case P_TASKID: @@ -617,8 +622,10 @@ lwpinset(proc_t *pp, procset_t *psp, kthread_t *tp, int *done) break; case P_SID: + mutex_enter(&pp->p_splock); if (pp->p_sessp->s_sid == psp->p_rid) roperand++; + mutex_exit(&pp->p_splock); break; case P_TASKID: @@ -756,6 +763,7 @@ getmyid(idtype_t idtype) proc_t *pp; uid_t uid; gid_t gid; + pid_t sid; pp = ttoproc(curthread); @@ -773,7 +781,10 @@ getmyid(idtype_t idtype) return (pp->p_pgrp); case P_SID: - return (pp->p_sessp->s_sid); + mutex_enter(&pp->p_splock); + sid = pp->p_sessp->s_sid; + mutex_exit(&pp->p_splock); + return (sid); case P_TASKID: return (pp->p_task->tk_tkid); diff --git a/usr/src/uts/common/os/session.c b/usr/src/uts/common/os/session.c index 972677f7dc..7790a09094 100644 --- a/usr/src/uts/common/os/session.c +++ b/usr/src/uts/common/os/session.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,102 +46,614 @@ #include <sys/kmem.h> #include <sys/cmn_err.h> #include <sys/strsubr.h> +#include <sys/fs/snode.h> sess_t session0 = { - 1, /* s_ref */ - NODEV, /* s_dev */ - NULL, /* s_vp */ - &pid0, /* s_sidp */ - NULL /* s_cred */ + &pid0, /* s_sidp */ + {0}, /* s_lock */ + 1, /* s_ref */ + B_FALSE, /* s_sighuped */ + B_FALSE, /* s_exit */ + 0, /* s_exit_cv */ + 0, /* s_cnt */ + 0, /* s_cnt_cv */ + NODEV, /* s_dev */ + NULL, /* s_vp */ + NULL /* s_cred */ }; void -sess_rele(sess_t *sp) +sess_hold(proc_t *p) { - ASSERT(MUTEX_HELD(&pidlock)); + ASSERT(MUTEX_HELD(&pidlock) || MUTEX_HELD(&p->p_splock)); + mutex_enter(&p->p_sessp->s_lock); + p->p_sessp->s_ref++; + mutex_exit(&p->p_sessp->s_lock); +} + +void +sess_rele(sess_t *sp, boolean_t pidlock_held) +{ + ASSERT(MUTEX_HELD(&pidlock) || !pidlock_held); + + mutex_enter(&sp->s_lock); ASSERT(sp->s_ref != 0); - if (--sp->s_ref == 0) { - if (sp == &session0) - panic("sp == &session0"); - PID_RELE(sp->s_sidp); - mutex_destroy(&sp->s_lock); - cv_destroy(&sp->s_wait_cv); - kmem_free(sp, sizeof (sess_t)); + if (--sp->s_ref > 0) { + mutex_exit(&sp->s_lock); + return; } + ASSERT(sp->s_ref == 0); + + /* + * It's ok to free this session structure now because we know + * that no one else can have a pointer to it. We know this + * to be true because the only time that s_ref can possibly + * be incremented is when pidlock or p_splock is held AND there + * is a proc_t that points to that session structure. In that + * case we are guaranteed that the s_ref is at least 1 since there + * is a proc_t that points to it. So when s_ref finally drops to + * zero then no one else has a reference (and hence pointer) to + * this session structure and there is no valid proc_t pointing + * to this session structure anymore so, no one can acquire a + * reference (and pointer) to this session structure so it's + * ok to free it here. + */ + + if (sp == &session0) + panic("sp == &session0"); + + /* make sure there are no outstanding holds */ + ASSERT(sp->s_cnt == 0); + + /* make sure there is no exit in progress */ + ASSERT(!sp->s_exit); + + /* make sure someone already freed any ctty */ + ASSERT(sp->s_vp == NULL); + ASSERT(sp->s_dev == NODEV); + + if (!pidlock_held) + mutex_enter(&pidlock); + PID_RELE(sp->s_sidp); + if (!pidlock_held) + mutex_exit(&pidlock); + + mutex_destroy(&sp->s_lock); + cv_destroy(&sp->s_cnt_cv); + kmem_free(sp, sizeof (sess_t)); +} + +sess_t * +tty_hold(void) +{ + proc_t *p = curproc; + sess_t *sp; + boolean_t got_sig = B_FALSE; + + /* make sure the caller isn't holding locks they shouldn't */ + ASSERT(MUTEX_NOT_HELD(&pidlock)); + + for (;;) { + mutex_enter(&p->p_splock); /* protect p->p_sessp */ + sp = p->p_sessp; + mutex_enter(&sp->s_lock); /* protect sp->* */ + + /* make sure the caller isn't holding locks they shouldn't */ + ASSERT((sp->s_vp == NULL) || + MUTEX_NOT_HELD(&sp->s_vp->v_stream->sd_lock)); + + /* + * If the session leader process is not exiting (and hence + * not trying to release the session's ctty) then we can + * safely grab a hold on the current session structure + * and return it. If on the other hand the session leader + * process is exiting and clearing the ctty then we'll + * wait till it's done before we loop around and grab a + * hold on the session structure. + */ + if (!sp->s_exit) + break; + + /* need to hold the session so it can't be freed */ + sp->s_ref++; + mutex_exit(&p->p_splock); + + /* Wait till the session leader is done */ + if (!cv_wait_sig(&sp->s_exit_cv, &sp->s_lock)) + got_sig = B_TRUE; + + /* + * Now we need to drop our hold on the session structure, + * but we can't hold any locks when we do this because + * sess_rele() may need to aquire pidlock. + */ + mutex_exit(&sp->s_lock); + sess_rele(sp, B_FALSE); + + if (got_sig) + return (NULL); + } + + /* whew, we finally got a hold */ + sp->s_cnt++; + sp->s_ref++; + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + return (sp); } void -sess_create(void) +tty_rele(sess_t *sp) { - proc_t *pp; - sess_t *sp; + /* make sure the caller isn't holding locks they shouldn't */ + ASSERT(MUTEX_NOT_HELD(&pidlock)); - pp = ttoproc(curthread); + mutex_enter(&sp->s_lock); + if ((--sp->s_cnt) == 0) + cv_broadcast(&sp->s_cnt_cv); + mutex_exit(&sp->s_lock); + + sess_rele(sp, B_FALSE); +} + +void +sess_create(void) +{ + proc_t *p = curproc; + sess_t *sp, *old_sp; sp = kmem_zalloc(sizeof (sess_t), KM_SLEEP); mutex_init(&sp->s_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&sp->s_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&sp->s_cnt_cv, NULL, CV_DEFAULT, NULL); + /* + * we need to grap p_lock to protect p_pgidp because + * /proc looks at p_pgidp while holding only p_lock. + * + * we don't need to hold p->p_sessp->s_lock or get a hold on the + * session structure since we're not actually updating any of + * the contents of the old session structure. + */ mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + mutex_enter(&p->p_splock); + + pgexit(p); + + sp->s_sidp = p->p_pidp; + sp->s_ref = 1; + sp->s_dev = NODEV; + + old_sp = p->p_sessp; + p->p_sessp = sp; + + pgjoin(p, p->p_pidp); + PID_HOLD(p->p_pidp); + + mutex_exit(&p->p_splock); + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + sess_rele(old_sp, B_FALSE); +} + +/* + * Note that sess_ctty_clear() resets all the fields in the session + * structure but doesn't release any holds or free any objects + * that the session structure might currently point to. it is the + * callers responsibility to do this. + */ +static void +sess_ctty_clear(sess_t *sp, stdata_t *stp) +{ /* - * We need to protect p_pgidp with p_lock because - * /proc looks at it while holding only p_lock. + * Assert that we hold all the necessary locks. We also need + * to be holding proc_t->p_splock for the process associated + * with this session, but since we don't have a proc pointer + * passed in we can't assert this here. */ - mutex_enter(&pp->p_lock); - pgexit(pp); - SESS_RELE(pp->p_sessp); + ASSERT(MUTEX_HELD(&stp->sd_lock) && MUTEX_HELD(&pidlock) && + MUTEX_HELD(&sp->s_lock)); - sp->s_sidp = pp->p_pidp; - sp->s_ref = 1; + /* reset the session structure members to defaults */ + sp->s_sighuped = B_FALSE; sp->s_dev = NODEV; + sp->s_vp = NULL; + sp->s_cred = NULL; + + /* reset the stream session and group pointers */ + stp->sd_pgidp = NULL; + stp->sd_sidp = NULL; +} + +static void +sess_ctty_set(proc_t *p, sess_t *sp, stdata_t *stp) +{ + cred_t *crp; + + /* Assert that we hold all the necessary locks. */ + ASSERT(MUTEX_HELD(&stp->sd_lock) && MUTEX_HELD(&pidlock) && + MUTEX_HELD(&p->p_splock) && MUTEX_HELD(&sp->s_lock)); + + /* get holds on structures */ + mutex_enter(&p->p_crlock); + crhold(crp = p->p_cred); + mutex_exit(&p->p_crlock); + PID_HOLD(sp->s_sidp); /* requires pidlock */ + PID_HOLD(sp->s_sidp); /* requires pidlock */ + + /* update the session structure members */ + sp->s_vp = makectty(stp->sd_vnode); + sp->s_dev = sp->s_vp->v_rdev; + sp->s_cred = crp; + + /* update the stream emebers */ + stp->sd_flag |= STRISTTY; /* just to be sure */ + stp->sd_sidp = sp->s_sidp; + stp->sd_pgidp = sp->s_sidp; +} + +int +strctty(stdata_t *stp) +{ + sess_t *sp; + proc_t *p = curproc; + boolean_t got_sig = B_FALSE; + + /* + * We are going to try to make stp the default ctty for the session + * associated with curproc. Not only does this require holding a + * bunch of locks but it also requires waiting for any outstanding + * holds on the session structure (aquired via tty_hold()) to be + * released. Hence, we have the following for(;;) loop that will + * aquire our locks, do some sanity checks, and wait for the hold + * count on the session structure to hit zero. If we get a signal + * while waiting for outstanding holds to be released then we abort + * the operation and return. + */ + for (;;) { + mutex_enter(&stp->sd_lock); /* protects sd_pgidp/sd_sidp */ + mutex_enter(&pidlock); /* protects p_pidp */ + mutex_enter(&p->p_splock); /* protects p_sessp */ + sp = p->p_sessp; + mutex_enter(&sp->s_lock); /* protects sp->* */ + + if (((stp->sd_flag & (STRHUP|STRDERR|STWRERR|STPLEX)) != 0) || + (stp->sd_sidp != NULL) || /* stp already ctty? */ + (p->p_pidp != sp->s_sidp) || /* we're not leader? */ + (sp->s_vp != NULL)) { /* session has ctty? */ + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); + return (ENOTTY); + } + + /* sanity check. we can't be exiting right now */ + ASSERT(!sp->s_exit); + + /* + * If no one else has a hold on this session structure + * then we now have exclusive access to it, so break out + * of this loop and update the session structure. + */ + if (sp->s_cnt == 0) + break; + + /* need to hold the session so it can't be freed */ + sp->s_ref++; - pp->p_sessp = sp; + /* ain't locking order fun? */ + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); - pgjoin(pp, pp->p_pidp); - mutex_exit(&pp->p_lock); + if (!cv_wait_sig(&sp->s_cnt_cv, &sp->s_lock)) + got_sig = B_TRUE; + mutex_exit(&sp->s_lock); + sess_rele(sp, B_FALSE); - PID_HOLD(sp->s_sidp); + if (got_sig) + return (EINTR); + } + + /* set the session ctty bindings */ + sess_ctty_set(p, sp, stp); + + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); + return (0); } -void -freectty(sess_t *sp) +/* + * freectty_lock() attempts to aquire the army of locks required to free + * the ctty associated with a given session leader process. If it returns + * successfully the following locks will be held: + * sd_lock, pidlock, p_splock, s_lock + * + * as a secondary bit of convience, freectty_lock() will also return + * pointers to the session, ctty, and ctty stream associated with the + * specified session leader process. + */ +static boolean_t +freectty_lock(proc_t *p, sess_t **spp, vnode_t **vpp, stdata_t **stpp, + boolean_t at_exit) { - vnode_t *vp = sp->s_vp; - cred_t *cred = sp->s_cred; + sess_t *sp; + vnode_t *vp; + stdata_t *stp; - strfreectty(vp->v_stream); + mutex_enter(&pidlock); /* protect p_pidp */ + mutex_enter(&p->p_splock); /* protect p->p_sessp */ + sp = p->p_sessp; + mutex_enter(&sp->s_lock); /* protect sp->* */ - mutex_enter(&sp->s_lock); - while (sp->s_cnt > 0) { - cv_wait(&sp->s_wait_cv, &sp->s_lock); + if ((sp->s_sidp != p->p_pidp) || /* we're not leader? */ + (sp->s_vp == NULL)) { /* no ctty? */ + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + return (B_FALSE); + } + + vp = sp->s_vp; + stp = sp->s_vp->v_stream; + + if (at_exit) { + /* stop anyone else calling tty_hold() */ + sp->s_exit = B_TRUE; + } else { + /* + * due to locking order we have to grab stp->sd_lock before + * grabbing all the other proc/session locks. but after we + * drop all our current locks it's possible that someone + * could come in and change our current session or close + * the current ctty (vp) there by making sp or stp invalid. + * (a VN_HOLD on vp won't protect stp because that only + * prevents the vnode from being freed not closed.) so + * to prevent this we bump s_ref and s_cnt here. + * + * course this doesn't matter if we're the last thread in + * an exiting process that is the session leader, since no + * one else can change our session or free our ctty. + */ + sp->s_ref++; /* hold the session structure */ + sp->s_cnt++; /* protect vp and stp */ + } + + /* drop our session locks */ + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + + /* grab locks in the right order */ + mutex_enter(&stp->sd_lock); /* protects sd_pgidp/sd_sidp */ + mutex_enter(&pidlock); /* protect p_pidp */ + mutex_enter(&p->p_splock); /* protects p->p_sessp */ + mutex_enter(&sp->s_lock); /* protects sp->* */ + + /* if the session has changed, abort mission */ + if (sp != p->p_sessp) { + /* + * this can't happen during process exit since we're the + * only thread in the process and we sure didn't change + * our own session at this point. + */ + ASSERT(!at_exit); + + /* release our locks and holds */ + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); + tty_rele(sp); + return (B_FALSE); } - ASSERT(sp->s_cnt == 0); - ASSERT(vp->v_count >= 1); - sp->s_vp = NULL; - sp->s_cred = NULL; /* - * It is possible for the VOP_CLOSE below to call stralloctty() - * and reallocate a new tty vnode. To prevent that the - * session is marked as closing here. + * sanity checks. none of this should have changed since we had + * holds on the current ctty. */ + ASSERT(sp->s_sidp == p->p_pidp); /* we're the leader */ + ASSERT(sp->s_vp != NULL); /* a ctty exists */ + ASSERT(vp == sp->s_vp); + ASSERT(stp == sp->s_vp->v_stream); + + /* release our holds */ + if (!at_exit) { + if ((--(sp)->s_cnt) == 0) + cv_broadcast(&sp->s_cnt_cv); + sp->s_ref--; + ASSERT(sp->s_ref > 0); + } + + /* return our pointers */ + *spp = sp; + *vpp = vp; + *stpp = stp; - sp->s_flag = SESS_CLOSE; + return (B_TRUE); +} + +/* + * Returns B_FALSE if no signal is sent to the process group associated with + * this ctty. Returns B_TRUE if a signal is sent to the process group. + * If it return B_TRUE it also means that all the locks we were holding + * were dropped so that we could send the signal. + */ +static boolean_t +freectty_signal(proc_t *p, sess_t *sp, stdata_t *stp, boolean_t at_exit) +{ + /* Assert that we hold all the necessary locks. */ + ASSERT(MUTEX_HELD(&stp->sd_lock) && MUTEX_HELD(&pidlock) && + MUTEX_HELD(&p->p_splock) && MUTEX_HELD(&sp->s_lock)); + + /* check if we already signaled this group */ + if (sp->s_sighuped) + return (B_FALSE); + + sp->s_sighuped = B_TRUE; + + if (!at_exit) { + /* + * once again, we're about to drop our army of locks and we + * don't want sp or stp to be freed. (see the comment in + * freectty_lock()) + */ + sp->s_ref++; /* hold the session structure */ + sp->s_cnt++; /* protect vp and stp */ + } + + /* can't hold these locks while calling pgsignal() */ mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + + /* signal anyone in the foreground process group */ + pgsignal(stp->sd_pgidp, SIGHUP); + + /* signal anyone blocked in poll on this stream */ + if (!(stp->sd_flag & STRHUP)) + strhup(stp); + + mutex_exit(&stp->sd_lock); + + /* release our holds */ + if (!at_exit) + tty_rele(sp); + + return (B_TRUE); +} + +int +freectty(boolean_t at_exit) +{ + proc_t *p = curproc; + stdata_t *stp; + vnode_t *vp; + cred_t *cred; + sess_t *sp; + struct pid *pgidp, *sidp; + boolean_t got_sig = B_FALSE; /* - * This will be the only thread with access to - * this vnode, from this point on. + * If the current process is a session leader we are going to + * try to release the ctty associated our current session. To + * do this we need to aquire a bunch of locks, signal any + * processes in the forground that are associated with the ctty, + * and make sure no one has any outstanding holds on the current + * session * structure (aquired via tty_hold()). Hence, we have + * the following for(;;) loop that will do all this work for + * us and break out when the hold count on the session structure + * hits zero. */ + for (;;) { + if (!freectty_lock(p, &sp, &vp, &stp, at_exit)) + return (EIO); + + if (freectty_signal(p, sp, stp, at_exit)) { + /* loop around to re-aquire locks */ + continue; + } + + /* + * Only a session leader process can free a ctty. So if + * we've made it here we know we're a session leader and + * if we're not actively exiting it impossible for another + * thread in this process to be exiting. (Because that + * thread would have already stopped all other threads + * in the current process.) + */ + ASSERT(at_exit || !sp->s_exit); + + /* + * If no one else has a hold on this session structure + * then we now have exclusive access to it, so break out + * of this loop and update the session structure. + */ + if (sp->s_cnt == 0) + break; + + if (!at_exit) { + /* need to hold the session so it can't be freed */ + sp->s_ref++; + } + + /* ain't locking order fun? */ + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); + + if (at_exit) { + /* + * if we're exiting then we can't allow this operation + * to fail so we do a cw_wait() instead of a + * cv_wait_sig(). if there are threads with active + * holds on this ctty that are blocked, then + * they should only be blocked in a cv_wait_sig() + * and hopefully they were in the foreground process + * group and recieved the SIGHUP we sent above. of + * course it's possible that they weren't in the + * foreground process group and didn't get our + * signal (or they could be stopped by job control + * in which case our signal wouldn't matter until + * they are restarted). in this case we won't + * exit until someone else sends them a signal. + */ + cv_wait(&sp->s_cnt_cv, &sp->s_lock); + mutex_exit(&sp->s_lock); + continue; + } + + if (!cv_wait_sig(&sp->s_cnt_cv, &sp->s_lock)) { + got_sig = B_TRUE; + } + + mutex_exit(&sp->s_lock); + sess_rele(sp, B_FALSE); + + if (got_sig) + return (EINTR); + } + ASSERT(sp->s_cnt == 0); + /* save some pointers for later */ + cred = sp->s_cred; + pgidp = stp->sd_pgidp; + sidp = stp->sd_sidp; + + /* clear the session ctty bindings */ + sess_ctty_clear(sp, stp); + + /* wake up anyone blocked in tty_hold() */ + if (at_exit) { + ASSERT(sp->s_exit); + sp->s_exit = B_FALSE; + cv_broadcast(&sp->s_exit_cv); + } + + /* we can drop these locks now */ + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); + + /* This is the only remaining thread with access to this vnode */ (void) VOP_CLOSE(vp, 0, 1, (offset_t)0, cred); VN_RELE(vp); - crfree(cred); + + /* release our holds on assorted structures and return */ + mutex_enter(&pidlock); + PID_RELE(pgidp); + PID_RELE(sidp); + mutex_exit(&pidlock); + + return (1); } /* @@ -169,23 +680,29 @@ vhangup(void) dev_t cttydev(proc_t *pp) { - sess_t *sp = pp->p_sessp; + sess_t *sp; + dev_t dev; + + mutex_enter(&pp->p_splock); /* protects p->p_sessp */ + sp = pp->p_sessp; + +#ifdef DEBUG + mutex_enter(&sp->s_lock); /* protects sp->* */ if (sp->s_vp == NULL) - return (NODEV); - return (sp->s_dev); + ASSERT(sp->s_dev == NODEV); + else + ASSERT(sp->s_dev != NODEV); + mutex_exit(&sp->s_lock); +#endif /* DEBUG */ + + dev = sp->s_dev; + mutex_exit(&pp->p_splock); + return (dev); } void -alloctty(proc_t *pp, vnode_t *vp) +ctty_clear_sighuped(void) { - sess_t *sp = pp->p_sessp; - cred_t *crp; - - sp->s_vp = vp; - sp->s_dev = vp->v_rdev; - - mutex_enter(&pp->p_crlock); - crhold(crp = pp->p_cred); - mutex_exit(&pp->p_crlock); - sp->s_cred = crp; + ASSERT(MUTEX_HELD(&pidlock) || MUTEX_HELD(&curproc->p_splock)); + curproc->p_sessp->s_sighuped = B_FALSE; } diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index ffa676604f..e189a1627d 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -77,6 +77,19 @@ #include <sys/autoconf.h> #include <sys/policy.h> + +/* + * This define helps improve the readability of streams code while + * still maintaining a very old streams performance enhancement. The + * performance enhancement basically involved having all callers + * of straccess() perform the first check that straccess() will do + * locally before actually calling straccess(). (There by reducing + * the number of unnecessary calls to straccess().) + */ +#define i_straccess(x, y) ((stp->sd_sidp == NULL) ? 0 : \ + (stp->sd_vnode->v_type == VFIFO) ? 0 : \ + straccess((x), (y))) + /* * what is mblk_pull_len? * @@ -1095,11 +1108,13 @@ strread(struct vnode *vp, struct uio *uiop, cred_t *crp) ASSERT(vp->v_stream); stp = vp->v_stream; - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCREAD)) - return (error); - mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCREAD)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } + if (stp->sd_flag & (STRDERR|STPLEX)) { error = strgeterr(stp, STRDERR|STPLEX, 0); if (error != 0) { @@ -1161,12 +1176,8 @@ strread(struct vnode *vp, struct uio *uiop, cred_t *crp) } TRACE_3(TR_FAC_STREAMS_FR, TR_STRREAD_AWAKE, "strread awakes:%p, %p, %p", vp, uiop, crp); - if (stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO) { - mutex_exit(&stp->sd_lock); - if (error = straccess(stp, JCREAD)) - goto oops1; - mutex_enter(&stp->sd_lock); + if ((error = i_straccess(stp, JCREAD)) != 0) { + goto oops; } first = 0; } @@ -2026,8 +2037,8 @@ strrput_nondata(queue_t *q, mblk_t *bp) cv_broadcast(&q->q_wait); /* the readers */ cv_broadcast(&_WR(q)->q_wait); /* the writers */ cv_broadcast(&stp->sd_monitor); /* the ioctllers */ - mutex_exit(&stp->sd_lock); strhup(stp); + mutex_exit(&stp->sd_lock); return (0); case M_UNHANGUP: @@ -2665,18 +2676,23 @@ strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag) ASSERT(vp->v_stream); stp = vp->v_stream; - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if ((error = straccess(stp, JCWRITE)) != 0) - return (error); + mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { - mutex_enter(&stp->sd_lock); error = strwriteable(stp, B_TRUE, B_TRUE); - mutex_exit(&stp->sd_lock); - if (error != 0) + if (error != 0) { + mutex_exit(&stp->sd_lock); return (error); + } } + mutex_exit(&stp->sd_lock); + wqp = stp->sd_wrq; /* get these values from them cached in the stream head */ @@ -2778,11 +2794,11 @@ strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag) } TRACE_1(TR_FAC_STREAMS_FR, TR_STRWRITE_WAKE, "strwrite wake:q %p awakes", wqp); + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + goto out; + } mutex_exit(&stp->sd_lock); - if (stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCWRITE)) - goto out; } waitflag |= NOINTR; TRACE_2(TR_FAC_STREAMS_FR, TR_STRWRITE_RESID, @@ -3101,6 +3117,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } @@ -3162,10 +3179,12 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, if (cmd == SRIOCSREDIR || cmd == SRIOCISREDIR) return (EINVAL); - if (access != -1 && stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, access)) - return (error); + mutex_enter(&stp->sd_lock); + if ((access != -1) && ((error = i_straccess(stp, access)) != 0)) { + mutex_exit(&stp->sd_lock); + return (error); + } + mutex_exit(&stp->sd_lock); /* * Check for sgttyb-related ioctls first, and complain as @@ -3307,11 +3326,16 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, secpolicy_sti(crp) != 0) { return (EPERM); } - if (stp->sd_sidp != - ttoproc(curthread)->p_sessp->s_sidp && + mutex_enter(&stp->sd_lock); + mutex_enter(&curproc->p_splock); + if (stp->sd_sidp != curproc->p_sessp->s_sidp && secpolicy_sti(crp) != 0) { + mutex_exit(&curproc->p_splock); + mutex_exit(&stp->sd_lock); return (EACCES); } + mutex_exit(&curproc->p_splock); + mutex_exit(&stp->sd_lock); strioc.ic_len = sizeof (char); strioc.ic_dp = (char *)arg; @@ -3445,10 +3469,13 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, return (EINVAL); access = job_control_type(strioc.ic_cmd); - if (access != -1 && stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO && - (error = straccess(stp, access)) != 0) + mutex_enter(&stp->sd_lock); + if ((access != -1) && + ((error = i_straccess(stp, access)) != 0)) { + mutex_exit(&stp->sd_lock); return (error); + } + mutex_exit(&stp->sd_lock); /* * The I_STR facility provides a trap door for malicious @@ -3699,7 +3726,7 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, /* * try to allocate it as a controlling terminal */ - stralloctty(stp); + (void) strctty(stp); } } @@ -5053,15 +5080,11 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, releasef(STRUCT_FGET(strfdinsert, fildes)); return (error); } - if (stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO) { + if ((error = i_straccess(stp, access)) != 0) { mutex_exit(&stp->sd_lock); - if (error = straccess(stp, access)) { - releasef( - STRUCT_FGET(strfdinsert, fildes)); - return (error); - } - mutex_enter(&stp->sd_lock); + releasef( + STRUCT_FGET(strfdinsert, fildes)); + return (error); } } mutex_exit(&stp->sd_lock); @@ -5144,12 +5167,9 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, mutex_exit(&stp->sd_lock); return (error); } - if (stp->sd_sidp != NULL && - stp->sd_vnode->v_type != VFIFO) { + if ((error = i_straccess(stp, access)) != 0) { mutex_exit(&stp->sd_lock); - if (error = straccess(stp, access)) - return (error); - mutex_enter(&stp->sd_lock); + return (error); } } if (mp->b_datap->db_type != M_PASSFP) { @@ -5446,13 +5466,13 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, { pid_t sid; - mutex_enter(&pidlock); + mutex_enter(&stp->sd_lock); if (stp->sd_sidp == NULL) { - mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); return (ENOTTY); } sid = stp->sd_sidp->pid_id; - mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); return (strcopyout(&sid, (void *)arg, sizeof (pid_t), copyflag)); } @@ -5494,6 +5514,7 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, bg_pgid = stp->sd_pgidp->pid_id; CL_SET_PROCESS_GROUP(curthread, sid, bg_pgid, fg_pgid); PID_RELE(stp->sd_pgidp); + ctty_clear_sighuped(); stp->sd_pgidp = q->p_pgidp; PID_HOLD(stp->sd_pgidp); mutex_exit(&pidlock); @@ -5505,17 +5526,30 @@ strioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, int copyflag, { pid_t pgrp; - mutex_enter(&pidlock); + mutex_enter(&stp->sd_lock); if (stp->sd_sidp == NULL) { - mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); return (ENOTTY); } pgrp = stp->sd_pgidp->pid_id; - mutex_exit(&pidlock); + mutex_exit(&stp->sd_lock); return (strcopyout(&pgrp, (void *)arg, sizeof (pid_t), copyflag)); } + case TIOCSCTTY: + { + return (strctty(stp)); + } + + case TIOCNOTTY: + { + /* freectty() always assumes curproc. */ + if (freectty(B_FALSE) != 0) + return (0); + return (ENOTTY); + } + case FIONBIO: case FIOASYNC: return (0); /* handled by the upper layer */ @@ -6233,18 +6267,21 @@ strgetmsg( stp = vp->v_stream; rvp->r_val1 = 0; - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCREAD)) - return (error); + mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCREAD)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } - /* Fast check of flags before acquiring the lock */ if (stp->sd_flag & (STRDERR|STPLEX)) { - mutex_enter(&stp->sd_lock); error = strgeterr(stp, STRDERR|STPLEX, 0); - mutex_exit(&stp->sd_lock); - if (error != 0) + if (error != 0) { + mutex_exit(&stp->sd_lock); return (error); + } } + mutex_exit(&stp->sd_lock); switch (*flagsp) { case MSG_HIPRI: @@ -6381,11 +6418,9 @@ strgetmsg( } TRACE_2(TR_FAC_STREAMS_FR, TR_STRGETMSG_AWAKE, "strgetmsg awakes:%p, %p", vp, uiop); - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) { + if ((error = i_straccess(stp, JCREAD)) != 0) { mutex_exit(&stp->sd_lock); - if (error = straccess(stp, JCREAD)) - return (error); - mutex_enter(&stp->sd_lock); + return (error); } first = 0; } @@ -6797,23 +6832,26 @@ kstrgetmsg( stp = vp->v_stream; rvp->r_val1 = 0; - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCREAD)) - return (error); + mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCREAD)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } flags = *flagsp; - /* Fast check of flags before acquiring the lock */ if (stp->sd_flag & (STRDERR|STPLEX)) { if ((stp->sd_flag & STPLEX) || (flags & (MSG_IGNERROR|MSG_DELAYERROR)) == 0) { - mutex_enter(&stp->sd_lock); error = strgeterr(stp, STRDERR|STPLEX, (flags & MSG_IPEEK)); - mutex_exit(&stp->sd_lock); - if (error != 0) + if (error != 0) { + mutex_exit(&stp->sd_lock); return (error); + } } } + mutex_exit(&stp->sd_lock); switch (flags & (MSG_HIPRI|MSG_ANY|MSG_BAND)) { case MSG_HIPRI: @@ -6955,11 +6993,9 @@ retry: } TRACE_2(TR_FAC_STREAMS_FR, TR_KSTRGETMSG_AWAKE, "kstrgetmsg awakes:%p, %p", vp, uiop); - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) { + if ((error = i_straccess(stp, JCREAD)) != 0) { mutex_exit(&stp->sd_lock); - if (error = straccess(stp, JCREAD)) - return (error); - mutex_enter(&stp->sd_lock); + return (error); } first = 0; } @@ -7430,18 +7466,23 @@ strputmsg( audit_strputmsg(vp, mctl, mdata, pri, flag, fmode); #endif - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCWRITE)) - return (error); + mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { - mutex_enter(&stp->sd_lock); error = strwriteable(stp, B_FALSE, xpg4); - mutex_exit(&stp->sd_lock); - if (error != 0) + if (error != 0) { + mutex_exit(&stp->sd_lock); return (error); + } } + mutex_exit(&stp->sd_lock); + /* * Check for legal flag value. */ @@ -7561,10 +7602,11 @@ strputmsg( } TRACE_1(TR_FAC_STREAMS_FR, TR_STRPUTMSG_WAKE, "strputmsg wake:stp %p wakes", stp); + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + return (error); + } mutex_exit(&stp->sd_lock); - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) - if (error = straccess(stp, JCWRITE)) - return (error); } out: /* @@ -7617,25 +7659,27 @@ kstrputmsg( if (mctl == NULL) return (EINVAL); - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) { - if (error = straccess(stp, JCWRITE)) { - freemsg(mctl); - return (error); - } + mutex_enter(&stp->sd_lock); + + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + freemsg(mctl); + return (error); } if ((stp->sd_flag & STPLEX) || !(flag & MSG_IGNERROR)) { if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { - mutex_enter(&stp->sd_lock); error = strwriteable(stp, B_FALSE, B_TRUE); - mutex_exit(&stp->sd_lock); if (error != 0) { + mutex_exit(&stp->sd_lock); freemsg(mctl); return (error); } } } + mutex_exit(&stp->sd_lock); + /* * Check for legal flag value. */ @@ -7804,13 +7848,12 @@ kstrputmsg( } TRACE_1(TR_FAC_STREAMS_FR, TR_KSTRPUTMSG_WAKE, "kstrputmsg wake:stp %p wakes", stp); - mutex_exit(&stp->sd_lock); - if (stp->sd_sidp != NULL && stp->sd_vnode->v_type != VFIFO) { - if (error = straccess(stp, JCWRITE)) { - freemsg(mctl); - return (error); - } + if ((error = i_straccess(stp, JCWRITE)) != 0) { + mutex_exit(&stp->sd_lock); + freemsg(mctl); + return (error); } + mutex_exit(&stp->sd_lock); } out: freemsg(mctl); diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 57a918a3f0..ae99e5198a 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -3107,13 +3107,18 @@ straccess(struct stdata *stp, enum jcaccess mode) proc_t *p = ttoproc(t); sess_t *sp; + ASSERT(mutex_owned(&stp->sd_lock)); + if (stp->sd_sidp == NULL || stp->sd_vnode->v_type == VFIFO) return (0); - mutex_enter(&p->p_lock); - sp = p->p_sessp; + mutex_enter(&p->p_lock); /* protects p_pgidp */ for (;;) { + mutex_enter(&p->p_splock); /* protects p->p_sessp */ + sp = p->p_sessp; + mutex_enter(&sp->s_lock); /* protects sp->* */ + /* * If this is not the calling process's controlling terminal * or if the calling process is already in the foreground @@ -3121,6 +3126,8 @@ straccess(struct stdata *stp, enum jcaccess mode) */ if (sp->s_dev != stp->sd_vnode->v_rdev || p->p_pgidp == stp->sd_pgidp) { + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); mutex_exit(&p->p_lock); return (0); } @@ -3131,10 +3138,15 @@ straccess(struct stdata *stp, enum jcaccess mode) if (sp->s_vp == NULL) { if (!cantsend(p, t, SIGHUP)) sigtoproc(p, t, SIGHUP); + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); mutex_exit(&p->p_lock); return (EIO); } + mutex_exit(&sp->s_lock); + mutex_exit(&p->p_splock); + if (mode == JCGETP) { mutex_exit(&p->p_lock); return (0); @@ -3146,7 +3158,9 @@ straccess(struct stdata *stp, enum jcaccess mode) return (EIO); } mutex_exit(&p->p_lock); + mutex_exit(&stp->sd_lock); pgsignal(p->p_pgidp, SIGTTIN); + mutex_enter(&stp->sd_lock); mutex_enter(&p->p_lock); } else { /* mode == JCWRITE or JCSETP */ if ((mode == JCWRITE && !(stp->sd_flag & STRTOSTOP)) || @@ -3159,7 +3173,9 @@ straccess(struct stdata *stp, enum jcaccess mode) return (EIO); } mutex_exit(&p->p_lock); + mutex_exit(&stp->sd_lock); pgsignal(p->p_pgidp, SIGTTOU); + mutex_enter(&stp->sd_lock); mutex_enter(&p->p_lock); } @@ -3174,10 +3190,15 @@ straccess(struct stdata *stp, enum jcaccess mode) * We can't get here if the signal is ignored or * if the current thread is blocking the signal. */ + mutex_exit(&stp->sd_lock); if (!cv_wait_sig_swap(&lbolt_cv, &p->p_lock)) { mutex_exit(&p->p_lock); + mutex_enter(&stp->sd_lock); return (EINTR); } + mutex_exit(&p->p_lock); + mutex_enter(&stp->sd_lock); + mutex_enter(&p->p_lock); } } @@ -4001,59 +4022,12 @@ strsignal(stdata_t *stp, int sig, int32_t band) void strhup(stdata_t *stp) { + ASSERT(mutex_owned(&stp->sd_lock)); pollwakeup(&stp->sd_pollist, POLLHUP); - mutex_enter(&stp->sd_lock); if (stp->sd_sigflags & S_HANGUP) strsendsig(stp->sd_siglist, S_HANGUP, 0, 0); - mutex_exit(&stp->sd_lock); -} - -void -stralloctty(stdata_t *stp) -{ - proc_t *p = curproc; - sess_t *sp = p->p_sessp; - - mutex_enter(&stp->sd_lock); - /* - * No need to hold the session lock or do a TTY_HOLD() because - * this is the only thread that can be the session leader and not - * have a controlling tty. - */ - if ((stp->sd_flag & - (STRHUP|STRDERR|STWRERR|STPLEX|STRISTTY)) == STRISTTY && - stp->sd_sidp == NULL && /* not allocated as ctty */ - sp->s_sidp == p->p_pidp && /* session leader */ - sp->s_flag != SESS_CLOSE && /* session is not closing */ - sp->s_vp == NULL) { /* without ctty */ - ASSERT(stp->sd_pgidp == NULL); - alloctty(p, makectty(stp->sd_vnode)); - - mutex_enter(&pidlock); - stp->sd_sidp = sp->s_sidp; - stp->sd_pgidp = sp->s_sidp; - PID_HOLD(stp->sd_pgidp); - PID_HOLD(stp->sd_sidp); - mutex_exit(&pidlock); - } - mutex_exit(&stp->sd_lock); } -void -strfreectty(stdata_t *stp) -{ - mutex_enter(&stp->sd_lock); - pgsignal(stp->sd_pgidp, SIGHUP); - mutex_enter(&pidlock); - PID_RELE(stp->sd_pgidp); - PID_RELE(stp->sd_sidp); - stp->sd_pgidp = NULL; - stp->sd_sidp = NULL; - mutex_exit(&pidlock); - mutex_exit(&stp->sd_lock); - if (!(stp->sd_flag & STRHUP)) - strhup(stp); -} /* * Backenable the first queue upstream from `q' with a service procedure. */ diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 80761e102c..8211e23d01 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -51,6 +51,7 @@ int access(); int alarm(); int auditsys(); +int64_t brandsys(); int brk(); int chdir(); int chmod(); @@ -131,6 +132,8 @@ int unlink(); int utime(); int64_t utssys32(); int64_t utssys64(); +int uucopy(); +ssize_t uucopystr(); int64_t wait(); ssize_t write(); ssize_t readv(); @@ -473,7 +476,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_CI("fstatfs", fstatfs32, 4)), /* 39 */ SYSENT_CI("setpgrp", setpgrp, 3), - /* 40 */ SYSENT_LOADABLE(), /* (was cxenix) */ + /* 40 */ SYSENT_CI("uucopystr", uucopystr, 3), /* 41 */ SYSENT_CI("dup", dup, 1), /* 42 */ SYSENT_LOADABLE(), /* (was pipe ) */ /* 43 */ SYSENT_CL("times", times, 1), @@ -658,7 +661,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_LOADABLE(), /* (was srmlimitsys) */ + /* 177 */ SYSENT_CI("brandsys", brandsys, 6), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -770,7 +773,7 @@ struct sysent sysent[NSYSCALL] = /* 251 */ SYSENT_CI("lwp_mutex_trylock", lwp_mutex_trylock, 1), /* 252 */ SYSENT_CI("lwp_mutex_init", lwp_mutex_init, 2), /* 253 */ SYSENT_CI("cladm", cladm, 3), - /* 254 */ SYSENT_LOADABLE(), /* (was lwp_sigtimedwait) */ + /* 254 */ SYSENT_CI("uucopy", uucopy, 3), /* 255 */ SYSENT_CI("umount2", umount2, 2) /* ONC_PLUS EXTRACT START */ }; @@ -876,7 +879,7 @@ struct sysent sysent32[NSYSCALL] = /* 37 */ SYSENT_CI("kill", kill, 2), /* 38 */ SYSENT_CI("fstatfs", fstatfs32, 4), /* 39 */ SYSENT_CI("setpgrp", setpgrp, 3), - /* 40 */ SYSENT_LOADABLE32(), /* (was cxenix) */ + /* 40 */ SYSENT_CI("uucopystr", uucopystr, 3), /* 41 */ SYSENT_CI("dup", dup, 1), /* 42 */ SYSENT_LOADABLE32(), /* (was pipe ) */ /* 43 */ SYSENT_CI("times", times32, 1), @@ -1036,7 +1039,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_LOADABLE32(), /* srmlimitsys */ + /* 177 */ SYSENT_CI("brandsys", brandsys, 6), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1116,7 +1119,7 @@ struct sysent sysent32[NSYSCALL] = /* 251 */ SYSENT_CI("lwp_mutex_trylock", lwp_mutex_trylock, 1), /* 252 */ SYSENT_CI("lwp_mutex_init", lwp_mutex_init, 2), /* 253 */ SYSENT_CI("cladm", cladm, 3), - /* 254 */ SYSENT_LOADABLE32(), /* (was lwp_sigtimedwait) */ + /* 254 */ SYSENT_CI("uucopy", uucopy, 3), /* 255 */ SYSENT_CI("umount2", umount2, 2) /* ONC_PLUS EXTRACT START */ }; diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 6a5c9243b3..9fd6b423bd 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -228,6 +228,7 @@ #include <sys/nvpair.h> #include <sys/rctl.h> #include <sys/fss.h> +#include <sys/brand.h> #include <sys/zone.h> #include <sys/tsol/label.h> @@ -330,7 +331,6 @@ static kmutex_t mount_lock; const char * const zone_default_initname = "/sbin/init"; static char * const zone_prefix = "/zone/"; - static int zone_shutdown(zoneid_t zoneid); /* @@ -1223,6 +1223,8 @@ zone_init(void) zone0.zone_nlwps = p0.p_lwpcnt; zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); + zone0.zone_restart_init = B_TRUE; + zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* * pool_default hasn't been initialized yet, so we let pool_init() take @@ -2330,33 +2332,40 @@ void zone_start_init(void) { proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; ASSERT(!INGLOBALZONE(curproc)); /* + * For all purposes (ZONE_ATTR_INITPID and restart_init), + * storing just the pid of init is sufficient. + */ + z->zone_proc_initpid = p->p_pid; + + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. */ p->p_zone->zone_boot_err = start_init_common(); mutex_enter(&zone_status_lock); - if (p->p_zone->zone_boot_err != 0) { + if (z->zone_boot_err != 0) { /* * Make sure we are still in the booting state-- we could have * raced and already be shutting down, or even further along. */ - if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) - zone_status_set(p->p_zone, ZONE_IS_SHUTTING_DOWN); + if (zone_status_get(z) == ZONE_IS_BOOTING) + zone_status_set(z, ZONE_IS_SHUTTING_DOWN); mutex_exit(&zone_status_lock); /* It's gone bad, dispose of the process */ - if (proc_exit(CLD_EXITED, p->p_zone->zone_boot_err) != 0) { + if (proc_exit(CLD_EXITED, z->zone_boot_err) != 0) { mutex_enter(&p->p_lock); ASSERT(p->p_flag & SEXITLWPS); lwp_exit(); } } else { - if (zone_status_get(p->p_zone) == ZONE_IS_BOOTING) - zone_status_set(p->p_zone, ZONE_IS_RUNNING); + if (zone_status_get(z) == ZONE_IS_BOOTING) + zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); /* cause the process to return to userland. */ lwp_rtt(); @@ -2939,6 +2948,9 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_psetid = ZONE_PS_INVAL; zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; + zone->zone_restart_init = B_TRUE; + zone->zone_brand = &native_brand; + zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); @@ -3464,6 +3476,9 @@ zone_shutdown(zoneid_t zoneid) zone_rele(zone); return (set_errno(EINTR)); } + + brand_unregister_zone(zone->zone_brand); + zone_rele(zone); return (0); } @@ -3771,6 +3786,18 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) copyout(&initpid, buf, bufsize) != 0) error = EFAULT; break; + case ZONE_ATTR_BRAND: + size = strlen(zone->zone_brand->b_name) + 1; + + if (bufsize > size) + bufsize = size; + if (buf != NULL) { + err = copyoutstr(zone->zone_brand->b_name, buf, + bufsize, NULL); + if (err != 0 && err != ENAMETOOLONG) + error = EFAULT; + } + break; case ZONE_ATTR_INITNAME: size = strlen(zone->zone_initname) + 1; if (bufsize > size) @@ -3797,7 +3824,12 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } break; default: - error = EINVAL; + if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { + size = bufsize; + error = ZBROP(zone)->b_getattr(zone, attr, buf, &size); + } else { + error = EINVAL; + } } zone_rele(zone); @@ -3815,6 +3847,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) { zone_t *zone; zone_status_t zone_status; + struct brand_attr *attrp; int err; if (secpolicy_zone_config(CRED()) != 0) @@ -3847,8 +3880,33 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_BOOTARGS: err = zone_set_bootargs(zone, (const char *)buf); break; + case ZONE_ATTR_BRAND: + ASSERT(!ZONE_IS_BRANDED(zone)); + err = 0; + attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP); + if ((buf == NULL) || + (copyin(buf, attrp, sizeof (struct brand_attr)) != 0)) { + kmem_free(attrp, sizeof (struct brand_attr)); + err = EFAULT; + break; + } + + if (is_system_labeled() && strncmp(attrp->ba_brandname, + NATIVE_BRAND_NAME, MAXNAMELEN) != 0) { + err = EPERM; + break; + } + + zone->zone_brand = brand_register_zone(attrp); + kmem_free(attrp, sizeof (struct brand_attr)); + if (zone->zone_brand == NULL) + err = EINVAL; + break; default: - err = EINVAL; + if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) + err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); + else + err = EINVAL; } done: @@ -4145,10 +4203,10 @@ zone_enter(zoneid_t zoneid) */ mutex_enter(&pidlock); sp = zone->zone_zsched->p_sessp; - SESS_HOLD(sp); + sess_hold(zone->zone_zsched); mutex_enter(&pp->p_lock); pgexit(pp); - SESS_RELE(pp->p_sessp); + sess_rele(pp->p_sessp, B_TRUE); pp->p_sessp = sp; pgjoin(pp, zone->zone_zsched->p_pidp); mutex_exit(&pp->p_lock); diff --git a/usr/src/uts/common/rpc/clnt_gen.c b/usr/src/uts/common/rpc/clnt_gen.c index 0093210bd5..4c557b563f 100644 --- a/usr/src/uts/common/rpc/clnt_gen.c +++ b/usr/src/uts/common/rpc/clnt_gen.c @@ -346,6 +346,11 @@ bindresvport_again: } if (!error && bound_addr) { + if (bound_addr->maxlen < ret->addr.len) { + kmem_free(bound_addr->buf, bound_addr->maxlen); + bound_addr->buf = kmem_zalloc(ret->addr.len, KM_SLEEP); + bound_addr->maxlen = ret->addr.len; + } bcopy(ret->addr.buf, bound_addr->buf, ret->addr.len); bound_addr->len = ret->addr.len; } diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index f6bcef9c5c..2754405b01 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -102,6 +102,7 @@ CHKHDRS= \ bofi_impl.h \ bpp_io.h \ bootstat.h \ + brand.h \ buf.h \ bufmod.h \ bustypes.h \ diff --git a/usr/src/uts/common/sys/audioio.h b/usr/src/uts/common/sys/audioio.h index 5b8152cfc5..2814eb7040 100644 --- a/usr/src/uts/common/sys/audioio.h +++ b/usr/src/uts/common/sys/audioio.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright (c) 1995-2001 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ #ifndef _SYS_AUDIOIO_H @@ -209,10 +208,10 @@ typedef struct audio_info audio_info_t; * a signed int. */ #define AUDIO_INITINFO(i) { \ - uint_t *__x__; \ - for (__x__ = (uint_t *)(i); \ + uint_t *__x__; \ + for (__x__ = (uint_t *)(i); \ (char *)__x__ < (((char *)(i)) + sizeof (audio_info_t)); \ - *__x__++ = ~0); \ + *__x__++ = (uint_t)~0); \ } diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 025d7a18e9..b9cf07f269 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,7 +23,7 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -171,6 +170,15 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_AUXFLAGS 2017 /* AF_SUN_ flags passed from the kernel */ /* + * Used to indicate to the runtime linker the name of the emulation binary, + * if one is being used. For brands, this is the name of the brand library. + */ +#define AT_SUN_EMULATOR 2018 + +#define AT_SUN_BRANDNAME 2019 +#define AT_SUN_BRAND_PHDR 2020 /* Brand executable's phdr */ + +/* * The kernel is in a better position to determine whether a process needs to * ignore dangerous LD environment variables. If set, this flags tells * ld.so.1 to run "secure" and ignore the the environment. @@ -183,7 +191,6 @@ extern uint_t getisax(uint32_t *, uint_t); */ #define AF_SUN_HWCAPVERIFY 0x00000002 - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/bitmap.h b/usr/src/uts/common/sys/bitmap.h index 8476ba9563..d0dd12b683 100644 --- a/usr/src/uts/common/sys/bitmap.h +++ b/usr/src/uts/common/sys/bitmap.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -124,6 +124,14 @@ extern "C" { #endif /* _LP64 */ +/* + * BIT_ONLYONESET is a private macro not designed for bitmaps of + * arbitrary size. u must be an unsigned integer/long. It returns + * true if one and only one bit is set in u. + */ +#define BIT_ONLYONESET(u) \ + ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0)) + #if defined(_KERNEL) && !defined(_ASM) #include <sys/atomic.h> diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h new file mode 100644 index 0000000000..c4595e9641 --- /dev/null +++ b/usr/src/uts/common/sys/brand.h @@ -0,0 +1,134 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_BRAND_H +#define _SYS_BRAND_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/proc.h> +#include <sys/exec.h> + +/* + * All Brands supported by this kernel must use BRAND_VER_1. + */ +#define BRAND_VER_1 1 + +/* + * sub-commands to brandsys. + * 1 - 128 are for common commands + * 128+ are available for brand-specific commands. + */ +#define B_REGISTER 1 +#define B_TTYMODES 2 +#define B_ELFDATA 3 +#define B_EXEC_NATIVE 4 +#define B_EXEC_BRAND 5 + +/* + * Structure used by zoneadmd to communicate the name of a brand and the + * supporting brand module into the kernel. + */ +struct brand_attr { + char ba_brandname[MAXNAMELEN]; + char ba_modname[MAXPATHLEN]; +}; + +/* What we call the native brand. */ +#define NATIVE_BRAND_NAME "native" + +#ifdef _KERNEL + +/* Root for branded zone's native binaries */ +#define NATIVE_ROOT "/native/" + +struct proc; +struct uarg; +struct brand_mach_ops; +struct intpdata; +struct execa; + +struct brand_ops { + int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t, uintptr_t); + void (*b_setbrand)(struct proc *); + int (*b_getattr)(zone_t *, int, void *, size_t *); + int (*b_setattr)(zone_t *, int, void *, size_t); + void (*b_copy_procdata)(struct proc *, struct proc *); + void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_exec)(); + void (*b_lwp_setrval)(klwp_t *, int, int); + int (*b_initlwp)(klwp_t *); + void (*b_forklwp)(klwp_t *, klwp_t *); + void (*b_freelwp)(klwp_t *); + void (*b_lwpexit)(klwp_t *); + int (*b_elfexec)(struct vnode *vp, struct execa *uap, + struct uarg *args, struct intpdata *idata, int level, + long *execsz, int setid, caddr_t exec_file, + struct cred *cred, int brand_action); +}; + +/* + * The b_version field must always be the first entry in this struct. + */ +typedef struct brand { + int b_version; + char *b_name; + struct brand_ops *b_ops; + struct brand_mach_ops *b_machops; +} brand_t; + +extern brand_t native_brand; + +/* + * Convenience macros + */ +#define lwptolwpbrand(l) ((l)->lwp_brand) +#define ttolwpbrand(t) (lwptolwpbrand(ttolwp(t))) +#define PROC_IS_BRANDED(p) ((p)->p_brand != &native_brand) +#define ZONE_IS_BRANDED(z) ((z)->zone_brand != &native_brand) +#define BROP(p) ((p)->p_brand->b_ops) +#define ZBROP(z) ((z)->zone_brand->b_ops) +#define BRMOP(p) ((p)->p_brand->b_machops) + +extern void brand_init(); +extern int brand_register(brand_t *); +extern int brand_unregister(brand_t *); +extern brand_t *brand_register_zone(struct brand_attr *); +extern brand_t *brand_find_name(char *); +extern void brand_unregister_zone(brand_t *); +extern int brand_zone_count(brand_t *); +extern void brand_setbrand(proc_t *); +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BRAND_H */ diff --git a/usr/src/uts/common/sys/class.h b/usr/src/uts/common/sys/class.h index fbfbcc6080..9988ca3190 100644 --- a/usr/src/uts/common/sys/class.h +++ b/usr/src/uts/common/sys/class.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +37,7 @@ #include <sys/thread.h> #include <sys/priocntl.h> #include <sys/mutex.h> +#include <sys/uio.h> #ifdef __cplusplus extern "C" { @@ -128,15 +128,15 @@ extern pri_t minclsyspri; extern id_t syscid; /* system scheduling class ID */ extern id_t defaultcid; /* "default" class id; see dispadmin(1M) */ -extern int alloc_cid(char *, id_t *); -extern int scheduler_load(char *, sclass_t *); -extern int getcid(char *, id_t *); -extern int getcidbyname(char *, id_t *); -extern int parmsin(pcparms_t *, pc_vaparms_t *); -extern int parmsout(pcparms_t *, pc_vaparms_t *); -extern int parmsset(pcparms_t *, kthread_id_t); -extern void parmsget(kthread_id_t, pcparms_t *); -extern int vaparmsout(char *, pcparms_t *, pc_vaparms_t *); +extern int alloc_cid(char *, id_t *); +extern int scheduler_load(char *, sclass_t *); +extern int getcid(char *, id_t *); +extern int getcidbyname(char *, id_t *); +extern int parmsin(pcparms_t *, pc_vaparms_t *); +extern int parmsout(pcparms_t *, pc_vaparms_t *); +extern int parmsset(pcparms_t *, kthread_id_t); +extern void parmsget(kthread_id_t, pcparms_t *); +extern int vaparmsout(char *, pcparms_t *, pc_vaparms_t *, uio_seg_t); #endif diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index e9a34eacfe..a5eaf18edd 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -105,9 +105,19 @@ typedef struct uarg { uint_t brkpageszc; uintptr_t entry; uintptr_t thrptr; + char *emulator; + char *brandname; + auxv32_t *brand_auxp; /* starting user addr of brand auxvs on stack */ } uarg_t; /* + * Possible brand actions for exec. + */ +#define EBA_NONE 0 +#define EBA_NATIVE 1 +#define EBA_BRAND 2 + +/* * The following macro is a machine dependent encapsulation of * postfix processing to hide the stack direction from elf.c * thereby making the elf.c code machine independent. @@ -166,7 +176,7 @@ struct execsw { int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, int setid, caddr_t exec_file, - struct cred *cred); + struct cred *cred, int brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -198,10 +208,10 @@ extern int exec_args(execa_t *, uarg_t *, intpdata_t *, void **); extern int exec(const char *fname, const char **argp); extern int exece(const char *fname, const char **argp, const char **envp); extern int exec_common(const char *fname, const char **argp, - const char **envp); + const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, caddr_t exec_file, - struct cred *cred); + struct cred *cred, int brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index ade26b4f82..7dea5b4941 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -211,6 +210,8 @@ typedef struct _klwp { */ struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */ struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ + + void *lwp_brand; /* per-lwp brand data */ } klwp_t; /* lwp states */ diff --git a/usr/src/uts/common/sys/modctl.h b/usr/src/uts/common/sys/modctl.h index 1093eddef6..5e9450dde5 100644 --- a/usr/src/uts/common/sys/modctl.h +++ b/usr/src/uts/common/sys/modctl.h @@ -61,6 +61,7 @@ struct mod_ops { * The defined set of mod_ops structures for each loadable module type * Defined in modctl.c */ +extern struct mod_ops mod_brandops; #if defined(__i386) || defined(__amd64) extern struct mod_ops mod_cpuops; #endif @@ -175,6 +176,13 @@ struct modlpcbe { struct __pcbe_ops *pcbe_ops; }; +/* For Brand modules */ +struct modlbrand { + struct mod_ops *brand_modops; + char *brand_linkinfo; + struct brand *brand_branddef; +}; + /* for devname fs */ struct modldev { struct mod_ops *dev_modops; diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index fadcbf4a6d..13a3605e66 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -115,6 +115,7 @@ typedef struct lwpdir { struct pool; struct task; struct zone; +struct brand; struct corectl_path; struct corectl_content; @@ -336,6 +337,11 @@ typedef struct proc { uintptr_t p_portcnt; /* event ports counter */ struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ + struct brand *p_brand; /* process's brand */ + void *p_brand_data; /* per-process brand state */ + + /* additional lock to protect p_sessp (but not its contents) */ + kmutex_t p_splock; } proc_t; #define PROC_T /* headers relying on proc_t are OK */ @@ -408,6 +414,10 @@ struct plock { extern proc_t p0; /* process 0 */ extern struct plock p0lock; /* p0's plock */ extern struct pid pid0; /* p0's pid */ + +/* pid_allocate() flags */ +#define PID_ALLOC_PROC 0x0001 /* assign a /proc slot as well */ + #endif /* _KERNEL */ /* stat codes */ @@ -588,7 +598,8 @@ extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); extern void pid_setmin(void); -extern pid_t pid_assign(proc_t *); +extern pid_t pid_allocate(proc_t *, int); +extern struct pid *pid_find(pid_t); extern int pid_rele(struct pid *); extern void pid_exit(proc_t *); extern void proc_entry_free(struct pid *); @@ -724,6 +735,7 @@ extern void lwp_rtt(void); extern void lwp_rtt_initial(void); extern int lwp_setprivate(klwp_t *, int, uintptr_t); extern void lwp_stat_update(lwp_stat_id_t, long); +extern void lwp_attach_brand_hdlrs(klwp_t *); /* * Signal queue function prototypes. Must be here due to header ordering diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 9aa6493956..120503539b 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -35,6 +35,8 @@ extern "C" { #endif +#ifdef _KERNEL + /* * Structures and definitions supporting the pseudo terminal * drivers. This structure is private and should not be used by any @@ -63,8 +65,6 @@ struct pt_ttys { #define PTSOPEN 0x04 /* slave side is open */ #define PTSTTY 0x08 /* slave side is tty */ -#ifdef _KERNEL - /* * Multi-threading primitives. * Values of pt_refcnt: -1 if a writer is accessing the struct @@ -129,18 +129,29 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ +typedef struct pt_own { + uid_t pto_ruid; + gid_t pto_rgid; +} pt_own_t; + /* * ioctl commands * - * ISPTM: Determines whether the file descriptor is that of an open master - * device. Return code of zero indicates that the file descriptor - * represents master device. + * ISPTM: Determines whether the file descriptor is that of an open master + * device. Return code of zero indicates that the file descriptor + * represents master device. * - * UNLKPT: Unlocks the master and slave devices. It returns 0 on success. On - * failure, the errno is set to EINVAL indicating that the master - * device is not open. + * UNLKPT: Unlocks the master and slave devices. It returns 0 on success. On + * failure, the errno is set to EINVAL indicating that the master + * device is not open. * * ZONEPT: Sets the zoneid of the pair of master and slave devices. It * returns 0 upon success. Used to force a pty 'into' a zone upon @@ -149,16 +160,24 @@ extern void ptms_logp(char *, uintptr_t); * PT_OWNER: Sets uid and gid for slave device. It returns 0 on success. * */ -#define ISPTM (('P'<<8)|1) /* query for master */ -#define UNLKPT (('P'<<8)|2) /* unlock master/slave pair */ -#define PTSSTTY (('P'<<8)|3) /* set tty flag */ -#define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ -#define PT_OWNER (('P'<<8)|5) /* set owner and group for slave device */ +#define ISPTM (('P'<<8)|1) /* query for master */ +#define UNLKPT (('P'<<8)|2) /* unlock master/slave pair */ +#define PTSSTTY (('P'<<8)|3) /* set tty flag */ +#define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ +#define PT_OWNER (('P'<<8)|5) /* set owner/group for slave device */ -typedef struct pt_own { - uid_t pto_ruid; - gid_t pto_rgid; -} pt_own_t; +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/session.h b/usr/src/uts/common/sys/session.h index 639d6bf69d..8db8a8a5bb 100644 --- a/usr/src/uts/common/sys/session.h +++ b/usr/src/uts/common/sys/session.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,54 +36,96 @@ extern "C" { #endif +/* + * Session structure overview. + * + * Currently, the only structure in the kernel which has a pointer to a + * session structures is the proc_t via the p_sessp pointer. To + * access a session proc_t->p_sessp pointer a caller must hold either + * pidlock or p_splock. These locks only protect the p_sessp pointer + * itself and do not protect any of the contents of the session structure. + * To prevent the contents of a the session structure from changing the + * caller must grab s_lock. + * + * No callers should ever update the contents of the session structure + * directly. Only the session management code should ever modify the + * contents of the session structure. When the session code attempts + * to modify the contents of a session structure it must hold multiple + * locks. The locking order for all the locks that may need to be + * acquired is: + * sd_lock -> pidlock -> p_splock -> s_lock + * + * If a caller requires access to a session structure for long + * periods of time or across operations that may block it should + * use the tty_hold() and sess_hold() interfaces. + * + * sess_hold() returns a pointer to a session structure associated + * with the proc_t that was passed in. It also increments the reference + * count associated with that session structure to ensure that it + * can't be freed until after the caller is done with it and calls + * sess_rele(). This hold doesn't actually protect any of the + * contents of the session structure. + * + * tty_hold() returns a pointer to a session structure associated + * with the curproc. It also "locks" the contents of the session + * structure. This hold should be used when the caller will be + * doing operations on a controlling tty associated with the session. + * This operation doesn an implicit sess_hold() so that the session + * structure can't be free'd until after the caller is done with it + * and invokes tty_rele(). + * + * NOTE: Neither of these functions (sess_hold() or tty_hold()) + * prevent a process from changing its session. Once these functions + * return a session pointer, that session pointer may no longer be + * associated with the current process. If a caller wants to prevent + * a process from changing its session then it must hold pidlock or + * p_splock. + */ + typedef struct sess { - uint_t s_ref; /* reference count */ - dev_t s_dev; /* tty's device number */ - struct vnode *s_vp; /* tty's vnode */ - struct pid *s_sidp; /* session ID info */ - struct cred *s_cred; /* allocation credentials */ - kmutex_t s_lock; /* sync s_vp use with freectty */ - kcondvar_t s_wait_cv; /* Condvar for sleeping */ - int s_cnt; /* # of active users of this session */ - int s_flag; /* session state flag see below */ -} sess_t; + struct pid *s_sidp; /* session ID info, never changes */ -#define SESS_CLOSE 1 /* session about to close */ -#define s_sid s_sidp->pid_id + kmutex_t s_lock; /* protects everything below */ + uint_t s_ref; /* reference count */ + boolean_t s_sighuped; /* ctty had sighup sent to it */ -#if defined(_KERNEL) + boolean_t s_exit; /* sesion leader is exiting */ + kcondvar_t s_exit_cv; /* Condvar for s_exit */ -extern sess_t session0; + int s_cnt; /* active users of this ctty */ + kcondvar_t s_cnt_cv; /* Condvar for s_cnt */ -#define SESS_HOLD(sp) (++(sp)->s_ref) -#define SESS_RELE(sp) sess_rele(sp) + /* + * The following fields can only be updated while s_lock is held + * and s_cnt is 0. (ie, no one has a tty_hold() on this session.) + */ + dev_t s_dev; /* tty's device number */ + struct vnode *s_vp; /* tty's vnode */ + struct cred *s_cred; /* allocation credentials */ +} sess_t; -/* - * Used to synchronize session vnode users with freectty() - */ +#define s_sid s_sidp->pid_id -#define TTY_HOLD(sp) { \ - mutex_enter(&(sp)->s_lock); \ - (++(sp)->s_cnt); \ - mutex_exit(&(sp)->s_lock); \ -} +#if defined(_KERNEL) -#define TTY_RELE(sp) { \ - mutex_enter(&(sp)->s_lock); \ - if ((--(sp)->s_cnt) == 0) \ - cv_signal(&(sp)->s_wait_cv); \ - mutex_exit(&(sp)->s_lock); \ -} +extern sess_t session0; /* forward referenced structure tags */ struct vnode; struct proc; +struct stdata; + +extern void sess_hold(proc_t *p); +extern void sess_rele(sess_t *, boolean_t); +extern sess_t *tty_hold(void); +extern void tty_rele(sess_t *sp); + -extern void sess_rele(sess_t *); extern void sess_create(void); -extern void freectty(sess_t *); -extern void alloctty(struct proc *, struct vnode *); +extern int strctty(struct stdata *); +extern int freectty(boolean_t); extern dev_t cttydev(struct proc *); +extern void ctty_clear_sighuped(void); #endif /* defined(_KERNEL) */ diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index d00220f2a9..39112e6c97 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -544,11 +544,21 @@ struct sonodeops { (((len) + _CMSG_HDR_ALIGNMENT - 1) & ~(_CMSG_HDR_ALIGNMENT - 1)) /* - * Used in parsing msg_control + * Macros that operate on struct cmsghdr. + * Used in parsing msg_control. + * The CMSG_VALID macro does not assume that the last option buffer is padded. */ #define CMSG_NEXT(cmsg) \ (struct cmsghdr *)((uintptr_t)(cmsg) + \ ROUNDUP_cmsglen((cmsg)->cmsg_len)) +#define CMSG_CONTENT(cmsg) (&((cmsg)[1])) +#define CMSG_CONTENTLEN(cmsg) ((cmsg)->cmsg_len - sizeof (struct cmsghdr)) +#define CMSG_VALID(cmsg, start, end) \ + (ISALIGNED_cmsghdr(cmsg) && \ + ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ + ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ + ((ssize_t)(cmsg)->cmsg_len >= sizeof (struct cmsghdr)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) /* * Maximum size of any argument that is copied in (addresses, options, diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 27403d72cc..4f424e96e1 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,7 +23,7 @@ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -1113,8 +1112,6 @@ extern mblk_t *strrput_misc(vnode_t *, mblk_t *, extern int getiocseqno(void); extern int strwaitbuf(size_t, int); extern int strwaitq(stdata_t *, int, ssize_t, int, clock_t, int *); -extern void stralloctty(struct stdata *); -extern void strfreectty(struct stdata *); extern struct stdata *shalloc(queue_t *); extern void shfree(struct stdata *s); extern queue_t *allocq(void); diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index 43dee30f0b..1a6412b70b 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -99,7 +99,7 @@ extern "C" { * getpgid(pid) :: syscall(39,4,pid) * setpgid(pid,pgid) :: syscall(39,5,pid,pgid) */ -#define SYS_reserved_40 40 /* 40 not used, was xenix */ +#define SYS_uucopystr 40 #define SYS_dup 41 #define SYS_pipe 42 #define SYS_times 43 @@ -355,7 +355,7 @@ extern "C" { #define SYS_pwrite 174 #define SYS_llseek 175 #define SYS_inst_sync 176 -#define SYS_reserved_177 177 /* 177 reserved */ +#define SYS_brand 177 #define SYS_kaio 178 /* * subcodes: @@ -464,6 +464,8 @@ extern "C" { * zone_list(...) :: zone(ZONE_LIST, ...) * zone_shutdown(...) :: zone(ZONE_SHUTDOWN, ...) * zone_lookup(...) :: zone(ZONE_LOOKUP, ...) + * zone_setattr(...) :: zone(ZONE_SETATTR, ...) + * zone_getattr(...) :: zone(ZONE_GETATTR, ...) */ #define SYS_autofssys 228 #define SYS_getcwd 229 @@ -494,7 +496,7 @@ extern "C" { #define SYS_lwp_mutex_trylock 251 #define SYS_lwp_mutex_init 252 #define SYS_cladm 253 -#define SYS_reserved_254 254 /* 254 reserved */ +#define SYS_uucopy 254 #define SYS_umount2 255 diff --git a/usr/src/uts/common/sys/systm.h b/usr/src/uts/common/sys/systm.h index c96ea5b4ac..ac465ad49f 100644 --- a/usr/src/uts/common/sys/systm.h +++ b/usr/src/uts/common/sys/systm.h @@ -246,6 +246,7 @@ int copyoutstr_noerr(const char *, char *, size_t, size_t *); int copystr(const char *, char *, size_t, size_t *); void bcopy(const void *, void *, size_t); void ucopy(const void *, void *, size_t); +void ucopystr(const char *, char *, size_t, size_t *); void pgcopy(const void *, void *, size_t); void ovbcopy(const void *, void *, size_t); void bzero(void *, size_t); diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 8bd020e5c1..2d99f70bc2 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,7 +23,7 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -382,6 +381,24 @@ extern pid_t tcgetsid(); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) @@ -457,6 +474,7 @@ struct ppsclockev32 { #define TIOCGLTC (tIOC|116) /* get local special chars */ #define TIOCOUTQ (tIOC|115) /* driver output queue size */ #define TIOCNOTTY (tIOC|113) /* void tty association */ +#define TIOCSCTTY (tIOC|132) /* get a ctty */ #define TIOCSTOP (tIOC|111) /* stop output, like ^S */ #define TIOCSTART (tIOC|110) /* start output, like ^Q */ #define TIOCSILOOP (tIOC|109) /* private to Sun; do not use */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 56c23d00ad..636b8acc0f 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -87,6 +87,10 @@ extern "C" { #define ZONE_ATTR_SLBL 8 #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 +#define ZONE_ATTR_BRAND 11 + +/* Start of the brand-specific attribute namespace */ +#define ZONE_ATTR_BRAND_ATTRS 32768 #define ZONE_EVENT_CHANNEL "com.sun:zones:status" #define ZONE_EVENT_STATUS_CLASS "status" @@ -103,6 +107,49 @@ extern "C" { #define ZONE_CB_TIMESTAMP "when" #define ZONE_CB_ZONEID "zoneid" +/* + * Exit values that may be returned by scripts or programs invoked by various + * zone commands. + * + * These are defined as: + * + * ZONE_SUBPROC_OK + * =============== + * The subprocess completed successfully. + * + * ZONE_SUBPROC_USAGE + * ================== + * The subprocess failed with a usage message, or a usage message should + * be output in its behalf. + * + * ZONE_SUBPROC_NOTCOMPLETE + * ======================== + * The subprocess did not complete, but the actions performed by the + * subprocess require no recovery actions by the user. + * + * For example, if the subprocess were called by "zoneadm install," the + * installation of the zone did not succeed but the user need not perform + * a "zoneadm uninstall" before attempting another install. + * + * ZONE_SUBPROC_FATAL + * ================== + * The subprocess failed in a fatal manner, usually one that will require + * some type of recovery action by the user. + * + * For example, if the subprocess were called by "zoneadm install," the + * installation of the zone did not succeed and the user will need to + * perform a "zoneadm uninstall" before another install attempt is + * possible. + * + * The non-success exit values are large to avoid accidental collision + * with values used internally by some commands (e.g. "Z_ERR" and + * "Z_USAGE" as used by zoneadm.) + */ +#define ZONE_SUBPROC_OK 0 +#define ZONE_SUBPROC_USAGE 253 +#define ZONE_SUBPROC_NOTCOMPLETE 254 +#define ZONE_SUBPROC_FATAL 255 + #ifdef _SYSCALL32 typedef struct { caddr32_t zone_name; @@ -159,8 +206,8 @@ typedef enum { * communicates with zoneadmd, but only uses Z_REBOOT and Z_HALT. */ typedef enum zone_cmd { - Z_READY, Z_BOOT, Z_REBOOT, Z_HALT, Z_NOTE_UNINSTALLING, - Z_MOUNT, Z_UNMOUNT + Z_READY, Z_BOOT, Z_FORCEBOOT, Z_REBOOT, Z_HALT, Z_NOTE_UNINSTALLING, + Z_MOUNT, Z_FORCEMOUNT, Z_UNMOUNT } zone_cmd_t; /* @@ -223,6 +270,7 @@ typedef struct zone_cmd_rval { #define ZF_IS_SCRATCH 0x4 /* scratch zone */ struct pool; +struct brand; /* * Structure to record list of ZFS datasets exported to a zone. @@ -318,6 +366,8 @@ typedef struct zone { int zone_match; /* require label match for packets */ tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */ + boolean_t zone_restart_init; /* Restart init if it dies? */ + struct brand *zone_brand; /* zone's brand */ } zone_t; /* @@ -330,8 +380,6 @@ extern zone_t *global_zone; extern uint_t maxzones; extern rctl_hndl_t rc_zone_nlwps; -extern const char * const zone_initname; - extern long zone(int, void *, void *, void *, void *); extern void zone_zsd_init(void); extern void zone_init(void); diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c new file mode 100644 index 0000000000..9b4bd38baa --- /dev/null +++ b/usr/src/uts/common/syscall/brandsys.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/brand.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zone.h> + +/* + * brand(2) system call. + */ +int64_t +brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, + uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) +{ + struct proc *p = curthread->t_procp; + int64_t rval = 0; + int err; + + /* + * The brandsys system call can only be executed from inside a + * branded zone. + */ + if (INGLOBALZONE(p) || !ZONE_IS_BRANDED(p->p_zone)) + return (set_errno(ENOSYS)); + + if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, + arg4, arg5, arg6)) != 0) + return (set_errno(err)); + + return (rval); +} diff --git a/usr/src/uts/common/syscall/pgrpsys.c b/usr/src/uts/common/syscall/pgrpsys.c index e8be876537..8f60747663 100644 --- a/usr/src/uts/common/syscall/pgrpsys.c +++ b/usr/src/uts/common/syscall/pgrpsys.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,11 +18,16 @@ * * CDDL HEADER END */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ -#ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ +#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.78 */ #include <sys/param.h> #include <sys/types.h> @@ -39,8 +43,9 @@ int setpgrp(int flag, int pid, int pgid) { - register proc_t *p = ttoproc(curthread); - register int retval = 0; + proc_t *p = curproc; + int retval = 0; + int sid; switch (flag) { @@ -51,7 +56,10 @@ setpgrp(int flag, int pid, int pgid) sess_create(); } else mutex_exit(&pidlock); - return (p->p_sessp->s_sid); + mutex_enter(&p->p_splock); + sid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + return (sid); case 3: /* setsid() */ mutex_enter(&pidlock); @@ -61,7 +69,10 @@ setpgrp(int flag, int pid, int pgid) } mutex_exit(&pidlock); sess_create(); - return (p->p_sessp->s_sid); + mutex_enter(&p->p_splock); + sid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + return (sid); case 5: /* setpgid() */ { diff --git a/usr/src/uts/common/syscall/uucopy.c b/usr/src/uts/common/syscall/uucopy.c new file mode 100644 index 0000000000..c301599e2f --- /dev/null +++ b/usr/src/uts/common/syscall/uucopy.c @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/systm.h> + +int +uucopy(const void *from, void *to, size_t size) +{ + label_t ljb; + + if (on_fault(&ljb)) + return (set_errno(EFAULT)); + + ucopy(from, to, size); + + no_fault(); + + return (0); +} + +ssize_t +uucopystr(const char *from, char *to, size_t size) +{ + label_t ljb; + size_t len; + + if (on_fault(&ljb)) + return (set_errno(EFAULT)); + + ucopystr(from, to, size, &len); + + no_fault(); + + return ((ssize_t)len); +} |