diff options
Diffstat (limited to 'usr/src/uts')
576 files changed, 113850 insertions, 2710 deletions
diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index 0807ad70dc..c7d771d6e5 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -20,6 +20,7 @@ # # # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. +# Copyright 2019 Joyent, Inc. # Copyright 2014 Garrett D'Amore <garrett@damore.org> # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright (c) 2017 by Delphix. All rights reserved. diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts index db66dd229d..3c4a307c4c 100644 --- a/usr/src/uts/Makefile.uts +++ b/usr/src/uts/Makefile.uts @@ -184,6 +184,15 @@ CERRWARN += -_smatch=-p=illumos_kernel include $(SRC)/Makefile.smatch # +# Add specific compiler options that are required based on the +# architecture in question. +# +CFLAGS_uts_i386 += -_gcc7=-mindirect-branch=thunk-extern +CFLAGS_uts_i386 += -_gcc7=-mindirect-branch-register +CFLAGS_uts_i386 += -_gcc8=-mindirect-branch=thunk-extern +CFLAGS_uts_i386 += -_gcc8=-mindirect-branch-register + +# # Ensure that the standard function prologue remains at the very start # of a function, so DTrace fbt will instrument the right place. # diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 51392a6b4f..0c60127800 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -25,7 +25,7 @@ # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # Copyright 2018 Nexenta Systems, Inc. # Copyright 2022 Garrett D'Amore -# Copyright 2020 Joyent, Inc. +# Copyright 2021 Joyent, Inc. # Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright 2022 RackTop Systems, Inc. @@ -124,6 +124,7 @@ GENUNIX_OBJS += \ bz2huffman.o \ callb.o \ callout.o \ + chacha.o \ chdir.o \ chmod.o \ chown.o \ @@ -445,6 +446,8 @@ PROFILE_OBJS += profile.o SYSTRACE_OBJS += systrace.o +LX_SYSTRACE_OBJS += lx_systrace.o + LOCKSTAT_OBJS += lockstat.o FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o @@ -515,6 +518,10 @@ PTSL_OBJS += tty_pts.o PTM_OBJS += ptm.o +LX_PTM_OBJS += lx_ptm.o + +LX_NETLINK_OBJS += lx_netlink.o + MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ mii_marvell.o mii_realtek.o mii_other.o @@ -583,6 +590,7 @@ IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ ip_helper_stream.o ip_tunables.o \ ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \ conn_opt.o ip_attr.o ip_dce.o \ + bpf_filter.o \ $(IP_ICMP_OBJS) \ $(IP_RTS_OBJS) \ $(IP_TCP_OBJS) \ @@ -607,6 +615,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o +DATAFILT_OBJS += datafilt.o + SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o SPPPTUN_OBJS += sppptun.o sppptun_mod.o @@ -658,7 +668,7 @@ TL_OBJS += tl.o DUMP_OBJS += dump.o -BPF_OBJS += bpf.o bpf_filter.o bpf_mod.o bpf_dlt.o bpf_mac.o +BPF_OBJS += bpf.o bpf_wrap.o bpf_mod.o bpf_dlt.o bpf_mac.o CLONE_OBJS += clone.o @@ -707,6 +717,10 @@ OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \ OVERLAY_VXLAN_OBJS += overlay_vxlan.o +VND_OBJS += vnd.o frameio.o + +GSQUEUE_OBJS += gsqueue.o + SIMNET_OBJS += simnet.o IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o @@ -957,6 +971,8 @@ SIGNALFD_OBJS += signalfd.o I8042_OBJS += i8042.o +INOTIFY_OBJS += inotify.o + KB8042_OBJS += \ at_keyprocess.o \ kb8042.o \ @@ -1031,6 +1047,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o ZCONS_OBJS += zcons.o +ZFD_OBJS += zfd.o + NV_SATA_OBJS += nv_sata.o SI3124_OBJS += si3124.o @@ -1095,8 +1113,13 @@ PIPE_OBJS += pipe.o HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o +HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \ + hyprlofs_vnops.o hyprlofs_vfsops.o + LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o +LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o + NAMEFS_OBJS += namevfs.o namevno.o NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ @@ -1262,8 +1285,8 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ pc_vfsops.o pc_vnops.o -PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ - prvfsops.o prvnops.o +PROC_OBJS += prargv.o prcontrol.o prioctl.o prsubr.o \ + prusrio.o prvfsops.o prvnops.o MNTFS_OBJS += mntvfsops.o mntvnops.o @@ -1444,6 +1467,7 @@ ZFS_COMMON_OBJS += \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ + zfs_zone.o \ zil.o \ zio.o \ zio_checksum.o \ @@ -1805,8 +1829,8 @@ SCSA2USB_OBJS += scsa2usb.o usb_ms_bulkonly.o usb_ms_cbi.o CCID_OBJS += ccid.o atr.o -IPF_OBJS += ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o ip_nat.o \ - ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \ +IPF_OBJS += cfw.o ip_fil_solaris.o fil.o solaris.o ip_state.o ip_frag.o \ + ip_nat.o ip_proxy.o ip_auth.o ip_pool.o ip_htable.o ip_lookup.o \ ip_log.o misc.o ip_compat.o ip_nat6.o drand48.o IPD_OBJS += ipd.o @@ -2209,6 +2233,11 @@ URF_OBJS = urf_usbgem.o UPF_OBJS = upf_usbgem.o # +# NFP objects +# +NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o + +# # BNXE objects # BNXE_OBJS += bnxe_cfg.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 83d58e522b..705d794670 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -23,7 +23,7 @@ # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2022 Garrett D'Amore <garrett@damore.org> # Copyright 2013 Saso Kiselkov. All rights reserved. -# Copyright 2019 Joyent, Inc. +# Copyright 2021 Joyent, Inc. # Copyright 2018 Nexenta Systems, Inc. # Copyright (c) 2017 by Delphix. All rights reserved. # Copyright 2022 Oxide Computer Company @@ -280,10 +280,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -986,6 +994,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/net80211/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nfp/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nge/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1161,6 +1173,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1173,6 +1189,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1525,6 +1545,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/idspace/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioif/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1589,6 +1613,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/nvpair/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/bootbanner.o := CPPFLAGS += \ -DBOOTBANNER1='"$(BOOTBANNER1)"' \ -DBOOTBANNER2='"$(BOOTBANNER2)"' \ @@ -1632,6 +1660,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/sec_gss/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/crypto/chacha/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/crypto/edonr/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c new file mode 100644 index 0000000000..364215d026 --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c @@ -0,0 +1,3177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * See the big theory statement in ../sys/lx_autofs.h + */ + +#include <fs/fs_subr.h> +#include <sys/stat.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/dirent.h> +#include <sys/fs/fifonode.h> +#include <sys/modctl.h> +#include <sys/mount.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/conf.h> +#include <sys/sdt.h> + +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> + +#include <sys/dnlc.h> +#include <nfs/rnode.h> +#include <nfs/rnode4.h> +#include <sys/lx_autofs_impl.h> +#include <sys/lx_types.h> + +/* + * External functions + */ +extern uintptr_t space_fetch(char *key); +extern int space_store(char *key, uintptr_t ptr); +extern int umount2_engine(vfs_t *, int, cred_t *, int); + +/* + * Globals + */ +static vfsops_t *lx_autofs_vfsops; +static vnodeops_t *lx_autofs_vn_ops = NULL; +static int lx_autofs_fstype; +static major_t lx_autofs_major; +static minor_t lx_autofs_minor = 0; +static dev_info_t *lx_autofs_dip = NULL; + +#define LX_AUTOFS_DEV_VERSION_MAJOR 1 +#define LX_AUTOFS_DEV_VERSION_MINOR 0 + +/* The Linux autofs superblock magic number */ +#define LX_AUTOFS_SB_MAGIC 0x0187 + +/* Linux autofs mount types */ +#define LX_AUTOFS_TYPE_INDIRECT 1 +#define LX_AUTOFS_TYPE_DIRECT 2 +#define LX_AUTOFS_TYPE_OFFSET 4 + +/* Structure passed for autofs dev ioctls */ +typedef struct lx_autofs_dv_ioctl { + uint32_t lad_ver_major; + uint32_t lad_ver_minor; + uint32_t lad_size; + uint32_t lad_ioctlfd; + uint32_t lad_arg1; + uint32_t lad_arg2; + char lad_path[0]; +} lx_autofs_dv_ioctl_t; + +/* + * Support functions + */ +static void +lx_autofs_strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +lx_autofs_strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static int +lx_autofs_str_to_int(char *str, int *val) +{ + long res; + + if (str == NULL) + return (-1); + + if ((ddi_strtol(str, NULL, 10, &res) != 0) || + (res < INT_MIN) || (res > INT_MAX)) + return (-1); + + *val = res; + return (0); +} + +static void +ls_autofs_stack_init(list_t *lp) +{ + list_create(lp, + sizeof (stack_elem_t), offsetof(stack_elem_t, se_list)); +} + +static void +lx_autofs_stack_fini(list_t *lp) +{ + ASSERT(list_head(lp) == NULL); + list_destroy(lp); +} + +static void +lx_autofs_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3) +{ + stack_elem_t *se; + + se = kmem_alloc(sizeof (*se), KM_SLEEP); + se->se_ptr1 = ptr1; + se->se_ptr2 = ptr2; + se->se_ptr3 = ptr3; + list_insert_head(lp, se); +} + +static int +lx_autofs_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3) +{ + stack_elem_t *se; + + if ((se = list_head(lp)) == NULL) + return (-1); + list_remove(lp, se); + if (ptr1 != NULL) + *ptr1 = se->se_ptr1; + if (ptr2 != NULL) + *ptr2 = se->se_ptr2; + if (ptr3 != NULL) + *ptr3 = se->se_ptr3; + kmem_free(se, sizeof (*se)); + return (0); +} + +static vnode_t * +lx_autofs_fifo_peer_vp(vnode_t *vp) +{ + fifonode_t *fnp = VTOF(vp); + fifonode_t *fn_dest = fnp->fn_dest; + return (FTOV(fn_dest)); +} + +static vnode_t * +lx_autofs_vn_alloc(vfs_t *vfsp, vnode_t *uvp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *vp, *vp_old; + + /* Allocate a new vnode structure in case we need it. */ + vp = vn_alloc(KM_SLEEP); + vn_setops(vp, lx_autofs_vn_ops); + VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev); + vp->v_data = uvp; + ASSERT(vp->v_count == 1); + + /* + * Take a hold on the vfs structure. This is how unmount will + * determine if there are any active vnodes in the file system. + */ + VFS_HOLD(vfsp); + + /* + * Check if we already have a vnode allocated for this underlying + * vnode_t. + */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) { + + /* + * Didn't find an existing node. + * Add this node to the hash and return. + */ + VERIFY(mod_hash_insert(data->lav_vn_hash, + (mod_hash_key_t)uvp, + (mod_hash_val_t)vp) == 0); + mutex_exit(&data->lav_lock); + return (vp); + } + + /* Get a hold on the existing vnode and free up the one we allocated. */ + VN_HOLD(vp_old); + mutex_exit(&data->lav_lock); + + /* Free up the new vnode we allocated. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); + + return (vp_old); +} + +static void +lx_autofs_vn_free(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *uvp = vp->v_data; + vnode_t *vp_tmp; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + ASSERT(MUTEX_HELD((&vp->v_lock))); + + ASSERT(vp->v_count == 0); + + /* We're about to free this vnode so take it out of the hash. */ + (void) mod_hash_remove(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp); + + /* + * No one else can lookup this vnode any more so there's no need + * to hold locks. + */ + mutex_exit(&data->lav_lock); + mutex_exit(&vp->v_lock); + + /* Release the underlying vnode. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_alloc(lx_autofs_vfs_t *data, boolean_t *is_dup, boolean_t expire, + char *nm) +{ + lx_autofs_automnt_req_t *laar, *laar_dup; + + /* Pre-allocate a new automounter request before grabbing locks. */ + laar = kmem_zalloc(sizeof (*laar), KM_SLEEP); + mutex_init(&laar->laar_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&laar->laar_cv, NULL, CV_DEFAULT, NULL); + laar->laar_ref = 1; + + if (data->lav_min_proto == 5) { + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS5; + + if (data->lav_mnttype == LXAMT_INDIR) { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_INDIR; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_INDIR; + } + } else { + if (expire) { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_EXPIRE_DIRECT; + } else { + laar->laar_pkt.lap_type = + LX_AUTOFS_PTYPE_MISSING_DIRECT; + } + } + laar->laar_pkt_size = sizeof (lx_autofs_v5_pkt_t); + + laar->laar_pkt.lap_v5.lap_dev = data->lav_dev; + laar->laar_pkt.lap_v5.lap_ino = data->lav_ino; + /* + * Note that we're currently not filling in the other v5 pkt + * fields (pid, uid, etc.) since they don't appear to be used + * by the automounter. We can fill those in later if it proves + * necessary. + */ + + /* + * For indirect mounts the token expected by the automounter is + * the name of the directory entry to look up (not the entire + * path that is being accessed.) For direct mounts the Linux + * kernel passes a dummy name, so this is just as good. + */ + laar->laar_pkt.lap_v5.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v5.lap_name_len > + (sizeof (laar->laar_pkt.lap_v5.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs automnt req: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v5.lap_name, nm, + sizeof (laar->laar_pkt.lap_v5.lap_name)); + + } else if (expire) { + zcmn_err(getzoneid(), CE_WARN, + "unsupported expire protocol request: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + + } else { + ASSERT(expire == B_FALSE); + + /* Older protocol pkt (really v2) */ + laar->laar_pkt.lap_protover = LX_AUTOFS_PROTO_VERS2; + laar->laar_pkt.lap_type = LX_AUTOFS_PTYPE_MISSING; + laar->laar_pkt_size = sizeof (lx_autofs_v2_pkt_t); + + /* + * The token expected by the linux automount is the name of + * the directory entry to look up. (And not the entire + * path that is being accessed.) + */ + laar->laar_pkt.lap_v2.lap_name_len = strlen(nm); + if (laar->laar_pkt.lap_v2.lap_name_len > + (sizeof (laar->laar_pkt.lap_v2.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs lookup: \"%s\"", nm); + kmem_free(laar, sizeof (*laar)); + return (NULL); + } + (void) strlcpy(laar->laar_pkt.lap_v2.lap_name, nm, + sizeof (laar->laar_pkt.lap_v2.lap_name)); + } + + /* Assign a unique id for this request. */ + laar->laar_pkt.lap_id = id_alloc(data->lav_ids); + + /* Check for an outstanding request for this path. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_path_hash, + (mod_hash_key_t)nm, (mod_hash_val_t *)&laar_dup) == 0) { + /* + * There's already an outstanding request for this + * path so we don't need a new one. + */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); + laar = laar_dup; + + /* Bump the ref count on the old request. */ + atomic_add_int(&laar->laar_ref, 1); + + *is_dup = 1; + } else { + /* Add it to the hashes. */ + VERIFY(mod_hash_insert(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)laar) == 0); + VERIFY(mod_hash_insert(data->lav_path_hash, + (mod_hash_key_t)lx_autofs_strdup(nm), + (mod_hash_val_t)laar) == 0); + + *is_dup = 0; + } + mutex_exit(&data->lav_lock); + + return (laar); +} + +static lx_autofs_automnt_req_t * +lx_autofs_la_find(lx_autofs_vfs_t *data, int id) +{ + lx_autofs_automnt_req_t *laar; + + /* Check for an outstanding request for this id. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&laar) != 0) { + mutex_exit(&data->lav_lock); + return (NULL); + } + atomic_add_int(&laar->laar_ref, 1); + mutex_exit(&data->lav_lock); + return (laar); +} + +static void +lx_autofs_la_complete(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* Remove this request from the hashes so no one can look it up. */ + mutex_enter(&data->lav_lock); + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* Mark this requst as complete and wakeup anyone waiting on it. */ + mutex_enter(&laar->laar_lock); + laar->laar_complete = 1; + cv_broadcast(&laar->laar_cv); + mutex_exit(&laar->laar_lock); +} + +static void +lx_autofs_la_release(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + ASSERT(!MUTEX_HELD(&laar->laar_lock)); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) + return; + ASSERT(laar->laar_ref == 0); + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static void +lx_autofs_la_abort(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laar) +{ + lx_autofs_automnt_req_t *laar_tmp; + + /* + * This is a little tricky. We're aborting the wait for this + * request. So if anyone else is waiting for this request we + * can't free it, but if no one else is waiting for the request + * we should free it. + */ + mutex_enter(&data->lav_lock); + if (atomic_add_int_nv(&laar->laar_ref, -1) > 0) { + mutex_exit(&data->lav_lock); + return; + } + ASSERT(laar->laar_ref == 0); + + /* Remove this request from the hashes so no one can look it up. */ + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)laar->laar_pkt.lap_id, + (mod_hash_val_t)&laar_tmp); + if (data->lav_min_proto == 5) { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v5.lap_name, + (mod_hash_val_t)&laar_tmp); + } else { + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)laar->laar_pkt.lap_v2.lap_name, + (mod_hash_val_t)&laar_tmp); + } + mutex_exit(&data->lav_lock); + + /* It's ok to free this now because the ref count was zero. */ + id_free(data->lav_ids, laar->laar_pkt.lap_id); + kmem_free(laar, sizeof (*laar)); +} + +static int +lx_autofs_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_wr, *ufp_rd = NULL; + file_t *fp_wr, *fp_rd = NULL; + vnode_t *vp_wr, *vp_rd; + int i; + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* Sanity check fifo write fd. */ + if (fd >= fip->fi_nfiles) { + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* Get a pointer to the write fifo. */ + UF_ENTER(ufp_wr, fip, fd); + if (((fp_wr = ufp_wr->uf_file) == NULL) || + ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) { + /* Invalid fifo fd. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + vp_rd = lx_autofs_fifo_peer_vp(fp_wr->f_vnode); + for (i = 0; i < fip->fi_nfiles; i++) { + if (i == fd) + continue; + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * We need to drop fi_lock before we can try to acquire f_tlock + * the good news is that the file pointers are protected because + * we're still holding uf_lock. + */ + mutex_exit(&fip->fi_lock); + + /* + * Here we bump the open counts on the fifos. The reason + * that we do this is because when we go to write to the + * fifo we want to ensure that they are actually open (and + * not in the process of being closed) without having to + * stop the automounter. (If the write end of the fifo + * were closed and we tried to write to it we would panic. + * If the read end of the fifo was closed and we tried to + * write to the other end, the process that invoked the + * lookup operation would get an unexpected SIGPIPE.) + */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + ASSERT(fp_wr->f_count >= 2); + mutex_exit(&fp_wr->f_tlock); + + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + ASSERT(fp_rd->f_count >= 2); + mutex_exit(&fp_rd->f_tlock); + + /* Release all our locks. */ + UF_EXIT(ufp_wr); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + /* Return the file pointers. */ + *fpp_rd = fp_rd; + *fpp_wr = fp_wr; + return (0); +} + +static uint_t +/*ARGSUSED*/ +lx_autofs_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + int *id = (int *)arg; + /* Return the key and terminate the walk. */ + *id = (uintptr_t)key; + return (MH_WALK_TERMINATE); +} + +static void +lx_autofs_fifo_close(lx_autofs_vfs_t *data) +{ + /* + * Close the fifo to prevent any future requests from + * getting sent to the automounter. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr != NULL) { + (void) closef(data->lav_fifo_wr); + data->lav_fifo_wr = NULL; + } + if (data->lav_fifo_rd != NULL) { + (void) closef(data->lav_fifo_rd); + data->lav_fifo_rd = NULL; + } + mutex_exit(&data->lav_lock); + + /* + * Wakeup any threads currently waiting for the automounter + * note that it's possible for multiple threads to have entered + * this function and to be doing the work below simultaneously. + */ + for (;;) { + lx_autofs_automnt_req_t *laar; + int id; + + /* Lookup the first entry in the hash. */ + id = -1; + mod_hash_walk(data->lav_id_hash, + lx_autofs_fifo_close_cb, &id); + if (id == -1) { + /* No more id's in the hash. */ + break; + } + if ((laar = lx_autofs_la_find(data, id)) == NULL) { + /* Someone else beat us to it. */ + continue; + } + + /* Mark the request as complete and release it. */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + } +} + +static int +lx_autofs_fifo_verify_rd(lx_autofs_vfs_t *data) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_rd = NULL; + file_t *fp_rd = NULL; + vnode_t *vp_rd; + int i; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + + /* Check if we've already been shut down. */ + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + return (-1); + } + vp_rd = lx_autofs_fifo_peer_vp(data->lav_fifo_wr->f_vnode); + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(data->lav_pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Seems the automounter still has the read end of the fifo + * open, we're done here. Release all our locks and exit. + */ + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + return (0); +} + +static int +lx_autofs_fifo_write(lx_autofs_vfs_t *data, lx_autofs_automnt_req_t *laarp) +{ + struct uio uio; + struct iovec iov; + file_t *fp_wr, *fp_rd; + int error; + + /* + * The catch here is we need to make sure _we_ don't close + * the the fifo while writing to it. (Another thread could come + * along and realize the automounter process is gone and close + * the fifo. To do this we bump the open count before we + * write to the fifo. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + fp_wr = data->lav_fifo_wr; + fp_rd = data->lav_fifo_rd; + + /* Bump the open count on the write fifo. */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + mutex_exit(&fp_wr->f_tlock); + + /* Bump the open count on the read fifo. */ + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + mutex_exit(&fp_rd->f_tlock); + + mutex_exit(&data->lav_lock); + + iov.iov_base = (caddr_t)&laarp->laar_pkt; + iov.iov_len = laarp->laar_pkt_size; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = 0; + uio.uio_segflg = (short)UIO_SYSSPACE; + uio.uio_resid = laarp->laar_pkt_size; + uio.uio_llimit = 0; + uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK; + + error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL); + (void) closef(fp_wr); + (void) closef(fp_rd); + + /* + * After every write we verify that the automounter still has + * these files open. + */ + mutex_enter(&data->lav_lock); + if (lx_autofs_fifo_verify_rd(data) != 0) { + /* + * Something happened to the automounter. + * Close down the communication pipe we setup. + */ + mutex_exit(&data->lav_lock); + lx_autofs_fifo_close(data); + if (error != 0) + return (error); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + return (error); +} + +static int +lx_autofs_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack) +{ + struct iovec iov; + struct uio uio; + dirent64_t *dp, *dbuf; + vnode_t *vp; + size_t dlen, dbuflen; + int eof, error, ndirents = 64; + char *nm; + + dlen = ndirents * (sizeof (*dbuf)); + dbuf = kmem_alloc(dlen, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_loffset = 0; + uio.uio_llimit = MAXOFFSET_T; + + eof = 0; + error = 0; + while (!error && !eof) { + uio.uio_resid = dlen; + iov.iov_base = (char *)dbuf; + iov.iov_len = dlen; + + (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); + if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) { + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + kmem_free(dbuf, dlen); + return (-1); + } + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + + if ((dbuflen = dlen - uio.uio_resid) == 0) { + /* We're done. */ + break; + } + + for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) { + + nm = dp->d_name; + + if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0) + continue; + + if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + kmem_free(dbuf, dlen); + return (-1); + } + if (vp->v_type == VDIR) { + if (dir_stack != NULL) { + lx_autofs_stack_push(dir_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } else { + if (file_stack != NULL) { + lx_autofs_stack_push(file_stack, + (caddr_t)dvp, + (caddr_t)vp, lx_autofs_strdup(nm)); + } else { + VN_RELE(vp); + } + } + } + } + kmem_free(dbuf, dlen); + return (0); +} + +static void +lx_autofs_bs_destroy(vnode_t *dvp, char *path) +{ + list_t search_stack; + list_t dir_stack; + list_t file_stack; + vnode_t *pdvp, *vp; + char *dpath, *fpath; + int ret; + + if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + /* A directory entry with this name doesn't actually exist. */ + return; + } + + if ((vp->v_type & VDIR) == 0) { + /* Easy, the directory entry is a file so delete it. */ + VN_RELE(vp); + (void) VOP_REMOVE(dvp, path, kcred, NULL, 0); + return; + } + + /* + * The directory entry is a subdirectory, now we have a bit more + * work to do. (We'll have to recurse into the sub directory.) + * It would have been much easier to do this recursively but kernel + * stacks are notoriously small. + */ + ls_autofs_stack_init(&search_stack); + ls_autofs_stack_init(&dir_stack); + ls_autofs_stack_init(&file_stack); + + /* Save our newfound subdirectory into a list. */ + lx_autofs_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp, + lx_autofs_strdup(path)); + + /* Do a recursive depth first search into the subdirectories. */ + while (lx_autofs_stack_pop(&search_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the subdirectories in this directory. */ + if (lx_autofs_bs_readdir(dvp, &search_stack, NULL) != 0) + goto exit; + + /* Save the current directory a separate stack. */ + lx_autofs_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp, + dpath); + } + + /* + * Now dir_stack contains a list of directories, the deepest paths + * are at the top of the list. So let's go through and process them. + */ + while (lx_autofs_stack_pop(&dir_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the files in this directory. */ + if (lx_autofs_bs_readdir(dvp, NULL, &file_stack) != 0) { + VN_RELE(dvp); + lx_autofs_strfree(dpath); + goto exit; + } + + /* Delete all the files in this directory. */ + while (lx_autofs_stack_pop(&file_stack, + NULL, (caddr_t *)&vp, &fpath) == 0) { + VN_RELE(vp) + ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0); + lx_autofs_strfree(fpath); + if (ret != 0) { + lx_autofs_strfree(dpath); + goto exit; + } + } + + /* Delete this directory. */ + VN_RELE(dvp); + ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0); + lx_autofs_strfree(dpath); + if (ret != 0) + goto exit; + } + +exit: + while ( + (lx_autofs_stack_pop(&search_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&dir_stack, NULL, (caddr_t *)&vp, + &path) == 0) || + (lx_autofs_stack_pop(&file_stack, NULL, (caddr_t *)&vp, + &path) == 0)) { + VN_RELE(vp); + lx_autofs_strfree(path); + } + lx_autofs_stack_fini(&search_stack); + lx_autofs_stack_fini(&dir_stack); + lx_autofs_stack_fini(&file_stack); +} + +static vnode_t * +lx_autofs_bs_create(vnode_t *dvp, char *bs_name) +{ + vnode_t *vp; + vattr_t vattr; + + /* + * After looking at the mkdir syscall path it seems we don't need + * to initialize all of the vattr_t structure. + */ + bzero(&vattr, sizeof (vattr)); + vattr.va_type = VDIR; + vattr.va_mode = 0755; /* u+rwx,og=rx */ + vattr.va_mask = AT_TYPE|AT_MODE; + + if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0) + return (NULL); + return (vp); +} + +static int +lx_autofs_automounter_call(vnode_t *dvp, char *nm) +{ + lx_autofs_automnt_req_t *laar; + lx_autofs_vfs_t *data; + int error; + boolean_t is_dup; + + /* Get a pointer to the vfs mount data. */ + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* The automounter only supports queries in the root directory. */ + if (dvp != data->lav_root) + return (ENOENT); + + /* + * Check if the current process is in the automounters process + * group. (If it is, the current process is either the autmounter + * itself or one of it's forked child processes.) If so, don't + * redirect this call back into the automounter because we'll + * hang. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter request structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_FALSE, + nm)) == NULL) + return (ENOENT); + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if ((!is_dup) && + ((error = lx_autofs_fifo_write(data, laar)) != 0)) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (error); + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this call. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + return (EINTR); + } + } + mutex_exit(&laar->laar_lock); + + if (laar->laar_result == LXACR_READY) { + /* + * Mount succeeded, keep track for future expire calls. + * + * See vfs lav_vn_hash. Is this something we could use for + * iterating mounts under this autofs? Used by + * lx_autofs_vn_alloc + */ + lx_autofs_mntent_t *mp; + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, mp->lxafme_len); + + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } + + lx_autofs_la_release(data, laar); + + return (0); +} + +/* + * Same preliminary checks as in lx_autofs_unmount. + */ +static boolean_t +lx_autofs_may_unmount(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (B_FALSE); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (B_FALSE); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + + /* Check for any remaining holds on the root vnode. */ + if (data->lav_root->v_count > 1) + return (B_FALSE); + + return (B_TRUE); +} + +static vfs_t * +lx_autofs_get_mountvfs(char *fs_mntpt, int *cnt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + int fsmplen; + int acnt = 0; + + fsmplen = strlen(fs_mntpt); + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + *cnt = 0; + return (NULL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt; + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) == 0 && + (mntpt[fsmplen] == '\0' || mntpt[fsmplen] == '/')) { + /* + * We'll return the first one we find but don't + * return a mount that is actually autofs (i.e. + * autofs direct or offset mount). + */ + if (vfsp->vfs_op == lx_autofs_vfsops) { + acnt++; + } else if (fnd_vfs == NULL) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + } + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + *cnt = acnt; + return (fnd_vfs); +} + +/* + * Unmount all autofs offset mounts below the given path. + */ +static boolean_t +lx_autofs_umount_offset(char *fs_mntpt, struct cred *cr) +{ + struct vfs *vfsp; + struct vfs *vfslist; + boolean_t busy = B_FALSE; + int fsmplen = strlen(fs_mntpt); + +restart: + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (B_FALSE); + } + + do { + char *mntpt; + lx_autofs_vfs_t *data; + + /* Skip mounts we should ignore. */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB)) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + if (strncmp(fs_mntpt, mntpt, fsmplen) != 0 || + (mntpt[fsmplen] != '\0' && mntpt[fsmplen] != '/')) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Something got mounted over the autofs mountpoint + * after we checked that this inidrect hierarchy was + * not busy. + */ + busy = B_TRUE; + break; + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_mnttype != LXAMT_OFFSET) { + /* + * Something mounted a non-offset autofs fs under this + * indirect mnt! + */ + busy = B_TRUE; + break; + } + + /* + * Attempt to umount - set busy if fails. + * + * umount2_engine will call VFS_RELE, so we need to take an + * extra hold to match the behavior during the normal umount + * path. + * + * We also need to drop the list lock to prevent deadlock + * during umount. + */ + VFS_HOLD(vfsp); + vfs_list_unlock(); + if (umount2_engine(vfsp, 0, cr, 0) != 0) { + busy = B_TRUE; + goto errexit; + } + + /* Retake list lock and look for more. */ + goto restart; + } while (vfsp != vfslist); + + vfs_list_unlock(); + +errexit: + return (busy); +} + + +/* + * Note that lx_autofs_automounter_call() only supports queries in the root + * directory, so all mntent names are relative to that. + */ +static int +lx_autofs_expire(vfs_t *vfsp, struct cred *cr) +{ + lx_autofs_vfs_t *data; + lx_autofs_mntent_t *mp; + lx_autofs_automnt_req_t *laar; + boolean_t is_dup; + vfs_t *fnd_vfs; + int autofs_cnt; + boolean_t busy = B_FALSE; + char exp_path[MAXPATHLEN]; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* + * We process only the first element (i.e. do not do multi). This + * works fine for the automounter. + */ + mutex_enter(&data->lav_lock); + mp = (lx_autofs_mntent_t *)list_remove_head(&data->lav_mnt_list); + mutex_exit(&data->lav_lock); + if (mp == NULL) { + if (data->lav_mnttype == LXAMT_OFFSET) { + /* + * During restart the automounter will openmount each + * offset mount for management. It won't closemount the + * offset mount until we expire it, even though nothing + * is mounted over that offset. We handle this as a + * special expiration case. + */ + int cnt; + + mutex_enter(&data->lav_lock); + cnt = data->lav_openmnt_cnt; + mutex_exit(&data->lav_lock); + + if (cnt == 1 && vn_ismntpt(data->lav_root) == 0) { + char *mntpt = (char *) + refstr_value(vfsp->vfs_mntpt); + char *nm = ZONE_PATH_TRANSLATE(mntpt, curzone); + + mp = kmem_zalloc(sizeof (lx_autofs_mntent_t), + KM_SLEEP); + mp->lxafme_len = strlen(nm) + 1; + mp->lxafme_path = kmem_zalloc(mp->lxafme_len, + KM_SLEEP); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + (void) strlcpy(mp->lxafme_path, nm, + mp->lxafme_len); + + goto exp_offset; + } + } + + return (EAGAIN); + } + + /* + * We only return an expired mount if it is inactive for the full + * timeout. This reduces overly aggressive umount/mount activity. + */ + if (data->lav_timeout > 0) { + uint64_t now = TICK_TO_SEC(ddi_get_lbolt64()); + + if ((now - mp->lxafme_ts) < data->lav_timeout) { + /* put it back at the end of the line */ + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + } + + if (data->lav_mnttype == LXAMT_INDIR) { + (void) snprintf(exp_path, sizeof (exp_path), "%s/%s", + (char *)refstr_value(vfsp->vfs_mntpt), mp->lxafme_path); + } else { + (void) strlcpy(exp_path, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (exp_path)); + } + + fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt); + if (fnd_vfs != NULL) { + boolean_t skip = B_FALSE; + vfssw_t *vfssw; + + /* + * If it's an NFS file system (typical) then we check in + * advance to see if it can be unmounted, otherwise, proceed. + * The fs-specific umount attempted by the automounter will + * either succeed or fail. Both are valid outcomes but checking + * now for nfs will save a bunch of work by the automounter + * if the fs is busy. + * + * Unfortunately, for NFS the vfs_fstype is the same for all + * versions of NFS, so we need to check the vfs_op member to + * determine which version of NFS we're dealing with. + */ + if (!skip && (vfssw = vfs_getvfssw("nfs4")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable4(fnd_vfs)) + busy = B_TRUE; + skip = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + if (!skip && (vfssw = vfs_getvfssw("nfs3")) != NULL) { + if (vfs_matchops(fnd_vfs, &vfssw->vsw_vfsops)) { + (void) dnlc_purge_vfsp(fnd_vfs, 0); + if (check_rtable(fnd_vfs)) + busy = B_TRUE; + } + vfs_unrefvfssw(vfssw); + } + + VFS_RELE(fnd_vfs); + + } else if (autofs_cnt > 0) { + /* + * The automounter is asking us to expire and we pulled this + * name from our vfs mountpoint list, but if + * lx_autofs_get_mountvfs returns null then that means we + * didn't find a non-autofs mount under this name. Thus, the + * name could be a subdirectory under an autofs toplevel + * indirect mount with one or more offset mounts below. + * autofs_cnt will indicate how many autofs mounts exist below + * this subdirectory name. + * + * The automounter will take care of unmounting any fs mounted + * over one of these offset mounts (i.e. offset is like a + * direct mount which the automounter will manage) but the + * automounter will not unmount the actual autofs offset mount + * itself, so we have to do that before we can expire the + * top-level subrectory name. + */ + busy = lx_autofs_umount_offset(exp_path, cr); + } + + if (busy) { + /* + * Can't unmount this one right now, put it at the end of the + * list and return. The caller will return EAGAIN for the + * expire ioctl and the automounter will check again later. + */ + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + return (EAGAIN); + } + + /* + * See lx_autofs_automounter_call. We want to send a msg up the pipe + * to the automounter in a similar way. + */ + +exp_offset: + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + goto err_free; + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter expire structure. */ + if ((laar = lx_autofs_la_alloc(data, &is_dup, B_TRUE, + mp->lxafme_path)) == NULL) + goto err_free; + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if (!is_dup && lx_autofs_fifo_write(data, laar) != 0) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + goto err_free; + } + + /* Wait for someone to signal us that this request has completed. */ + mutex_enter(&laar->laar_lock); + while (!laar->laar_complete) { + if (cv_wait_sig(&laar->laar_cv, &laar->laar_lock) == 0) { + /* We got a signal, abort this request. */ + mutex_exit(&laar->laar_lock); + lx_autofs_la_abort(data, laar); + goto err_free; + } + } + mutex_exit(&laar->laar_lock); + + /* + * If it failed or if the file system is still mounted after we get the + * response from our expire msg, then that means the automounter tried + * to unmount it but failed because the file system is busy, so we put + * this entry back on our list to try to expire it again later. + */ + fnd_vfs = NULL; + if (laar->laar_result == LXACR_FAIL || + (fnd_vfs = lx_autofs_get_mountvfs(exp_path, &autofs_cnt)) != NULL || + autofs_cnt > 0) { + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + mp->lxafme_ts = TICK_TO_SEC(ddi_get_lbolt64()); + mutex_enter(&data->lav_lock); + list_insert_tail(&data->lav_mnt_list, mp); + mutex_exit(&data->lav_lock); + } else { + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + lx_autofs_la_release(data, laar); + return (0); + +err_free: + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + return (EAGAIN); +} + +static int +lx_autofs_ack(int reqid, vfs_t *vfsp, enum lx_autofs_callres result) +{ + lx_autofs_vfs_t *data; + lx_autofs_automnt_req_t *laar; + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if ((laar = lx_autofs_la_find(data, reqid)) == NULL) + return (ENXIO); + + /* Mark the request as complete and release it. */ + laar->laar_result = result; + lx_autofs_la_complete(data, laar); + lx_autofs_la_release(data, laar); + return (0); +} + +static int +lx_autofs_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg, cred_t *cr) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + int id = arg; + int v; + int err; + + /* + * Be strict. + * We only accept ioctls from the automounter process group. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp != curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + switch (cmd) { + case LX_AUTOFS_IOC_READY: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_READY)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_FAIL: + if ((err = lx_autofs_ack(id, vp->v_vfsp, LXACR_FAIL)) != 0) + return (err); + return (0); + + case LX_AUTOFS_IOC_CATATONIC: + /* The automounter is shutting down. */ + lx_autofs_fifo_close(data); + return (0); + + case LX_AUTOFS_IOC_PROTOVER: + v = LX_AUTOFS_PROTO_VERS5; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_PROTOSUBVER: + v = LX_AUTOFS_PROTO_SUBVERSION; + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_ASKUMOUNT: + /* + * This is asking if autofs can be unmounted, not asking to + * actually unmount it. We return 1 if it is busy or 0 if it + * can be unmounted. + */ + v = 1; + if (lx_autofs_may_unmount(vp->v_vfsp, cr)) + v = 0; + + if (copyout(&v, (caddr_t)arg, sizeof (int)) != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_SETTIMEOUT: + if (copyin((caddr_t)arg, &data->lav_timeout, sizeof (ulong_t)) + != 0) + return (EFAULT); + return (0); + + case LX_AUTOFS_IOC_EXPIRE: + return (ENOTSUP); + + case LX_AUTOFS_IOC_EXPIRE_MULTI: + lx_autofs_expire(vp->v_vfsp, cr); + return (EAGAIN); + + default: + ASSERT(0); + return (ENOTSUP); + } +} + +static int +lx_autofs_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data) +{ + char *fd_str, *pgrp_str, *minproto_str, *maxproto_str; + int fd, pgrp, minproto, maxproto; + file_t *fp_wr, *fp_rd; + + /* Require these options to be present. */ + if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1)) + return (EINVAL); + + /* Get the values for each parameter. */ + if ((lx_autofs_str_to_int(fd_str, &fd) != 0) || + (lx_autofs_str_to_int(pgrp_str, &pgrp) != 0) || + (lx_autofs_str_to_int(minproto_str, &minproto) != 0) || + (lx_autofs_str_to_int(maxproto_str, &maxproto) != 0)) + return (EINVAL); + + /* + * We primarily support v2 & v5 of the linux kernel automounter + * protocol. The userland daemon typically needs v5. We'll reject + * unsupported ioctls later if we get one. + */ + if ((minproto > 5) || (maxproto < 2)) + return (EINVAL); + + /* + * Now we need to lookup the fifos we'll be using + * to talk to the userland automounter process. + */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + /* + * The automounter doesn't always have the same id as the pgrp. + * This happens when it is started via one of the various + * service managers. In this case the fifo lookup will fail + * so we retry with our own pid. + */ + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) + return (EINVAL); + } + + if (vfs_optionisset(vfsp, LX_MNTOPT_INDIRECT, NULL)) { + data->lav_mnttype = LXAMT_INDIR; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_DIRECT, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_DIRECT; + } + if (vfs_optionisset(vfsp, LX_MNTOPT_OFFSET, NULL)) { + if (data->lav_mnttype != LXAMT_NONE) + return (EINVAL); + data->lav_mnttype = LXAMT_OFFSET; + } + /* The automounter does test mounts with none of the options */ + if (data->lav_mnttype == LXAMT_NONE) + data->lav_mnttype = LXAMT_DIRECT; + + /* Save the mount options and fifo pointers. */ + data->lav_fd = fd; + data->lav_min_proto = minproto; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + return (0); +} + +static uint64_t +s2l_dev(dev_t dev) +{ + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + return (LX_MAKEDEVICE(maj, min)); +} + +/* + * VFS entry points + */ +static int +lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lx_autofs_vfs_t *data; + dev_t dev; + char name[40]; + int error; + vattr_t va; + + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) + return (EBUSY); + + /* We don't support mounts in the global zone. */ + if (getzoneid() == GLOBAL_ZONEID) + return (EPERM); + + /* + * Offset mounts will occur below the top-level mountpoint so we + * need to allow for autofs mounts even though mvp is an autofs. + */ + + /* Allocate a vfs struct. */ + data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP); + + /* Parse mount options. */ + if ((error = lx_autofs_parse_mntopt(vfsp, data)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + + /* Initialize the backing store. */ + lx_autofs_bs_destroy(mvp, LX_AUTOFS_BS_DIR); + data->lav_bs_vp = lx_autofs_bs_create(mvp, LX_AUTOFS_BS_DIR); + if (data->lav_bs_vp == NULL) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (EBUSY); + } + data->lav_bs_name = LX_AUTOFS_BS_DIR; + + /* Get the backing store inode for use in v5 protocol msgs */ + va.va_mask = AT_STAT; + if ((error = VOP_GETATTR(data->lav_bs_vp, &va, 0, cr, NULL)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + data->lav_ino = va.va_nodeid; + + /* We have to hold the underlying vnode we're mounted on. */ + data->lav_mvp = mvp; + VN_HOLD(mvp); + + /* Initialize vfs fields */ + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lx_autofs_fstype; + vfsp->vfs_data = data; + + /* Invent a dev_t (sigh) */ + do { + dev = makedevice(lx_autofs_major, + atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32); + } while (vfs_devismounted(dev)); + vfsp->vfs_dev = dev; + vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype); + + data->lav_dev = s2l_dev(vfsp->vfs_dev); + + /* Create an id space arena for automounter requests. */ + (void) snprintf(name, sizeof (name), "lx_autofs_id_%d", + getminor(vfsp->vfs_dev)); + data->lav_ids = id_space_create(name, 1, INT_MAX); + + /* Create hashes to keep track of automounter requests. */ + mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL); + (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_path_hash = mod_hash_create_strhash(name, + LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor); + (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_id_hash = mod_hash_create_idhash(name, + LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor); + + /* Create a hash to keep track of vnodes. */ + (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_vn_hash = mod_hash_create_ptrhash(name, + LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor, + sizeof (vnode_t)); + + list_create(&data->lav_mnt_list, sizeof (lx_autofs_mntent_t), + offsetof(lx_autofs_mntent_t, lxafme_lst)); + + /* Create root vnode */ + data->lav_root = lx_autofs_vn_alloc(vfsp, data->lav_bs_vp); + + data->lav_root->v_flag |= VROOT | VNOCACHE | VNOMAP | VNOSWAP; + + /* + * For a direct mountpoint we need to allow a filesystem to be + * mounted overtop of this autofs mount. Otherwise, disallow that. + */ + if (data->lav_mnttype == LXAMT_INDIR) + data->lav_root->v_flag |= VNOMOUNT; + + return (0); +} + +static int +lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (EPERM); + + /* We do not currently support forced unmounts. */ + if (flag & MS_FORCE) + return (ENOTSUP); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (EBUSY); + + /* Check for any remaining holds on the root vnode. */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + if (data->lav_root->v_count > 1) + return (EBUSY); + + /* Close the fifo to the automount process. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + /* + * We have to release our hold on our root vnode before we can + * delete the backing store. (Since the root vnode is linked + * to the backing store.) + */ + VN_RELE(data->lav_root); + + /* Cleanup the backing store. */ + lx_autofs_bs_destroy(data->lav_mvp, data->lav_bs_name); + VN_RELE(data->lav_mvp); + + /* + * Delete all listed mounts. + */ + for (;;) { + lx_autofs_mntent_t *mp; + + mp = list_remove_head(&data->lav_mnt_list); + if (mp == NULL) + break; + kmem_free(mp->lxafme_path, mp->lxafme_len); + kmem_free(mp, sizeof (lx_autofs_mntent_t)); + } + + /* Cleanup out remaining data structures. */ + mod_hash_destroy_strhash(data->lav_path_hash); + mod_hash_destroy_idhash(data->lav_id_hash); + mod_hash_destroy_ptrhash(data->lav_vn_hash); + id_space_destroy(data->lav_ids); + list_destroy(&data->lav_mnt_list); + kmem_free(data, sizeof (lx_autofs_vfs_t)); + + return (0); +} + +static int +lx_autofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + *vpp = data->lav_root; + VN_HOLD(*vpp); + + return (0); +} + +static int +lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *urvp = data->lav_root->v_data; + dev32_t d32; + int error; + + if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0) + return (error); + + /* Update some of values before returning. */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name, + sizeof (sp->f_basetype)); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + return (0); +} + +static const fs_operation_def_t lx_autofs_vfstops[] = { + { VFSNAME_MOUNT, { .vfs_mount = lx_autofs_mount } }, + { VFSNAME_UNMOUNT, { .vfs_unmount = lx_autofs_unmount } }, + { VFSNAME_ROOT, { .vfs_root = lx_autofs_root } }, + { VFSNAME_STATVFS, { .vfs_statvfs = lx_autofs_statvfs } }, + { NULL, NULL } +}; + +/* + * VOP entry points - simple passthrough + * + * For most VOP entry points we can simply pass the request on to + * the underlying filesystem we're mounted on. + */ +static int +lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp)); +} + +static int +lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ctp, int flags) +{ + vnode_t *uvp = vp->v_data; + return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags)); +} + +static int +lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_ACCESS(uvp, mode, flags, cr, ctp)); +} + +static int +lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_RWLOCK(uvp, write_lock, ctp)); +} + +static void +lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + VOP_RWUNLOCK(uvp, write_lock, ctp); +} + +/* + * Check if attempting to access a 'direct' mount and if so, call the + * automounter to perform the mount. Once the mount occurs, the new filesystem + * will be mounted overtop of this autofs mountpoint and we will no longer + * come through this path. + */ +static vnode_t * +lx_autofs_do_direct(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vfsp->vfs_data; + vnode_t *nvp; + boolean_t skip_am_call = B_FALSE; + + if (data->lav_mnttype == LXAMT_INDIR) + return (NULL); + + /* + * Check if the current process is in the automounter's process group. + * If it is, the current process is either the automounter itself or + * one of it's children. If so, don't call back into the automounter. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + skip_am_call = B_TRUE; + } + mutex_exit(&pidlock); + + /* + * It is possible there is already a new fs mounted on top of our vnode. + * This can happen if the caller first did a lookup of a file name + * using our vnode as the directory vp. The lookup would trigger the + * autofs mount on top of ourself, but if the caller then uses our + * vnode to do a getattr on the directory, it will use the autofs + * vnode and not the newly mounted vnode. We need to skip re-calling + * the automounter for this case. + */ + if (!skip_am_call && vn_mountedvfs(vp) == NULL) { + char tbuf[MAXPATHLEN]; + char *nm; + + (void) strlcpy(tbuf, (char *)refstr_value(vfsp->vfs_mntpt), + sizeof (tbuf)); + nm = tbuf + strlen(tbuf); + while (*nm != '/' && nm != tbuf) + nm--; + if (*nm == '/') + nm++; + (void) lx_autofs_automounter_call(vp, nm); + } + + /* + * We need to take an extra hold on our vp (which is the autofs + * root vp) to account for the rele done in traverse. traverse will + * take a hold on the new vp so the caller is responsible for calling + * VN_RELE on the returned vp. + */ + VN_HOLD(vp); + nvp = vp; + if (traverse(&nvp) != 0) { + VN_RELE(nvp); + return (NULL); + } + + /* Confirm that we have a non-autofs fs mounted now */ + if (nvp->v_op == lx_autofs_vn_ops) { + VN_RELE(nvp); + return (NULL); + } + + return (nvp); +} + +/*ARGSUSED*/ +static int +lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ctp, int flags) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + + /* handle direct mount here */ + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + int error; + + error = VOP_RMDIR(nvp, nm, cdir, cr, ctp, flags); + VN_RELE(nvp); + return (error); + } + + /* + * cdir is the calling processes current directory. + * If cdir is lx_autofs vnode then get its real underlying + * vnode ptr. (It seems like the only thing cdir is + * ever used for is to make sure the user doesn't delete + * their current directory.) + */ + if (vn_matchops(cdir, lx_autofs_vn_ops)) { + vnode_t *ucdir = cdir->v_data; + return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags)); + } + + return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags)); +} + +/* + * VOP entry points - special passthrough + * + * For some VOP entry points we will first pass the request on to + * the underlying filesystem we're mounted on. If there's an error + * then we immediately return the error, but if the request succeeds + * we have to do some extra work before returning. + */ +static int +lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp) +{ + vnode_t *ovp = *vpp; + vnode_t *uvp = ovp->v_data; + int error; + + /* direct mounts were handled by the lookup to get *vpp */ + + if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0) + return (error); + + /* Check for clone opens. */ + if (uvp == ovp->v_data) + return (0); + + /* Deal with clone opens by returning a new vnode. */ + *vpp = lx_autofs_vn_alloc(ovp->v_vfsp, uvp); + VN_RELE(ovp); + return (0); +} + +/* + * Internally, we have already converted our autofs vfs device number into a + * Linux-format device during lx_autofs_mount and stored that device number + * in data->lav_dev. However, our lx emulation for the various stat() syscalls + * also wants to convert the fsid the same way. That obviously will be + * incorrect if we pass along an fsid that is already converted, so we always + * pass along the original vfs fsid here. Both lav_dev and lav_ino are passed + * in messages to the automounter, and these must match the values obtained by + * stat(). + */ +static int +lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + vnode_t *dvp; + int error; + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + dev_t autofs_fsid = vp->v_vfsp->vfs_dev; + + if ((dvp = lx_autofs_do_direct(vp)) != NULL) { + uvp = dvp; + } + + error = VOP_GETATTR(uvp, vap, flags, cr, ctp); + + if (dvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(dvp); + if (error == 0) { + /* + * During automounter restart recovery, the automounter + * will fstat the fd provided in the setpipe ioctl. It + * uses the resulting inode & dev to correlate future + * autofs fifo requests to the correct entry. Thus, we + * have to update the attributes with the proper IDs. + */ + vap->va_fsid = autofs_fsid; + vap->va_nodeid = data->lav_ino; + } + } else if (error == 0) { + /* Update the attributes with our filesystem id. */ + vap->va_fsid = autofs_fsid; + } + + return (error); +} + +static int +lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *nvp; + int error; + + if ((nvp = lx_autofs_do_direct(dvp)) != NULL) { + udvp = nvp; + } + + error = VOP_MKDIR(udvp, nm, vap, vpp, cr, ctp, flags, vsecp); + + if (nvp != NULL) { + /* we operated on the direct mounted fs */ + VN_RELE(nvp); + } else if (error == 0) { + vnode_t *uvp = NULL; + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = dvp->v_vfsp->vfs_dev; + + /* Allocate our new vnode. */ + uvp = *vpp; + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + } + + return (error); +} + +/* + * VOP entry points - custom + */ +/*ARGSUSED*/ +static void +lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + /* + * We need to hold the vfs lock because if we're going to free + * this vnode we have to prevent anyone from looking it up + * in the vnode hash. + */ + mutex_enter(&data->lav_lock); + mutex_enter(&vp->v_lock); + + if (vp->v_count < 1) { + panic("lx_autofs_inactive: bad v_count"); + /*NOTREACHED*/ + } + + /* Drop the temporary hold by vn_rele now. */ + if (--vp->v_count > 0) { + mutex_exit(&vp->v_lock); + mutex_exit(&data->lav_lock); + return; + } + + /* + * No one should have been blocked on this lock because we're + * about to free this vnode. + */ + lx_autofs_vn_free(vp); +} + +static int +lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + lx_autofs_vfs_t *data; + int error = ENOENT; + + data = (lx_autofs_vfs_t *)dvp->v_vfsp->vfs_data; + + /* + * For an indirect mount first try to lookup if this path component + * already exists. + */ + if (data->lav_mnttype == LXAMT_INDIR) { + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } + + /* Only query the automounter if the path does not exist. */ + if (error != ENOENT) + return (error); + + if (flags & __FLXNOAUTO) + return (ENOENT); + + if (data->lav_catatonic) + return (ENOENT); + + /* Save the uid/gid for the requestor ioctl. */ + data->lav_uid = crgetuid(cr); + data->lav_gid = crgetgid(cr); + + /* Refer the lookup to the automounter. */ + if ((error = lx_autofs_automounter_call(dvp, nm)) != 0) + return (error); + + if (data->lav_mnttype == LXAMT_INDIR) { + /* + * Indirect mount. The automounter call should have mounted + * something on nm. Retry the lookup operation. + */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, + ctp, direntflags, realpnp)) == 0) { + *vpp = lx_autofs_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + } else { + /* + * Direct or offset mount. The automounter call should have + * covered our 'dvp' with a new filesystem. Traverse into the + * new mount and retry the lookup. + * + * We need to take an extra hold on our vp (which is the autofs + * root vp) to acount for the rele done in traverse. Our caller + * will also do a rele on the original dvp and that would leave + * us one ref short on our autofs root vnode. + */ + vnode_t *orig_dvp = dvp; + + VN_HOLD(dvp); + if ((error = traverse(&dvp)) != 0) { + VN_RELE(dvp); + return (error); + } + + if (dvp == orig_dvp) { + /* + * For some reason the automountd did not actually + * mount the new filesystem. Return an error. + */ + VN_RELE(dvp); + return (ENOENT); + } + + error = VOP_LOOKUP(dvp, nm, vpp, pnp, flags, rdir, cr, ctp, + direntflags, realpnp); + + /* release the traverse hold */ + VN_RELE(dvp); + } + return (error); +} + +static int +lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rvalp, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + + /* Intercept our ioctls. */ + switch ((uint_t)cmd) { + case LX_AUTOFS_IOC_READY: + case LX_AUTOFS_IOC_FAIL: + case LX_AUTOFS_IOC_CATATONIC: + case LX_AUTOFS_IOC_PROTOVER: + case LX_AUTOFS_IOC_SETTIMEOUT: + case LX_AUTOFS_IOC_EXPIRE: + case LX_AUTOFS_IOC_EXPIRE_MULTI: + case LX_AUTOFS_IOC_PROTOSUBVER: + case LX_AUTOFS_IOC_ASKUMOUNT: + return (lx_autofs_automounter_ioctl(vp, cmd, arg, cr)); + } + + /* Pass any remaining ioctl on. */ + return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp)); +} + +/* + * VOP entry points definitions + */ +static const fs_operation_def_t lx_autofs_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = lx_autofs_open } }, + { VOPNAME_CLOSE, { .vop_close = lx_autofs_close } }, + { VOPNAME_IOCTL, { .vop_ioctl = lx_autofs_ioctl } }, + { VOPNAME_RWLOCK, { .vop_rwlock = lx_autofs_rwlock } }, + { VOPNAME_RWUNLOCK, { .vop_rwunlock = lx_autofs_rwunlock } }, + { VOPNAME_GETATTR, { .vop_getattr = lx_autofs_getattr } }, + { VOPNAME_ACCESS, { .vop_access = lx_autofs_access } }, + { VOPNAME_READDIR, { .vop_readdir = lx_autofs_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = lx_autofs_lookup } }, + { VOPNAME_INACTIVE, { .vop_inactive = lx_autofs_inactive } }, + { VOPNAME_MKDIR, { .vop_mkdir = lx_autofs_mkdir } }, + { VOPNAME_RMDIR, { .vop_rmdir = lx_autofs_rmdir } }, + { NULL } +}; + +/* + * DEV-specific entry points + */ + +/*ARGSUSED*/ +static int +lx_autofs_dev_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + return (0); +} + +static int +lx_autofs_dev_validate_cmd(intptr_t arg, lx_autofs_dv_ioctl_t *dcmd) +{ + if (copyin((caddr_t)arg, dcmd, sizeof (lx_autofs_dv_ioctl_t)) != 0) + return (EFAULT); + + if (dcmd->lad_ver_major != LX_AUTOFS_DEV_VERSION_MAJOR || + dcmd->lad_ver_minor > LX_AUTOFS_DEV_VERSION_MINOR) + return (EINVAL); + + DTRACE_PROBE1(lx__dev__cmd, void *, dcmd); + + /* Fill in the version for return */ + dcmd->lad_ver_major = LX_AUTOFS_DEV_VERSION_MAJOR; + dcmd->lad_ver_minor = LX_AUTOFS_DEV_VERSION_MINOR; + return (0); +} + +static vfs_t * +lx_autofs_dev_getvfs_bypath(char *fs_mntpt) +{ + struct vfs *vfsp; + struct vfs *vfslist; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + vfs_list_read_lock(); + + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + return (NULL); + } + + do { + if (vfsp->vfs_op == lx_autofs_vfsops) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(fs_mntpt, ZONE_PATH_TRANSLATE(mntpt, zone)) + == 0) { + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + return (fnd_vfs); +} + +static int +lx_autofs_dev_fd_preamble(intptr_t arg, lx_autofs_dv_ioctl_t *dc, vfs_t **vfspp) +{ + int err; + lx_autofs_vfs_t *data; + file_t *fp; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_validate_cmd(arg, dc)) != 0) + return (err); + + if ((fp = getf(dc->lad_ioctlfd)) == NULL) + return (EBADF); + + vfsp = fp->f_vnode->v_vfsp; + if (vfsp->vfs_op != lx_autofs_vfsops) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + if (data->lav_root->v_count <= 1) { + releasef(dc->lad_ioctlfd); + return (EBADF); + } + + VFS_HOLD(vfsp); + *vfspp = vfsp; + + releasef(dc->lad_ioctlfd); + return (0); +} + +static int +lx_autofs_dev_vers(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_VERS5; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_protosubver(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + dcmd.lad_arg1 = LX_AUTOFS_PROTO_SUBVERSION; + + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +static int +lx_autofs_dev_get_path_cmd(intptr_t arg, lx_autofs_dv_ioctl_t **dcp) +{ + int err; + lx_autofs_dv_ioctl_t dcmd, *dc; + + if ((err = lx_autofs_dev_validate_cmd(arg, &dcmd)) != 0) + return (err); + + if (dcmd.lad_size <= sizeof (dcmd) || + dcmd.lad_size > (sizeof (dcmd) + MAXPATHLEN)) + return (EINVAL); + + dc = kmem_alloc(dcmd.lad_size, KM_SLEEP); + + /* re-copyin the full struct with the path */ + if (copyin((caddr_t)arg, dc, dcmd.lad_size) != 0) { + kmem_free(dc, dcmd.lad_size); + return (EFAULT); + } + dc->lad_size = dcmd.lad_size; + + if (dc->lad_path[0] != '/' || + dc->lad_path[dcmd.lad_size - sizeof (dcmd) - 1] != '\0') { + kmem_free(dc, dcmd.lad_size); + return (EINVAL); + } + + *dcp = dc; + return (0); +} + +static int +lx_autofs_dev_openmount(intptr_t arg) +{ + int err; + int fd; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + if ((vfsp = lx_autofs_dev_getvfs_bypath(dc->lad_path)) == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + /* lad_arg1 is the dev number of the mnt but we don't check that */ + + /* + * Do an "open" on the root vnode. To fully simulate "open" we also add + * a hold on the root vnode itself since lx_autofs_open will only open + * (and hold) the underlying vnode. + */ + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + VN_HOLD(data->lav_root); + if ((err = fassign(&data->lav_root, FWRITE|FREAD, &fd)) != 0) { + VN_RELE(data->lav_root); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (err); + } + + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt++; + mutex_exit(&data->lav_lock); + + dc->lad_ioctlfd = fd; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + mutex_enter(&data->lav_lock); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + (void) closeandsetf(fd, NULL); + VFS_RELE(vfsp); + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + VFS_RELE(vfsp); + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_closemount(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* "close" the vnode */ + if ((err = closeandsetf(dcmd.lad_ioctlfd, NULL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + mutex_enter(&data->lav_lock); + ASSERT(data->lav_openmnt_cnt > 0); + data->lav_openmnt_cnt--; + mutex_exit(&data->lav_lock); + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_ready(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_READY)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +static int +lx_autofs_dev_fail(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if ((err = lx_autofs_ack(dcmd.lad_arg1, vfsp, LXACR_FAIL)) != 0) { + VFS_RELE(vfsp); + return (err); + } + + VFS_RELE(vfsp); + return (0); +} + +/* + * Update the fifo pipe information we use to talk to the automounter. The + * ioctl is used when the automounter restarts. This logic is similar to the + * handling done in lx_autofs_parse_mntopt() when the filesytem is first + * mounted. + */ +static int +lx_autofs_dev_setpipefd(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + int fd, pgrp; + file_t *fp_wr, *fp_rd; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + mutex_enter(&pidlock); + pgrp = curproc->p_pgrp; + mutex_exit(&pidlock); + fd = dcmd.lad_arg1; + + /* Lookup the new fifos. See comment in lx_autofs_parse_mntopt. */ + if (lx_autofs_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) { + int pid = (int)curproc->p_pid; + + if (lx_autofs_fifo_lookup(pid, fd, &fp_wr, &fp_rd) != 0) { + VFS_RELE(vfsp); + return (EINVAL); + } + } + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + + /* Close the old fifos. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + data->lav_fd = fd; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + /* + * Not explicitly in the ioctl spec. but necessary for correct recovery + */ + data->lav_catatonic = B_FALSE; + + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_catatonic(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_catatonic = B_TRUE; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_expire(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + /* If it succeeds in expiring then we don't want to return EAGAIN */ + if ((err = lx_autofs_expire(vfsp, kcred)) == 0) { + VFS_RELE(vfsp); + return (0); + } + + VFS_RELE(vfsp); + return (EAGAIN); +} + +static int +lx_autofs_dev_timeout(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + lx_autofs_vfs_t *data; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + data = (lx_autofs_vfs_t *)vfsp->vfs_data; + data->lav_timeout = dcmd.lad_arg1; + VFS_RELE(vfsp); + + return (0); +} + +static int +lx_autofs_dev_requestor(intptr_t arg) +{ + int err; + lx_autofs_dv_ioctl_t *dc; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + struct vfs *vfslist; + zone_t *zone = curzone; + lx_autofs_vfs_t *data; + uid_t uid; + gid_t gid; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + do { + /* Skip mounts we shouldn't show. */ + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * Found an indirect mount (probably + * NFS) so we need to get the vfs it's + * mounted onto. + */ + vnode_t *vn = vfsp->vfs_vnodecovered; + vfsp = vn->v_vfsp; + + if (vfsp->vfs_op != lx_autofs_vfsops) { + /* + * autofs doesn't manage this + * path. + */ + break; + } + } + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (EINVAL); + } + + data = (lx_autofs_vfs_t *)fnd_vfs->vfs_data; + uid = data->lav_uid; + gid = data->lav_gid; + VFS_RELE(fnd_vfs); + + dc->lad_arg1 = uid; + dc->lad_arg2 = gid; + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + return (0); +} + +static int +lx_autofs_dev_ismntpt(intptr_t arg) +{ + int err = 0; + lx_autofs_dv_ioctl_t *dc; + struct vfs *vfslist; + vfs_t *vfsp; + vfs_t *fnd_vfs = NULL; + zone_t *zone = curzone; + + if ((err = lx_autofs_dev_get_path_cmd(arg, &dc)) != 0) + return (err); + + /* + * The automounter will always pass a path. It can also either pass an + * ioctlfd or, if it's -1, arg1 can be an LX_AUTOFS_TYPE_* value. We + * currently don't need those for our algorithm. + */ + + vfs_list_read_lock(); + vfsp = vfslist = curzone->zone_vfslist; + if (vfslist == NULL) { + vfs_list_unlock(); + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + do { + if (!(vfsp->vfs_flag & VFS_NOMNTTAB)) { + char *mntpt = (char *)refstr_value(vfsp->vfs_mntpt); + + if (strcmp(dc->lad_path, + ZONE_PATH_TRANSLATE(mntpt, zone)) == 0) { + + /* + * To handle direct mounts (on top of an autofs + * mount), we must prefer non-autofs vfs for + * this request. + */ + if (fnd_vfs != NULL) + VFS_RELE(fnd_vfs); + + fnd_vfs = vfsp; + VFS_HOLD(fnd_vfs) + + if (fnd_vfs->vfs_op != lx_autofs_vfsops) + break; + } + } + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + vfs_list_unlock(); + + if (fnd_vfs == NULL) { + kmem_free(dc, dc->lad_size); + return (0); /* return 0 if not a mount point */ + } + + /* + * arg1 is device number, arg2 is superblock magic number + * The superblock value only matters if autofs or not. + */ + dc->lad_arg1 = fnd_vfs->vfs_dev; + if (fnd_vfs->vfs_op == lx_autofs_vfsops) { + dc->lad_arg2 = LX_AUTOFS_SB_MAGIC; + } else { + dc->lad_arg2 = ~LX_AUTOFS_SB_MAGIC; + } + + VFS_RELE(fnd_vfs); + + if (copyout(dc, (caddr_t)arg, sizeof (lx_autofs_dv_ioctl_t)) != 0) { + kmem_free(dc, dc->lad_size); + return (EFAULT); + } + + kmem_free(dc, dc->lad_size); + + /* + * We have to return 1 if it is a mount point. The lx ioctl autofs + * translator will convert a negative value back to a positive, + * non-error return value. + */ + return (-1); +} + +static int +lx_autofs_dev_askumount(intptr_t arg) +{ + int err; + int v; + lx_autofs_dv_ioctl_t dcmd; + vfs_t *vfsp; + + if ((err = lx_autofs_dev_fd_preamble(arg, &dcmd, &vfsp)) != 0) + return (err); + + if (lx_autofs_may_unmount(vfsp, kcred)) { + v = 0; + } else { + v = 1; + } + VFS_RELE(vfsp); + + dcmd.lad_arg1 = v; + if (copyout(&dcmd, (caddr_t)arg, sizeof (dcmd)) != 0) + return (EFAULT); + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_dev_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + switch (cmd) { + case LX_AUTOFS_DEV_IOC_VERSION_CMD: + return (lx_autofs_dev_vers(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOVER_CMD: + return (lx_autofs_dev_protver(arg)); + + case LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD: + return (lx_autofs_dev_protosubver(arg)); + + case LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD: + return (lx_autofs_dev_openmount(arg)); + + case LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD: + return (lx_autofs_dev_closemount(arg)); + + case LX_AUTOFS_DEV_IOC_READY_CMD: + return (lx_autofs_dev_ready(arg)); + + case LX_AUTOFS_DEV_IOC_FAIL_CMD: + return (lx_autofs_dev_fail(arg)); + + case LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD: + return (lx_autofs_dev_setpipefd(arg)); + + case LX_AUTOFS_DEV_IOC_CATATONIC_CMD: + return (lx_autofs_dev_catatonic(arg)); + + case LX_AUTOFS_DEV_IOC_TIMEOUT_CMD: + return (lx_autofs_dev_timeout(arg)); + + case LX_AUTOFS_DEV_IOC_REQUESTER_CMD: + return (lx_autofs_dev_requestor(arg)); + + case LX_AUTOFS_DEV_IOC_EXPIRE_CMD: + return (lx_autofs_dev_expire(arg)); + + case LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD: + return (lx_autofs_dev_askumount(arg)); + + case LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD: + return (lx_autofs_dev_ismntpt(arg)); + } + + return (EINVAL); +} + +/* + * lx_autofs_init() gets invoked via the mod_install() call in + * this module's _init() routine. Therefore, the code that cleans + * up the structures we allocate below is actually found in + * our _fini() routine. + */ +/* ARGSUSED */ +static int +lx_autofs_init(int fstype, char *name) +{ + int error; + + lx_autofs_major = ddi_name_to_major(LX_AUTOFS_NAME); + + lx_autofs_fstype = fstype; + if ((error = vfs_setfsops(fstype, lx_autofs_vfstops, + &lx_autofs_vfsops)) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template"); + return (error); + } + + if ((error = vn_make_ops(name, lx_autofs_tops_root, + &lx_autofs_vn_ops)) != 0) { + VERIFY(vfs_freevfsops_by_type(fstype) == 0); + lx_autofs_vn_ops = NULL; + return (error); + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_autofs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance = ddi_get_instance(dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + ASSERT(instance == 0); + if (instance != 0) + return (DDI_FAILURE); + + /* create our minor node */ + if (ddi_create_minor_node(dip, LX_AUTOFS_MINORNAME, S_IFCHR, 0, + DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + lx_autofs_dip = dip; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + lx_autofs_dip = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_autofs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, + void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = lx_autofs_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Driver flags + */ +static struct cb_ops lx_autofs_cb_ops = { + lx_autofs_dev_open, /* open */ + lx_autofs_dev_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + lx_autofs_dev_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* vb_prop_op */ + NULL, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +/* + * Module linkage + */ +static mntopt_t lx_autofs_mntopt[] = { + { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_INDIRECT, NULL, 0, 0 }, + { LX_MNTOPT_DIRECT, NULL, 0, 0 }, + { LX_MNTOPT_OFFSET, NULL, 0, 0 } +}; + +static mntopts_t lx_autofs_mntopts = { + sizeof (lx_autofs_mntopt) / sizeof (mntopt_t), + lx_autofs_mntopt +}; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + LX_AUTOFS_NAME, + lx_autofs_init, + VSW_HASPROTO | VSW_VOLATILEDEV | VSW_ZMOUNT, + &lx_autofs_mntopts +}; + +static struct dev_ops lx_autofs_dev_ops = { + DEVO_REV, /* version */ + 0, /* refcnt */ + lx_autofs_info, /* info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_autofs_attach, /* attach */ + lx_autofs_detach, /* detach */ + nodev, /* reset */ + &lx_autofs_cb_ops, /* driver operations */ + NULL, /* no bus operations */ + NULL, /* power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx autofs filesystem", &vfw +}; + +static struct modldrv modldrv = { + &mod_driverops, "lx autofs driver", &lx_autofs_dev_ops +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modlfs, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) { + return (error); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + if (lx_autofs_vn_ops != NULL) { + vn_freevnodeops(lx_autofs_vn_ops); + lx_autofs_vn_ops = NULL; + } + + /* + * In our init routine, if we get an error after calling + * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type(). + * But we don't need to call vfs_freevfsops_by_type() here + * because the fs framework did this for us as part of the + * mod_remove() call above. + */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/autofs/lxautofs.conf b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf new file mode 100644 index 0000000000..36e0119e33 --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lxautofs.conf @@ -0,0 +1,14 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# Copyright 2016 Joyent, Inc. +# + +name="lxautofs" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps.h b/usr/src/uts/common/brand/lx/cgroups/cgrps.h new file mode 100644 index 0000000000..46e2cdd886 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps.h @@ -0,0 +1,222 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXCGRPS_H +#define _LXCGRPS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * cgrps.h: declarations, data structures and macros for lx_cgroup + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> + +/* + * cgrpmgr ioctl interface. + */ +#define CGRPFS_IOC ('C' << 16 | 'G' << 8) +#define CGRPFS_GETEVNT (CGRPFS_IOC | 1) + +typedef struct cgrpmgr_info { + pid_t cgmi_pid; + char *cgmi_rel_agent_path; + char *cgmi_cgroup_path; +} cgrpmgr_info_t; + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +typedef struct cgrpmgr_info32 { + pid_t cgmi_pid; + caddr32_t cgmi_rel_agent_path; + caddr32_t cgmi_cgroup_path; +} cgrpmgr_info32_t; + +#define CG_PSNSIZE 256 /* max size of pseudo file name entries */ +#define CG_PSDSIZE 16 /* pretend that a dir entry takes 16 bytes */ + +/* + * The order of these entries must be in sync with the cg_ssde_dir array. + */ +typedef enum cgrp_ssid { + CG_SSID_GENERIC = 1, + CG_SSID_NUM /* last ssid for range checking */ +} cgrp_ssid_t; + +typedef enum cgrp_nodetype { + CG_CGROUP_DIR = 1, /* cgroup directory entry */ + CG_NOTIFY, /* notify_on_release file */ + CG_PROCS, /* cgroup.procs file */ + CG_REL_AGENT, /* release_agent file */ + CG_TASKS, /* tasks file */ +} cgrp_nodetype_t; + +typedef struct cgrp_subsys_dirent { + cgrp_nodetype_t cgrp_ssd_type; + char *cgrp_ssd_name; +} cgrp_subsys_dirent_t; + +#define N_DIRENTS(m) (cgrp_num_pseudo_ents((m)->cg_ssid) + 2) + +/* + * A modern systemd-based Linux system typically has 50-60 cgroups so + * we size the hash for 2x that number. + */ +#define CGRP_HASH_SZ 128 +#define CGRP_AGENT_LEN (MAXPATHLEN + 1) + +/* + * cgroups per-mount data structure. + * + * All but the event related fields are protected by cg_contents. + * The evnt_list and counter is protected by cg_events. + */ +typedef struct cgrp_mnt { + struct vfs *cg_vfsp; /* filesystem's vfs struct */ + struct cgrp_node *cg_rootnode; /* root cgrp_node */ + char *cg_mntpath; /* name of cgroup mount point */ + cgrp_ssid_t cg_ssid; /* subsystem type */ + dev_t cg_dev; /* unique dev # of mounted `device' */ + uint_t cg_gen; /* node ID source for files */ + uint_t cg_grp_gen; /* ID source for cgroups */ + kmutex_t cg_contents; /* global lock for most fs activity */ + char cg_agent[CGRP_AGENT_LEN]; /* release_agent path */ + /* ptr to zone data for containing zone */ + lx_zone_data_t *cg_lxzdata; + struct cgrp_node **cg_grp_hash; /* hash list of cgroups in the fs */ +} cgrp_mnt_t; + +/* + * cgrp_node is the file system dependent node for cgroups. + * + * The node is used to represent both directories (a cgroup) and pseudo files + * within the directory. + * + * Members are tagged in the comment to note which type of node they apply to: + * A - all + * D - dir (i.e. a cgroup) + * F - pseudo file + */ + +typedef struct cgrp_node { + struct cgrp_node *cgn_back; /* A lnked lst of cgrp_nodes */ + struct cgrp_node *cgn_forw; /* A lnked lst of cgrp_nodes */ + struct cgrp_dirent *cgn_dir; /* D dirent list */ + struct cgrp_node *cgn_parent; /* A dir containing this node */ + struct cgrp_node *cgn_next; /* D link in per-mount cgroup */ + /* hash table */ + uint_t cgn_dirents; /* D number of dirents */ + cgrp_nodetype_t cgn_type; /* A type for this node */ + uint_t cgn_notify; /* D notify_on_release value */ + uint_t cgn_task_cnt; /* D number of threads in grp */ + struct vnode *cgn_vnode; /* A vnode for this cgrp_node */ + uint_t cgn_id; /* D ID number for the cgroup */ + struct vattr cgn_attr; /* A attributes */ +} cgrp_node_t; + +/* + * File system independent to cgroups conversion macros + */ +#define VFSTOCGM(vfsp) ((cgrp_mnt_t *)(vfsp)->vfs_data) +#define VTOCGM(vp) ((cgrp_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOCGN(vp) ((struct cgrp_node *)(vp)->v_data) +#define CGNTOV(cn) ((cn)->cgn_vnode) +#define cgnode_hold(cn) VN_HOLD(CGNTOV(cn)) +#define cgnode_rele(cn) VN_RELE(CGNTOV(cn)) + +/* + * Attributes + */ +#define cgn_mask cgn_attr.va_mask +#define cgn_mode cgn_attr.va_mode +#define cgn_uid cgn_attr.va_uid +#define cgn_gid cgn_attr.va_gid +#define cgn_fsid cgn_attr.va_fsid +#define cgn_nodeid cgn_attr.va_nodeid +#define cgn_nlink cgn_attr.va_nlink +#define cgn_size cgn_attr.va_size +#define cgn_atime cgn_attr.va_atime +#define cgn_mtime cgn_attr.va_mtime +#define cgn_ctime cgn_attr.va_ctime +#define cgn_rdev cgn_attr.va_rdev +#define cgn_blksize cgn_attr.va_blksize +#define cgn_nblocks cgn_attr.va_nblocks +#define cgn_seq cgn_attr.va_seq + +/* + * cgroup directories are made up of a linked list of cg_dirent structures + * hanging off directory cgrp_nodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct cgrp_dirent { + struct cgrp_node *cgd_cgrp_node; /* cg node for this file */ + struct cgrp_dirent *cgd_next; /* next directory entry */ + struct cgrp_dirent *cgd_prev; /* prev directory entry */ + uint_t cgd_offset; /* "offset" of dir entry */ + uint_t cgd_hash; /* a hash of cgd_name */ + struct cgrp_dirent *cgd_link; /* linked via hash table */ + struct cgrp_node *cgd_parent; /* parent, dir we are in */ + char *cgd_name; /* null terminated */ +} cgrp_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +extern struct vnodeops *cgrp_vnodeops; + +int cgrp_dirdelete(cgrp_node_t *, cgrp_node_t *, char *, enum dr_op, cred_t *); +int cgrp_direnter(cgrp_mnt_t *, cgrp_node_t *, char *, enum de_op, + cgrp_node_t *, struct vattr *, cgrp_node_t **, cred_t *); +void cgrp_dirinit(cgrp_node_t *, cgrp_node_t *, cred_t *); +int cgrp_dirlookup(cgrp_node_t *, char *, cgrp_node_t **, cred_t *); +void cgrp_dirtrunc(cgrp_node_t *); +void cgrp_node_init(cgrp_mnt_t *, cgrp_node_t *, vattr_t *, cred_t *); +int cgrp_taccess(void *, int, cred_t *); +ino_t cgrp_inode(cgrp_nodetype_t, unsigned int); +int cgrp_num_pseudo_ents(cgrp_ssid_t); +cgrp_node_t *cgrp_cg_hash_lookup(cgrp_mnt_t *, uint_t); +void cgrp_rel_agent_event(cgrp_mnt_t *, cgrp_node_t *, boolean_t); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXCGRPS_H */ diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c new file mode 100644 index 0000000000..d9b7e443dd --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_node.c @@ -0,0 +1,1023 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "cgrps.h" + +static int cgrp_dirmakecgnode(cgrp_node_t *, cgrp_mnt_t *, struct vattr *, + enum de_op, cgrp_node_t **, struct cred *); +static int cgrp_diraddentry(cgrp_node_t *, cgrp_node_t *, char *); + +static cgrp_subsys_dirent_t cgrp_generic_dir[] = { + { CG_PROCS, "cgroup.procs" }, + { CG_NOTIFY, "notify_on_release" }, + { CG_TASKS, "tasks" } +}; + +typedef struct cgrp_ssde { + cgrp_subsys_dirent_t *cg_ssde_files; + int cg_ssde_nfiles; +} cgrp_ssde_t; + +#define CGDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +/* + * Note, these entries must be in the same order as the cgrp_ssid_t entries. + */ +static cgrp_ssde_t cg_ssde_dir[] = { + /* subsystems start at 1 */ + {NULL, 0}, + + /* CG_SSID_GENERIC */ + {cgrp_generic_dir, CGDIRLISTSZ(cgrp_generic_dir)}, +}; + + +#define CG_HASH_SIZE 8192 /* must be power of 2 */ +#define CG_MUTEX_SIZE 64 + +static cgrp_dirent_t *cg_hashtable[CG_HASH_SIZE]; +static kmutex_t cg_hashmutex[CG_MUTEX_SIZE]; + +#define CG_HASH_INDEX(a) ((a) & (CG_HASH_SIZE-1)) +#define CG_MUTEX_INDEX(a) ((a) & (CG_MUTEX_SIZE-1)) + +#define CG_HASH(cp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(cp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +#define MODESHIFT 3 + +typedef enum cgrp_nodehold { + NOHOLD, + HOLD +} cgrp_nodehold_t; + +void +cgrp_hash_init(void) +{ + int i; + + for (i = 0; i < CG_MUTEX_SIZE; i++) + mutex_init(&cg_hashmutex[i], NULL, MUTEX_DEFAULT, NULL); +} + +static void +cgrp_hash_in(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + CG_HASH(c->cgd_parent, c->cgd_name, hash); + c->cgd_hash = hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + c->cgd_link = *prevpp; + *prevpp = c; + mutex_exit(cg_hmtx); +} + +static void +cgrp_hash_out(cgrp_dirent_t *c) +{ + uint_t hash; + cgrp_dirent_t **prevpp; + kmutex_t *cg_hmtx; + + hash = c->cgd_hash; + prevpp = &cg_hashtable[CG_HASH_INDEX(hash)]; + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + while (*prevpp != c) + prevpp = &(*prevpp)->cgd_link; + *prevpp = c->cgd_link; + mutex_exit(cg_hmtx); +} + +static cgrp_dirent_t * +cgrp_hash_lookup(char *name, cgrp_node_t *parent, cgrp_nodehold_t hold, + cgrp_node_t **found) +{ + cgrp_dirent_t *l; + uint_t hash; + kmutex_t *cg_hmtx; + cgrp_node_t *cnp; + + CG_HASH(parent, name, hash); + cg_hmtx = &cg_hashmutex[CG_MUTEX_INDEX(hash)]; + mutex_enter(cg_hmtx); + l = cg_hashtable[CG_HASH_INDEX(hash)]; + while (l) { + if ((l->cgd_hash == hash) && + (l->cgd_parent == parent) && + (strcmp(l->cgd_name, name) == 0)) { + /* + * We need to make sure that the cgrp_node that + * we put a hold on is the same one that we pass back. + * Hence, temporary variable cnp is necessary. + */ + cnp = l->cgd_cgrp_node; + if (hold == HOLD) { + ASSERT(cnp); + cgnode_hold(cnp); + } + if (found) + *found = cnp; + mutex_exit(cg_hmtx); + return (l); + } else { + l = l->cgd_link; + } + } + mutex_exit(cg_hmtx); + return (NULL); +} + +/* + * The following functions maintain the per-mount cgroup hash table. + */ +static void +cgrp_cg_hash_insert(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + cn->cgn_next = cgm->cg_grp_hash[hsh]; + cgm->cg_grp_hash[hsh] = cn; +} + +static void +cgrp_cg_hash_remove(cgrp_mnt_t *cgm, cgrp_node_t *cn) +{ + uint_t cgid; + int hsh; + cgrp_node_t *np = NULL, *curp, *prevp = NULL; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + cgid = cn->cgn_id; + hsh = cgid % CGRP_HASH_SZ; + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + if (prevp == NULL) { + cgm->cg_grp_hash[hsh] = curp->cgn_next; + } else { + prevp->cgn_next = curp->cgn_next; + } + np = curp; + np->cgn_next = NULL; + break; + } + + prevp = curp; + } + + ASSERT(np != NULL); + ASSERT(np->cgn_task_cnt == 0); +} + +/* + * Count up the number of threads already running in the zone and initialize the + * first cgroup's task counter. + * + * We have to look at all of the processes to find applicable ones. + */ +static void +cgrp_cg_hash_init(cgrp_node_t *cn) +{ + int i; + int cnt = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + /* Scan all of the process entries */ + mutex_enter(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != zoneid) { + continue; + } + + mutex_enter(&p->p_lock); + if (p->p_brand != &lx_brand) { + mutex_exit(&p->p_lock); + continue; + } + cnt += p->p_lwpcnt; + mutex_exit(&p->p_lock); + } + + /* + * There should be at least the init process with 1 thread in the zone + */ + ASSERT(cnt > 0); + cn->cgn_task_cnt = cnt; + + DTRACE_PROBE2(cgrp__grp__init, void *, cn, int, cnt); + + mutex_exit(&pidlock); +} + +cgrp_node_t * +cgrp_cg_hash_lookup(cgrp_mnt_t *cgm, uint_t cgid) +{ + int hsh = cgid % CGRP_HASH_SZ; + cgrp_node_t *curp; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + for (curp = cgm->cg_grp_hash[hsh]; curp != NULL; + curp = curp->cgn_next) { + if (curp->cgn_id == cgid) { + return (curp); + } + } + + return (NULL); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them to give the inode number for + * a cgrp pseudo file node. + */ +ino_t +cgrp_inode(cgrp_nodetype_t type, unsigned int cgrpid) +{ + /* + * cgroup inode format: + * 00000000AABBBBBB + * + * AA - node type (from subsystem list) + * BBBBBB - id of the cgroup + */ + + return ((ino_t)(type << 24) | (cgrpid & 0xffffff)); +} + +/* + * Return the number of pseudo file entries in a cgroup directory for the + * given subsystem. + */ +int +cgrp_num_pseudo_ents(cgrp_ssid_t ssid) +{ + cgrp_ssde_t *ssdp = &cg_ssde_dir[ssid]; + + return (ssdp->cg_ssde_nfiles); +} + +int +cgrp_taccess(void *vcp, int mode, cred_t *cred) +{ + cgrp_node_t *cn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in cgrp_node. + */ + if (crgetuid(cred) != cn->cgn_uid) { + shift += MODESHIFT; + if (groupmember(cn->cgn_gid, cred) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cred, CGNTOV(cn), cn->cgn_uid, + cn->cgn_mode << shift, mode)); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found cgrp_node with its vnode held. + */ +int +cgrp_dirlookup(cgrp_node_t *parent, char *name, cgrp_node_t **foundcp, + cred_t *cred) +{ + int error; + + ASSERT(MUTEX_HELD(&VTOCGM(parent->cgn_vnode)->cg_contents)); + *foundcp = NULL; + if (parent->cgn_type != CG_CGROUP_DIR) + return (ENOTDIR); + + if ((error = cgrp_taccess(parent, VEXEC, cred))) + return (error); + + if (*name == '\0') { + cgnode_hold(parent); + *foundcp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the cgn_dir list + * so that it doesn't change out from underneath us. + * cgrp_hash_lookup() will pass back the cgrp_node + * with a hold on it. + */ + + if (cgrp_hash_lookup(name, parent, HOLD, foundcp) != NULL) { + ASSERT(*foundcp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry for 'name' and 'cp' into directory 'dir' + * + * Returns 0 on success. + */ +int +cgrp_direnter( + cgrp_mnt_t *cgm, + cgrp_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + cgrp_node_t *cn, /* existing cgrp_node, if rename */ + struct vattr *va, + cgrp_node_t **cnp, /* return cgrp_node, if create/mkdir */ + cred_t *cred) +{ + cgrp_dirent_t *cdp; + cgrp_node_t *found = NULL; + int error = 0; + char *s; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("cgrp_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + * Remember that we can only rename within the same directory. + */ + if (op == DE_RENAME) { + if (cn->cgn_nlink == 0) { + return (ENOENT); + } + + if (cn->cgn_nlink == MAXLINK) { + return (EMLINK); + } + cn->cgn_nlink++; + gethrestime(&cn->cgn_ctime); + } + + /* + * This might be a "dangling detached directory". + * it could have been removed, but a reference + * to it kept in u_cwd. don't bother searching + * it, and with any luck the user will get tired + * of dealing with us and cd to some absolute + * pathway. *sigh*, thus in ufs, too. + */ + if (dir->cgn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * Search for the entry. In all cases it is an error if it exists. + */ + cdp = cgrp_hash_lookup(name, dir, HOLD, &found); + + if (cdp) { + ASSERT(found != NULL); + error = EEXIST; + mutex_exit(&cgm->cg_contents); + cgnode_rele(found); + mutex_enter(&cgm->cg_contents); + } else { + + /* + * The entry does not exist. Check write permission in + * directory to see if entry can be created. + */ + if ((error = cgrp_taccess(dir, VWRITE, cred)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new cgrp_node and directory entry as required. + */ + error = cgrp_dirmakecgnode(dir, cgm, va, op, &cn, cred); + if (error) + goto out; + + if (op == DE_MKDIR) { + /* + * inherit notify_on_release value from parent + */ + cn->cgn_notify = dir->cgn_notify; + } + } + + error = cgrp_diraddentry(dir, cn, name); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + if ((cn->cgn_type) == CG_CGROUP_DIR) { + ASSERT(cdp == NULL); + /* + * cleanup allocs made by cgrp_dirinit + */ + cgrp_dirtrunc(cn); + } + cn->cgn_nlink = 0; + gethrestime(&cn->cgn_ctime); + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + cn = NULL; + } + } else if (cnp) { + *cnp = cn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + mutex_exit(&cgm->cg_contents); + cgnode_rele(cn); + mutex_enter(&cgm->cg_contents); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + cn->cgn_nlink--; + gethrestime(&cn->cgn_ctime); + } + return (error); +} + +/* + * Delete entry cn of name "nm" from parent dir. This is used to both remove + * a cgroup directory and to remove the pseudo file nodes within the cgroup + * directory (by recursively calling itself). It frees the dir entry space + * and decrements link count on cgrp_node(s). + * + * Return 0 on success. + */ +int +cgrp_dirdelete(cgrp_node_t *dir, cgrp_node_t *cn, char *nm, enum dr_op op, + cred_t *cred) +{ + cgrp_mnt_t *cgm = VTOCGM(cn->cgn_vnode); + cgrp_dirent_t *cndp; + int error; + size_t namelen; + cgrp_node_t *cnnp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + if (nm[0] == '\0') + panic("cgrp_dirdelete: empty name for 0x%p", (void *)cn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = cgrp_taccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->cgn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a cgroup dir. Start by + * removing the fixed pseudo file entries from the dir. We do + * this by recursively calling back into this function with + * a different op code. The caller of this function has + * already verified that it is safe to remove this directory. + */ + cgrp_dirent_t *cdp; + + ASSERT(cn->cgn_type == CG_CGROUP_DIR); + + cdp = cn->cgn_dir; + while (cdp) { + cgrp_node_t *pseudo_node; + cgrp_dirent_t *nextp; + + if (strcmp(cdp->cgd_name, ".") == 0 || + strcmp(cdp->cgd_name, "..") == 0) { + cdp = cdp->cgd_next; + continue; + } + + pseudo_node = cdp->cgd_cgrp_node; + nextp = cdp->cgd_next; + + cgnode_hold(pseudo_node); + error = cgrp_dirdelete(cn, pseudo_node, + cdp->cgd_name, DR_REMOVE, cred); + mutex_exit(&cgm->cg_contents); + cgnode_rele(pseudo_node); + if (error != 0) + return (error); + mutex_enter(&cgm->cg_contents); + + cdp = nextp; + } + + cgrp_cg_hash_remove(cgm, cn); + } + + cndp = cgrp_hash_lookup(nm, dir, NOHOLD, &cnnp); + /* These used to be VERIFY(), but in racy conditions they can fail. */ + if (cndp == NULL) { + /* Can't find the directory entry at all now! */ + return (ENOENT); + } + if (cn != cnnp) { + /* Returned cnnp isn't our original, so it's also not-there. */ + return (ENOENT); + } + + cgrp_hash_out(cndp); + + /* Take cndp out of the directory list. */ + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + if (cndp->cgd_prev) { + cndp->cgd_prev->cgd_next = cndp->cgd_next; + } + if (cndp->cgd_next) { + cndp->cgd_next->cgd_prev = cndp->cgd_prev; + } + + /* + * If the roving slot pointer happens to match cndp, + * point it at the previous dirent. + */ + if (dir->cgn_dir->cgd_prev == cndp) { + dir->cgn_dir->cgd_prev = cndp->cgd_prev; + } + ASSERT(cndp->cgd_next != cndp); + ASSERT(cndp->cgd_prev != cndp); + + /* cndp points to the correct directory entry */ + namelen = strlen(cndp->cgd_name) + 1; + + kmem_free(cndp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + cn->cgn_ctime = now; + + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + if (op == DR_RMDIR && cn->cgn_type == CG_CGROUP_DIR) { + cgrp_dirtrunc(cn); + ASSERT(cn->cgn_nlink == 0); + } + return (0); +} + +/* + * Initialize a cgrp_node and add it to file list under mount point. + */ +void +cgrp_node_init(cgrp_mnt_t *cgm, cgrp_node_t *cn, vattr_t *vap, cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(vap != NULL); + + cn->cgn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + cn->cgn_mask = 0; + cn->cgn_attr.va_type = vap->va_type; + cn->cgn_nlink = 1; + cn->cgn_size = 0; + + if (cred == NULL) { + cn->cgn_uid = vap->va_uid; + cn->cgn_gid = vap->va_gid; + } else { + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + } + + cn->cgn_fsid = cgm->cg_dev; + cn->cgn_rdev = vap->va_rdev; + cn->cgn_blksize = PAGESIZE; + cn->cgn_nblocks = 0; + gethrestime(&now); + cn->cgn_atime = now; + cn->cgn_mtime = now; + cn->cgn_ctime = now; + cn->cgn_seq = 0; + cn->cgn_dir = NULL; + + cn->cgn_vnode = vn_alloc(KM_SLEEP); + vp = CGNTOV(cn); + vn_setops(vp, cgrp_vnodeops); + vp->v_vfsp = cgm->cg_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)cn; + + cn->cgn_nodeid = cgm->cg_gen++; + + /* + * Add new cgrp_node to end of linked list of cgrp_nodes for this + * cgroup fs. Root directory is handled specially in cgrp_mount. + */ + if (cgm->cg_rootnode != (cgrp_node_t *)NULL) { + cn->cgn_forw = NULL; + cn->cgn_back = cgm->cg_rootnode->cgn_back; + cn->cgn_back->cgn_forw = cgm->cg_rootnode->cgn_back = cn; + } + vn_exists(vp); +} + +void +cgrp_addnode(cgrp_mnt_t *cgm, cgrp_node_t *dir, char *name, + cgrp_nodetype_t type, struct vattr *nattr, cred_t *cr) +{ + cgrp_node_t *ncn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + VERIFY0(cgrp_direnter(cgm, dir, name, DE_CREATE, (cgrp_node_t *)NULL, + nattr, &ncn, cr)); + + /* + * Fix the inode and assign the pseudo file type to be correct. + */ + ncn->cgn_nodeid = cgrp_inode(type, dir->cgn_nodeid); + ncn->cgn_type = type; + + /* + * Since we're creating these entries here and not via the + * normal VOP_CREATE code path, we need to do the rele to drop + * our hold. This will leave the vnode v_count at 0 when we + * come out of cgrp_inactive but we won't reclaim the vnode + * there since the cgn_nlink value will still be 1. + */ + mutex_exit(&cgm->cg_contents); + cgnode_rele(ncn); + mutex_enter(&cgm->cg_contents); +} + +/* + * cgrp_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +cgrp_dirinit(cgrp_node_t *parent, cgrp_node_t *dir, cred_t *cr) +{ + cgrp_dirent_t *dot, *dotdot; + timestruc_t now; + cgrp_mnt_t *cgm = VTOCGM(dir->cgn_vnode); + cgrp_ssde_t *ssdp; + cgrp_subsys_dirent_t *pseudo_files; + struct vattr nattr; + int i; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + ASSERT(cgm->cg_ssid > 0 && cgm->cg_ssid < CG_SSID_NUM); + ssdp = &cg_ssde_dir[cgm->cg_ssid]; + + /* + * If this is the top-level cgroup created by the mount then we need to + * count up the number of procs and tasks already running in the zone. + */ + + /* + * Set the cgroup ID for this cgrp_node by using a counter on each + * mount. + */ + dir->cgn_id = cgm->cg_grp_gen++; + cgrp_cg_hash_insert(cgm, dir); + /* Initialise the first cgroup if this is top-level group */ + if (parent == dir) + cgrp_cg_hash_init(dir); + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (cgrp_dirent_t) + 2, KM_SLEEP); + dot->cgd_cgrp_node = dir; + dot->cgd_offset = 0; + dot->cgd_name = (char *)dot + sizeof (cgrp_dirent_t); + dot->cgd_name[0] = '.'; + dot->cgd_parent = dir; + cgrp_hash_in(dot); + + dotdot = kmem_zalloc(sizeof (cgrp_dirent_t) + 3, KM_SLEEP); + dotdot->cgd_cgrp_node = parent; + dotdot->cgd_offset = 1; + dotdot->cgd_name = (char *)dotdot + sizeof (cgrp_dirent_t); + dotdot->cgd_name[0] = '.'; + dotdot->cgd_name[1] = '.'; + dotdot->cgd_parent = dir; + cgrp_hash_in(dotdot); + + /* + * Initialize directory entry list. + */ + dot->cgd_next = dotdot; + dot->cgd_prev = dotdot; /* dot's cgd_prev holds roving slot pointer */ + dotdot->cgd_next = NULL; + dotdot->cgd_prev = dot; + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + parent->cgn_nlink++; + parent->cgn_ctime = now; + + dir->cgn_dir = dot; + dir->cgn_size = 2 * sizeof (cgrp_dirent_t) + 5; /* dot and dotdot */ + dir->cgn_dirents = 2; + dir->cgn_nlink = 2; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; + + /* + * If this is the top-level dir in the file system then it always + * has a release_agent pseudo file. Only the top-level dir has this + * file. + */ + if (parent == dir) { + cgrp_addnode(cgm, dir, "release_agent", CG_REL_AGENT, &nattr, + cr); + } + + pseudo_files = ssdp->cg_ssde_files; + for (i = 0; i < ssdp->cg_ssde_nfiles; i++) { + cgrp_addnode(cgm, dir, pseudo_files[i].cgrp_ssd_name, + pseudo_files[i].cgrp_ssd_type, &nattr, cr); + } +} + +/* + * cgrp_dirtrunc is called to remove all directory entries under this directory. + */ +void +cgrp_dirtrunc(cgrp_node_t *dir) +{ + cgrp_dirent_t *cgdp; + timestruc_t now; + + ASSERT(MUTEX_HELD(&VTOCGM(dir->cgn_vnode)->cg_contents)); + ASSERT(dir->cgn_type == CG_CGROUP_DIR); + + for (cgdp = dir->cgn_dir; cgdp; cgdp = dir->cgn_dir) { + size_t namelen; + cgrp_node_t *cn; + + ASSERT(cgdp->cgd_next != cgdp); + ASSERT(cgdp->cgd_prev != cgdp); + ASSERT(cgdp->cgd_cgrp_node); + + dir->cgn_dir = cgdp->cgd_next; + namelen = strlen(cgdp->cgd_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + cn = cgdp->cgd_cgrp_node; + ASSERT(cn->cgn_nlink > 0); + cn->cgn_nlink--; + + cgrp_hash_out(cgdp); + kmem_free(cgdp, sizeof (cgrp_dirent_t) + namelen); + dir->cgn_size -= (sizeof (cgrp_dirent_t) + namelen); + dir->cgn_dirents--; + } + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + ASSERT(dir->cgn_dir == NULL); + ASSERT(dir->cgn_size == 0); + ASSERT(dir->cgn_dirents == 0); +} + +static int +cgrp_diraddentry(cgrp_node_t *dir, cgrp_node_t *cn, char *name) +{ + cgrp_dirent_t *cdp, *cpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->cgn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (cn->cgn_vnode->v_vfsp != dir->cgn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (cgrp_dirent_t); + cdp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY); + if (cdp == NULL) + return (ENOSPC); + + cn->cgn_parent = dir; + + dir->cgn_size += alloc_size; + dir->cgn_dirents++; + cdp->cgd_cgrp_node = cn; + cdp->cgd_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + cdp->cgd_name = (char *)cdp + sizeof (cgrp_dirent_t); + (void) strcpy(cdp->cgd_name, name); + + cgrp_hash_in(cdp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in cgrp_readdir. + */ + cpdp = dir->cgn_dir->cgd_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (cpdp->cgd_next != NULL && (cpdp->cgd_next->cgd_offset - + cpdp->cgd_offset) <= 1) { + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + ASSERT(cpdp->cgd_next->cgd_offset > cpdp->cgd_offset); + cpdp = cpdp->cgd_next; + } + cdp->cgd_offset = cpdp->cgd_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's cgd_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (cpdp->cgd_next == NULL && cpdp->cgd_offset > 2 * dir->cgn_dirents) + dir->cgn_dir->cgd_prev = dir->cgn_dir->cgd_next; + else + dir->cgn_dir->cgd_prev = cdp; + + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + cdp->cgd_next = cpdp->cgd_next; + if (cdp->cgd_next) { + cdp->cgd_next->cgd_prev = cdp; + } + cdp->cgd_prev = cpdp; + cpdp->cgd_next = cdp; + + ASSERT(cdp->cgd_next != cdp); + ASSERT(cdp->cgd_prev != cdp); + ASSERT(cpdp->cgd_next != cpdp); + ASSERT(cpdp->cgd_prev != cpdp); + + gethrestime(&now); + dir->cgn_mtime = now; + dir->cgn_ctime = now; + + return (0); +} + +static int +cgrp_dirmakecgnode(cgrp_node_t *dir, cgrp_mnt_t *cgm, struct vattr *va, + enum de_op op, cgrp_node_t **newnode, struct cred *cred) +{ + cgrp_node_t *cn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + cn = kmem_zalloc(sizeof (cgrp_node_t), KM_SLEEP); + cgrp_node_init(cgm, cn, va, cred); + + cn->cgn_vnode->v_rdev = cn->cgn_rdev = NODEV; + cn->cgn_vnode->v_type = va->va_type; + cn->cgn_uid = crgetuid(cred); + cn->cgn_gid = crgetgid(cred); + + if (va->va_mask & AT_ATIME) + cn->cgn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + cn->cgn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + cn->cgn_type = CG_CGROUP_DIR; + cgrp_dirinit(dir, cn, cred); + } + + *newnode = cn; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c new file mode 100644 index 0000000000..7805c3f2bd --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vfsops.c @@ -0,0 +1,1071 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The cgroup file system implements a subset of the Linux cgroup functionality + * for use by lx-branded zones. On Linux, cgroups are a generic process grouping + * mechanism which is used to apply various behaviors to the processes within + * the group, although it's primary purpose is for resource management. + * + * In Linux, the cgroup file system provides two pieces of functionality: + * 1) A per-mount set of cgroups arranged in a tree, such that every task in + * the system is in one, and only one, of the cgroups in the tree. + * 2) A set of subsystems; each subsystem has subsystem-specific state and + * behavior and is associated with a cgroup mount. This provides a way to + * apply arbitrary functionality (but generally resource management related) + * to the processes associated with the nodes in the tree at that mount + * point. + * + * For example, it is common to see cgroup trees (each is its own mount with a + * different subsystem controller) for blkio, cpuset, memory, systemd (has no + * controller), etc. Within each tree there is a top-level directory with at + * least a cgroup.procs, notify_on_release, release_agent, and tasks file. + * The cgroup.procs file lists the processes within that group and the tasks + * file lists the threads in the group. There could be subdirectories, which + * define new cgroups, that then contain a subset of the processes. Each + * subdirectory also has, at a minimum, a cgroup.procs, notify_on_release, and + * tasks file. + * + * Since we're using lx to run user-level code within zones, the majority (all?) + * of the cgroup resource management functionality simply doesn't apply to us. + * The primary need for cgroups is to support the init program 'systemd' as the + * consumer. systemd only requires the process grouping hierarchy of cgroups, + * although it can also use the resource management features if they are + * available. Given this, our cgroup file system only implements the process + * hierarchy and does not report that any resource management controllers are + * available for separate mounts. + * + * In addition to the hierarchy, the other important component of cgroups that + * is used by systemd is the 'release_agent'. This provides a mechanism to + * run a command when a cgroup becomes empty (the last task in the group + * leaves, either by exit or move, and there are no more sub-cgroups). The + * 'release_agent' file only exists in the top-level cgroup of the mounted + * file system and holds the path to a command to run. The 'notify_on_release' + * file exists in each cgroup dir. If that file contains a '1' then the agent + * is run when that group becomes empty. The agent is passed a path string of + * the cgroup, relative to the file system mount point (e.g. a mount on + * /sys/fs/cgroups/systemd with a sub-cgroup of /sys/fs/cgroups/systemd/foo/bar + * gets the arg /foo/bar). + * + * Cgroup membership is implemented via hooks into the lx brand code. When + * the cgroup file system loads it installs callbacks for: + * lx_cgrp_initlwp + * lx_cgrp_freelwp + * and when it unloads it clears those hooks. The lx brand code calls those + * hooks when a lwp starts and when it exits. Internally we use a + * simple reference counter (cgn_task_cnt) on the cgroup node to track how many + * threads are in the group, so we can tell when a group becomes empty. + * To make this quick, a hash table (cg_grp_hash) is maintained on the + * cgrp_mnt_t struct to allow quick lookups by cgroup ID. The hash table is + * sized so that there should typically only be 0 or 1 cgroups per bucket. + * We also keep a reference to the file system in the zone-specific brand data + * (lxzd_cgroup) so that the lx brand code can pass in the correct vfs_t + * when it runs the hook. + * + * Once a cgroup is about to become empty, the final process exiting the cgroup + * will launch a new user-level process which execs the release agent. The new + * process is created as a child of zsched (indicated by the -1 pid argument + * to newproc) and is not associated with the exiting process in any way. + * + * This file system is similar to tmpfs in that directories only exist in + * memory. Each subdirectory represents a different cgroup. Within the cgroup + * there are pseudo files (see cg_ssde_dir) with well-defined names which + * control the configuration and behavior of the cgroup (see cgrp_nodetype_t). + * The primary files within every cgroup are named 'cgroup.procs', + * 'notify_on_release', and 'tasks' (as well as 'release_agent' in the + * top-level cgroup). The cgroup.procs and tasks files are used to control and + * list which processes/threads belong to the cgroup. In the general case there + * could be additional files in the cgroup, which defined additional behavior + * (i.e. subsystem specific pseudo files), although none exist at this time. + * + * Each cgroup node has a unique ID (cgn_nodeid) within the mount. This ID is + * used to correlate with the threads to determine cgroup membership. When + * assigning a PID to a cgroup (via write) the code updates the br_cgroupid + * member in the brand-specific lx_lwp_data structure to control which cgroup + * the thread belongs to. Note that because the br_cgroupid lives in + * lx_lwp_data, native processes will not appear in the cgroup hierarchy. + * + * An overview of the behavior for the various vnode operations is: + * - no hardlinks or symlinks + * - no file create (the subsystem-specific files are a fixed list of + * pseudo-files accessible within the directory) + * - no file remove + * - no file rename, but a directory (i.e. a cgroup) can be renamed within the + * containing directory, but not into a different directory + * - can mkdir and rmdir to create/destroy cgroups + * - cannot rmdir while it contains tasks or a subdir (i.e. a sub-cgroup) + * - open, read/write, close on the subsytem-specific pseudo files is + * allowed, as this is the interface to configure and report on the cgroup. + * The pseudo file's mode controls write access and cannot be changed. + * + * The locking in this file system is simple since the file system is not + * subjected to heavy I/O activity and all data is in-memory. There is a single + * global mutex for each mount (cg_contents). This mutex is held for the life + * of most vnode operations. The most active path is probably the LWP start and + * exit hooks which increment/decrement the reference counter on the cgroup + * node. The lock is important for this case since we don't want concurrent + * activity (such as moving the process into another cgroup) while we're trying + * to lookup the cgroup from the mount's hash table. We must be careful to + * avoid a deadlock while reading or writing since that code can take pidlock + * and p_lock, but the cgrp_lwp_fork_helper can also be called while one of + * those is held. To prevent deadlock we always take cg_contents after pidlock + * and p_lock. + * + * EXTENDING THE FILE SYSTEM + * + * When adding support for a new subsystem, be sure to also update the + * lxpr_read_cgroups function in lx_procfs so that the subsystem is reported + * by proc. + * + * Although we don't currently support any subsystem controllers, the design + * allows for the file system to be extended to add controller emulation + * if needed. New controller IDs (i.e. different subsystems) for a mount can + * be defined in the cgrp_ssid_t enum (e.g. CG_SSID_CPUSET or CG_SSID_MEMORY) + * and new node types for additional pseudo files in the tree can be defined in + * the cgrp_nodetype_t enum (e.g. CG_CPUSET_CPUS or CG_MEMORY_USAGE_IN_BYTES). + * The cg_ssde_dir array would need a new entry for the new subsystem to + * control which nodes are visible in a directory for the new subsystem. + * + * New emulation would then need to be written to manage the behavior on the + * new pseudo file(s) associated with new cgrp_nodetype_t types. + * + * Within lx procfs the lxpr_read_pid_cgroup() function would need to be + * updated so that it reported the various subsystems used by the different + * mounts. + * + * In addition, in order to support more than one cgroup mount we would need a + * list of cgroup IDs associated with every thread, instead of just one ID + * (br_cgroupid). The thread data would need to become a struct which held + * both an ID and an indication as to which mounted cgroup file system instance + * the ID was associated with. We would also need a list of cgroup mounts per + * zone, instead the current single zone reference. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/vmparam.h> +#include <sys/corectl.h> +#include <sys/contract_impl.h> +#include <sys/pool.h> +#include <sys/stack.h> +#include <sys/rt.h> +#include <sys/fx.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +/* Module level parameters */ +static int cgrp_fstype; +static dev_t cgrp_dev; + +#define MAX_AGENT_EVENTS 32 /* max num queued events */ + +#define UMNT_DELAY_TIME drv_usectohz(50000) /* 500th of a second */ +#define UMNT_RETRY_MAX 100 /* 100 times - 2 secs */ + +/* + * cgrp_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t cgrp_mountcount; + +/* + * cgrp_minfree is the minimum amount of swap space that cgroups leaves for + * the rest of the zone. In other words, if the amount of free swap space + * in the zone drops below cgrp_minfree, cgroup anon allocations will fail. + * This number is only likely to become factor when DRAM and swap have both + * been capped low to allow for maximum tenancy. + */ +size_t cgrp_minfree = 0; + +/* + * CGMINFREE -- the value from which cgrp_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for cgroups in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow cgroups to consume + * no more than half of this, yielding a CGMINFREE of 64MB. + */ +#define CGMINFREE 64 * 1024 * 1024 /* 64 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +/* + * cgroup vfs operations. + */ +static int cgrp_init(int, char *); +static int cgrp_mount(struct vfs *, struct vnode *, + struct mounta *, struct cred *); +static int cgrp_unmount(struct vfs *, int, struct cred *); +static int cgrp_root(struct vfs *, struct vnode **); +static int cgrp_statvfs(struct vfs *, struct statvfs64 *); +static void cgrp_freevfs(vfs_t *vfsp); + +/* Forward declarations for hooks */ +static void cgrp_lwp_fork_helper(vfs_t *, uint_t, id_t, pid_t); +static void cgrp_lwp_exit_helper(vfs_t *, uint_t, id_t, pid_t); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_cgroup", + cgrp_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand cgroups", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (cgrp_mountcount) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* Disable hooks used by the lx brand module. */ + lx_cgrp_initlwp = NULL; + lx_cgrp_freelwp = NULL; + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(cgrp_fstype); + vn_freevnodeops(cgrp_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading cgroup module. + */ +static int +cgrp_init(int fstype, char *name) +{ + static const fs_operation_def_t cgrp_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = cgrp_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = cgrp_unmount }, + VFSNAME_ROOT, { .vfs_root = cgrp_root }, + VFSNAME_STATVFS, { .vfs_statvfs = cgrp_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = cgrp_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def cgrp_vnodeops_template[]; + int error; + extern void cgrp_hash_init(); + major_t dev; + + cgrp_hash_init(); + cgrp_fstype = fstype; + ASSERT(cgrp_fstype != 0); + + error = vfs_setfsops(fstype, cgrp_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "cgrp_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, cgrp_vnodeops_template, &cgrp_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "cgrp_init: bad vnode ops template"); + return (error); + } + + /* + * cgrp_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (cgrp_minfree == 0) { + /* Set if not patched */ + cgrp_minfree = btopr(CGMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "cgrp_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + cgrp_dev = makedevice(dev, 0); + + /* Install the hooks used by the lx brand module. */ + lx_cgrp_initlwp = cgrp_lwp_fork_helper; + lx_cgrp_freelwp = cgrp_lwp_exit_helper; + + return (0); +} + +static int +cgrp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + cgrp_mnt_t *cgm = NULL; + struct cgrp_node *cp; + struct pathname dpn; + int error; + struct vattr rattr; + cgrp_ssid_t ssid = CG_SSID_GENERIC; + lx_zone_data_t *lxzdata; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Since we depend on per-thread lx brand data, only allow mounting + * within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Having the resource be anything but "swap" doesn't make sense. + */ + vfs_setresource(vfsp, "swap", 0); + + /* cgroups don't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + /* + * Here is where we could support subsystem-specific controller + * mounting. For example, if mounting a cgroup fs with the 'cpuset' + * option to specify that particular controller. + * + * char *argstr; + * if (vfs_optionisset(vfsp, "cpuset", &argstr)) { + * if (ssid != CG_SSID_GENERIC) { + * error = EINVAL; + * goto out; + * } + * ssid = CG_SSID_CPUSET; + * } + */ + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * We currently only support one mount per zone. + */ + lxzdata = ztolxzd(curproc->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + if (lxzdata->lxzd_cgroup != NULL) { + mutex_exit(&lxzdata->lxzd_lock); + return (EINVAL); + } + + cgm = kmem_zalloc(sizeof (*cgm), KM_SLEEP); + + /* Set but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&cgm->cg_contents, NULL, MUTEX_DEFAULT, NULL); + + cgm->cg_vfsp = lxzdata->lxzd_cgroup = vfsp; + mutex_exit(&lxzdata->lxzd_lock); + + cgm->cg_lxzdata = lxzdata; + cgm->cg_ssid = ssid; + + vfsp->vfs_data = (caddr_t)cgm; + vfsp->vfs_fstype = cgrp_fstype; + vfsp->vfs_dev = cgrp_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, cgrp_dev, cgrp_fstype); + cgm->cg_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(cgm->cg_mntpath, dpn.pn_path); + + cgm->cg_grp_hash = kmem_zalloc(sizeof (cgrp_node_t *) * CGRP_HASH_SZ, + KM_SLEEP); + + /* allocate and initialize root cgrp_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + cp = kmem_zalloc(sizeof (struct cgrp_node), KM_SLEEP); + + mutex_enter(&cgm->cg_contents); + cgrp_node_init(cgm, cp, &rattr, cr); + + CGNTOV(cp)->v_flag |= VROOT; + + /* + * initialize linked list of cgrp_nodes so that the back pointer of + * the root cgrp_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + cp->cgn_back = cp; + cp->cgn_forw = NULL; + cp->cgn_nlink = 0; + cgm->cg_rootnode = cp; + + cp->cgn_type = CG_CGROUP_DIR; + cp->cgn_nodeid = cgrp_inode(CG_CGROUP_DIR, cgm->cg_gen); + + /* + * This initial cgrp_node will have an ID of 0. All existing processes + * inside the zone will have been started with, or inherited, a + * br_cgroupid of 0. The cgrp_cg_hash_init function will initialize the + * cgn_task_cnt for cgroup 0 to reflect the number of tasks already in + * the group. + * + * Because we must hold cg_contents in cgrp_lwp_fork_helper and + * cgrp_lwp_exit_helper, no process can be creating or exiting another + * thread (although that is unlikely anyway since the cgroup filesystem + * is normally mounted at the start of zone bootup, before anything + * else is started). + */ + cgrp_dirinit(cp, cp, cr); + + mutex_exit(&cgm->cg_contents); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&cgrp_mountcount); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +cgrp_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cgnp, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + int retry_cnt = 0; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + +retry: + mutex_enter(&cgm->cg_contents); + + /* + * In the normal unmount case, if there were no open files, only the + * root node would have a reference count. However, the user-level + * agent manager should have the root vnode open and be waiting in + * ioctl. We need to wake the manager and it may take some retries + * before it closes its file descriptor. + * + * With cg_contents held, nothing can be added or removed. + * There may be some dirty pages. To prevent fsflush from + * disrupting the unmount, put a hold on each node while scanning. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + cgnp = cgm->cg_rootnode; + + ASSERT(cgm->cg_lxzdata->lxzd_cgroup != NULL); + + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return (EINVAL); + } + + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + /* Likely because the user-level manager hasn't exited yet */ + if (retry_cnt++ < UMNT_RETRY_MAX) { + delay(UMNT_DELAY_TIME); + goto retry; + } + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (cgnp = cgnp->cgn_forw; cgnp; cgnp = cgnp->cgn_forw) { + vp = CGNTOV(cgnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = cgm->cg_rootnode->cgn_forw; + while (cancel != cgnp) { + vp = CGNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->cgn_forw; + } + mutex_exit(&cgm->cg_contents); + return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + mutex_enter(&cgm->cg_lxzdata->lxzd_lock); + cgm->cg_lxzdata->lxzd_cgroup = NULL; + mutex_exit(&cgm->cg_lxzdata->lxzd_lock); + kmem_free(cgm->cg_grp_hash, sizeof (cgrp_node_t *) * CGRP_HASH_SZ); + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&cgm->cg_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * cgrp_umount. + */ +void +cgrp_freevfs(vfs_t *vfsp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + struct vnode *vp; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries + */ + for (cn = cgm->cg_rootnode; cn; cn = cn->cgn_forw) { + mutex_enter(&cgm->cg_contents); + if (cn->cgn_type == CG_CGROUP_DIR) + cgrp_dirtrunc(cn); + mutex_exit(&cgm->cg_contents); + } + + ASSERT(cgm->cg_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on + * a cgrp_node via its pages or anon slots from blowing it away + * (in cgrp_inactive) while we're trying to get to it here. Once + * we have a HOLD on it we know it'll stick around. + * + */ + mutex_enter(&cgm->cg_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((cn = cgm->cg_rootnode->cgn_back) != cgm->cg_rootnode) { + mutex_exit(&cgm->cg_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = CGNTOV(cn); + VN_RELE(vp); + mutex_enter(&cgm->cg_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (cn == cgm->cg_rootnode->cgn_back) { + VN_HOLD(vp); + mutex_exit(&cgm->cg_contents); + delay(hz / 4); + mutex_enter(&cgm->cg_contents); + } + } + mutex_exit(&cgm->cg_contents); + + VN_RELE(CGNTOV(cgm->cg_rootnode)); + + ASSERT(cgm->cg_mntpath); + + kmem_free(cgm->cg_mntpath, strlen(cgm->cg_mntpath) + 1); + + mutex_destroy(&cgm->cg_contents); + kmem_free(cgm, sizeof (cgrp_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&cgrp_mountcount); +} + +/* + * return root cgnode for given vnode + */ +static int +cgrp_root(struct vfs *vfsp, struct vnode **vpp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cp = cgm->cg_rootnode; + struct vnode *vp; + + ASSERT(cp); + + vp = CGNTOV(cp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +cgrp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = cgm->cg_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > cgrp_minfree) + sbp->f_bfree = blocks - cgrp_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of cgrp_nodes we can allocate from the remaining kernel memory + * available to cgroups. This is fairly inaccurate since it doesn't + * take into account the names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (cgrp_node_t) + sizeof (cgrp_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[cgrp_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, cgm->cg_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +cgrp_get_dirname(cgrp_node_t *cn, char *buf, int blen) +{ + cgrp_node_t *parent; + cgrp_dirent_t *dp; + + buf[0] = '\0'; + + parent = cn->cgn_parent; + if (parent == NULL || parent == cn) { + (void) strlcpy(buf, ".", blen); + return (0); + } + + /* + * Search the parent dir list to find this cn's name. + */ + for (dp = parent->cgn_dir; dp != NULL; dp = dp->cgd_next) { + if (dp->cgd_cgrp_node->cgn_id == cn->cgn_id) { + (void) strlcpy(buf, dp->cgd_name, blen); + return (0); + } + } + + return (-1); +} + +typedef struct cgrp_rra_arg { + char *crraa_agent_path; + char *crraa_event_path; +} cgrp_rra_arg_t; + +static void +cgrp_run_rel_agent(void *a) +{ + cgrp_rra_arg_t *rarg = a; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + struct core_globals *cg; + int res; + + ASSERT(!INGLOBALZONE(curproc)); + + /* The following block is derived from start_init_common */ + ASSERT_STACK_ALIGNED(); + + p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; + p->p_usrstack = (caddr_t)USRSTACK32; + p->p_model = DATAMODEL_ILP32; + p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; + p->p_datprot = PROT_ZFOD & ~PROT_EXEC; + p->p_stk_ctl = INT32_MAX; + + p->p_as = as_alloc(); + p->p_as->a_proc = p; + p->p_as->a_userlimit = (caddr_t)USERLIMIT32; + (void) hat_setup(p->p_as->a_hat, HAT_INIT); + + VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + curproc->p_corefile = cg->core_default_path; + curproc->p_content = cg->core_default_content; + + init_mstate(curthread, LMS_SYSTEM); + res = exec_init(rarg->crraa_agent_path, rarg->crraa_event_path); + + /* End of code derived from start_init_common */ + + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, CGRP_AGENT_LEN); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + + /* The following is derived from zone_start_init - see comments there */ + if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + if (proc_exit(CLD_EXITED, res) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + } else { + id_t cid = curthread->t_cid; + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + (void) parmsset(&pcparms, curthread); + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + + /* cause the process to return to userland. */ + lwp_rtt(); + } +} + +/* + * Launch the user-level release_agent manager. The event data is the + * pathname (relative to the mount point of the file system) of the newly empty + * cgroup. + * + * The cg_contents mutex is held on entry and dropped before returning. + */ +void +cgrp_rel_agent_event(cgrp_mnt_t *cgm, cgrp_node_t *cn, boolean_t is_exit) +{ + cgrp_node_t *parent; + char nm[MAXNAMELEN]; + char *argstr, *oldstr, *tmp; + id_t cid; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + cgrp_rra_arg_t *rarg; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + + /* Nothing to do if the agent is not set */ + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return; + } + + parent = cn->cgn_parent; + /* Cannot remove the top-level cgroup (only via unmount) */ + if (parent == cn) { + mutex_exit(&cgm->cg_contents); + return; + } + + argstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + oldstr = kmem_alloc(MAXPATHLEN, KM_SLEEP); + *argstr = '\0'; + + /* + * Iterate up the directory tree to construct the agent argument string. + */ + do { + VERIFY0(cgrp_get_dirname(cn, nm, sizeof (nm))); + DTRACE_PROBE1(cgrp__dir__name, char *, nm); + if (*argstr == '\0') { + (void) snprintf(argstr, MAXPATHLEN, "/%s", nm); + } else { + tmp = oldstr; + oldstr = argstr; + argstr = tmp; + (void) snprintf(argstr, MAXPATHLEN, "/%s%s", nm, + oldstr); + } + + if (cn->cgn_parent == NULL) + break; + cn = cn->cgn_parent; + parent = cn->cgn_parent; + + /* + * The arg path is relative to the mountpoint so we stop when + * we get to the top level. + */ + if (parent == NULL || parent == cn) + break; + } while (parent != cn); + + kmem_free(oldstr, MAXPATHLEN); + + rarg = kmem_alloc(sizeof (cgrp_rra_arg_t), KM_SLEEP); + rarg->crraa_agent_path = kmem_alloc(sizeof (cgm->cg_agent), KM_SLEEP); + (void) strlcpy(rarg->crraa_agent_path, cgm->cg_agent, + sizeof (cgm->cg_agent)); + rarg->crraa_event_path = argstr; + + DTRACE_PROBE2(cgrp__agent__event, cgrp_rra_arg_t *, rarg, + int, plwpd->br_cgroupid); + + /* + * When we're exiting, the release agent process cannot belong to our + * cgroup. When the release agent is called for a move or rmdir, then + * we do not change our cgroupid. + */ + if (is_exit) { + plwpd->br_cgroupid = 0; + } + + /* + * The cg_contents mutex cannot be held while taking the pool lock + * or calling newproc. + */ + mutex_exit(&cgm->cg_contents); + + if (z->zone_defaultcid > 0) { + cid = z->zone_defaultcid; + } else { + pool_lock(); + cid = pool_get_class(z->zone_pool); + pool_unlock(); + } + if (cid == -1) + cid = defaultcid; + + if (newproc(cgrp_run_rel_agent, (void *)rarg, cid, minclsyspri - 1, + NULL, -1) != 0) { + /* There's nothing we can do if creating the proc fails. */ + kmem_free(rarg->crraa_event_path, MAXPATHLEN); + kmem_free(rarg->crraa_agent_path, sizeof (cgm->cg_agent)); + kmem_free(rarg, sizeof (cgrp_rra_arg_t)); + } +} + +/*ARGSUSED*/ +static void +cgrp_lwp_fork_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + cn->cgn_task_cnt++; + mutex_exit(&cgm->cg_contents); + + DTRACE_PROBE1(cgrp__lwp__fork, void *, cn); +} + +/*ARGSUSED*/ +static void +cgrp_lwp_exit_helper(vfs_t *vfsp, uint_t cg_id, id_t tid, pid_t tpid) +{ + cgrp_mnt_t *cgm = (cgrp_mnt_t *)VFSTOCGM(vfsp); + cgrp_node_t *cn; + + mutex_enter(&cgm->cg_contents); + cn = cgrp_cg_hash_lookup(cgm, cg_id); + ASSERT(cn != NULL); + if (cn->cgn_task_cnt == 0) { + /* top-level cgroup cnt can be 0 during reboot */ + mutex_exit(&cgm->cg_contents); + return; + } + cn->cgn_task_cnt--; + DTRACE_PROBE1(cgrp__lwp__exit, void *, cn); + + if (cn->cgn_task_cnt == 0 && cn->cgn_dirents == N_DIRENTS(cgm) && + cn->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, cn, B_TRUE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + } else { + mutex_exit(&cgm->cg_contents); + } +} diff --git a/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c new file mode 100644 index 0000000000..0078ad7876 --- /dev/null +++ b/usr/src/uts/common/brand/lx/cgroups/cgrps_vnops.c @@ -0,0 +1,1552 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/vm.h> +#include <sys/prsystm.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#include "cgrps.h" + +typedef enum cgrp_wr_type { + CG_WR_PROCS = 1, + CG_WR_TASKS +} cgrp_wr_type_t; + +/* ARGSUSED1 */ +static int +cgrp_open(struct vnode **vpp, int flag, struct cred *cred, caller_context_t *ct) +{ + /* + * swapon to a cgrp file is not supported so access is denied on open + * if VISSWAP is set. + */ + if ((*vpp)->v_flag & VISSWAP) + return (EINVAL); + + return (0); +} + +/* ARGSUSED1 */ +static int +cgrp_close(struct vnode *vp, int flag, int count, offset_t offset, + struct cred *cred, caller_context_t *ct) +{ + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); +} + +/* + * Lookup proc or task based on pid and typ. + */ +static proc_t * +cgrp_p_for_wr(pid_t pid, cgrp_wr_type_t typ) +{ + int i; + zoneid_t zoneid = curproc->p_zone->zone_id; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + + ASSERT(MUTEX_HELD(&pidlock)); + + /* getting a proc from a pid is easy */ + if (typ == CG_WR_PROCS) + return (prfind(pid)); + + ASSERT(typ == CG_WR_TASKS); + + /* + * We have to scan all of the process entries to find the proc + * containing this task. + */ + mutex_exit(&pidlock); + for (i = 1; i < v.v_proc; i++) { + proc_t *p; + kthread_t *t; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check all threads in this proc. + */ + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_pid == pid) { + mutex_exit(&p->p_lock); + return (p); + } + + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + } + + mutex_enter(&pidlock); + return (NULL); +} + +/* + * Move a thread from one cgroup to another. If the old cgroup is empty + * we queue up an agent event. We return true in that case since we've + * dropped the locks and the caller needs to reacquire them. + */ +static boolean_t +cgrp_thr_move(cgrp_mnt_t *cgm, lx_lwp_data_t *plwpd, cgrp_node_t *ncn, + uint_t cg_id, proc_t *p) +{ + cgrp_node_t *ocn; + + ASSERT(MUTEX_HELD(&cgm->cg_contents)); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ocn = cgrp_cg_hash_lookup(cgm, plwpd->br_cgroupid); + VERIFY(ocn != NULL); + + ASSERT(ocn->cgn_task_cnt > 0); + atomic_dec_32(&ocn->cgn_task_cnt); + atomic_inc_32(&ncn->cgn_task_cnt); + plwpd->br_cgroupid = cg_id; + + if (ocn->cgn_task_cnt == 0 && ocn->cgn_dirents == N_DIRENTS(cgm) && + ocn->cgn_notify == 1) { + /* + * We want to drop p_lock before queuing the event since + * that might sleep. Dropping p_lock might cause the caller to + * have to restart the move process from the beginning. + */ + mutex_exit(&p->p_lock); + cgrp_rel_agent_event(cgm, ocn, B_FALSE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Assign either all of the threads, or a single thread, for the specified pid + * to the new cgroup. Controlled by the typ argument. + */ +static int +cgrp_proc_set_id(cgrp_mnt_t *cgm, uint_t cg_id, pid_t pid, cgrp_wr_type_t typ) +{ + proc_t *p; + kthread_t *t; + int error; + cgrp_node_t *ncn; + + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + /* + * Move one or all threads to this cgroup. + */ + if (typ == CG_WR_TASKS) { + error = ESRCH; + } else { + error = 0; + } + +restart: + mutex_enter(&pidlock); + + p = cgrp_p_for_wr(pid, typ); + if (p == NULL) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Fail writes for pids for which there is no corresponding process, + * system processes, a pid of 0, the pid for our zsched process, + * anything the security policy doesn't allow us to look at, and + * processes that are not in the zone. + */ + if (p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == curproc->p_zone->zone_zsched->p_pid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_zone->zone_id != curproc->p_zone->zone_id) { + mutex_exit(&pidlock); + return (ESRCH); + } + + /* + * Ignore writes for PID which is not an lx-branded process or with + * no threads. + */ + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL || + p->p_flag & SEXITING) { + mutex_exit(&p->p_lock); + return (0); + } + + mutex_enter(&cgm->cg_contents); + + ncn = cgrp_cg_hash_lookup(cgm, cg_id); + VERIFY(ncn != NULL); + + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd != NULL && plwpd->br_cgroupid != cg_id) { + if (typ == CG_WR_PROCS) { + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + /* + * We dropped all of the locks so we + * need to start over. + */ + goto restart; + } + + } else if (plwpd->br_pid == pid) { + /* type is CG_WR_TASKS and we found the task */ + error = 0; + if (cgrp_thr_move(cgm, plwpd, ncn, cg_id, p)) { + goto done; + } else { + break; + } + } + } + t = t->t_forw; + } while (t != p->p_tlist); + + mutex_exit(&cgm->cg_contents); + mutex_exit(&p->p_lock); +done: + + return (error); +} + +/* + * User-level is writing a pid string. We need to get that string and convert + * it to a pid. The user-level code has to completely write an entire pid + * string at once. The user-level code could write multiple strings (delimited + * by newline) although that is frowned upon. However, we must handle this + * case too. Thus we consume the input one byte at a time until we get a whole + * pid string. We can't consume more than a byte at a time since otherwise we + * might be left with a partial pid string. + */ +static int +cgrp_get_pid_str(struct uio *uio, pid_t *pid) +{ + char buf[16]; /* big enough for a pid string */ + int i; + int error; + char *p = &buf[0]; + char *ep; + long pidnum; + + bzero(buf, sizeof (buf)); + for (i = 0; uio->uio_resid > 0 && i < sizeof (buf); i++, p++) { + error = uiomove(p, 1, UIO_WRITE, uio); + if (error != 0) + return (error); + if (buf[i] == '\n') { + buf[i] = '\0'; + break; + } + } + + if (buf[0] == '\0' || i >= sizeof (buf)) /* no input or too long */ + return (EINVAL); + + error = ddi_strtol(buf, &ep, 10, &pidnum); + if (error != 0 || *ep != '\0' || pidnum > maxpid || pidnum < 0) + return (EINVAL); + + *pid = (pid_t)pidnum; + return (0); +} + +static int +cgrp_wr_notify(cgrp_node_t *cn, struct uio *uio) +{ + int error; + uint_t value; + + /* + * This is cheesy but since we only take a 0 or 1 value we can + * let the pid_str function do the uio string conversion. + */ + error = cgrp_get_pid_str(uio, (pid_t *)&value); + if (error != 0) + return (error); + + if (value != 0 && value != 1) + return (EINVAL); + + /* + * The flag is on the containing dir. We don't bother taking the + * cg_contents lock since this is a simple assignment. + */ + cn->cgn_parent->cgn_notify = value; + return (0); +} + +static int +cgrp_wr_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int error; + int len; + char *wrp; + + len = uio->uio_offset + uio->uio_resid; + if (len > MAXPATHLEN) + return (EFBIG); + + mutex_enter(&cgm->cg_contents); + + wrp = &cgm->cg_agent[uio->uio_offset]; + error = uiomove(wrp, uio->uio_resid, UIO_WRITE, uio); + cgm->cg_agent[len] = '\0'; + if (len > 1 && cgm->cg_agent[len - 1] == '\n') + cgm->cg_agent[len - 1] = '\0'; + + mutex_exit(&cgm->cg_contents); + return (error); +} + +static int +cgrp_wr_proc_or_task(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio, + cgrp_wr_type_t typ) +{ + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + int error; + pid_t pidnum; + + while (uio->uio_resid > 0) { + error = cgrp_get_pid_str(uio, &pidnum); + if (error != 0) + return (error); + + error = cgrp_proc_set_id(cgm, cg_id, pidnum, typ); + if (error != 0) + return (error); + } + + return (0); +} + +static int +cgrp_wr(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int error = 0; + rlim64_t limit = uio->uio_llimit; + + ASSERT(CGNTOV(cn)->v_type == VREG); + + if (uio->uio_loffset < 0) + return (EINVAL); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + if (uio->uio_loffset >= MAXOFF_T) + return (EFBIG); + + if (uio->uio_resid == 0) + return (0); + + if (limit > MAXOFF_T) + limit = MAXOFF_T; + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_wr_notify(cn, uio); + break; + case CG_PROCS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_PROCS); + break; + case CG_REL_AGENT: + error = cgrp_wr_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_wr_proc_or_task(cgm, cn, uio, CG_WR_TASKS); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* + * Read value from the notify_on_release pseudo file on the parent node + * (which is the actual cgroup node). We don't bother taking the cg_contents + * lock since it's a single instruction so an empty group action/read will + * only see one value or the other. + */ +/* ARGSUSED */ +static int +cgrp_rd_notify(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int len; + int error = 0; + char buf[16]; + char *rdp; + /* the flag is on the containing dir */ + uint_t value = cn->cgn_parent->cgn_notify; + + len = snprintf(buf, sizeof (buf), "%u\n", value); + if (uio->uio_offset > len) + return (0); + + len -= uio->uio_offset; + rdp = &buf[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + return (error); +} + +/* + * Read value from the release_agent pseudo file. + */ +static int +cgrp_rd_rel_agent(cgrp_mnt_t *cgm, struct uio *uio) +{ + int len; + int error = 0; + char *rdp; + + mutex_enter(&cgm->cg_contents); + + if (cgm->cg_agent[0] == '\0') { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len = strlen(cgm->cg_agent); + if (uio->uio_offset > len) { + mutex_exit(&cgm->cg_contents); + return (0); + } + + len -= uio->uio_offset; + rdp = &cgm->cg_agent[uio->uio_offset]; + len = (uio->uio_resid < len) ? uio->uio_resid : len; + + error = uiomove(rdp, len, UIO_READ, uio); + + mutex_exit(&cgm->cg_contents); + + return (error); +} + +/* + * Read pids from the cgroup.procs pseudo file. We have to look at all of the + * processes to find applicable ones, then report pids for any process which + * has all of its threads in the same cgroup. + */ +static int +cgrp_rd_procs(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + ssize_t uresid; + zoneid_t zoneid = curproc->p_zone->zone_id; + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + pid_t schedpid = curproc->p_zone->zone_zsched->p_pid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && (uresid = uio->uio_resid) > 0; i++) { + proc_t *p; + ssize_t len; + pid_t pid; + char buf[16]; + char *rdp; + kthread_t *t; + boolean_t in_cg; + + mutex_enter(&pidlock); + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, system processes, + * a PID of 0, the pid for our zsched process, anything the + * security policy doesn't allow us to look at, its not an + * lx-branded process and processes that are not in the zone. + */ + if ((p = pid_entry(i)) == NULL || + p->p_stat == SIDL || + (p->p_flag & SSYS) != 0 || + p->p_pid == 0 || + p->p_pid == schedpid || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0 || + p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&pidlock); + continue; + } + + mutex_enter(&p->p_lock); + if ((t = p->p_tlist) == NULL) { + /* no threads, skip it */ + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + continue; + } + + /* + * Check if all threads are in this cgroup. + */ + in_cg = B_TRUE; + mutex_enter(&cgm->cg_contents); + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL || plwpd->br_cgroupid != cg_id) { + in_cg = B_FALSE; + break; + } + + t = t->t_forw; + } while (t != p->p_tlist); + mutex_exit(&cgm->cg_contents); + + mutex_exit(&p->p_lock); + if (!in_cg) { + /* + * This proc, or at least one of its threads, is not + * in this cgroup. + */ + mutex_exit(&pidlock); + continue; + } + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc struct + */ + if (p->p_pid == initpid) { + pid = 1; + } else { + pid = p->p_pid; + } + + mutex_exit(&pidlock); + + /* + * Generate pid line and write all or part of it if we're + * in the right spot within the pseudo file. + */ + len = snprintf(buf, sizeof (buf), "%u\n", pid); + if ((offset + len) > uio->uio_offset) { + int diff = (int)(uio->uio_offset - offset); + + ASSERT(diff < len); + offset += diff; + rdp = &buf[diff]; + len -= diff; + if (len > uresid) + len = uresid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + offset += len; + } + + return (0); +} + +/* + * We are given a locked process we know is valid, report on any of its thresds + * that are in the cgroup. + */ +static int +cgrp_rd_proc_tasks(uint_t cg_id, proc_t *p, pid_t initpid, ssize_t *offset, + struct uio *uio) +{ + int error = 0; + uint_t tid; + char buf[16]; + char *rdp; + kthread_t *t; + + ASSERT(p->p_proc_flag & P_PR_LOCK); + + /* + * Report all threads in this cgroup. + */ + t = p->p_tlist; + do { + lx_lwp_data_t *plwpd = ttolxlwp(t); + if (plwpd == NULL) { + t = t->t_forw; + continue; + } + + if (plwpd->br_cgroupid == cg_id) { + int len; + + /* + * Convert taskid to the Linux default of 1 if + * we're the zone's init process. + */ + tid = plwpd->br_pid; + if (tid == initpid) + tid = 1; + + len = snprintf(buf, sizeof (buf), "%u\n", tid); + if ((*offset + len) > uio->uio_offset) { + int diff; + + diff = (int)(uio->uio_offset - *offset); + ASSERT(diff < len); + *offset = *offset + diff; + rdp = &buf[diff]; + len -= diff; + if (len > uio->uio_resid) + len = uio->uio_resid; + + error = uiomove(rdp, len, UIO_READ, uio); + if (error != 0) + return (error); + } + *offset = *offset + len; + } + + t = t->t_forw; + } while (t != p->p_tlist && uio->uio_resid > 0); + + return (0); +} + +/* + * Read PIDs from the tasks pseudo file. In order to do this, the process + * table is walked, searching for entries which are in the correct state and + * match this zone. The LX emulated PIDs will be reported from branded entries + * which fulfill the criteria. Since records are being emulated for every task + * in the process, PR_LOCK is acquired to prevent changes during output. + * + * Note: If the buffer is filled and the accessing process is forced into a + * subsequent read, the reported threads may changes while locks are dropped in + * the mean time. + */ +static int +cgrp_rd_tasks(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int i; + ssize_t offset = 0; + zoneid_t zoneid = curproc->p_zone->zone_id; + cred_t *cred = CRED(); + int error = 0; + pid_t initpid = curproc->p_zone->zone_proc_initpid; + /* the cgroup ID is on the containing dir */ + uint_t cg_id = cn->cgn_parent->cgn_id; + + /* Scan all of the process entries */ + for (i = 1; i < v.v_proc && uio->uio_resid > 0; i++) { + proc_t *p; + + mutex_enter(&pidlock); + for (;;) { + if ((p = pid_entry(i)) == NULL) { + /* Quickly move onto the next slot */ + if (++i < v.v_proc) { + continue; + } else { + mutex_exit(&pidlock); + break; + } + } + + /* + * Check if this process would even be of interest to + * cgroupfs before attempting to acquire its PR_LOCK. + */ + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + if (p->p_brand != &lx_brand || + p->p_zone->zone_id != zoneid) { + mutex_exit(&p->p_lock); + p = NULL; + break; + } + + /* Attempt to grab P_PR_LOCK. */ + error = sprtrylock_proc(p); + if (error == 0) { + /* Success */ + break; + } else if (error < 0) { + /* + * This process is not in a state where + * P_PR_LOCK can be acquired. It either + * belongs to the system or is a zombie. + * Regardless, give up and move on. + */ + mutex_exit(&p->p_lock); + p = NULL; + break; + } else { + /* + * Wait until P_PR_LOCK is no longer contended + * and attempt to acquire it again. Since the + * process may have changed state, the entry + * lookup must be repeated. + */ + sprwaitlock_proc(p); + mutex_enter(&pidlock); + } + } + + if (p == NULL) { + continue; + } else if (secpolicy_basic_procinfo(cred, p, curproc) != 0) { + sprunlock(p); + continue; + } + + /* Shuffle locks and output the entry. */ + mutex_exit(&p->p_lock); + mutex_enter(&cgm->cg_contents); + error = cgrp_rd_proc_tasks(cg_id, p, initpid, &offset, uio); + mutex_exit(&cgm->cg_contents); + mutex_enter(&p->p_lock); + + sprunlock(p); + if (error != 0) { + return (error); + } + } + + return (0); +} + +static int +cgrp_rd(cgrp_mnt_t *cgm, cgrp_node_t *cn, struct uio *uio) +{ + int error = 0; + + if (uio->uio_loffset >= MAXOFF_T) + return (0); + if (uio->uio_loffset < 0) + return (EINVAL); + if (uio->uio_resid == 0) + return (0); + + switch (cn->cgn_type) { + case CG_NOTIFY: + error = cgrp_rd_notify(cgm, cn, uio); + break; + case CG_PROCS: + error = cgrp_rd_procs(cgm, cn, uio); + break; + case CG_REL_AGENT: + error = cgrp_rd_rel_agent(cgm, uio); + break; + case CG_TASKS: + error = cgrp_rd_tasks(cgm, cn, uio); + break; + default: + VERIFY(0); + } + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support reading non-regular files + */ + if (vp->v_type == VDIR) + return (EISDIR); + if (vp->v_type != VREG) + return (EINVAL); + error = cgrp_rd(cgm, cn, uiop); + + return (error); +} + +/* ARGSUSED */ +static int +cgrp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, + struct caller_context *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VTOCGM(vp); + int error; + + /* + * We don't support writing to non-regular files + */ + if (vp->v_type != VREG) + return (EINVAL); + + if (ioflag & FAPPEND) { + /* In append mode start at end of file. */ + uiop->uio_loffset = cn->cgn_size; + } + + error = cgrp_wr(cgm, cn, uiop); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_getattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + vap->va_type = vp->v_type; + vap->va_mode = cn->cgn_mode & MODEMASK; + vap->va_uid = cn->cgn_uid; + vap->va_gid = cn->cgn_gid; + vap->va_fsid = cn->cgn_fsid; + vap->va_nodeid = (ino64_t)cn->cgn_nodeid; + vap->va_nlink = cn->cgn_nlink; + vap->va_size = (u_offset_t)cn->cgn_size; + vap->va_atime = cn->cgn_atime; + vap->va_mtime = cn->cgn_mtime; + vap->va_ctime = cn->cgn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = cn->cgn_rdev; + vap->va_seq = cn->cgn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + mutex_exit(&cgm->cg_contents); + return (0); +} + +/*ARGSUSED4*/ +static int +cgrp_setattr(struct vnode *vp, struct vattr *vap, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error = 0; + struct vattr *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR) || + (vap->va_mode & (S_ISUID | S_ISGID)) || (vap->va_mask & AT_SIZE)) + return (EINVAL); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + get = &cn->cgn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cred, vp, vap, get, flags, cgrp_taccess, + cn); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&cn->cgn_ctime); + +out: + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_access(struct vnode *vp, int mode, int flags, struct cred *cred, + caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + int error; + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_taccess(cn, mode, cred); + mutex_exit(&cgm->cg_contents); + return (error); +} + +/* ARGSUSED3 */ +static int +cgrp_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + cgrp_node_t *cn = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *ncn = NULL; + int error; + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* + * Null component name is a synonym for directory being searched. + */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(cn); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(cn, nm, &ncn, cred); + mutex_exit(&cgm->cg_contents); + + if (error == 0) { + ASSERT(ncn); + *vpp = CGNTOV(ncn); + } + + return (error); +} + +/* ARGSUSED */ +static int +cgrp_create(struct vnode *dvp, char *nm, struct vattr *vap, + enum vcexcl exclusive, int mode, struct vnode **vpp, struct cred *cred, + int flag, caller_context_t *ct, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + int error; + + if (*nm == '\0') + return (EPERM); + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + if (error == 0) { /* name found */ + ASSERT(cn); + + mutex_exit(&cgm->cg_contents); + /* + * Creating an existing file, allow it except for the following + * errors. + */ + if (exclusive == EXCL) { + error = EEXIST; + } else if ((CGNTOV(cn)->v_type == VDIR) && (mode & VWRITE)) { + error = EISDIR; + } else { + error = cgrp_taccess(cn, mode, cred); + } + if (error != 0) { + cgnode_rele(cn); + return (error); + } + *vpp = CGNTOV(cn); + return (0); + } + mutex_exit(&cgm->cg_contents); + + /* + * cgroups doesn't allow creation of additional, non-subsystem specific + * files in a dir + */ + return (EPERM); +} + +/* ARGSUSED3 */ +static int +cgrp_remove(struct vnode *dvp, char *nm, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + int error; + cgrp_node_t *cn = NULL; + cgrp_mnt_t *cgm; + + /* + * Removal of subsystem-specific files is not allowed but we need + * to return the correct error if they try to remove a non-existent + * file. + */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &cn, cred); + mutex_exit(&cgm->cg_contents); + if (error) + return (error); + + ASSERT(cn); + cgnode_rele(cn); + return (EPERM); +} + +/* ARGSUSED */ +static int +cgrp_link(struct vnode *dvp, struct vnode *srcvp, char *cnm, struct cred *cred, + caller_context_t *ct, int flags) +{ + /* cgroups doesn't support hard links */ + return (EPERM); +} + +/* + * Rename of subsystem-specific files is not allowed but we can rename + * directories (i.e. sub-groups). We cannot mv subdirs from one group to + * another so the src and dest vnode must be the same. + */ +/* ARGSUSED5 */ +static int +cgrp_rename( + struct vnode *odvp, /* source parent vnode */ + char *onm, /* source name */ + struct vnode *ndvp, /* destination parent vnode */ + char *nnm, /* destination name */ + struct cred *cred, + caller_context_t *ct, + int flags) +{ + cgrp_node_t *fromparent; + cgrp_node_t *toparent; + cgrp_node_t *fromcn = NULL; /* source cgrp_node */ + cgrp_mnt_t *cgm = VTOCGM(odvp); + int error, err; + + fromparent = VTOCGN(odvp); + toparent = VTOCGN(ndvp); + + if (fromparent != toparent) + return (EIO); + + /* discourage additional use of toparent */ + toparent = NULL; + + mutex_enter(&cgm->cg_contents); + + /* + * Look up cgrp_node of file we're supposed to rename. + */ + error = cgrp_dirlookup(fromparent, onm, &fromcn, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + if (fromcn->cgn_type != CG_CGROUP_DIR) { + error = EPERM; + goto done; + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. + */ + if (((error = cgrp_taccess(fromparent, VWRITE, cred)) != 0)) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromcn == fromparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (fromparent == fromcn)) { + error = EINVAL; + goto done; + } + + /* + * Link source to new target + */ + error = cgrp_direnter(cgm, fromparent, nnm, DE_RENAME, + fromcn, (struct vattr *)NULL, + (cgrp_node_t **)NULL, cred); + + if (error) + goto done; + + /* + * Unlink from source. + */ + error = err = cgrp_dirdelete(fromparent, fromcn, onm, DR_RENAME, cred); + + /* + * The following handles the case where our source cgrp_node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + if (err == 0) { + vnevent_rename_src(CGNTOV(fromcn), odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, CGNTOV(fromcn), nnm, ct); + } + +done: + mutex_exit(&cgm->cg_contents); + cgnode_rele(fromcn); + + return (error); +} + +/* ARGSUSED5 */ +static int +cgrp_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, + struct cred *cred, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_node_t *self = NULL; + cgrp_mnt_t *cgm = VTOCGM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT + * return from cgrp_dirlookup() is an "ok return". + */ + if (parent->cgn_nlink == 0) + return (ENOENT); + + mutex_enter(&cgm->cg_contents); + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error == 0) { + ASSERT(self != NULL); + mutex_exit(&cgm->cg_contents); + cgnode_rele(self); + return (EEXIST); + } + if (error != ENOENT) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + error = cgrp_direnter(cgm, parent, nm, DE_MKDIR, (cgrp_node_t *)NULL, + va, &self, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + if (self != NULL) + cgnode_rele(self); + return (error); + } + mutex_exit(&cgm->cg_contents); + *vpp = CGNTOV(self); + return (0); +} + +/* ARGSUSED4 */ +static int +cgrp_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, + caller_context_t *ct, int flags) +{ + cgrp_node_t *parent = VTOCGN(dvp); + cgrp_mnt_t *cgm; + cgrp_node_t *self = NULL; + struct vnode *vp; + int error = 0; + + /* + * Return error when removing . and .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + + cgm = VTOCGM(parent->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + error = cgrp_dirlookup(parent, nm, &self, cred); + if (error) { + mutex_exit(&cgm->cg_contents); + return (error); + } + + vp = CGNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done; + } + if (self->cgn_type != CG_CGROUP_DIR) { + error = ENOTDIR; + goto done; + } + + cgm = (cgrp_mnt_t *)VFSTOCGM(self->cgn_vnode->v_vfsp); + + /* + * Check for the existence of any sub-cgroup directories or tasks in + * the cgroup. + */ + if (self->cgn_task_cnt > 0 || self->cgn_dirents > N_DIRENTS(cgm)) { + error = EEXIST; + /* + * Update atime because checking cn_dirents is logically + * equivalent to reading the directory + */ + gethrestime(&self->cgn_atime); + goto done; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + } else { + error = cgrp_dirdelete(parent, self, nm, DR_RMDIR, cred); + } + + vn_vfsunlock(vp); + + if (parent->cgn_task_cnt == 0 && + parent->cgn_dirents == N_DIRENTS(cgm) && parent->cgn_notify == 1) { + cgrp_rel_agent_event(cgm, parent, B_FALSE); + ASSERT(MUTEX_NOT_HELD(&cgm->cg_contents)); + goto dropped; + } + +done: + mutex_exit(&cgm->cg_contents); +dropped: + vnevent_rmdir(CGNTOV(self), dvp, nm, ct); + cgnode_rele(self); + + return (error); +} + +/* ARGSUSED2 */ +static int +cgrp_readdir(struct vnode *vp, struct uio *uiop, struct cred *cred, int *eofp, + caller_context_t *ct, int flags) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm; + cgrp_dirent_t *cdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + cgm = VTOCGM(cn->cgn_vnode); + mutex_enter(&cgm->cg_contents); + + if (cn->cgn_dir == NULL) { + VERIFY(cn->cgn_nlink == 0); + mutex_exit(&cgm->cg_contents); + return (0); + } + + /* + * Get space for multiple directory entries + */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + /* LINTED: alignment */ + dp = (struct dirent64 *)outbuf; + + offset = 0; + cdp = cn->cgn_dir; + while (cdp) { + namelen = strlen(cdp->cgd_name); /* no +1 needed */ + offset = cdp->cgd_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) { + /* Buffer too small for any entries. */ + error = EINVAL; + } + break; + } + ASSERT(cdp->cgd_cgrp_node != NULL); + + /* use strncpy(9f) to zero out uninitialized bytes */ + + (void) strncpy(dp->d_name, cdp->cgd_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)cdp->cgd_cgrp_node->cgn_nodeid; + dp->d_off = (offset_t)cdp->cgd_offset + 1; + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + cdp = cdp->cgd_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!cdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&cn->cgn_atime); + + mutex_exit(&cgm->cg_contents); + + kmem_free(outbuf, bufsize); + return (error); +} + +/* ARGSUSED */ +static int +cgrp_symlink(struct vnode *dvp, char *lnm, struct vattr *cva, char *cnm, + struct cred *cred, caller_context_t *ct, int flags) +{ + /* cgroups doesn't support symlinks */ + return (EPERM); +} + +/* ARGSUSED */ +static void +cgrp_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + cgrp_node_t *cn = VTOCGN(vp); + cgrp_mnt_t *cgm = VFSTOCGM(vp->v_vfsp); + + mutex_enter(&cgm->cg_contents); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || cn->cgn_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + return; + } + + if (cn->cgn_forw == NULL) + cgm->cg_rootnode->cgn_back = cn->cgn_back; + else + cn->cgn_forw->cgn_back = cn->cgn_back; + cn->cgn_back->cgn_forw = cn->cgn_forw; + + mutex_exit(&vp->v_lock); + mutex_exit(&cgm->cg_contents); + + /* Here's our chance to send invalid event */ + vn_invalid(CGNTOV(cn)); + + vn_free(CGNTOV(cn)); + kmem_free(cn, sizeof (cgrp_node_t)); +} + +/* ARGSUSED */ +static int +cgrp_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* ARGSUSED */ +static int +cgrp_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + return (write_lock); +} + +/* ARGSUSED */ +static void +cgrp_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ +} + +static int +cgrp_pathconf(struct vnode *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + switch (cmd) { + case _PC_XATTR_EXISTS: + if (vp->v_vfsp->vfs_flag & VFS_XATTR) { + *valp = 0; /* assume no attributes */ + error = 0; /* okay to ask */ + } else { + error = EINVAL; + } + break; + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_type == VREG || vp->v_type == VDIR); + error = 0; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *cgrp_vnodeops; + +const fs_operation_def_t cgrp_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = cgrp_open }, + VOPNAME_CLOSE, { .vop_close = cgrp_close }, + VOPNAME_READ, { .vop_read = cgrp_read }, + VOPNAME_WRITE, { .vop_write = cgrp_write }, + VOPNAME_GETATTR, { .vop_getattr = cgrp_getattr }, + VOPNAME_SETATTR, { .vop_setattr = cgrp_setattr }, + VOPNAME_ACCESS, { .vop_access = cgrp_access }, + VOPNAME_LOOKUP, { .vop_lookup = cgrp_lookup }, + VOPNAME_CREATE, { .vop_create = cgrp_create }, + VOPNAME_REMOVE, { .vop_remove = cgrp_remove }, + VOPNAME_LINK, { .vop_link = cgrp_link }, + VOPNAME_RENAME, { .vop_rename = cgrp_rename }, + VOPNAME_MKDIR, { .vop_mkdir = cgrp_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = cgrp_rmdir }, + VOPNAME_READDIR, { .vop_readdir = cgrp_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = cgrp_symlink }, + VOPNAME_INACTIVE, { .vop_inactive = cgrp_inactive }, + VOPNAME_RWLOCK, { .vop_rwlock = cgrp_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = cgrp_rwunlock }, + VOPNAME_SEEK, { .vop_seek = cgrp_seek }, + VOPNAME_PATHCONF, { .vop_pathconf = cgrp_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/devfs/lxd.h b/usr/src/uts/common/brand/lx/devfs/lxd.h new file mode 100644 index 0000000000..437b0b6162 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd.h @@ -0,0 +1,244 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LXD_H +#define _LXD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxd.h: declarations, data structures and macros for lxd (lxd devfs). + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/atomic.h> +#include <vm/anon.h> +#include <sys/lx_types.h> + +#if defined(_KERNEL) + +#include <sys/lx_brand.h> + +/* + * It's unlikely that we need to create more than 50-60 subdirs/symlinks + * as front files so we size the file system hash for 2x that number. + * The back devfs typically has ~80 nodes so this is also a comfortable size + * for the back hash table. + */ +#define LXD_HASH_SZ 128 + +#define LXD_BACK_HASH(v) ((((intptr_t)(v)) >> 10) & ((LXD_HASH_SZ) - 1)) + +#define LXD_NM_HASH(ldn, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(ldn) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + hash &= (LXD_HASH_SZ - 1); \ + } + + +enum lxd_node_type { LXDNT_NONE, LXDNT_BACK, LXDNT_FRONT }; + +typedef struct lxd_dev_attr { + list_node_t lxda_link; + char lxda_name[MAXPATHLEN]; + uid_t lxda_uid; + gid_t lxda_gid; + mode_t lxda_mode; +} lxd_dev_attr_t; + +/* + * lxd per-mount data structure. + * + * All fields are protected by lxd_contents. + * File renames on a specific file system are protected lxdm_renamelck. + */ +typedef struct lxd_mnt { + struct vfs *lxdm_vfsp; /* filesystem's vfs struct */ + struct lxd_node *lxdm_rootnode; /* root lxd_node */ + char *lxdm_mntpath; /* name of lxd mount point */ + dev_t lxdm_dev; /* unique dev # of mounted `device' */ + kmutex_t lxdm_contents; /* per-mount lock */ + kmutex_t lxdm_renamelck; /* rename lock for this mount */ + kmutex_t lxdm_attrlck; /* per-mount attr. file lock */ + list_t lxdm_devattrs; /* list of device attr. settings */ + uint_t lxdm_gen; /* node ID source for files */ + + /* protects buckets in both "dir ent" and "back" hash tables */ + kmutex_t lxdm_hash_mutex[LXD_HASH_SZ]; + + /* per-mount data for "back" vnodes in the fs */ + uint_t lxdm_back_refcnt; /* # outstanding "back" vnodes */ + struct lxd_node *lxdm_back_htable[LXD_HASH_SZ]; + + /* + * Per-mount directory data for "front" nodes in the fs. + * Each front node has a directory entry but directory entries can live + * on either front or back nodes. + */ + uint_t lxdm_dent_refcnt; /* # outstanding dir ents */ + struct lxd_dirent *lxdm_dent_htable[LXD_HASH_SZ]; +} lxd_mnt_t; + +/* + * lxd_node is the file system dependent node for lxd. + * + * The node is used to represent both front and back files. For front files + * the node can represent either a directory or symlink. + */ +typedef struct lxd_node { + enum lxd_node_type lxdn_type; + + /* Data for "front" nodes */ + struct lxd_node *lxdn_prev; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_next; /* lnked lst of lxd nodes */ + struct lxd_node *lxdn_parent; /* dir containing this node */ + krwlock_t lxdn_rwlock; /* serialize mods/dir updates */ + kmutex_t lxdn_tlock; /* time, flag, and nlink lock */ + + /* these could be in a union ala tmpfs but not really necessary */ + uint_t lxdn_dirents; /* number of dirents */ + struct lxd_dirent *lxdn_dir; /* dirent list */ + char *lxdn_symlink; /* pointer to symlink */ + struct vattr lxdn_attr; /* attributes */ + + /* Hash table link */ + struct lxd_node *lxdn_hnxt; /* link in per-mount entry */ + /* hash table */ + vnode_t *lxdn_vnode; /* vnode for this lxd_node */ + + vnode_t *lxdn_real_vp; /* back file - real vnode */ +} lxd_node_t; + +/* + * Attributes + */ +#define lxdn_mask lxdn_attr.va_mask +#define lxdn_mode lxdn_attr.va_mode +#define lxdn_uid lxdn_attr.va_uid +#define lxdn_gid lxdn_attr.va_gid +#define lxdn_fsid lxdn_attr.va_fsid +#define lxdn_nodeid lxdn_attr.va_nodeid +#define lxdn_nlink lxdn_attr.va_nlink +#define lxdn_size lxdn_attr.va_size +#define lxdn_atime lxdn_attr.va_atime +#define lxdn_mtime lxdn_attr.va_mtime +#define lxdn_ctime lxdn_attr.va_ctime +#define lxdn_rdev lxdn_attr.va_rdev +#define lxdn_blksize lxdn_attr.va_blksize +#define lxdn_nblocks lxdn_attr.va_nblocks +#define lxdn_seq lxdn_attr.va_seq + +/* + * lx devfs conversion macros + */ +#define VFSTOLXDM(vfsp) ((lxd_mnt_t *)(vfsp)->vfs_data) +#define VTOLXDM(vp) ((lxd_mnt_t *)(vp)->v_vfsp->vfs_data) +#define VTOLDN(vp) ((lxd_node_t *)(vp)->v_data) +#define LDNTOV(ln) ((ln)->lxdn_vnode) +#define ldnode_hold(ln) VN_HOLD(LDNTOV(ln)) +#define ldnode_rele(ln) VN_RELE(LDNTOV(ln)) + +#define REALVP(vp) (VTOLDN(vp)->lxdn_real_vp) + +/* + * front directories are made up of a linked list of lxd_dirent structures + * hanging off directory lxdn_nodes. File names are not fixed length, but are + * null terminated. + */ +typedef struct lxd_dirent { + lxd_node_t *lddir_node; /* lxd node for this file */ + struct lxd_dirent *lddir_next; /* next directory entry */ + struct lxd_dirent *lddir_prev; /* prev directory entry */ + uint_t lddir_offset; /* "offset" of dir entry */ + uint_t lddir_hash; /* a hash of lddir_name */ + struct lxd_dirent *lddir_link; /* linked via hash table */ + lxd_node_t *lddir_parent; /* parent, dir we are in */ + char *lddir_name; /* null terminated */ +} lxd_dirent_t; + +enum de_op { DE_CREATE, DE_MKDIR, DE_RENAME }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ + +typedef struct lxd_minor_translator { + char *lxd_mt_path; /* illumos minor node path */ + minor_t lxd_mt_minor; /* illumos minor node number */ + int lxd_mt_lx_major; /* linux major node number */ + int lxd_mt_lx_minor; /* linux minor node number */ +} lxd_minor_translator_t; + +enum lxd_xl_tp { DTT_INVALID, DTT_LIST, DTT_CUSTOM }; + +#define xl_list lxd_xl_minor.lxd_xl_list +#define xl_custom lxd_xl_minor.lxd_xl_custom + +typedef struct lxd_devt_translator { + char *lxd_xl_driver; /* driver name */ + major_t lxd_xl_major; /* driver number */ + + enum lxd_xl_tp lxd_xl_type; /* dictates how we intrep. xl_minor */ + union { + uintptr_t lxd_xl_foo; /* required to compile */ + lxd_minor_translator_t *lxd_xl_list; + void (*lxd_xl_custom)(dev_t, dev_t *); + } lxd_xl_minor; +} lxd_devt_translator_t; + +extern struct vnodeops *lxd_vnodeops; +extern lxd_devt_translator_t lxd_devt_translators[]; + +vnode_t *lxd_make_back_node(vnode_t *, lxd_mnt_t *); +void lxd_free_back_node(lxd_node_t *); +int lxd_dirdelete(lxd_node_t *, lxd_node_t *, char *, enum dr_op, cred_t *); +int lxd_direnter(lxd_mnt_t *, lxd_node_t *, char *, enum de_op, lxd_node_t *, + lxd_node_t *, struct vattr *, lxd_node_t **, cred_t *); +void lxd_dirinit(lxd_node_t *, lxd_node_t *); +int lxd_dirlookup(lxd_node_t *, char *, lxd_node_t **, cred_t *); +void lxd_dirtrunc(lxd_node_t *); +void lxd_node_init(lxd_mnt_t *, lxd_node_t *, vnode_t *, vattr_t *, cred_t *); +int lxd_naccess(void *, int, cred_t *); + +void lxd_save_attrs(lxd_mnt_t *, vnode_t *); +void lxd_apply_db(lxd_mnt_t *); + +#endif /* KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LXD_H */ diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c new file mode 100644 index 0000000000..02d396a36d --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_attrdb.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> + +#include "lxd.h" + +#define LX_ATTR_FILE "/etc/.lxd_dev_attr" + +#define RD_BUFSIZE MAXPATHLEN +#define ENTRY_BUFSIZE (MAXPATHLEN + 32) + +static int +lxd_db_open(int fmode, vnode_t **vpp) +{ + return (vn_open(LX_ATTR_FILE, UIO_SYSSPACE, fmode, + (int)(0644 & MODEMASK), vpp, CRCREAT, PTOU(curproc)->u_cmask)); +} + +static int +lxd_wr_entry(vnode_t *wvn, off_t offset, char *entry) +{ + int len, err; + struct uio auio; + struct iovec aiov; + + len = strlen(entry); + aiov.iov_base = entry; + aiov.iov_len = len; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = len; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = FWRITE; + auio.uio_extflg = UIO_COPY_DEFAULT; + + (void) VOP_RWLOCK(wvn, V_WRITELOCK_TRUE, NULL); + err = VOP_WRITE(wvn, &auio, FAPPEND, CRED(), NULL); + VOP_RWUNLOCK(wvn, V_WRITELOCK_TRUE, NULL); + + if (err != 0) + return (0); + return (len); +} + +/* + * Given an entry, apply a uid, gid and mode change to the given device. There + * is no strtok in the kernel but it's easy to tokenize the entry ourselves. + * + * entries have the form (newline removed by caller): + * path uid gid mode\0 + */ +static int +lxd_apply_entry(char *entry, char **dpath, uid_t *uidp, gid_t *gidp, + mode_t *modep) +{ + char *dp, *up, *gp, *mp, *ep; + long uid, gid, mode; + int error, res = 0; + vnode_t *vp; + vattr_t va; + + dp = entry; + + /* find and delimit the first field (device name) */ + for (up = dp; *up != ' ' && *up != '\0'; up++) + ; + if (*up != ' ') + return (-1); + *up++ = '\0'; + + /* find and delimit the second field (uid) */ + for (gp = up; *gp != ' ' && *gp != '\0'; gp++) + ; + if (*gp != ' ') + return (-1); + *gp++ = '\0'; + + /* find and delimit the third field (gid) */ + for (mp = gp; *mp != ' ' && *mp != '\0'; mp++) + ; + if (*mp != ' ') + return (-1); + *mp++ = '\0'; + + /* validate the fourth field (mode) */ + ep = mp + strlen(mp); + if (*ep != '\0') + return (-1); + + if (*dp != '/') + return (-1); + + error = ddi_strtol(up, &ep, 10, &uid); + if (error != 0 || *ep != '\0' || uid > MAXUID || uid < 0) + return (-1); + + error = ddi_strtol(gp, &ep, 10, &gid); + if (error != 0 || *ep != '\0' || gid > MAXUID || gid < 0) + return (-1); + + /* note that the mode is octal */ + error = ddi_strtol(mp, &ep, 8, &mode); + if (error != 0 || *ep != '\0' || mode > 0777 || mode < 0) + return (-1); + + if (lookupname(dp, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp) != 0) { + /* + * It's likely the device is no longer visible to the zone. + * No matter the reason, we indicate failure. + */ + return (-1); + } + + va.va_mask = AT_UID | AT_GID | AT_MODE; + va.va_uid = (uid_t)uid; + va.va_gid = (gid_t)gid; + va.va_mode = (mode_t)mode; + + if (VOP_SETATTR(vp, &va, 0, CRED(), NULL) != 0) + res = -1; + + VN_RELE(vp); + + *dpath = dp; + *uidp = (uid_t)uid; + *gidp = (gid_t)gid; + *modep = (mode_t)mode; + return (res); +} + +/* + * Return true if this is a pre-existing record. + */ +static boolean_t +lxd_save_devattr(lxd_mnt_t *lxdm, char *dpath, uid_t uid, gid_t gid, + mode_t mode) +{ + lxd_dev_attr_t *da; + + da = list_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + if (strcmp(dpath, da->lxda_name) == 0) { + da->lxda_uid = uid; + da->lxda_gid = gid; + da->lxda_mode = mode; + return (B_TRUE); + } + da = list_next(&lxdm->lxdm_devattrs, da); + } + + da = kmem_zalloc(sizeof (lxd_dev_attr_t), KM_SLEEP); + (void) strlcpy(da->lxda_name, dpath, sizeof (da->lxda_name)); + da->lxda_uid = uid; + da->lxda_gid = gid; + da->lxda_mode = mode; + + list_insert_tail(&lxdm->lxdm_devattrs, da); + return (B_FALSE); +} + +static void +lxd_save_db(lxd_mnt_t *lxdm) +{ + lxd_dev_attr_t *da; + char *entry; + vnode_t *wvn; + off_t woff = 0; + + if (list_is_empty(&lxdm->lxdm_devattrs)) { + /* The attribute file is no longer needed. */ + (void) vn_remove(LX_ATTR_FILE, UIO_SYSSPACE, RMFILE); + return; + } + + if (lxd_db_open(FWRITE | FCREAT | FTRUNC, &wvn) != 0) + return; + + entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP); + + woff = lxd_wr_entry(wvn, woff, "# DO NOT EDIT: this file is " + "automatically maintained for lx container devices\n"); + + da = list_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + (void) snprintf(entry, ENTRY_BUFSIZE, "%s %d %d %o\n", + da->lxda_name, da->lxda_uid, da->lxda_gid, + da->lxda_mode & 0777); + woff += lxd_wr_entry(wvn, woff, entry); + da = list_next(&lxdm->lxdm_devattrs, da); + } + + (void) VOP_CLOSE(wvn, FWRITE, 1, woff, CRED(), NULL); + + kmem_free(entry, ENTRY_BUFSIZE); +} + +/* + * This function records the uid, gid and mode information for an lx devfs + * block device node after a chown/chmod setattr operation so that these + * changes can be persistent across reboots. Since the actual setattr has + * already suceeded, the tracking of these changes is done on a "best effort" + * basis. That is, if we fail to record the change for some reason, the setattr + * will still return success. The vp passed in is the "real vp" for the back + * device node. + */ +void +lxd_save_attrs(lxd_mnt_t *lxdm, vnode_t *vp) +{ + vattr_t va; + char devpath[MAXPATHLEN]; + + /* the path returned is relative to the zone's root */ + if (vnodetopath(curproc->p_zone->zone_rootvp, vp, devpath, + sizeof (devpath), CRED()) != 0) + return; + + va.va_mask = AT_MODE | AT_UID | AT_GID; + + /* + * We just set attrs, so the getattr shouldn't fail. If the device + * is not a block device we don't persist the change. + */ + if (VOP_GETATTR(vp, &va, 0, CRED(), NULL) != 0 || + ((va.va_mode & S_IFBLK) != S_IFBLK)) + return; + + /* + * We serialize all updates to the attribute DB file. In practice this + * should not be a problem since there is rarely concurrent device + * file mode changes. + */ + mutex_enter(&lxdm->lxdm_attrlck); + + (void) lxd_save_devattr(lxdm, devpath, va.va_uid, va.va_gid, + va.va_mode & 0777); + lxd_save_db(lxdm); + + mutex_exit(&lxdm->lxdm_attrlck); +} + +/* + * Re-apply the persistent attribute settings to the devices when this lx + * devfs is mounted. As with lxd_save_attrs, this is done on a best effort and + * we won't prevent the mount if there is a problem. No locking is needed + * while reading the DB file since this action is performed during the + * mount of the devfs. + */ +void +lxd_apply_db(lxd_mnt_t *lxdm) +{ + vnode_t *rvn; + char *buf, *entry, *bp, *ep; + struct uio auio; + struct iovec aiov; + size_t cnt, len, ecnt, roff; + char *devpath; + uid_t uid; + gid_t gid; + mode_t mode; + boolean_t needs_update = B_FALSE; + + if (lxd_db_open(FREAD, &rvn) != 0) + return; + + buf = kmem_alloc(RD_BUFSIZE, KM_SLEEP); + entry = kmem_alloc(ENTRY_BUFSIZE, KM_SLEEP); + + roff = 0; + ep = entry; + ecnt = 0; + (void) VOP_RWLOCK(rvn, V_WRITELOCK_FALSE, NULL); +loop: + aiov.iov_base = buf; + aiov.iov_len = RD_BUFSIZE; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = roff; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = RD_BUFSIZE; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + (void) VOP_READ(rvn, &auio, 0, CRED(), NULL); + + len = RD_BUFSIZE - auio.uio_resid; + roff += len; + + if (len > 0) { + for (bp = buf, cnt = 0; cnt < len; bp++, cnt++) { + + /* + * We have an improperly formed entry in the file (too + * long). In an attempt to recover we reset the entry + * pointer so we can read the rest of the line and try + * to absorb the bad line. The code in lxd_apply_entry + * will handle any malformed or inapplicable entries. + */ + if (ecnt >= (ENTRY_BUFSIZE - 1)) { + ep = entry; + ecnt = 0; + needs_update = B_TRUE; + } + + if (*bp == '\n') { + *ep = '\0'; + + /* skip comments */ + if (entry[0] != '#') { + if (lxd_apply_entry(entry, &devpath, + &uid, &gid, &mode) != 0 || + lxd_save_devattr(lxdm, devpath, + uid, gid, mode)) { + /* + * An invalid entry, a + * non-existent device node or + * a duplicate entry. + */ + needs_update = B_TRUE; + } + } + ep = entry; + ecnt = 0; + } else { + *ep++ = *bp; + ecnt++; + } + } + goto loop; + } + VOP_RWUNLOCK(rvn, V_WRITELOCK_FALSE, NULL); + + kmem_free(buf, RD_BUFSIZE); + kmem_free(entry, ENTRY_BUFSIZE); + + (void) VOP_CLOSE(rvn, FREAD, 1, 0, CRED(), NULL); + + if (needs_update) + lxd_save_db(lxdm); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_node.c b/usr/src/uts/common/brand/lx/devfs/lxd_node.c new file mode 100644 index 0000000000..30fdeb82a6 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_node.c @@ -0,0 +1,1012 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2021 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/sdt.h> + +#include "lxd.h" + +#define LXD_HASH_SIZE 8192 /* must be power of 2 */ +#define LXD_MUTEX_SIZE 64 + + +#define MODESHIFT 3 + +typedef enum lxd_nodehold { + NOHOLD, + HOLD +} lxd_nodehold_t; + +/* + * The following functions maintain the per-mount "front" files. + */ +static void +lxd_save_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(de->lddir_parent, de->lddir_name, hash); + de->lddir_hash = hash; + + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + ASSERT(de->lddir_link == NULL); + de->lddir_link = lxdm->lxdm_dent_htable[hash]; + lxdm->lxdm_dent_htable[hash] = de; + mutex_exit(hmtx); + + atomic_inc_32(&lxdm->lxdm_dent_refcnt); +} + +static void +lxd_rm_dirent(lxd_dirent_t *de) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(de->lddir_parent)); + uint_t hash; + lxd_dirent_t **prevpp; + kmutex_t *hmtx; + + hash = de->lddir_hash; + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + prevpp = &lxdm->lxdm_dent_htable[hash]; + while (*prevpp != de) + prevpp = &(*prevpp)->lddir_link; + *prevpp = de->lddir_link; + de->lddir_link = NULL; + mutex_exit(hmtx); + + ASSERT(lxdm->lxdm_dent_refcnt > 0); + atomic_dec_32(&lxdm->lxdm_dent_refcnt); +} + +static lxd_dirent_t * +lxd_find_dirent(char *name, lxd_node_t *parent, lxd_nodehold_t do_hold, + lxd_node_t **found) +{ + lxd_mnt_t *lxdm = VTOLXDM(LDNTOV(parent)); + lxd_dirent_t *de; + uint_t hash; + kmutex_t *hmtx; + + LXD_NM_HASH(parent, name, hash); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + + mutex_enter(hmtx); + de = lxdm->lxdm_dent_htable[hash]; + while (de) { + if (de->lddir_hash == hash && de->lddir_parent == parent && + strcmp(de->lddir_name, name) == 0) { + lxd_node_t *ldn = de->lddir_node; + + if (do_hold == HOLD) { + ASSERT(ldn != NULL); + ldnode_hold(ldn); + } + if (found != NULL) + *found = ldn; + mutex_exit(hmtx); + return (de); + } + + de = de->lddir_link; + } + mutex_exit(hmtx); + return (NULL); +} + +int +lxd_naccess(void *vcp, int mode, cred_t *cr) +{ + lxd_node_t *ldn = vcp; + int shift = 0; + /* + * Check access based on owner, group and public perms in lxd_node. + */ + if (crgetuid(cr) != ldn->lxdn_uid) { + shift += MODESHIFT; + if (groupmember(ldn->lxdn_gid, cr) == 0) + shift += MODESHIFT; + } + + if (ldn->lxdn_type == LXDNT_FRONT) + return (secpolicy_vnode_access2(cr, LDNTOV(ldn), + ldn->lxdn_uid, ldn->lxdn_mode << shift, mode)); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + return (VOP_ACCESS(ldn->lxdn_real_vp, mode, 0, cr, NULL)); +} + +static lxd_node_t * +lxd_find_back(struct vnode *vp, uint_t hash, lxd_mnt_t *lxdm) +{ + lxd_node_t *l; + + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; l = l->lxdn_hnxt) { + if (l->lxdn_real_vp == vp) { + ASSERT(l->lxdn_type == LXDNT_BACK); + + VN_HOLD(LDNTOV(l)); + return (l); + } + } + return (NULL); +} + +static void +lxd_save_back(lxd_node_t *l, uint_t hash, lxd_mnt_t *lxdm) +{ + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(l->lxdn_real_vp != NULL); + ASSERT(MUTEX_HELD(&lxdm->lxdm_hash_mutex[hash])); + + atomic_inc_32(&lxdm->lxdm_back_refcnt); + + l->lxdn_hnxt = lxdm->lxdm_back_htable[hash]; + lxdm->lxdm_back_htable[hash] = l; +} + + +struct vnode * +lxd_make_back_node(struct vnode *vp, lxd_mnt_t *lxdm) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + + hash = LXD_BACK_HASH(vp); /* Note: hashing with realvp */ + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + l = lxd_find_back(vp, hash, lxdm); + if (l == NULL) { + vnode_t *nvp; + + l = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + nvp = vn_alloc(KM_SLEEP); + + rw_init(&l->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&l->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + + l->lxdn_vnode = nvp; + l->lxdn_type = LXDNT_BACK; + l->lxdn_real_vp = vp; + + VN_SET_VFS_TYPE_DEV(nvp, lxdm->lxdm_vfsp, vp->v_type, + vp->v_rdev); + nvp->v_flag |= (vp->v_flag & (VNOMOUNT|VNOMAP|VDIROPEN)); + vn_setops(nvp, lxd_vnodeops); + nvp->v_data = (caddr_t)l; + + lxd_save_back(l, hash, lxdm); + vn_exists(vp); + } else { + VN_RELE(vp); + } + + mutex_exit(hmtx); + return (LDNTOV(l)); +} + +void +lxd_free_back_node(lxd_node_t *lp) +{ + uint_t hash; + kmutex_t *hmtx; + lxd_node_t *l; + lxd_node_t *lprev = NULL; + vnode_t *vp = LDNTOV(lp); + vnode_t *realvp = REALVP(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + /* in lxd_make_back_node we call lxd_find_back with the realvp */ + hash = LXD_BACK_HASH(realvp); + hmtx = &lxdm->lxdm_hash_mutex[hash]; + mutex_enter(hmtx); + + mutex_enter(&vp->v_lock); + if (vp->v_count > 1) { + vp->v_count--; /* release our hold from vn_rele */ + mutex_exit(&vp->v_lock); + mutex_exit(hmtx); + return; + } + mutex_exit(&vp->v_lock); + + for (l = lxdm->lxdm_back_htable[hash]; l != NULL; + lprev = l, l = l->lxdn_hnxt) { + + if (l != lp) + continue; + + ASSERT(l->lxdn_type == LXDNT_BACK); + ASSERT(lxdm->lxdm_back_refcnt > 0); + + atomic_dec_32(&lxdm->lxdm_back_refcnt); + vn_invalid(vp); + + if (lprev == NULL) { + lxdm->lxdm_back_htable[hash] = l->lxdn_hnxt; + } else { + lprev->lxdn_hnxt = l->lxdn_hnxt; + } + + mutex_exit(hmtx); + rw_destroy(&l->lxdn_rwlock); + mutex_destroy(&l->lxdn_tlock); + kmem_free(l, sizeof (lxd_node_t)); + vn_free(vp); + VN_RELE(realvp); + return; + } + + panic("lxd_free_back_node"); + /*NOTREACHED*/ +} +/* + * Search directory 'parent' for entry 'name'. + * + * 0 is returned on success and *foundcp points + * to the found lxd_node with its vnode held. + */ +int +lxd_dirlookup(lxd_node_t *parent, char *name, lxd_node_t **foundnp, cred_t *cr) +{ + int error; + + *foundnp = NULL; + if (parent->lxdn_vnode->v_type != VDIR) + return (ENOTDIR); + + if ((error = lxd_naccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + ldnode_hold(parent); + *foundnp = parent; + return (0); + } + + /* + * Search the directory for the matching name + * We need the lock protecting the lxdn_dir list + * so that it doesn't change out from underneath us. + * lxd_find_dirent() will pass back the lxd_node + * with a hold on it. + */ + + if (lxd_find_dirent(name, parent, HOLD, foundnp) != NULL) { + ASSERT(*foundnp); + return (0); + } + + return (ENOENT); +} + +/* + * Check if the source directory is in the path of the target directory. + * The target directory is locked by the caller. + */ +static int +lxd_dircheckpath(lxd_node_t *fromnode, lxd_node_t *toparent) +{ + int error = 0; + lxd_node_t *dir, *dotdot; + + ASSERT(RW_WRITE_HELD(&toparent->lxdn_rwlock)); + ASSERT(toparent->lxdn_vnode->v_type == VDIR); + + dotdot = toparent->lxdn_parent; + if (dotdot == NULL) + return (ENOENT); + ldnode_hold(dotdot); + + if (dotdot == toparent) { + /* root of fs. search trivially satisfied. */ + ldnode_rele(dotdot); + return (0); + } + + for (;;) { + /* + * Return error for cases like "mv c c/d", + * "mv c c/d/e" and so on. + */ + if (dotdot == fromnode) { + ldnode_rele(dotdot); + error = EINVAL; + break; + } + + dir = dotdot; + dotdot = dir->lxdn_parent; + if (dotdot == NULL) { + ldnode_rele(dir); + error = ENOENT; + break; + } + ldnode_hold(dotdot); + + /* + * We're okay if we traverse the directory tree up to + * the root directory and don't run into the + * parent directory. + */ + if (dir == dotdot) { + ldnode_rele(dir); + ldnode_rele(dotdot); + break; + } + ldnode_rele(dir); + } + + return (error); +} + +static int +lxd_dir_make_node(lxd_node_t *dir, lxd_mnt_t *lxdm, struct vattr *va, + enum de_op op, lxd_node_t **newnode, struct cred *cred) +{ + lxd_node_t *ldn; + + ASSERT(va != NULL); + + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + + ldn = kmem_zalloc(sizeof (lxd_node_t), KM_SLEEP); + + ldn->lxdn_type = LXDNT_FRONT; + lxd_node_init(lxdm, ldn, NULL, va, cred); + + ldn->lxdn_vnode->v_rdev = ldn->lxdn_rdev = NODEV; + ldn->lxdn_vnode->v_type = va->va_type; + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + if (va->va_mask & AT_ATIME) + ldn->lxdn_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + ldn->lxdn_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + lxd_dirinit(dir, ldn); + } + + *newnode = ldn; + return (0); +} + +static int +lxd_diraddentry(lxd_node_t *dir, lxd_node_t *ldn, char *name) +{ + lxd_dirent_t *dp, *pdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent directory wasn't removed from + * underneath the caller. + */ + if (dir->lxdn_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same filesystem. */ + if (ldn->lxdn_vnode->v_vfsp != dir->lxdn_vnode->v_vfsp) + return (EXDEV); + + /* Allocate and initialize directory entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (lxd_dirent_t); + dp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY); + if (dp == NULL) + return (ENOSPC); + + ldn->lxdn_parent = dir; + + dir->lxdn_size += alloc_size; + dir->lxdn_dirents++; + dp->lddir_node = ldn; + dp->lddir_parent = dir; + + /* The directory entry and its name were allocated sequentially. */ + dp->lddir_name = (char *)dp + sizeof (lxd_dirent_t); + (void) strcpy(dp->lddir_name, name); + + lxd_save_dirent(dp); + + /* + * Some utilities expect the size of a directory to remain + * somewhat static. For example, a routine which removes + * subdirectories between calls to readdir(); the size of the + * directory changes from underneath it and so the real + * directory offset in bytes is invalid. To circumvent + * this problem, we initialize a directory entry with an + * phony offset, and use this offset to determine end of + * file in lxd_readdir. + */ + pdp = dir->lxdn_dir->lddir_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (pdp->lddir_next != NULL && + (pdp->lddir_next->lddir_offset - pdp->lddir_offset) <= 1) { + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + ASSERT(pdp->lddir_next->lddir_offset > pdp->lddir_offset); + pdp = pdp->lddir_next; + } + dp->lddir_offset = pdp->lddir_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which + * is necessarily the largest offset in this directory) is more + * than twice the number of dirents, that means the directory is + * 50% holes. At this point we reset the slot pointer back to + * the beginning of the directory so we start using the holes. + * The idea is that if there are N dirents, there must also be + * N holes, so we can satisfy the next N creates by walking at + * most 2N entries; thus the average cost of a create is constant. + * Note that we use the first dirent's lddir_prev as the roving + * slot pointer; it's ugly, but it saves a word in every dirent. + */ + if (pdp->lddir_next == NULL && + pdp->lddir_offset > 2 * dir->lxdn_dirents) + dir->lxdn_dir->lddir_prev = dir->lxdn_dir->lddir_next; + else + dir->lxdn_dir->lddir_prev = dp; + + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + dp->lddir_next = pdp->lddir_next; + if (dp->lddir_next) { + dp->lddir_next->lddir_prev = dp; + } + dp->lddir_prev = pdp; + pdp->lddir_next = dp; + + ASSERT(dp->lddir_next != dp); + ASSERT(dp->lddir_prev != dp); + ASSERT(pdp->lddir_next != pdp); + ASSERT(pdp->lddir_prev != pdp); + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + return (0); +} + +/* + * Enter a directory entry for 'name' into directory 'dir' + * + * Returns 0 on success. + */ +int +lxd_direnter( + lxd_mnt_t *lxdm, + lxd_node_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + lxd_node_t *fromparent, /* original directory if rename */ + lxd_node_t *ldn, /* existing lxd_node, if rename */ + struct vattr *va, + lxd_node_t **rnp, /* return lxd_node, if create/mkdir */ + cred_t *cr) +{ + lxd_dirent_t *dirp; + lxd_node_t *found = NULL; + int error = 0; + char *s; + + /* lxdn_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + /* + * Don't allow '/' characters in pathname component, + */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("lxd_direnter: NULL name"); + + /* + * For rename lock the source entry and check the link count + * to see if it has been removed while it was unlocked. + */ + if (op == DE_RENAME) { + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink == 0) { + mutex_exit(&ldn->lxdn_tlock); + return (ENOENT); + } + + if (ldn->lxdn_nlink == MAXLINK) { + mutex_exit(&ldn->lxdn_tlock); + return (EMLINK); + } + ldn->lxdn_nlink++; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + + /* + * This might be a "dangling detached directory" (it could have been + * removed, but a reference to it kept in u_cwd). Don't bother + * searching it, and with any luck the user will get tired of dealing + * with us and cd to some absolute pathway (thus in ufs, too). + */ + if (dir->lxdn_nlink == 0) { + error = ENOENT; + goto out; + } + + /* + * If this is a rename of a directory and the parent is different + * (".." must be changed), then the source directory must not be in the + * directory hierarchy above the target, as this would orphan + * everything below the source directory. + */ + if (op == DE_RENAME) { + if (ldn == dir) { + error = EINVAL; + goto out; + } + if ((ldn->lxdn_vnode->v_type) == VDIR) { + if ((fromparent != dir) && + (error = lxd_dircheckpath(ldn, dir)) != 0) { + goto out; + } + } + } + + /* Search for an existing entry. */ + dirp = lxd_find_dirent(name, dir, HOLD, &found); + if (dirp != NULL) { + ASSERT(found != NULL); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (rnp != NULL) { + *rnp = found; + error = EEXIST; + } else { + ldnode_rele(found); + } + break; + + case DE_RENAME: + /* + * Note that we only hit this path when we're renaming + * a symlink from one directory to another and there is + * a pre-existing symlink as the target. lxd_rename + * will unlink the src from the original directory but + * here we need to unlink the dest that we collided + * with, then create the new directory entry as we do + * below when there is no pre-existing symlink. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + + ASSERT(found->lxdn_vnode->v_type == VLNK); + /* dir rw lock is already held and asserted above */ + rw_enter(&found->lxdn_rwlock, RW_WRITER); + error = lxd_dirdelete(dir, found, name, DR_RENAME, cr); + rw_exit(&found->lxdn_rwlock); + ldnode_rele(found); + if (error != 0) + goto out; + + error = lxd_diraddentry(dir, ldn, name); + if (error == 0 && rnp != NULL) + *rnp = ldn; + break; + } + } else { + + /* + * The directory entry does not exist, but the node might if + * this is a rename. Check write permission in directory to + * see if entry can be created. + */ + if ((error = lxd_naccess(dir, VWRITE, cr)) != 0) + goto out; + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Make new lxd_node and directory entry as required. + */ + error = lxd_dir_make_node(dir, lxdm, va, op, &ldn, cr); + if (error) + goto out; + } + + error = lxd_diraddentry(dir, ldn, name); + if (error != 0) { + if (op == DE_CREATE || op == DE_MKDIR) { + /* + * Unmake the inode we just made. + */ + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if ((ldn->lxdn_vnode->v_type) == VDIR) { + ASSERT(dirp == NULL); + /* + * cleanup allocs made by lxd_dirinit + */ + lxd_dirtrunc(ldn); + } + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink = 0; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + ldnode_rele(ldn); + ldn = NULL; + } + } else if (rnp != NULL) { + *rnp = ldn; + } else if (op == DE_CREATE || op == DE_MKDIR) { + ldnode_rele(ldn); + } + } + +out: + if (error && op == DE_RENAME) { + /* Undo bumped link count. */ + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + gethrestime(&ldn->lxdn_ctime); + mutex_exit(&ldn->lxdn_tlock); + } + return (error); +} + +/* + * Delete entry ldn of name "nm" from parent dir. This is used to both remove + * a directory and to remove file nodes within the directory (by recursively + * calling itself). It frees the dir entry space and decrements link count on + * lxd_node(s). + * + * Return 0 on success. + */ +int +lxd_dirdelete(lxd_node_t *dir, lxd_node_t *ldn, char *nm, enum dr_op op, + cred_t *cred) +{ + lxd_dirent_t *dirp; + int error; + size_t namelen; + lxd_node_t *fndnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(RW_WRITE_HELD(&ldn->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + if (nm[0] == '\0') + panic("lxd_dirdelete: empty name for 0x%p", (void *)ldn); + + /* + * return error when removing . and .. + */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = lxd_naccess(dir, VEXEC|VWRITE, cred)) != 0) + return (error); + + if (dir->lxdn_dir == NULL) + return (ENOENT); + + if (op == DR_RMDIR) { + /* + * This is the top-level removal of a directory. Start by + * removing any file entries from the dir. We do this by + * recursively calling back into this function with a different + * op code. The caller of this function has already verified + * that it is safe to remove this directory. + */ + lxd_dirent_t *dirp; + + ASSERT(ldn->lxdn_vnode->v_type == VDIR); + + dirp = ldn->lxdn_dir; + while (dirp) { + lxd_node_t *dn; + lxd_dirent_t *nextp; + + if (strcmp(dirp->lddir_name, ".") == 0 || + strcmp(dirp->lddir_name, "..") == 0) { + dirp = dirp->lddir_next; + continue; + } + + dn = dirp->lddir_node; + nextp = dirp->lddir_next; + + ldnode_hold(dn); + error = lxd_dirdelete(ldn, dn, dirp->lddir_name, + DR_REMOVE, cred); + ldnode_rele(dn); + if (error != 0) + return (error); + + dirp = nextp; + } + } + + dirp = lxd_find_dirent(nm, dir, NOHOLD, &fndnp); + /* These used to be VERIFY(), but in racy conditions they can fail. */ + if (dirp == NULL) { + /* Can't find the directory entry at all now! */ + return (ENOENT); + } + if (ldn != fndnp) { + /* Returned fndnp isn't our original, so it's also not-there. */ + return (ENOENT); + } + + lxd_rm_dirent(dirp); + + /* Take dirp out of the directory list. */ + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + if (dirp->lddir_prev) { + dirp->lddir_prev->lddir_next = dirp->lddir_next; + } + if (dirp->lddir_next) { + dirp->lddir_next->lddir_prev = dirp->lddir_prev; + } + + /* + * If the roving slot pointer happens to match dirp, + * point it at the previous dirent. + */ + if (dir->lxdn_dir->lddir_prev == dirp) { + dir->lxdn_dir->lddir_prev = dirp->lddir_prev; + } + ASSERT(dirp->lddir_next != dirp); + ASSERT(dirp->lddir_prev != dirp); + + /* dirp points to the correct directory entry */ + namelen = strlen(dirp->lddir_name) + 1; + + kmem_free(dirp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + ldn->lxdn_ctime = now; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + if (op == DR_RMDIR && ldn->lxdn_vnode->v_type == VDIR) { + lxd_dirtrunc(ldn); + ASSERT(ldn->lxdn_nlink == 0); + } + return (0); +} + +/* + * Initialize a lxd_node and add it to file list under mount point. + */ +void +lxd_node_init(lxd_mnt_t *lxdm, lxd_node_t *ldn, vnode_t *realvp, vattr_t *vap, + cred_t *cred) +{ + struct vnode *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&ldn->lxdn_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&ldn->lxdn_tlock, NULL, MUTEX_DEFAULT, NULL); + ldn->lxdn_mode = MAKEIMODE(vap->va_type, vap->va_mode); + ldn->lxdn_mask = 0; + ldn->lxdn_attr.va_type = vap->va_type; + ldn->lxdn_nlink = 1; + ldn->lxdn_size = 0; + + if (cred == NULL) { + ldn->lxdn_uid = vap->va_uid; + ldn->lxdn_gid = vap->va_gid; + } else { + ldn->lxdn_uid = crgetuid(cred); + ldn->lxdn_gid = crgetgid(cred); + } + + ldn->lxdn_fsid = lxdm->lxdm_dev; + ldn->lxdn_rdev = vap->va_rdev; + ldn->lxdn_blksize = PAGESIZE; + ldn->lxdn_nblocks = 0; + gethrestime(&now); + ldn->lxdn_atime = now; + ldn->lxdn_mtime = now; + ldn->lxdn_ctime = now; + ldn->lxdn_seq = 0; + ldn->lxdn_dir = NULL; + + ldn->lxdn_real_vp = realvp; + + ldn->lxdn_vnode = vn_alloc(KM_SLEEP); + vp = LDNTOV(ldn); + vn_setops(vp, lxd_vnodeops); + vp->v_vfsp = lxdm->lxdm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)ldn; + + mutex_enter(&lxdm->lxdm_contents); + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Add new lxd_node to end of linked list of lxd_nodes for this + * lxdevfs. Root directory is handled specially in lxd_mount. + */ + if (lxdm->lxdm_rootnode != (lxd_node_t *)NULL) { + ldn->lxdn_next = NULL; + ldn->lxdn_prev = lxdm->lxdm_rootnode->lxdn_prev; + ldn->lxdn_prev->lxdn_next = lxdm->lxdm_rootnode->lxdn_prev = + ldn; + } + mutex_exit(&lxdm->lxdm_contents); + vn_exists(vp); +} + +/* + * lxd_dirinit is used internally to initialize a directory (dir) + * with '.' and '..' entries without checking permissions and locking + * It also creates the entries for the pseudo file nodes that reside in the + * directory. + */ +void +lxd_dirinit(lxd_node_t *parent, lxd_node_t *dir) +{ + lxd_dirent_t *dot, *dotdot; + timestruc_t now; + lxd_mnt_t *lxdm = VTOLXDM(dir->lxdn_vnode); + struct vattr nattr; + + ASSERT(RW_WRITE_HELD(&parent->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + dir->lxdn_nodeid = lxdm->lxdm_gen++; + + /* + * Initialize the entries + */ + dot = kmem_zalloc(sizeof (lxd_dirent_t) + 2, KM_SLEEP); + dot->lddir_node = dir; + dot->lddir_offset = 0; + dot->lddir_name = (char *)dot + sizeof (lxd_dirent_t); + dot->lddir_name[0] = '.'; + dot->lddir_parent = dir; + lxd_save_dirent(dot); + + dotdot = kmem_zalloc(sizeof (lxd_dirent_t) + 3, KM_SLEEP); + dotdot->lddir_node = parent; + dotdot->lddir_offset = 1; + dotdot->lddir_name = (char *)dotdot + sizeof (lxd_dirent_t); + dotdot->lddir_name[0] = '.'; + dotdot->lddir_name[1] = '.'; + dotdot->lddir_parent = dir; + lxd_save_dirent(dotdot); + + /* + * Initialize directory entry list. + */ + dot->lddir_next = dotdot; + dot->lddir_prev = dotdot; /* dot's lddir_prev holds roving slot ptr */ + dotdot->lddir_next = NULL; + dotdot->lddir_prev = dot; + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + parent->lxdn_nlink++; + parent->lxdn_ctime = now; + + dir->lxdn_dir = dot; + dir->lxdn_size = 2 * sizeof (lxd_dirent_t) + 5; /* dot and dotdot */ + dir->lxdn_dirents = 2; + dir->lxdn_nlink = 2; + dir->lxdn_parent = parent; + + bzero(&nattr, sizeof (struct vattr)); + nattr.va_mode = (mode_t)(0644); + nattr.va_type = VREG; + nattr.va_rdev = 0; +} + +/* + * lxd_dirtrunc is called to remove all directory entries under this directory. + */ +void +lxd_dirtrunc(lxd_node_t *dir) +{ + lxd_dirent_t *ldp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->lxdn_rwlock)); + ASSERT(dir->lxdn_vnode->v_type == VDIR); + + for (ldp = dir->lxdn_dir; ldp; ldp = dir->lxdn_dir) { + size_t namelen; + lxd_node_t *ldn; + + ASSERT(ldp->lddir_next != ldp); + ASSERT(ldp->lddir_prev != ldp); + ASSERT(ldp->lddir_node); + + dir->lxdn_dir = ldp->lddir_next; + namelen = strlen(ldp->lddir_name) + 1; + + /* + * Adjust the link counts to account for this directory entry + * removal. We do hold/rele operations to free up these nodes. + */ + ldn = ldp->lddir_node; + + ASSERT(ldn->lxdn_nlink > 0); + mutex_enter(&ldn->lxdn_tlock); + ldn->lxdn_nlink--; + mutex_exit(&ldn->lxdn_tlock); + + lxd_rm_dirent(ldp); + kmem_free(ldp, sizeof (lxd_dirent_t) + namelen); + dir->lxdn_size -= (sizeof (lxd_dirent_t) + namelen); + dir->lxdn_dirents--; + } + + gethrestime(&now); + dir->lxdn_mtime = now; + dir->lxdn_ctime = now; + + ASSERT(dir->lxdn_dir == NULL); + ASSERT(dir->lxdn_size == 0); + ASSERT(dir->lxdn_dirents == 0); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c new file mode 100644 index 0000000000..69c131d886 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vfsops.c @@ -0,0 +1,860 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * The lx devfs (lxd) file system is used within lx branded zones to provide + * the Linux view of /dev. + * + * In the past, the Linux /dev was simply a lofs mount pointing at /native/dev. + * lxd now provides the Linux /dev. + * + * The lxd file system is a hybrid of lofs and tmpfs. It supports a "back" file + * system which is the special device and corresponds to the special device in + * a lofs mount. As with lofs, all files in the special device are accessible + * through the lxd mount. Because the zone's devfs is not directly modifiable + * within the zone (also mknod(2) is not generally allowed within a zone) it is + * impossible to create files in devfs. For lx, in some cases it's useful to be + * able to make new symlinks or new directories under /dev. lxd implements + * these operations by creating "files" in memory in the same way as tmpfs + * does. Within lxd these are referred to as "front" files. For operations such + * as lookup or readdir, lxd provides a merged view of both the front and back + * files. lxd does not support regular front files or simple I/O (read/write) + * to front files, since there is no need for that. For back files, all + * operations are simply passed through to the real vnode, as is done with + * lofs. Front files are not allowed to mask back files. + * + * The Linux /dev is now a lxd mount with the special file (i.e. the back + * file system) as /native/dev. + * + * In addition, lx has a need for some illumos/Linux translation for the + * various *stat(2) system calls when used on a device. This translation can + * be centralized within lxd's getattr vnode entry point. + * + * Because the front file system only exists in memory and the back file + * system is the zone's devfs, which is not persistent across reboots, we + * track any device uid/gid/mode changes in a per-zone /etc/.lxd_dev_attr + * file and re-apply those changes when the lx devfs file system is mounted. + * Currently only changes to block device nodes are persistent. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <sys/policy.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_ptm.h> +#include <sys/lx_impl.h> + +#include "lxd.h" + +/* Module level parameters */ +static int lxd_fstype; +static dev_t lxd_dev; + +/* + * lxd_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. The filesystem module must not be + * allowed to go away before the last VFS_FREEVFS() call has been made. Since + * this is just an atomic counter, there's no need for locking. + */ +static uint32_t lxd_mountcount; + +/* + * lxd_minfree is the minimum amount of swap space that lx devfs leaves for + * the rest of the zone. + */ +size_t lxd_minfree = 0; + +/* + * LXDMINFREE -- the value from which lxd_minfree is derived -- should be + * configured to a value that is roughly the smallest practical value for + * memory + swap minus the largest reasonable size for lxd in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow lxd to consume + * no more than ~10% of this, yielding a LXDMINFREE of 12MB. + */ +#define LXDMINFREE 12 * 1024 * 1024 /* 12 Megabytes */ + +extern pgcnt_t swapfs_minfree; + +extern int lxd_symlink(vnode_t *, char *, struct vattr *, char *, cred_t *, + caller_context_t *, int); +extern int stat64(char *, struct stat64 *); + +/* + * lxd vfs operations. + */ +static int lxd_init(int, char *); +static int lxd_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int lxd_unmount(vfs_t *, int, cred_t *); +static int lxd_root(vfs_t *, vnode_t **); +static int lxd_statvfs(vfs_t *, statvfs64_t *); +static void lxd_freevfs(vfs_t *vfsp); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_devfs", + lxd_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "lx brand devfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +/* + * Definitions and translators for devt's. + */ +static void lxd_pts_devt_translator(dev_t, dev_t *); +static void lxd_ptm_devt_translator(dev_t, dev_t *); + +static kmutex_t lxd_xlate_lock; +static boolean_t lxd_xlate_initialized = B_FALSE; + +static lxd_minor_translator_t lxd_mtranslator_mm[] = { + { "/dev/null", 0, 1, 3 }, + { "/dev/zero", 0, 1, 5 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_random[] = { + { "/dev/random", 0, 1, 8 }, + { "/dev/urandom", 0, 1, 9 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_sy[] = { + { "/dev/tty", 0, LX_TTY_MAJOR, 0 }, + { NULL, 0, 0, 0 } +}; +static lxd_minor_translator_t lxd_mtranslator_zcons[] = { + { "/dev/console", 0, LX_TTY_MAJOR, 1 }, + { NULL, 0, 0, 0 } +}; +lxd_devt_translator_t lxd_devt_translators[] = { + { "mm", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_mm }, + { "random", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_random }, + { "sy", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_sy }, + { "zcons", 0, DTT_LIST, (uintptr_t)&lxd_mtranslator_zcons }, + { LX_PTM_DRV, 0, DTT_CUSTOM, (uintptr_t)lxd_ptm_devt_translator }, + { "pts", 0, DTT_CUSTOM, (uintptr_t)lxd_pts_devt_translator }, + { NULL, 0, DTT_INVALID, (uintptr_t)NULL } +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + if (lxd_mountcount > 0) + return (EBUSY); + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(lxd_fstype); + vn_freevnodeops(lxd_vnodeops); + mutex_destroy(&lxd_xlate_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * Initialize global locks, etc. Called when loading lxd module. + */ +static int +lxd_init(int fstype, char *name) +{ + static const fs_operation_def_t lxd_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxd_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxd_unmount }, + VFSNAME_ROOT, { .vfs_root = lxd_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxd_statvfs }, + VFSNAME_FREEVFS, { .vfs_freevfs = lxd_freevfs }, + NULL, NULL + }; + extern const struct fs_operation_def lxd_vnodeops_template[]; + int error; + major_t dev; + + lxd_fstype = fstype; + ASSERT(lxd_fstype != 0); + + error = vfs_setfsops(fstype, lxd_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxd_init: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, lxd_vnodeops_template, &lxd_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxd_init: bad vnode ops template"); + return (error); + } + + /* + * lxd_minfree doesn't need to be some function of configured + * swap space since it really is an absolute limit of swap space + * which still allows other processes to execute. + */ + if (lxd_minfree == 0) { + /* Set if not patched */ + lxd_minfree = btopr(LXDMINFREE); + } + + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxd_init: Can't get unique device number."); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxd_dev = makedevice(dev, 0); + + mutex_init(&lxd_xlate_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* + * Initialize device translator mapping table. + * + * Note that we cannot do this in lxd_init since that can lead to a recursive + * rw_enter while we're doing lookupnameat (via sdev_lookup/prof_make_maps/ + * devi_attach_node/modload). Thus we do it in the mount path and keep track + * so that we only initialize the table once. + */ +static void +lxd_xlate_init() +{ + int i; + + mutex_enter(&lxd_xlate_lock); + if (lxd_xlate_initialized) { + mutex_exit(&lxd_xlate_lock); + return; + } + + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + lxd_minor_translator_t *mt; + int j; + + lxd_devt_translators[i].lxd_xl_major = + mod_name_to_major(lxd_devt_translators[i].lxd_xl_driver); + + /* if this translator doesn't use a list mapping we're done. */ + if (lxd_devt_translators[i].lxd_xl_type != DTT_LIST) + continue; + + /* for each device listed, lookup the minor node number */ + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + vnode_t *vp; + struct vattr va; + char *tpath; + char tnm[MAXPATHLEN]; + + /* + * The attach might be triggered in either the global + * zone or in a non-global zone, so we may need to + * adjust the path if we're in a NGZ. + */ + if (curproc->p_zone->zone_id == GLOBAL_ZONEUNIQID) { + tpath = mt[j].lxd_mt_path; + } else { + (void) snprintf(tnm, sizeof (tnm), "/native%s", + mt[j].lxd_mt_path); + tpath = tnm; + } + + if (lookupnameat(tpath, UIO_SYSSPACE, FOLLOW, NULL, + &vp, NULL) != 0) { + mt[j].lxd_mt_minor = UINT_MAX; + continue; + } + + va.va_mask = AT_RDEV; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) != 0) { + va.va_rdev = NODEV; + } else { + ASSERT(getmajor(va.va_rdev) == + lxd_devt_translators[i].lxd_xl_major); + ASSERT(mt[j].lxd_mt_lx_minor < LX_MAXMIN); + } + + mt[j].lxd_mt_minor = getminor(va.va_rdev); + + VN_RELE(vp); + } + } + + lxd_xlate_initialized = B_TRUE; + mutex_exit(&lxd_xlate_lock); +} + +static int +lxd_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lxd_mnt_t *lxdm = NULL; + struct lxd_node *ldn; + struct pathname dpn; + int error; + int i; + int nodev; + struct vattr rattr; + vnode_t *realrootvp; + vnode_t *tvp; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + vattr_t vattr; + + nodev = vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL); + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + lxd_xlate_init(); + + /* + * This is the same behavior as with lofs. + * Loopback devices which get "nodevices" added can be done without + * "nodevices" set because we cannot import devices into a zone + * with loopback. Note that we have all zone privileges when + * this happens; if not, we'd have gotten "nosuid". + */ + if (!nodev && vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + vfs_setmntopt(vfsp, MNTOPT_DEVICES, NULL, VFS_NODISPLAY); + + /* + * Only allow mounting within lx zones. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (EINVAL); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* lxd doesn't support read-only mounts */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + error = EINVAL; + goto out; + } + + error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (error != 0) + goto out; + + /* + * Find real root + */ + if ((error = lookupname(uap->spec, (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE, FOLLOW, NULLVPP, &realrootvp))) { + pn_free(&dpn); + return (error); + } + + if ((error = VOP_ACCESS(realrootvp, 0, 0, cr, NULL)) != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (error); + } + + /* If realroot is not a devfs, error out */ + if (strcmp(realrootvp->v_op->vnop_name, "dev") != 0) { + pn_free(&dpn); + VN_RELE(realrootvp); + return (EINVAL); + } + + lxdm = kmem_zalloc(sizeof (*lxdm), KM_SLEEP); + + /* init but don't bother entering the mutex (not on mount list yet) */ + mutex_init(&lxdm->lxdm_contents, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_renamelck, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxdm->lxdm_attrlck, NULL, MUTEX_DEFAULT, NULL); + + list_create(&lxdm->lxdm_devattrs, sizeof (lxd_dev_attr_t), + offsetof(lxd_dev_attr_t, lxda_link)); + + /* Initialize the hash table mutexes */ + for (i = 0; i < LXD_HASH_SZ; i++) { + mutex_init(&lxdm->lxdm_hash_mutex[i], NULL, MUTEX_DEFAULT, + NULL); + } + + lxdm->lxdm_vfsp = vfsp; + lxdm->lxdm_gen = 1; /* start inode counter at 1 */ + + vfsp->vfs_data = (caddr_t)lxdm; + vfsp->vfs_fstype = lxd_fstype; + vfsp->vfs_dev = lxd_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, lxd_dev, lxd_fstype); + lxdm->lxdm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(lxdm->lxdm_mntpath, dpn.pn_path); + + /* allocate and initialize root lxd_node structure */ + bzero(&rattr, sizeof (struct vattr)); + rattr.va_mode = (mode_t)(S_IFDIR | 0755); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + + tvp = lxd_make_back_node(realrootvp, lxdm); + ldn = VTOLDN(tvp); + + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + LDNTOV(ldn)->v_flag |= VROOT; + + /* + * initialize linked list of lxd_nodes so that the back pointer of + * the root lxd_node always points to the last one on the list + * and the forward pointer of the last node is null + */ + ldn->lxdn_prev = ldn; + ldn->lxdn_next = NULL; + ldn->lxdn_nlink = 0; + lxdm->lxdm_rootnode = ldn; + + ldn->lxdn_nodeid = lxdm->lxdm_gen++; + lxd_dirinit(ldn, ldn); + + rw_exit(&ldn->lxdn_rwlock); + + pn_free(&dpn); + error = 0; + atomic_inc_32(&lxd_mountcount); + + lxzdata = ztolxzd(curproc->p_zone); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vattr.va_mask = AT_TYPE | AT_MODE; + vattr.va_type = VLNK; + vattr.va_mode = 0777; + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL) { + char lnknm[MAXPATHLEN]; + + /* Create a symlink for the actual zvol. */ + (void) snprintf(lnknm, sizeof (lnknm), + "./zvol/dsk/%s", vd->lxvd_real_name); + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + lnknm, cr, NULL, 0); + } else if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Create a symlink for the root "disk" using /dev/zfs + * as the target device. + */ + (void) lxd_symlink(LDNTOV(ldn), vd->lxvd_name, &vattr, + "./zfs", cr, NULL, 0); + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Apply any persistent attribute changes. */ + lxd_apply_db(lxdm); + +out: + if (error == 0) + vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); + + return (error); +} + +static int +lxd_unmount(struct vfs *vfsp, int flag, struct cred *cr) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn, *cancel; + struct vnode *vp; + int error; + uint_t cnt; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + + mutex_enter(&lxdm->lxdm_contents); + + /* + * In the normal unmount case only the root node would have a reference + * count. + * + * With lxdm_contents held, nothing can be added or removed. + * If we find a previously referenced node, undo the holds we have + * placed and fail EBUSY. + */ + ldn = lxdm->lxdm_rootnode; + + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + + if (flag & MS_FORCE) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EINVAL); + } + + cnt = vp->v_count; + if (cnt > 1) { + mutex_exit(&vp->v_lock); + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } + + mutex_exit(&vp->v_lock); + + /* + * Check for open files. An open file causes everything to unwind. + */ + for (ldn = ldn->lxdn_next; ldn; ldn = ldn->lxdn_next) { + vp = LDNTOV(ldn); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); + cancel = lxdm->lxdm_rootnode->lxdn_next; + while (cancel != ldn) { + vp = LDNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->lxdn_next; + } + mutex_exit(&lxdm->lxdm_contents); + return (EBUSY); + } else { + /* + * It may seem incorrect for us to have a vnode with + * a count of 0, but this is modeled on tmpfs and works + * the same way. See lxd_front_inactive. There we allow + * the v_count to go to 0 but rely on the link count to + * keep the vnode alive. Since we now want to cleanup + * these vnodes we manually add a VN_HOLD so that the + * VN_RELEs that occur in the lxd_freevfs() cleanup + * will take us down the lxd_inactive code path. We + * can directly add a VN_HOLD since we have the lock. + */ + vp->v_count++; + mutex_exit(&vp->v_lock); + } + } + + /* + * We can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&lxdm->lxdm_contents); + + return (0); +} + +/* + * Implementation of VFS_FREEVFS(). This is called by the vfs framework after + * umount and the last VFS_RELE, to trigger the release of any resources still + * associated with the given vfs_t. This is normally called immediately after + * lxd_unmount. + */ +void +lxd_freevfs(vfs_t *vfsp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn; + struct vnode *vp; + lxd_dev_attr_t *da; + + /* + * Free all kmemalloc'd and anonalloc'd memory associated with + * this filesystem. To do this, we go through the file list twice, + * once to remove all the directory entries, and then to remove + * all the pseudo files. + */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the lxd_mnt_t that + * says we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + + /* + * Remove all directory entries (this doesn't remove top-level dirs). + */ + for (ldn = lxdm->lxdm_rootnode; ldn; ldn = ldn->lxdn_next) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + if (ldn->lxdn_vnode->v_type == VDIR) + lxd_dirtrunc(ldn); + rw_exit(&ldn->lxdn_rwlock); + } + + ASSERT(lxdm->lxdm_rootnode != NULL); + + /* + * All links are gone, v_count is keeping nodes in place. + * VN_RELE should make the node disappear, unless somebody + * is holding pages against it. Nap and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a + * lxd_node from blowing it away (in lxd_inactive) while we're trying + * to get to it here. Once we have a HOLD on it we know it'll stick + * around. + */ + mutex_enter(&lxdm->lxdm_contents); + + /* + * Remove all the files (except the rootnode) backwards. + */ + while ((ldn = lxdm->lxdm_rootnode->lxdn_prev) != lxdm->lxdm_rootnode) { + mutex_exit(&lxdm->lxdm_contents); + /* + * All nodes will be released here. Note we handled the link + * count above. + */ + vp = LDNTOV(ldn); + ASSERT(vp->v_type == VLNK || vp->v_type == VDIR || + vp->v_type == VSOCK); + VN_RELE(vp); + mutex_enter(&lxdm->lxdm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again - we know + * they'll give it up soon. + */ + if (ldn == lxdm->lxdm_rootnode->lxdn_prev) { + VN_HOLD(vp); + mutex_exit(&lxdm->lxdm_contents); + delay(hz / 4); + mutex_enter(&lxdm->lxdm_contents); + } + } + mutex_exit(&lxdm->lxdm_contents); + + ASSERT(lxdm->lxdm_back_refcnt == 1); + ASSERT(lxdm->lxdm_dent_refcnt == 0); + + VN_RELE(LDNTOV(lxdm->lxdm_rootnode)); + + ASSERT(lxdm->lxdm_mntpath != NULL); + kmem_free(lxdm->lxdm_mntpath, strlen(lxdm->lxdm_mntpath) + 1); + + da = list_remove_head(&lxdm->lxdm_devattrs); + while (da != NULL) { + kmem_free(da, sizeof (lxd_dev_attr_t)); + da = list_remove_head(&lxdm->lxdm_devattrs); + } + list_destroy(&lxdm->lxdm_devattrs); + + mutex_destroy(&lxdm->lxdm_contents); + mutex_destroy(&lxdm->lxdm_renamelck); + mutex_destroy(&lxdm->lxdm_attrlck); + kmem_free(lxdm, sizeof (lxd_mnt_t)); + + /* Allow _fini() to succeed now */ + atomic_dec_32(&lxd_mountcount); +} + +/* + * return root lxdnode for given vnode + */ +static int +lxd_root(struct vfs *vfsp, struct vnode **vpp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + lxd_node_t *ldn = lxdm->lxdm_rootnode; + struct vnode *vp; + + ASSERT(ldn != NULL); + + vp = LDNTOV(ldn); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxd_statvfs(struct vfs *vfsp, statvfs64_t *sbp) +{ + lxd_mnt_t *lxdm = (lxd_mnt_t *)VFSTOLXDM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + zp = lxdm->lxdm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > lxd_minfree) + sbp->f_bfree = blocks - lxd_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is just what's available + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a zone with a swap cap, + * then report the capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * The maximum number of files available is approximately the number + * of lxd_nodes we can allocate from the remaining kernel memory + * available to lxdevfs in this zone. This is fairly inaccurate since + * it doesn't take into account the names stored in the directory + * entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (lxd_node_t) + sizeof (lxd_dirent_t)); + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[lxd_fstype].vsw_name); + (void) strncpy(sbp->f_fstr, lxdm->lxdm_mntpath, sizeof (sbp->f_fstr)); + /* ensure null termination */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static void +lxd_pts_devt_translator(dev_t dev, dev_t *jdev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + /* + * Linux uses a range of major numbers for pts devices to address the + * relatively small minor number space (20 bits). + */ + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + if (lx_maj > LX_PTS_MAJOR_MAX) { + /* + * The major is outside the acceptable range but there's little + * we can presently do about it short of overhauling the + * translation logic. + */ + lx_unsupported("pts major out of translation range"); + } + + *jdev = LX_MAKEDEVICE(lx_maj, lx_min); +} + +/* ARGSUSED */ +static void +lxd_ptm_devt_translator(dev_t dev, dev_t *jdev) +{ + *jdev = LX_MAKEDEVICE(LX_PTM_MAJOR, LX_PTM_MINOR); +} diff --git a/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c new file mode 100644 index 0000000000..c291e25797 --- /dev/null +++ b/usr/src/uts/common/brand/lx/devfs/lxd_vnops.c @@ -0,0 +1,1520 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/pathname.h> +#include <sys/debug.h> +#include <sys/sdt.h> +#include <fs/fs_subr.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <sys/lx_brand.h> +#include <sys/brand.h> + +#include "lxd.h" + +static int +lxd_open(vnode_t **vpp, int flag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(*vpp); + vnode_t *vp = *vpp; + vnode_t *rvp; + vnode_t *oldvp; + int error; + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + oldvp = vp; + vp = rvp = REALVP(vp); + /* + * Need to hold new reference to vp since VOP_OPEN() may + * decide to release it. + */ + VN_HOLD(vp); + error = VOP_OPEN(&rvp, flag, cr, ct); + + if (!error && rvp != vp) { + /* + * the FS which we called should have released the + * new reference on vp + */ + *vpp = lxd_make_back_node(rvp, VFSTOLXDM(oldvp->v_vfsp)); + + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + else + *vpp = svp; + } + VN_RELE(oldvp); + } else { + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + } + + return (error); +} + +static int +lxd_close(vnode_t *vp, int flag, int count, offset_t offset, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_CLOSE(vp, flag, count, offset, cr, ct)); +} + +static int +lxd_read(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READ(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_write(vnode_t *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_WRITE(vp, uiop, ioflag, cr, ct)); +} + +static int +lxd_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, struct cred *cr, + int *rvalp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_IOCTL(vp, cmd, arg, flag, cr, rvalp, ct)); +} + +static int +lxd_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SETFL(vp, oflags, nflags, cr, ct)); +} + +/* + * Translate SunOS devt to Linux devt. + */ +static void +lxd_s2l_devt(dev_t dev, dev_t *rdev) +{ + lxd_minor_translator_t *mt; + int i, j; + major_t maj = getmajor(dev); + minor_t min = getminor(dev); + + /* look for a devt translator for this major number */ + for (i = 0; lxd_devt_translators[i].lxd_xl_driver != NULL; i++) { + if (lxd_devt_translators[i].lxd_xl_major == maj) + break; + } + + if (lxd_devt_translators[i].lxd_xl_driver != NULL) { + /* try to translate the illumos devt to a linux devt */ + switch (lxd_devt_translators[i].lxd_xl_type) { + case DTT_INVALID: + ASSERT(0); + break; + + case DTT_LIST: + mt = lxd_devt_translators[i].xl_list; + for (j = 0; mt[j].lxd_mt_path != NULL; j++) { + if (mt[j].lxd_mt_minor == min) { + ASSERT(mt[j].lxd_mt_minor < LX_MAXMIN); + + /* found a translation */ + *rdev = LX_MAKEDEVICE( + mt[j].lxd_mt_lx_major, + mt[j].lxd_mt_lx_minor); + return; + } + } + break; + + case DTT_CUSTOM: + lxd_devt_translators[i].xl_custom(dev, rdev); + return; + } + } + + /* we don't have a translator for this device */ + *rdev = LX_MAKEDEVICE(maj, min); +} + +static int +lxd_getattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + int error; + vnode_t *rvp; + + if (ldn->lxdn_type == LXDNT_FRONT) { + mutex_enter(&ldn->lxdn_tlock); + + vap->va_type = vp->v_type; + vap->va_mode = ldn->lxdn_mode & MODEMASK; + vap->va_uid = ldn->lxdn_uid; + vap->va_gid = ldn->lxdn_gid; + vap->va_fsid = ldn->lxdn_fsid; + vap->va_nodeid = (ino64_t)ldn->lxdn_nodeid; + vap->va_nlink = ldn->lxdn_nlink; + vap->va_size = (u_offset_t)ldn->lxdn_size; + vap->va_atime = ldn->lxdn_atime; + vap->va_mtime = ldn->lxdn_mtime; + vap->va_ctime = ldn->lxdn_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = 0; /* no devs in front */ + vap->va_seq = ldn->lxdn_seq; + + vap->va_nblocks = (fsblkcnt64_t)btodb(ptob(btopr( + vap->va_size))); + mutex_exit(&ldn->lxdn_tlock); + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + rvp = REALVP(vp); + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct))) + return (error); + + /* Skip devt translation for native programs */ + if (curproc->p_brand != &lx_brand) { + return (0); + } else { + /* + * We also skip translation when called from the user-land + * emulation code. + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + if (lwpd == NULL || lwpd->br_stack_mode != LX_STACK_MODE_BRAND) + return (0); + } + + if (rvp->v_type == VCHR) { + dev_t ldev; + + lxd_s2l_devt(vap->va_rdev, &ldev); + DTRACE_PROBE3(lxd__devxl, void *, rvp, void *, vap, int, ldev); + vap->va_rdev = ldev; + } + + return (0); +} + +static int +lxd_setattr(vnode_t *vp, struct vattr *vap, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + int res; + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error = 0; + struct vattr *set; + long mask = vap->va_mask; + + /* Cannot set these attributes */ + if ((mask & AT_NOSET) || (mask & AT_XVATTR) || + (mask & AT_MODE && vap->va_mode & (S_ISUID | S_ISGID)) || + (mask & AT_SIZE)) + return (EINVAL); + + mutex_enter(&ldn->lxdn_tlock); + + set = &ldn->lxdn_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, set, flags, + lxd_naccess, ldn); + if (error) { + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + if (mask & AT_MODE) { + set->va_mode &= S_IFMT; + set->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + set->va_uid = vap->va_uid; + if (mask & AT_GID) + set->va_gid = vap->va_gid; + if (mask & AT_ATIME) + set->va_atime = vap->va_atime; + if (mask & AT_MTIME) + set->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&ldn->lxdn_ctime); + + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + res = VOP_SETATTR(vp, vap, flags, cr, ct); + if (res == 0 && (vap->va_mask & (AT_MODE | AT_UID | AT_GID))) { + lxd_save_attrs(lxdm, vp); + } + return (res); +} + +static int +lxd_access(vnode_t *vp, int mode, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + mutex_enter(&ldn->lxdn_tlock); + error = lxd_naccess(ldn, mode, cr); + mutex_exit(&ldn->lxdn_tlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + vp = REALVP(vp); + return (VOP_ACCESS(vp, mode, flags, cr, ct)); +} + +static int +lxd_fsync(vnode_t *vp, int syncflag, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (0); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FSYNC(vp, syncflag, cr, ct)); +} + +/* ARGSUSED */ +static void +lxd_front_inactive(struct vnode *vp, struct cred *cred, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + lxd_mnt_t *lxdm = VTOLXDM(vp); + + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + mutex_enter(&ldn->lxdn_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's little to do -- just drop our hold. + */ + if (vp->v_count > 1 || ldn->lxdn_nlink != 0) { + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + rw_exit(&ldn->lxdn_rwlock); + return; + } + + /* + * We have the last hold *and* the link count is zero, so this node is + * dead from the filesystem's viewpoint. + */ + if (ldn->lxdn_size != 0) { + if (ldn->lxdn_vnode->v_type == VLNK) + kmem_free(ldn->lxdn_symlink, ldn->lxdn_size + 1); + } + + mutex_exit(&vp->v_lock); + mutex_exit(&ldn->lxdn_tlock); + + vn_invalid(LDNTOV(ldn)); + + mutex_enter(&lxdm->lxdm_contents); + if (ldn->lxdn_next == NULL) + lxdm->lxdm_rootnode->lxdn_prev = ldn->lxdn_prev; + else + ldn->lxdn_next->lxdn_prev = ldn->lxdn_prev; + ldn->lxdn_prev->lxdn_next = ldn->lxdn_next; + + mutex_exit(&lxdm->lxdm_contents); + rw_exit(&ldn->lxdn_rwlock); + rw_destroy(&ldn->lxdn_rwlock); + mutex_destroy(&ldn->lxdn_tlock); + + vn_free(LDNTOV(ldn)); + kmem_free(ldn, sizeof (lxd_node_t)); +} + +/*ARGSUSED*/ +static void +lxd_inactive(vnode_t *vp, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + lxd_front_inactive(vp, cr, ct); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + lxd_free_back_node(ldn); +} + +/* ARGSUSED */ +static int +lxd_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) + return (ENOTSUP); + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FID(vp, fidp, ct)); +} + +/* + * For a front node lookup in the dirent hash table and return a shadow vnode + * (lxd_node_t type) of type LXDNT_FRONT. + * + * For a back node, lookup nm name and return a shadow vnode (lxd_node_t type) + * of the real vnode found. + */ +static int +lxd_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, struct cred *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *vp = NULL; + int error; + vnode_t *realdvp; + lxd_mnt_t *lxdm = VTOLXDM(dvp); + int doingdotdot = 0; + lxd_node_t *ldn = VTOLDN(dvp); + lxd_node_t *nldn = NULL; + + /* + * First check for front file which could be instantiated on either a + * front or back node (e.g. the top-level moint point directory node is + * a back node which can have front files created in it). + */ + + /* disallow extended attrs */ + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for dir being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(ldn, nm, &nldn, cr); + rw_exit(&ldn->lxdn_rwlock); + + if (error == 0) { + /* found */ + ASSERT(nldn != NULL); + *vpp = LDNTOV(nldn); + return (0); + } + + /* At this point, if dir node is a front node, error */ + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOENT); + } + + realdvp = REALVP(dvp); + + if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { + doingdotdot++; + /* + * Handle ".." out of mounted filesystem + */ + while ((realdvp->v_flag & VROOT) && realdvp != rootdir) { + realdvp = realdvp->v_vfsp->vfs_vnodecovered; + ASSERT(realdvp != NULL); + } + } + + *vpp = NULL; /* default(error) case */ + + /* + * Do the normal lookup + */ + if ((error = VOP_LOOKUP(realdvp, nm, &vp, pnp, flags, rdir, cr, + ct, direntflags, realpnp)) != 0) { + vp = NULL; + goto out; + } + + /* + * We do this check here to avoid returning a stale file handle to the + * caller. + */ + if (nm[0] == '.' && nm[1] == '\0') { + ASSERT(vp == realdvp); + VN_HOLD(dvp); + VN_RELE(vp); + *vpp = dvp; + return (0); + } + + if (doingdotdot) { + *vpp = lxd_make_back_node(vp, lxdm); + return (0); + } + + /* + * If this vnode is mounted on, then we + * traverse to the vnode which is the root of + * the mounted file system. + */ + if ((error = traverse(&vp)) != 0) + goto out; + + /* + * Make a lxd node for the real vnode. + */ + *vpp = lxd_make_back_node(vp, lxdm); + if (vp->v_type != VDIR) { + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + VN_RELE(vp); + error = ENOSYS; + } else { + *vpp = svp; + } + } + return (error); + } + +out: + if (error != 0 && vp != NULL) + VN_RELE(vp); + + return (error); +} + +/*ARGSUSED*/ +static int +lxd_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, + int mode, vnode_t **vpp, struct cred *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + int error; + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *lnp = NULL; + + rw_enter(&parent->lxdn_rwlock, RW_READER); + error = lxd_dirlookup(parent, nm, &lnp, cr); + rw_exit(&parent->lxdn_rwlock); + + /* + * If a back node already exists then there is no need to pass + * the create to native devfs -- just set the vpp to the back + * vnode. If the front node already exists then fail because + * it can't represent a regular file. In both cases, enforce + * open(2)'s EEXIST and EISDIR semantics. + */ + if (error == 0) { + if (exclusive == EXCL) { + error = EEXIST; + } else if (LDNTOV(lnp)->v_type == VDIR && + (mode & S_IWRITE)) { + error = EISDIR; + } else if (lnp->lxdn_type == LXDNT_FRONT) { + error = ENOTSUP; + } + + if (error != 0) { + ldnode_rele(lnp); + return (error); + } + + VERIFY3S(lnp->lxdn_type, ==, LXDNT_BACK); + *vpp = lnp->lxdn_vnode; + + return (error); + } + + /* + * We cannot create files in the back devfs but we want to allow for + * O_CREAT on existing files. Pass this through and let the back file + * system allow or deny it. + */ + if (parent->lxdn_type == LXDNT_BACK) { + vnode_t *vp = NULL; + + if (*nm == '\0') { + ASSERT(vpp && dvp == *vpp); + vp = REALVP(*vpp); + } + if ((error = VOP_CREATE(REALVP(dvp), nm, va, exclusive, mode, + &vp, cr, flag, ct, vsecp)) == 0) { + *vpp = lxd_make_back_node(vp, VFSTOLXDM(dvp->v_vfsp)); + if (IS_DEVVP(*vpp)) { + vnode_t *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, + (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) { + return (ENOSYS); + } + *vpp = svp; + } + return (0); + } + /* + * If we were unable to perform the VOP_CREATE for any reason + * other than sdev being read-only, we should bail. + */ + if (error != ENOTSUP && error != EROFS) { + return (error); + } + } + + /* + * While we don't allow creating data-containing files under + * lx devfs, we must allow VSOCK front nodes to be created so + * that paths such as /dev/log can be used as AF_UNIX sockets. + */ + if (va->va_type == VSOCK) { + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + lnp = NULL; + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + va, &lnp, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error == 0) { + *vpp = LDNTOV(lnp); + } else if (lnp != NULL) { + /* + * It's possible that a racing process created an entry + * at this name since we last performed the lookup. + */ + ldnode_rele(lnp); + } + } else { + error = ENOTSUP; + } + + return (error); +} + +/* ARGSUSED */ +static int +lxd_remove(vnode_t *dvp, char *nm, struct cred *cr, caller_context_t *ct, + int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_node_t *ldn = NULL; + int error; + + /* can only remove existing front nodes */ + error = lxd_dirlookup(parent, nm, &ldn, cr); + if (error) { + return (error); + } + + ASSERT(ldn != NULL); + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(parent, ldn, nm, DR_REMOVE, cr); + + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + + ldnode_rele(ldn); + + return (error); +} + +/* ARGSUSED */ +static int +lxd_link(vnode_t *tdvp, vnode_t *vp, char *tnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static int +lxd_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, struct cred *cr, + caller_context_t *ct, int flags) +{ + lxd_node_t *oldparent = VTOLDN(odvp); + lxd_node_t *newparent; + lxd_mnt_t *lxdm = VTOLXDM(oldparent->lxdn_vnode); + lxd_node_t *fromnode = NULL; + int error; + int samedir = 0; + + if (!vn_matchops(ndvp, lxd_vnodeops)) { + /* cannot rename out of this file system */ + return (EACCES); + } + + mutex_enter(&lxdm->lxdm_renamelck); + + newparent = VTOLDN(ndvp); + + /* + * We can only rename front nodes. + */ + error = lxd_dirlookup(oldparent, onm, &fromnode, cr); + if (error != 0) { + /* not found in front */ + mutex_exit(&lxdm->lxdm_renamelck); + return (error); + } + + /* + * Make sure we can delete the old (source) entry. This + * requires write permission on the containing directory. If + * that directory is "sticky" it requires further checks. + */ + if ((error = lxd_naccess(oldparent, VWRITE, cr)) != 0) + goto done; + + /* + * Check for renaming to or from '.' or '..' or that + * fromnode == oldparent + */ + if ((onm[0] == '.' && + (onm[1] == '\0' || (onm[1] == '.' && onm[2] == '\0'))) || + (nnm[0] == '.' && + (nnm[1] == '\0' || (nnm[1] == '.' && nnm[2] == '\0'))) || + (oldparent == fromnode)) { + error = EINVAL; + goto done; + } + + samedir = (oldparent == newparent); + + /* + * Make sure we can search and rename into the destination directory. + */ + if (!samedir) { + if ((error = lxd_naccess(newparent, VEXEC|VWRITE, cr)) != 0) + goto done; + } + + /* + * Link source to new target + */ + rw_enter(&newparent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, newparent, nnm, DE_RENAME, + oldparent, fromnode, (struct vattr *)NULL, (lxd_node_t **)NULL, + cr); + rw_exit(&newparent->lxdn_rwlock); + + if (error) + goto done; + + /* + * Unlink from source. + */ + rw_enter(&oldparent->lxdn_rwlock, RW_WRITER); + rw_enter(&fromnode->lxdn_rwlock, RW_WRITER); + + error = lxd_dirdelete(oldparent, fromnode, onm, DR_RENAME, cr); + + /* + * The following handles the case where our source node was + * removed before we got to it. + */ + if (error == ENOENT) + error = 0; + + rw_exit(&fromnode->lxdn_rwlock); + rw_exit(&oldparent->lxdn_rwlock); + +done: + ldnode_rele(fromnode); + mutex_exit(&lxdm->lxdm_renamelck); + return (error); +} + +/* ARGSUSED */ +static int +lxd_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, + struct cred *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + int error; + vnode_t *tvp; + lxd_node_t *ndir = NULL; + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + + /* check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make front directory */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_MKDIR, NULL, NULL, + va, &ndir, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error != 0) { + if (ndir != NULL) + ldnode_rele(ndir); + } else { + *vpp = LDNTOV(ndir); + } + + return (error); +} + +static int +lxd_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + *vpp = vp; + return (0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + while (vn_matchops(vp, lxd_vnodeops)) + vp = REALVP(vp); + + if (VOP_REALVP(vp, vpp, ct) != 0) + *vpp = vp; + return (0); +} + +/* ARGSUSED */ +static int +lxd_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, struct cred *cr, + caller_context_t *ct, int flags) +{ + int error; + lxd_node_t *ldn; + struct vnode *vp; + lxd_node_t *parent = VTOLDN(dvp); + + /* + * Return error if trying to remove . or .. + */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); + + error = lxd_dirlookup(VTOLDN(dvp), nm, &ldn, cr); + if (error != 0) { + /* not found in front */ + return (error); + } + + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + + vp = LDNTOV(ldn); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto err; + } + + if (ldn->lxdn_vnode->v_type != VDIR) { + error = ENOTDIR; + goto err; + } + + mutex_enter(&ldn->lxdn_tlock); + if (ldn->lxdn_nlink > 2) { + mutex_exit(&ldn->lxdn_tlock); + error = EEXIST; + goto err; + } + mutex_exit(&ldn->lxdn_tlock); + + /* Check for an empty directory */ + if (ldn->lxdn_dirents > 2) { + error = EEXIST; + gethrestime(&ldn->lxdn_atime); + goto err; + } + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto err; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + vn_vfsunlock(vp); + goto err; + } + + error = lxd_dirdelete(parent, ldn, nm, DR_RMDIR, cr); + vn_vfsunlock(vp); + +err: + rw_exit(&ldn->lxdn_rwlock); + rw_exit(&parent->lxdn_rwlock); + ldnode_rele(ldn); + + return (error); +} + +/* Not static so it can be used during mount. */ +/* ARGSUSED */ +int +lxd_symlink(vnode_t *dvp, char *nm, struct vattr *tva, char *tnm, + struct cred *cr, caller_context_t *ct, int flags) +{ + lxd_node_t *parent = VTOLDN(dvp); + lxd_mnt_t *lxdm = VTOLXDM(parent->lxdn_vnode); + lxd_node_t *self = NULL; + vnode_t *tvp; + char *cp = NULL; + int error; + size_t len; + + /* this will check for existence in both front and back */ + if (lxd_lookup(dvp, nm, &tvp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + /* The entry already exists */ + VN_RELE(tvp); + return (EEXIST); + } + + /* make symlink in the front */ + rw_enter(&parent->lxdn_rwlock, RW_WRITER); + error = lxd_direnter(lxdm, parent, nm, DE_CREATE, NULL, NULL, + tva, &self, cr); + rw_exit(&parent->lxdn_rwlock); + + if (error) { + if (self != NULL) + ldnode_rele(self); + return (error); + } + + len = strlen(tnm) + 1; + cp = kmem_alloc(len, KM_NOSLEEP_LAZY); + if (cp == NULL) { + ldnode_rele(self); + return (ENOSPC); + } + (void) strcpy(cp, tnm); + + self->lxdn_symlink = cp; + self->lxdn_size = len - 1; + ldnode_rele(self); + + return (error); +} + +static int +lxd_readlink(vnode_t *vp, struct uio *uiop, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + int error; + + if (vp->v_type != VLNK) + return (EINVAL); + + rw_enter(&ldn->lxdn_rwlock, RW_READER); + error = uiomove(ldn->lxdn_symlink, ldn->lxdn_size, UIO_READ, + uiop); + gethrestime(&ldn->lxdn_atime); + rw_exit(&ldn->lxdn_rwlock); + return (error); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_READLINK(vp, uiop, cr, ct)); +} + +static int +lx_merge_front(vnode_t *vp, struct uio *uiop, off_t req_off, int *eofp) +{ + lxd_node_t *ldn = VTOLDN(vp); + struct dirent *sd; + lxd_dirent_t *ldp; + enum lxd_node_type type = ldn->lxdn_type; + ssize_t uresid; + off_t front_off; + int error = 0; + int sdlen; + + /* skip the front entries if the back read was incomplete */ + if (*eofp == 0) + return (0); + + /* + * If this was a back node then reading that node has completed and we + * may have a partially full uio struct. eof should be set to true. + * Leave it set since we're likely to hit eof for the front nodes (if + * any). + */ + + front_off = uiop->uio_offset + 1; + sdlen = sizeof (struct dirent) + MAXPATHLEN; + /* zalloc to ensure we don't have anything in the d_name buffer */ + sd = (struct dirent *)kmem_zalloc(sdlen, KM_SLEEP); + ldp = ldn->lxdn_dir; + while (ldp != NULL && (uresid = uiop->uio_resid) > 0) { + int namelen; + int reclen; + + /* + * Skip dot and dotdot for back nodes since we have them + * already. + */ + if (type == LXDNT_BACK && + (strcmp(ldp->lddir_name, ".") == 0 || + strcmp(ldp->lddir_name, "..") == 0)) { + ldp = ldp->lddir_next; + continue; + } + + /* + * Might have previously had a partial readdir of the front + * nodes, and now we're back for more, or we may just be + * be doing a follow-up readdir after we've previously + * returned all front and back nodes. + */ + if (front_off > req_off) { + namelen = strlen(ldp->lddir_name); /* no +1 needed */ + reclen = (int)DIRENT64_RECLEN(namelen); + + /* + * If the size of the data to transfer is greater + * than that requested, then we can't do it this + * transfer. + */ + if (reclen > uresid) { + *eofp = 0; + /* Buffer too small for any entries. */ + if (front_off == 0) + error = EINVAL; + break; + } + + (void) strncpy(sd->d_name, ldp->lddir_name, + DIRENT64_NAMELEN(reclen)); + sd->d_reclen = (ushort_t)reclen; + sd->d_ino = (ino_t)ldp->lddir_node->lxdn_nodeid; + sd->d_off = front_off; + + /* uiomove will adjust iov_base properly */ + if ((error = uiomove((caddr_t)sd, reclen, UIO_READ, + uiop)) != 0) { + *eofp = 0; + break; + } + } + + /* + * uiomove() above updates both uio_resid and uio_offset by the + * same amount but we want uio_offset to change in increments + * of 1, which is different from the number of bytes being + * returned to the caller, so we set uio_offset explicitly, + * ignoring what uiomove() did. + */ + uiop->uio_offset = front_off; + front_off++; + + ldp = ldp->lddir_next; + } + + kmem_free(sd, sdlen); + return (error); +} + +static int +lxd_readdir(vnode_t *vp, struct uio *uiop, struct cred *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxd_node_t *ldn = VTOLDN(vp); + vnode_t *rvp; + int res; + off_t req_off; + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + req_off = uiop->uio_offset; + + /* First read the back node (if it is one) */ + if (ldn->lxdn_type == LXDNT_BACK) { + rvp = REALVP(vp); + res = VOP_READDIR(rvp, uiop, cr, eofp, ct, flags); + if (res != 0) + return (res); + } else { + /* setup for merge_front */ + ASSERT(ldn->lxdn_type == LXDNT_FRONT); + /* caller should have already called lxd_rwlock */ + ASSERT(RW_READ_HELD(&ldn->lxdn_rwlock)); + + *eofp = 1; + /* + * The merge code starts the offset calculation from uio_offset, + * which is normally already set to the high value by the back + * code, but in this case we need to count up from 0. + */ + uiop->uio_offset = 0; + } + + /* + * Our back nodes can also have front entries hanging on them so we + * need to merge those in. Or, we may simply have a front node (i.e. a + * front subdir). + */ + res = lx_merge_front(vp, uiop, req_off, eofp); + return (res); +} + +static int +lxd_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + if (write_lock) { + rw_enter(&ldn->lxdn_rwlock, RW_WRITER); + } else { + rw_enter(&ldn->lxdn_rwlock, RW_READER); + } + return (write_lock); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_RWLOCK(vp, write_lock, ct)); +} + +static void +lxd_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + rw_exit(&ldn->lxdn_rwlock); + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + VOP_RWUNLOCK(vp, write_lock, ct); +} + +static int +lxd_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SEEK(vp, ooff, noffp, ct)); +} + +static int +lxd_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + while (vn_matchops(vp1, lxd_vnodeops) && + VTOLDN(vp1)->lxdn_type == LXDNT_BACK) { + vp1 = REALVP(vp1); + } + while (vn_matchops(vp2, lxd_vnodeops) && + VTOLDN(vp2)->lxdn_type == LXDNT_BACK) { + vp2 = REALVP(vp2); + } + + if (vn_matchops(vp1, lxd_vnodeops) || vn_matchops(vp2, lxd_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +static int +lxd_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct flk_callback *flk_cbp, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_FRLOCK(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); +} + +static int +lxd_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, offset_t offset, + struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SPACE(vp, cmd, bfp, flag, offset, cr, ct)); +} + +static int +lxd_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *prot, + struct page *parr[], size_t psz, struct seg *seg, caddr_t addr, + enum seg_rw rw, struct cred *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETPAGE(vp, off, len, prot, parr, psz, seg, addr, rw, cr, + ct)); +} + +static int +lxd_putpage(vnode_t *vp, offset_t off, size_t len, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PUTPAGE(vp, off, len, flags, cr, ct)); +} + +static int +lxd_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_MAP(vp, off, as, addrp, len, prot, maxprot, flags, cr, ct)); +} + +static int +lxd_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uchar_t prot, uchar_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_ADDMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, + uint_t prot, uint_t maxprot, uint_t flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DELMAP(vp, off, as, addr, len, prot, maxprot, flags, cr, + ct)); +} + +static int +lxd_poll(vnode_t *vp, short events, int anyyet, short *reventsp, + struct pollhead **phpp, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_POLL(vp, events, anyyet, reventsp, phpp, ct)); +} + +static int +lxd_dump(vnode_t *vp, caddr_t addr, offset_t bn, offset_t count, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_DUMP(vp, addr, bn, count, ct)); +} + +static int +lxd_pathconf(vnode_t *vp, int cmd, ulong_t *valp, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PATHCONF(vp, cmd, valp, cr, ct)); +} + +static int +lxd_pageio(vnode_t *vp, struct page *pp, u_offset_t io_off, size_t io_len, + int flags, cred_t *cr, caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_PAGEIO(vp, pp, io_off, io_len, flags, cr, ct)); +} + +static void +lxd_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return; + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + if (vp != NULL && !VN_ISKAS(vp)) + VOP_DISPOSE(vp, pp, fl, dn, cr, ct); +} + +static int +lxd_setsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + if (vn_is_readonly(vp)) + return (EROFS); + + vp = REALVP(vp); + return (VOP_SETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_getsecattr(vnode_t *vp, vsecattr_t *secattr, int flags, struct cred *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (ENOSYS); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_GETSECATTR(vp, secattr, flags, cr, ct)); +} + +static int +lxd_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, + caller_context_t *ct) +{ + lxd_node_t *ldn = VTOLDN(vp); + + if (ldn->lxdn_type == LXDNT_FRONT) { + return (EINVAL); + } + + ASSERT(ldn->lxdn_type == LXDNT_BACK); + vp = REALVP(vp); + return (VOP_SHRLOCK(vp, cmd, shr, flag, cr, ct)); +} + +/* + * Loopback vnode operations vector. + */ + +struct vnodeops *lxd_vnodeops; + +const fs_operation_def_t lxd_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxd_open }, + VOPNAME_CLOSE, { .vop_close = lxd_close }, + VOPNAME_READ, { .vop_read = lxd_read }, + VOPNAME_WRITE, { .vop_write = lxd_write }, + VOPNAME_IOCTL, { .vop_ioctl = lxd_ioctl }, + VOPNAME_SETFL, { .vop_setfl = lxd_setfl }, + VOPNAME_GETATTR, { .vop_getattr = lxd_getattr }, + VOPNAME_SETATTR, { .vop_setattr = lxd_setattr }, + VOPNAME_ACCESS, { .vop_access = lxd_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxd_lookup }, + VOPNAME_CREATE, { .vop_create = lxd_create }, + VOPNAME_REMOVE, { .vop_remove = lxd_remove }, + VOPNAME_LINK, { .vop_link = lxd_link }, + VOPNAME_RENAME, { .vop_rename = lxd_rename }, + VOPNAME_MKDIR, { .vop_mkdir = lxd_mkdir }, + VOPNAME_RMDIR, { .vop_rmdir = lxd_rmdir }, + VOPNAME_READDIR, { .vop_readdir = lxd_readdir }, + VOPNAME_SYMLINK, { .vop_symlink = lxd_symlink }, + VOPNAME_READLINK, { .vop_readlink = lxd_readlink }, + VOPNAME_FSYNC, { .vop_fsync = lxd_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = lxd_inactive }, + VOPNAME_FID, { .vop_fid = lxd_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = lxd_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = lxd_rwunlock }, + VOPNAME_SEEK, { .vop_seek = lxd_seek }, + VOPNAME_CMP, { .vop_cmp = lxd_cmp }, + VOPNAME_FRLOCK, { .vop_frlock = lxd_frlock }, + VOPNAME_SPACE, { .vop_space = lxd_space }, + VOPNAME_REALVP, { .vop_realvp = lxd_realvp }, + VOPNAME_GETPAGE, { .vop_getpage = lxd_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = lxd_putpage }, + VOPNAME_MAP, { .vop_map = lxd_map }, + VOPNAME_ADDMAP, { .vop_addmap = lxd_addmap }, + VOPNAME_DELMAP, { .vop_delmap = lxd_delmap }, + VOPNAME_POLL, { .vop_poll = lxd_poll }, + VOPNAME_DUMP, { .vop_dump = lxd_dump }, + VOPNAME_DUMPCTL, { .error = fs_error }, + VOPNAME_PATHCONF, { .vop_pathconf = lxd_pathconf }, + VOPNAME_PAGEIO, { .vop_pageio = lxd_pageio }, + VOPNAME_DISPOSE, { .vop_dispose = lxd_dispose }, + VOPNAME_SETSECATTR, { .vop_setsecattr = lxd_setsecattr }, + VOPNAME_GETSECATTR, { .vop_getsecattr = lxd_getsecattr }, + VOPNAME_SHRLOCK, { .vop_shrlock = lxd_shrlock }, + NULL, NULL +}; diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c new file mode 100644 index 0000000000..a55b436088 --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -0,0 +1,499 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + + +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/frame.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + +/* + * We store the syscall number in the low 16 bits (which limits us to 64k + * syscalls). The next bit indicates entry/return probe and the next bit + * indicates 64bit/32bit syscall. + */ +#define SCALL_MASK 0xffff +#define ENTRY_FLAG 0x10000 +#define SYSC_64_BIT 0x100000 + +#define LX_SYSTRACE_IS64BIT(x) ((int)(x) & SYSC_64_BIT) +#define LX_SYSTRACE_ISENTRY(x) ((int)(x) & ENTRY_FLAG) +#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & SCALL_MASK) + +#define LX_SYSTRACE32_ENTRY(id) (ENTRY_FLAG | (id)) +#define LX_SYSTRACE32_RETURN(id) (id) + +#define LX_SYSTRACE64_ENTRY(id) (SYSC_64_BIT | ENTRY_FLAG | (id)) +#define LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id) + +#define LX_SYSTRACE_ENTRY_AFRAMES 2 +#define LX_SYSTRACE_RETURN_AFRAMES 4 + +typedef struct lx_systrace_sysent { + const char *lss_name; + dtrace_id_t lss_entry; + dtrace_id_t lss_return; +} lx_systrace_sysent_t; + +static dev_info_t *lx_systrace_devi; +static dtrace_provider_id_t lx_systrace_id; +static kmutex_t lx_systrace_lock; +static uint_t lx_systrace_nenabled; + +static int lx_systrace_nsysent32; +static lx_systrace_sysent_t *lx_systrace_sysent32; + +#if defined(_LP64) +static int lx_systrace_nsysent64; +static lx_systrace_sysent_t *lx_systrace_sysent64; +#endif + +/*ARGSUSED*/ +static void +lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_entry; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_entry; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_return; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_return; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc) +{ + int i; + + if (desc != NULL) + return; + + for (i = 0; i < lx_systrace_nsysent32; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i))); + + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + for (i = 0; i < lx_systrace_nsysent64; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i))); + + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif +} + +/*ARGSUSED*/ +static int +lx_systrace_enable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + mutex_enter(&lx_systrace_lock); + if (lx_systrace_nenabled++ == 0) + lx_brand_systrace_enable(); + mutex_exit(&lx_systrace_lock); + +#if defined(_LP64) + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = id; + } else { + lx_systrace_sysent64[sysnum].lss_return = id; + } + } else +#endif + { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = id; + } else { + lx_systrace_sysent32[sysnum].lss_return = id; + } + } + return (0); +} + +/*ARGSUSED*/ +static void +lx_systrace_disable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + +#if defined(_LP64) + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE; + } + } else +#endif + { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE; + } + } + + mutex_enter(&lx_systrace_lock); + if (--lx_systrace_nenabled == 0) + lx_brand_systrace_disable(); + mutex_exit(&lx_systrace_lock); +} + +/*ARGSUSED*/ +static void +lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg) +{ +} + +/*ARGSUSED*/ +static uint64_t +lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct frame *fp = (struct frame *)dtrace_getfp(); + uintptr_t *stack; + uint64_t val = 0; + int i; + + if (argno >= 6) + return (0); + + /* + * Walk the four frames down the stack to the entry or return callback. + * Our callback calls dtrace_probe() which calls dtrace_dif_variable() + * which invokes this function to get the extended arguments. We get + * the frame pointer in via call to dtrace_getfp() above which makes for + * four frames. + */ + for (i = 0; i < 4; i++) { + fp = (struct frame *)fp->fr_savfp; + } + + stack = (uintptr_t *)&fp[1]; + + /* + * Skip the first argument to the callback -- the system call number. + */ + argno++; + +#ifdef __amd64 + /* + * On amd64, the first 6 arguments are passed in registers while + * subsequent arguments are on the stack. + */ + argno -= 6; +#endif + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[argno]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + + +static const dtrace_pattr_t lx_systrace_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t lx_systrace_pops = { + lx_systrace_provide, + NULL, + lx_systrace_enable, + lx_systrace_disable, + NULL, + NULL, + NULL, + lx_systrace_getarg, + NULL, + lx_systrace_destroy +}; + +static int +lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int i; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR, + 0, DDI_PSEUDO, 0) == DDI_FAILURE || + dtrace_register("lx-syscall", &lx_systrace_attr, + DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL, + &lx_systrace_id) != 0) { + ddi_remove_minor_node(devi, NULL); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + lx_systrace_devi = devi; + + /* + * Initialize the 32-bit table. + */ + VERIFY(lx_nsysent32 > 0); + lx_systrace_nsysent32 = lx_nsysent32; + lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent32; i++) { + lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name; + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + /* + * Initialize the 64-bit table. + */ + VERIFY(lx_nsysent64 > 0); + lx_systrace_nsysent64 = lx_nsysent64; + lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent64; i++) { + lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name; + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif + + /* + * Install probe triggers. + */ + lx_systrace_entry_ptr = lx_systrace_entry; + lx_systrace_return_ptr = lx_systrace_return; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (dtrace_unregister(lx_systrace_id) != 0) + return (DDI_FAILURE); + + /* + * Free tables. + */ + kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent32 = NULL; + lx_systrace_nsysent32 = 0; + +#if defined(_LP64) + kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent64 = NULL; + lx_systrace_nsysent64 = 0; +#endif + + /* + * Reset probe triggers. + */ + lx_systrace_entry_ptr = NULL; + lx_systrace_return_ptr = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +static struct cb_ops lx_systrace_cb_ops = { + lx_systrace_open, /* open */ + nodev, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops lx_systrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_systrace_attach, /* attach */ + lx_systrace_detach, /* detach */ + nodev, /* reset */ + &lx_systrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Linux Brand System Call Tracing", /* name of module */ + &lx_systrace_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf new file mode 100644 index 0000000000..e4499c8a5b --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_systrace" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c new file mode 100644 index 0000000000..b2c7589abc --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c @@ -0,0 +1,2234 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * Compatibility for the Linux netlink(7) kernel/user transport, as well as + * for in-kernel netlink(7) providers like rtnetlink(7). See RFC 3549 for + * details of the protocol, and the Linux man pages for details of the Linux + * implementation that we're mimicking. + */ + +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <sys/debug.h> +#include <sys/ucred.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/ip_impl.h> +#include <inet/ip_ire.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/lx_impl.h> +#include <sys/lx_audit.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/policy.h> +#include <sys/ddi.h> + +/* + * Flags in netlink header + * See Linux include/uapi/linux/netlink.h + * Additional flags for "GET" requests + */ +#define LX_NETLINK_NLM_F_REQUEST 1 +#define LX_NETLINK_NLM_F_MULTI 2 +#define LX_NETLINK_NLM_F_ACK 4 +#define LX_NETLINK_NLM_F_ECHO 8 +#define LX_NETLINK_NLM_F_DUMP_INTR 16 +#define LX_NETLINK_NLM_F_ROOT 0x100 +#define LX_NETLINK_NLM_F_MATCH 0x200 +#define LX_NETLINK_NLM_F_ATOMIC 0x400 + +/* + * Generic message type constants + */ +#define LX_NETLINK_NLMSG_NONE 0 +#define LX_NETLINK_NLMSG_NOOP 1 +#define LX_NETLINK_NLMSG_ERROR 2 +#define LX_NETLINK_NLMSG_DONE 3 +#define LX_NETLINK_NLMSG_OVERRUN 4 + +/* + * Protocol constants. + */ +#define LX_NETLINK_ROUTE 0 +#define LX_NETLINK_UNUSED 1 +#define LX_NETLINK_USERSOCK 2 +#define LX_NETLINK_FIREWALL 3 +#define LX_NETLINK_SOCK_DIAG 4 +#define LX_NETLINK_NFLOG 5 +#define LX_NETLINK_XFRM 6 +#define LX_NETLINK_SELINUX 7 +#define LX_NETLINK_ISCSI 8 +#define LX_NETLINK_AUDIT 9 +#define LX_NETLINK_FIB_LOOKUP 10 +#define LX_NETLINK_CONNECTOR 11 +#define LX_NETLINK_NETFILTER 12 +#define LX_NETLINK_IP6_FW 13 +#define LX_NETLINK_DNRTMSG 14 +#define LX_NETLINK_KOBJECT_UEVENT 15 +#define LX_NETLINK_GENERIC 16 +#define LX_NETLINK_SCSITRANSPORT 18 +#define LX_NETLINK_ECRYPTFS 19 +#define LX_NETLINK_RDMA 20 +#define LX_NETLINK_CRYPTO 21 + +/* + * rtnetlink(7) attribute-related constants + */ +#define LX_NETLINK_NLA_ALIGNTO 4 + +#define LX_NETLINK_RTM_NEWLINK 16 +#define LX_NETLINK_RTM_DELLINK 17 +#define LX_NETLINK_RTM_GETLINK 18 +#define LX_NETLINK_RTM_SETLINK 19 +#define LX_NETLINK_RTM_NEWADDR 20 +#define LX_NETLINK_RTM_DELADDR 21 +#define LX_NETLINK_RTM_GETADDR 22 +#define LX_NETLINK_RTM_NEWROUTE 24 +#define LX_NETLINK_RTM_DELROUTE 25 +#define LX_NETLINK_RTM_GETROUTE 26 +#define LX_NETLINK_RTM_NEWNEIGH 28 +#define LX_NETLINK_RTM_DELNEIGH 29 +#define LX_NETLINK_RTM_GETNEIGH 30 +#define LX_NETLINK_RTM_NEWRULE 32 +#define LX_NETLINK_RTM_DELRULE 33 +#define LX_NETLINK_RTM_GETRULE 34 +#define LX_NETLINK_RTM_NEWQDISC 36 +#define LX_NETLINK_RTM_DELQDISC 37 +#define LX_NETLINK_RTM_GETQDISC 38 +#define LX_NETLINK_RTM_NEWTCLASS 40 +#define LX_NETLINK_RTM_DELTCLASS 41 +#define LX_NETLINK_RTM_GETTCLASS 42 +#define LX_NETLINK_RTM_NEWTFILTER 44 +#define LX_NETLINK_RTM_DELTFILTER 45 +#define LX_NETLINK_RTM_GETTFILTER 46 +#define LX_NETLINK_RTM_NEWACTION 48 +#define LX_NETLINK_RTM_DELACTION 49 +#define LX_NETLINK_RTM_GETACTION 50 +#define LX_NETLINK_RTM_NEWPREFIX 52 +#define LX_NETLINK_RTM_GETMULTICAST 58 +#define LX_NETLINK_RTM_GETANYCAST 62 +#define LX_NETLINK_RTM_NEWNEIGHTBL 64 +#define LX_NETLINK_RTM_GETNEIGHTBL 66 +#define LX_NETLINK_RTM_SETNEIGHTBL 67 +#define LX_NETLINK_RTM_NEWNDUSEROPT 68 +#define LX_NETLINK_RTM_NEWADDRLABEL 72 +#define LX_NETLINK_RTM_DELADDRLABEL 73 +#define LX_NETLINK_RTM_GETADDRLABEL 74 +#define LX_NETLINK_RTM_GETDCB 78 +#define LX_NETLINK_RTM_SETDCB 79 +#define LX_NETLINK_RTM_NEWNETCONF 80 +#define LX_NETLINK_RTM_GETNETCONF 82 +#define LX_NETLINK_RTM_NEWMDB 84 +#define LX_NETLINK_RTM_DELMDB 85 +#define LX_NETLINK_RTM_GETMDB 86 +#define LX_NETLINK_RTM_MAX 87 + +/* + * rtnetlink(7) attribute constants + */ +#define LX_NETLINK_RTA_UNSPEC 0 +#define LX_NETLINK_RTA_DST 1 +#define LX_NETLINK_RTA_SRC 2 +#define LX_NETLINK_RTA_IIF 3 +#define LX_NETLINK_RTA_OIF 4 +#define LX_NETLINK_RTA_GATEWAY 5 +#define LX_NETLINK_RTA_PRIORITY 6 +#define LX_NETLINK_RTA_PREFSRC 7 +#define LX_NETLINK_RTA_METRICS 8 +#define LX_NETLINK_RTA_MULTIPATH 9 +#define LX_NETLINK_RTA_PROTOINFO 10 +#define LX_NETLINK_RTA_FLOW 11 +#define LX_NETLINK_RTA_CACHEINFO 12 +#define LX_NETLINK_RTA_SESSION 13 +#define LX_NETLINK_RTA_MP_ALGO 14 +#define LX_NETLINK_RTA_TABLE 15 +#define LX_NETLINK_RTA_MARK 16 +#define LX_NETLINK_RTA_MFC_STATS 17 +#define LX_NETLINK_MAX_RTA LX_NETLINK_RTA_MFC_STATS + +/* + * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants + */ +#define LX_NETLINK_IFLA_UNSPEC 0 +#define LX_NETLINK_IFLA_ADDRESS 1 +#define LX_NETLINK_IFLA_BROADCAST 2 +#define LX_NETLINK_IFLA_IFNAME 3 +#define LX_NETLINK_IFLA_MTU 4 +#define LX_NETLINK_IFLA_LINK 5 +#define LX_NETLINK_IFLA_QDISC 6 +#define LX_NETLINK_IFLA_STATS 7 +#define LX_NETLINK_IFLA_COST 8 +#define LX_NETLINK_IFLA_PRIORITY 9 +#define LX_NETLINK_IFLA_MASTER 10 +#define LX_NETLINK_IFLA_WIRELESS 11 +#define LX_NETLINK_IFLA_PROTINFO 12 +#define LX_NETLINK_IFLA_TXQLEN 13 +#define LX_NETLINK_IFLA_MAP 14 +#define LX_NETLINK_IFLA_WEIGHT 15 +#define LX_NETLINK_IFLA_OPERSTATE 16 +#define LX_NETLINK_IFLA_LINKMODE 17 +#define LX_NETLINK_IFLA_LINKINFO 18 +#define LX_NETLINK_IFLA_NET_NS_PID 19 +#define LX_NETLINK_IFLA_IFALIAS 20 +#define LX_NETLINK_IFLA_NUM_VF 21 +#define LX_NETLINK_IFLA_VFINFO_LIST 22 +#define LX_NETLINK_IFLA_STATS64 23 +#define LX_NETLINK_IFLA_VF_PORTS 24 +#define LX_NETLINK_IFLA_PORT_SELF 25 +#define LX_NETLINK_IFLA_AF_SPEC 26 +#define LX_NETLINK_IFLA_GROUP 27 +#define LX_NETLINK_IFLA_NET_NS_FD 28 +#define LX_NETLINK_IFLA_EXT_MASK 29 +#define LX_NETLINK_IFLA_PROMISCUITY 30 +#define LX_NETLINK_IFLA_NUM_TX_QUEUES 31 +#define LX_NETLINK_IFLA_NUM_RX_QUEUES 32 +#define LX_NETLINK_IFLA_CARRIER 33 +#define LX_NETLINK_IFLA_PHYS_PORT_ID 34 +#define LX_NETLINK_IFLA_CARRIER_CHANGES 35 +#define LX_NETLINK_IFLA_MAX 36 + +/* + * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants + */ +#define LX_NETLINK_IFA_UNSPEC 0 +#define LX_NETLINK_IFA_ADDRESS 1 +#define LX_NETLINK_IFA_LOCAL 2 +#define LX_NETLINK_IFA_LABEL 3 +#define LX_NETLINK_IFA_BROADCAST 4 +#define LX_NETLINK_IFA_ANYCAST 5 +#define LX_NETLINK_IFA_CACHEINFO 6 +#define LX_NETLINK_IFA_MULTICAST 7 +#define LX_NETLINK_IFA_FLAGS 8 +#define LX_NETLINK_IFA_MAX 9 + +#define LX_NETLINK_IFA_F_SECONDARY 0x01 +#define LX_NETLINK_IFA_F_TEMPORARY LX_NETLINK_IFA_F_SECONDARY +#define LX_NETLINK_IFA_F_NODAD 0x02 +#define LX_NETLINK_IFA_F_OPTIMISTIC 0x04 +#define LX_NETLINK_IFA_F_DADFAILED 0x08 +#define LX_NETLINK_IFA_F_HOMEADDRESS 0x10 +#define LX_NETLINK_IFA_F_DEPRECATED 0x20 +#define LX_NETLINK_IFA_F_TENTATIVE 0x40 +#define LX_NETLINK_IFA_F_PERMANENT 0x80 +#define LX_NETLINK_IFA_F_MANAGETEMPADDR 0x100 +#define LX_NETLINK_IFA_F_NOPREFIXROUTE 0x200 + +/* + * Linux interface flags. + */ +#define LX_IFF_UP (1<<0) +#define LX_IFF_BROADCAST (1<<1) +#define LX_IFF_DEBUG (1<<2) +#define LX_IFF_LOOPBACK (1<<3) +#define LX_IFF_POINTOPOINT (1<<4) +#define LX_IFF_NOTRAILERS (1<<5) +#define LX_IFF_RUNNING (1<<6) +#define LX_IFF_NOARP (1<<7) +#define LX_IFF_PROMISC (1<<8) +#define LX_IFF_ALLMULTI (1<<9) +#define LX_IFF_MASTER (1<<10) +#define LX_IFF_SLAVE (1<<11) +#define LX_IFF_MULTICAST (1<<12) +#define LX_IFF_PORTSEL (1<<13) +#define LX_IFF_AUTOMEDIA (1<<14) +#define LX_IFF_DYNAMIC (1<<15) +#define LX_IFF_LOWER_UP (1<<16) +#define LX_IFF_DORMANT (1<<17) +#define LX_IFF_ECHO (1<<18) + +/* rtm_table */ +#define LX_ROUTE_TABLE_MAIN 254 + +/* rtm_type */ +#define LX_RTN_UNSPEC 0 +#define LX_RTN_UNICAST 1 +#define LX_RTN_LOCAL 2 +#define LX_RTN_BROADCAST 3 +#define LX_RTN_ANYCAST 4 +#define LX_RTN_MULTICAST 5 +#define LX_RTN_BLACKHOLE 6 +#define LX_RTN_UNREACHABLE 7 +#define LX_RTN_PROHIBIT 8 +#define LX_RTN_THROW 9 +#define LX_RTN_NAT 10 +#define LX_RTN_XRESOLVE 11 + +/* rtm_protocol */ +#define LX_RTPROT_UNSPEC 0 +#define LX_RTPROT_REDIRECT 1 /* From ICMP redir */ +#define LX_RTPROT_KERNEL 2 /* From kernel */ +#define LX_RTPROT_BOOT 3 /* From boot */ +#define LX_RTPROT_STATIC 4 /* From administrator */ +#define LX_RTPROT_NULL 0xff /* Uninitialized */ + +/* rtm_scope */ +#define LX_RTSCOPE_UNIVERSE 0 +#define LX_RTSCOPE_SITE 200 +#define LX_RTSCOPE_LINK 253 +#define LX_RTSCOPE_HOST 254 +#define LX_RTSCOPE_NOWHERE 255 + +/* + * Audit message types (lxnh_type in the lx_netlink_hdr_t msg header) + * See Linux include/uapi/linux/audit.h and user-level auditd source + * lib/libaudit.h. + * + * The types fall into range blocks: + * 1000-1099 is for audit system control commands + * 1100-2999 various messages, as detailed in include/uapi/linux/audit.h + */ +#define LX_AUDIT_GET 1000 /* get audit system status */ +#define LX_AUDIT_SET 1001 /* set audit system status */ +#define LX_AUDIT_WATCH_INS 1007 /* insert file watch */ +#define LX_AUDIT_WATCH_REM 1008 /* remove file watch */ +#define LX_AUDIT_WATCH_LIST 1009 /* list file watchs */ +#define LX_AUDIT_ADD_RULE 1011 /* add syscall rule */ +#define LX_AUDIT_DEL_RULE 1012 /* del syscall rule */ +#define LX_AUDIT_LIST_RULES 1013 /* list syscall rules */ +#define LX_AUDIT_SET_FEATURE 1018 +#define LX_AUDIT_GET_FEATURE 1019 +#define LX_AUDIT_USER_MSG_START 1100 + +/* + * Netlink sockopts + */ +#define SOL_LX_NETLINK 270 + +/* See Linux include/uapi/linux/netlink.h */ +#define LX_NETLINK_SO_ADD_MEMBERSHIP 1 +#define LX_NETLINK_SO_DROP_MEMBERSHIP 2 +#define LX_NETLINK_SO_PKTINFO 3 +#define LX_NETLINK_SO_BROADCAST_ERROR 4 +#define LX_NETLINK_SO_NO_ENOBUFS 5 +#define LX_NETLINK_SO_RX_RING 6 +#define LX_NETLINK_SO_TX_RING 7 +#define LX_NETLINK_SO_LISTEN_ALL_NSID 8 +#define LX_NETLINK_SO_LIST_MEMBERSHIPS 9 +#define LX_NETLINK_SO_CAP_ACK 10 + +/* Internal socket flags */ +#define LXNLF_RECVUCRED 0x1 +#define LXNLF_AUDITD 0x2 + +/* nlmsg structure macros */ +#define LXNLMSG_ALIGNTO 4 +#define LXNLMSG_ALIGN(len) \ + (((len) + LXNLMSG_ALIGNTO - 1) & ~(LXNLMSG_ALIGNTO - 1)) +#define LXNLMSG_HDRLEN \ + ((int)LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t))) +#define LXNLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define LXNLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define LXNLMSG_DATA(nlh) ((void*)(((char *)nlh) + NLMSG_LENGTH(0))) +#define LXNLMSG_PAYLOAD(nlh, len) \ + ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define LXATTR_PAYLOAD(lxa) \ + ((void*)((caddr_t)(lxa) + sizeof (lx_netlink_attr_t))) +#define LXATTR_HDRLEN LXNLMSG_ALIGN(sizeof (lx_netlink_attr_t)) +#define LXATTR_LEN(len) (LXATTR_HDRLEN + LXNLMSG_ALIGN(len)) + +typedef struct lx_netlink_hdr { + uint32_t lxnh_len; /* length of message */ + uint16_t lxnh_type; /* type of message */ + uint16_t lxnh_flags; /* flags */ + uint32_t lxnh_seq; /* sequence number */ + uint32_t lxnh_pid; /* sending pid */ +} lx_netlink_hdr_t; + +typedef struct lx_netlink_err { + lx_netlink_hdr_t lxne_hdr; /* header */ + int32_t lxne_errno; /* errno */ + lx_netlink_hdr_t lxne_failed; /* header of err */ +} lx_netlink_err_t; + +typedef struct lx_netlink_attr { + uint16_t lxna_len; /* length of attribute */ + uint16_t lxna_type; /* type of attribute */ +} lx_netlink_attr_t; + +typedef struct lx_netlink_ifinfomsg { + uint8_t lxnl_ifi_family; /* family: AF_UNSPEC */ + uint8_t lxnl_ifi__pad; + uint16_t lxnl_ifi_type; /* device type */ + uint32_t lxnl_ifi_index; /* interface index */ + uint32_t lxnl_ifi_flags; /* device flags */ + uint32_t lxnl_ifi_change; /* unused; must be -1 */ +} lx_netlink_ifinfomsg_t; + +typedef struct lx_netlink_ifaddrmsg { + uint8_t lxnl_ifa_family; /* address type */ + uint8_t lxnl_ifa_prefixlen; /* prefix length of address */ + uint8_t lxnl_ifa_flags; /* address flags */ + uint8_t lxnl_ifa_scope; /* address scope */ + uint8_t lxnl_ifa_index; /* interface index */ +} lx_netlink_ifaddrmsg_t; + +typedef struct lx_netlink_rtmsg { + uint8_t rtm_family; /* route AF */ + uint8_t rtm_dst_len; /* destination addr length */ + uint8_t rtm_src_len; /* source addr length */ + uint8_t rtm_tos; /* TOS filter */ + uint8_t rtm_table; /* routing table ID */ + uint8_t rtm_protocol; /* routing protocol */ + uint8_t rtm_scope; + uint8_t rtm_type; + uint32_t rtm_flags; +} lx_netlink_rtmsg_t; + +typedef struct lx_netlink_sockaddr { + sa_family_t lxnl_family; /* AF_LX_NETLINK */ + uint16_t lxnl_pad; /* padding */ + uint32_t lxnl_port; /* port id */ + uint32_t lxnl_groups; /* multicast groups mask */ +} lx_netlink_sockaddr_t; + +typedef struct lx_netlink_sock { + struct lx_netlink_sock *lxns_next; /* list of lx_netlink sockets */ + sock_upcalls_t *lxns_upcalls; /* pointer to socket upcalls */ + sock_upper_handle_t lxns_uphandle; /* socket upcall handle */ + ldi_handle_t lxns_iphandle; /* handle to /dev/ip */ + ldi_handle_t lxns_ip6handle; /* handle to /dev/ip6 */ + ldi_handle_t lxns_current; /* current ip handle */ + int lxns_proto; /* protocol */ + uint32_t lxns_port; /* port identifier */ + uint32_t lxns_groups; /* group subscriptions */ + uint32_t lxns_bufsize; /* buffer size */ + uint32_t lxns_flags; /* socket flags */ + kmutex_t lxns_flowctl_mtx; /* protects lxns_flowctrled */ + boolean_t lxns_flowctrled; /* sock is flow-controlled */ +} lx_netlink_sock_t; + +typedef struct lx_netlink_reply { + lx_netlink_hdr_t lxnr_hdr; /* header that we're reply to */ + lx_netlink_sock_t *lxnr_sock; /* socket */ + uint32_t lxnr_seq; /* sequence number */ + uint16_t lxnr_type; /* type of reply */ + mblk_t *lxnr_mp; /* current mblk */ + mblk_t *lxnr_err; /* error mblk */ + mblk_t *lxnr_mp1; /* T_UNITDATA_IND mblk */ + int lxnr_errno; /* errno, if any */ +} lx_netlink_reply_t; + +static lx_netlink_sock_t *lx_netlink_head; /* head of lx_netlink sockets */ +static uint_t lx_netlink_audit_cnt; /* prevent unload for audit */ +static kmutex_t lx_netlink_lock; /* lock to protect state */ +static ldi_ident_t lx_netlink_ldi; /* LDI handle */ +static int lx_netlink_bufsize = 4096; /* default buffer size */ +static int lx_netlink_flowctrld; /* # of times flow controlled */ + +typedef enum { + LXNL_BIND, + LXNL_SENDMSG +} lx_netlink_action_t; + +#define LX_UNSUP_BUFSZ 64 + +/* + * On Linux, CAP_NET_ADMIN is required to take certain netlink actions. This + * restriction is loosened for certain protocol types, provided the activity is + * limited to communicating directly with the kernel (rather than transmitting + * to the various multicast groups) + */ +static int +lx_netlink_access(lx_netlink_sock_t *lns, cred_t *cr, lx_netlink_action_t act) +{ + /* Simple actions are allowed on these netlink protocols. */ + if (act != LXNL_SENDMSG) { + switch (lns->lxns_proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + return (0); + default: + break; + } + } + + /* CAP_NET_ADMIN roughly maps to PRIV_SYS_IP_CONFIG. */ + if (secpolicy_ip_config(cr, B_FALSE) != 0) { + return (EACCES); + } + + return (0); +} + +/*ARGSUSED*/ +static void +lx_netlink_activate(sock_lower_handle_t handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, + int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ | + SOCKOPT_MAXBLK | SOCKOPT_MINPSZ; + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl); + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_minpsz = 0; + + lxsock->lxns_upcalls = sock_upcalls; + lxsock->lxns_uphandle = sock_handle; + + sock_upcalls->su_set_proto_props(sock_handle, &sopp); +} + +/*ARGSUSED*/ +static int +lx_netlink_setsockopt(sock_lower_handle_t handle, int level, + int option_name, const void *optval, socklen_t optlen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + + if (level == SOL_SOCKET && option_name == SO_RECVUCRED) { + int *ival; + if (optlen != sizeof (int)) { + return (EINVAL); + } + ival = (int *)optval; + if (*ival == 0) { + lxsock->lxns_flags &= ~LXNLF_RECVUCRED; + } else { + lxsock->lxns_flags |= LXNLF_RECVUCRED; + } + return (0); + } else if (level == SOL_SOCKET) { + /* Punt on the other SOL_SOCKET options */ + return (0); + } else if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_ADD_MEMBERSHIP: + case LX_NETLINK_SO_DROP_MEMBERSHIP: + case LX_NETLINK_SO_PKTINFO: + case LX_NETLINK_SO_BROADCAST_ERROR: + case LX_NETLINK_SO_NO_ENOBUFS: + case LX_NETLINK_SO_RX_RING: + case LX_NETLINK_SO_TX_RING: + /* Blatant lie */ + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockopt(sock_lower_handle_t handle, int level, + int option_name, void *optval, socklen_t *optlen, cred_t *cr) +{ + if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_LIST_MEMBERSHIPS: + /* Report that we have 0 members to allow systemd to proceed. */ + *optlen = 0; + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name, + socklen_t namelen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name; + + if (namelen < sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + /* + * Perform access checks if attempting to bind on any multicast groups. + */ + if (lxsa->lxnl_groups != 0) { + int err; + + if ((err = lx_netlink_access(lxsock, cr, LXNL_BIND)) != 0) { + return (err); + } + + /* Lie about group subscription for now */ + lxsock->lxns_groups = lxsa->lxnl_groups; + } + + /* + * Linux netlink uses nl_port to identify distinct netlink sockets. + * Binding to an address of nl_port=0 triggers the kernel to + * automatically assign a free nl_port identifier. Originally, + * consumers of lx_netlink were required to bind with that automatic + * address. We now support non-zero values for nl_port although strict + * checking to identify conflicts is not performed. Use of the + * id_space facility could be a convenient solution, if a need arose. + */ + if (lxsa->lxnl_port == 0) { + /* + * Because we are not doing conflict detection, there is no + * need to expend effort selecting a unique port for automatic + * addressing during bind. + */ + lxsock->lxns_port = curproc->p_pid; + } else { + lxsock->lxns_port = lxsa->lxnl_port; + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa, + socklen_t *len, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa; + + if (*len < sizeof (lx_netlink_sockaddr_t)) + return (EINVAL); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_pad = 0; + lxsa->lxnl_port = lxsock->lxns_port; + lxsa->lxnl_groups = lxsock->lxns_groups; + + *len = sizeof (lx_netlink_sockaddr_t); + + return (0); +} + +static mblk_t * +lx_netlink_alloc_mp1(lx_netlink_sock_t *lxsock) +{ + mblk_t *mp; + size_t size; + struct T_unitdata_ind *tunit; + lx_netlink_sockaddr_t *lxsa; + boolean_t send_ucred; + + /* + * Certain netlink clients (such as systemd) will set SO_RECVUCRED + * (via the Linux SCM_CREDENTIALS) on the expectation that all replies + * will contain credentials passed via cmsg. They require this to + * authenticate those messages as having originated in the kernel by + * checking uc_pid == 0. + */ + VERIFY(lxsock != NULL); + send_ucred = ((lxsock->lxns_flags & LXNLF_RECVUCRED) != 0); + + /* + * Message structure: + * +----------------------------+ + * | struct T_unit_data_ind | + * +----------------------------+ + * | lx_netlink_sockaddr_t | + * +----------------------------+ -+ + * | struct cmsghdr (SCM_UCRED) | | + * +----------------------------+ +-(optional) + * | struct ucred_s (cmsg data) | | + * +----------------------------+ -+ + */ + size = sizeof (*tunit) + sizeof (*lxsa); + if (send_ucred) { + size += sizeof (struct cmsghdr) + + ROUNDUP_cmsglen(sizeof (struct ucred_s)); + } + mp = allocb(size, 0); + if (mp == NULL) { + return (NULL); + } + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + tunit = (struct T_unitdata_ind *)mp->b_rptr; + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + lxsa = (lx_netlink_sockaddr_t *)((caddr_t)tunit + sizeof (*tunit)); + mp->b_wptr += size; + + mp->b_datap->db_type = M_PROTO; + tunit->PRIM_type = T_UNITDATA_IND; + tunit->SRC_length = sizeof (*lxsa); + tunit->SRC_offset = sizeof (*tunit); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_port = 0; + lxsa->lxnl_groups = 0; + lxsa->lxnl_pad = 0; + + if (send_ucred) { + struct cmsghdr *cmsg; + struct ucred_s *ucred; + + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + cmsg = (struct cmsghdr *)((caddr_t)lxsa + sizeof (*lxsa)); + ucred = (struct ucred_s *)CMSG_CONTENT(cmsg); + cmsg->cmsg_len = sizeof (*cmsg) + sizeof (*ucred); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + bzero(ucred, sizeof (*ucred)); + ucred->uc_size = sizeof (*ucred); + ucred->uc_zoneid = getzoneid(); + + tunit->OPT_length = sizeof (*cmsg) + + ROUNDUP_cmsglen(sizeof (*ucred)); + tunit->OPT_offset = tunit->SRC_offset + tunit->SRC_length; + } else { + tunit->OPT_length = 0; + tunit->OPT_offset = 0; + } + + return (mp); +} + +static lx_netlink_reply_t * +lx_netlink_reply(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, uint16_t type) +{ + lx_netlink_reply_t *reply; + mblk_t *err, *mp1; + + /* + * We always allocate an error block to assure that even if subsequent + * allocations fail, we can return an error. + */ + if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL) + return (NULL); + + if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) { + freeb(err); + return (NULL); + } + + reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP); + reply->lxnr_err = err; + reply->lxnr_sock = lxsock; + reply->lxnr_hdr = *hdr; + reply->lxnr_type = type; + reply->lxnr_mp1 = mp1; + + return (reply); +} + +static void +lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + uint32_t aligned; + mblk_t *mp = reply->lxnr_mp; + + if (reply->lxnr_errno) + return; + + aligned = LXNLMSG_ALIGN(size); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + + if (hdr->lxnh_len + aligned > lxsock->lxns_bufsize) { + reply->lxnr_errno = E2BIG; + return; + } + + bcopy(payload, mp->b_wptr, size); + hdr->lxnh_len += aligned; + mp->b_wptr += aligned; +} + +static void +lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + if (reply->lxnr_errno) + return; + + VERIFY(reply->lxnr_mp == NULL); + + if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + + mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + + if (payload == NULL) { + /* + * A NULL payload denotes a "done" message. + */ + hdr->lxnh_type = LX_NETLINK_NLMSG_DONE; + } else { + hdr->lxnh_type = reply->lxnr_type; + lx_netlink_reply_add(reply, payload, size); + } +} + +static void +lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type, + void *payload, uint32_t size) +{ + lx_netlink_attr_t attr; + + attr.lxna_len = size + sizeof (lx_netlink_attr_t); + attr.lxna_type = type; + + lx_netlink_reply_add(reply, &attr, sizeof (attr)); + lx_netlink_reply_add(reply, payload, size); +} + +static void +lx_netlink_reply_attr_string(lx_netlink_reply_t *reply, + uint16_t type, const char *str) +{ + lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1); +} + +static void +lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply, + uint16_t type, int32_t val) +{ + int32_t v = val; + + lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t)); +} + +static int +lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg) +{ + int rval; + + if (reply->lxnr_errno != 0) + return (reply->lxnr_errno); + + if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current, + cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) { + reply->lxnr_errno = rval; + } + + return (rval); +} + +static void +lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + int error; + + /* + * To prevent the stream head from coalescing messages and to indicate + * their origin, we send them as T_UNITDATA_IND messages, not as raw + * M_DATA. + */ + mp1->b_cont = mp; + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1, + msgdsize(mp1), 0, &error, NULL); + + if (error != 0) + lx_netlink_flowctrld++; +} + +static void +lx_netlink_reply_send(lx_netlink_reply_t *reply) +{ + mblk_t *mp1; + + if (reply->lxnr_errno) + return; + + if ((mp1 = lx_netlink_alloc_mp1(reply->lxnr_sock)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1); + reply->lxnr_mp = NULL; +} + +static void +lx_netlink_reply_done(lx_netlink_reply_t *reply) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + /* + * Denote that we're done via a message with a NULL payload. + */ + lx_netlink_reply_msg(reply, NULL, 0); + + if (reply->lxnr_errno) { + /* + * If anything failed, we'll send up an error message. + */ + lx_netlink_hdr_t *hdr; + lx_netlink_err_t *err; + + if (reply->lxnr_mp != NULL) { + freeb(reply->lxnr_mp); + reply->lxnr_mp = NULL; + } + + mp = reply->lxnr_err; + VERIFY(mp != NULL); + reply->lxnr_err = NULL; + err = (lx_netlink_err_t *)mp->b_rptr; + hdr = &err->lxne_hdr; + mp->b_wptr += sizeof (lx_netlink_err_t); + + err->lxne_failed = reply->lxnr_hdr; + err->lxne_errno = reply->lxnr_errno; + hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_len = sizeof (lx_netlink_err_t); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + hdr->lxnh_flags = 0; + } else { + uint32_t status = 0; + + /* + * More recent versions of the iproute2 utils expect a status + * value after the header, even in the absence of errors. + */ + lx_netlink_reply_add(reply, &status, sizeof (status)); + + /* + * "done" is also the most minimal response possible. If + * lx_netlink_reply_msg() does not set lxnr_errno, we should + * be guaranteed enough room to hold this (i.e. our + * lx_netlink_reply_add() call should never end up setting + * lxnr_errno). + */ + VERIFY0(reply->lxnr_errno); + + mp = reply->lxnr_mp; + VERIFY(mp != NULL); + reply->lxnr_mp = NULL; + } + + lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + + if (reply->lxnr_mp != NULL) + freeb(reply->lxnr_mp); + + if (reply->lxnr_err != NULL) + freeb(reply->lxnr_err); + + kmem_free(reply, sizeof (lx_netlink_reply_t)); +} + +static int +lx_netlink_reply_error(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, int errno) +{ + /* + * The type of the message doesn't matter, as we're going to explicitly + * set lxnr_errno and therefore send only an error message. + */ + lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0); + + if (reply == NULL) + return (ENOMEM); + + reply->lxnr_errno = errno; + lx_netlink_reply_done(reply); + + return (0); +} + +/* + * Send an ack message with an explicit errno of 0. + * TODO: this needs more work + */ +/* + * static void + * lx_netlink_reply_ack(lx_netlink_reply_t *reply) + * { + * lx_netlink_sock_t *lxsock = reply->lxnr_sock; + * mblk_t *mp; + * lx_netlink_hdr_t *hdr; + * lx_netlink_err_t *err; + * + * lx_netlink_reply_msg(reply, NULL, 0); + * + * mp = reply->lxnr_err; + * VERIFY(mp != NULL); + * reply->lxnr_err = NULL; + * err = (lx_netlink_err_t *)mp->b_rptr; + * hdr = &err->lxne_hdr; + * + * err->lxne_failed = reply->lxnr_hdr; + * err->lxne_errno = 0; + * hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + * hdr->lxnh_len = sizeof (lx_netlink_err_t); + * hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + * hdr->lxnh_pid = lxsock->lxns_port; + * + * lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + * + * kmem_free(reply, sizeof (lx_netlink_reply_t)); + * } + */ + +static int +lx_netlink_parse_msg_attrs(mblk_t *mp, void **msgp, unsigned int msg_size, + lx_netlink_attr_t **attrp, unsigned int *attr_max) +{ + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + lx_netlink_attr_t *lxa; + unsigned char *buf = mp->b_rptr + LXNLMSG_HDRLEN; + unsigned int i; + uint32_t buf_left = MBLKL(mp) - LXNLMSG_HDRLEN; + uint32_t msg_left = hdr->lxnh_len; + + msg_size = LXNLMSG_ALIGN(msg_size); + if (msg_size > buf_left || msg_size > msg_left) { + return (-1); + } + + *msgp = (void *)buf; + buf += msg_size; + buf_left -= msg_size; + msg_left -= msg_size; + + /* Do not bother with attr parsing if not requested */ + if (attrp == NULL || *attr_max == 0) { + return (0); + } + + for (i = 0; i < *attr_max; i++) { + if (buf_left < LXATTR_HDRLEN || msg_left < LXATTR_HDRLEN) { + break; + } + + lxa = (lx_netlink_attr_t *)buf; + if (lxa->lxna_len > buf_left || lxa->lxna_len > msg_left) { + return (-1); + } + + attrp[i] = lxa; + buf += lxa->lxna_len; + buf_left -= lxa->lxna_len; + msg_left -= lxa->lxna_len; + } + *attr_max = i; + + return (0); +} + +/* + * Takes an IPv4 address (in network byte order) and returns the address scope. + */ +static uint8_t +lx_ipv4_rtscope(in_addr_t nbo_addr) +{ + in_addr_t addr = ntohl(nbo_addr); + if ((addr >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { + return (LX_RTSCOPE_HOST); + } else if ((addr & IN_AUTOCONF_MASK) == IN_AUTOCONF_NET) { + return (LX_RTSCOPE_LINK); + } else if ((addr & IN_PRIVATE8_MASK) == IN_PRIVATE8_NET || + (addr & IN_PRIVATE12_MASK) == IN_PRIVATE12_NET || + (addr & IN_PRIVATE16_MASK) == IN_PRIVATE16_NET) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +/* + * Takes an IPv6 address and returns the address scope. + */ +static uint8_t +lx_ipv6_rtscope(const in6_addr_t *addr) +{ + if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_RTSCOPE_HOST); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_RTSCOPE_LINK); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_RTSCOPE_SITE); + } else { + return (LX_RTSCOPE_UNIVERSE); + } +} + +static void +lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifinfomsg_t ifi; + int i; + char if_name[IFNAMSIZ]; + struct sockaddr_dl *sdl; + struct sockaddr hwaddr; + int hwaddr_size; + boolean_t is_loopback; + + struct { + int native; + int lx; + } flags[] = { + { IFF_UP, LX_IFF_UP }, + { IFF_BROADCAST, LX_IFF_BROADCAST }, + { IFF_DEBUG, LX_IFF_DEBUG }, + { IFF_LOOPBACK, LX_IFF_LOOPBACK }, + { IFF_POINTOPOINT, LX_IFF_POINTOPOINT }, + { IFF_NOTRAILERS, LX_IFF_NOTRAILERS }, + { IFF_RUNNING, LX_IFF_RUNNING }, + { IFF_NOARP, LX_IFF_NOARP }, + { IFF_PROMISC, LX_IFF_PROMISC }, + { IFF_ALLMULTI, LX_IFF_ALLMULTI }, + { IFF_MULTICAST, LX_IFF_MULTICAST }, + { 0 } + }; + + /* + * illumos interfaces that contain a ':' are non-zero logical + * interfaces. We should only emit the name of the zeroth logical + * interface, since RTM_GETLINK only expects to see the name of + * devices. The addresses of all logical devices will be + * returned via an RTM_GETADDR. + */ + if (strchr(lifr->lifr_name, ':') != NULL) + return; + + /* + * Most of the lx_netlink module is architected to emit information in + * an illumos-native manner. Socket syscalls such as getsockname will + * not translate fields to values Linux programs would expect since + * that conversion is performed by the generic socket emulation. + * + * This is _not_ true of the actual protocol output from lx_netlink. + * Since translating it at the socket layer would be onerous, all + * output (including constants and names) is pre-translated to values + * valid for Linux. + */ + + bzero(&ifi, sizeof (ifi)); + ifi.lxnl_ifi_family = AF_UNSPEC; + ifi.lxnl_ifi_change = (uint32_t)-1; + + /* Convert the name to be Linux-friendly */ + (void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ); + lx_ifname_convert(if_name, LX_IF_FROMNATIVE); + is_loopback = (strncmp(if_name, "lo", 2) == 0); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifi.lxnl_ifi_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + for (i = 0; flags[i].native; i++) { + if (lifr->lifr_flags & flags[i].native) + ifi.lxnl_ifi_flags |= flags[i].lx; + } + + /* + * Query the datalink address. + * The interface type will be included in the outgoing infomsg while + * the address itself will be output separately. + */ + sdl = (struct sockaddr_dl *)&lifr->lifr_addr; + bzero(sdl, sizeof (*sdl)); + if (!is_loopback) { + (void) lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr); + } else { + /* Simulate an empty hwaddr for loopback */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + } + lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size); + + ifi.lxnl_ifi_type = hwaddr.sa_family; + lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t)); + + lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0) + return; + + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu); + + if (hwaddr_size != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS, + hwaddr.sa_data, hwaddr_size); + } + + /* Emulate a txqlen of 1. (0 for loopbacks) */ + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN, + (is_loopback) ? 0 : 1); + + lx_netlink_reply_send(reply); +} + +static void +lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply, + void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct) +{ + lx_netlink_sock_t *sock = reply->lxnr_sock; + int nlifr, i; + + struct { + int family; + ldi_handle_t handle; + struct lifconf lifc; + struct lifnum lifn; + } families[] = { + { AF_INET, sock->lxns_iphandle }, + { AF_INET6, sock->lxns_ip6handle }, + { AF_UNSPEC } + }, *family, *check; + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + struct lifnum *lifn = &family->lifn; + + lifn->lifn_family = family->family; + sock->lxns_current = family->handle; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0) + break; + + lifc->lifc_family = lifn->lifn_family; + lifc->lifc_flags = 0; + lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq); + if (lifn->lifn_count == 0) { + lifc->lifc_buf = NULL; + continue; + } + lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0) + break; + + nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]); + + for (i = 0; i < nlifr; i++) { + if (!distinct) { + func(reply, &lifc->lifc_req[i]); + continue; + } + + /* + * If we have been asked to provide each interface + * exactly once, we need to (annoyingly) check this + * name against others that we've already processed for + * other families. Yes, this is quadratic time -- but + * the number of interfaces per family is expected to + * be very small. + */ + for (check = families; check != family; check++) { + struct lifconf *clifc = &check->lifc; + int cnlifr = clifc->lifc_len / + sizeof (clifc->lifc_req[0]), j; + char *nm = lifc->lifc_req[i].lifr_name, *cnm; + + for (j = 0; j < cnlifr; j++) { + cnm = clifc->lifc_req[j].lifr_name; + + if (strcmp(nm, cnm) == 0) + break; + } + + if (j != cnlifr) + break; + } + + if (check != family) + continue; + + func(reply, &lifc->lifc_req[i]); + } + } + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + + if (lifc->lifc_buf != NULL) + kmem_free(lifc->lifc_buf, lifc->lifc_len); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE); + lx_netlink_reply_done(reply); + + return (0); +} + +static void +lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifaddrmsg_t ifa; + + bzero(&ifa, sizeof (ifa)); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifa.lxnl_ifa_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + /* + * Don't report on-link subnets + */ + if ((lifr->lifr_flags & IFF_NOLOCAL) != 0) + return; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0) + return; + + ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0) + return; + + if (lifr->lifr_addr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + ifa.lxnl_ifa_family = LX_AF_INET; + + sin = (struct sockaddr_in *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv4_rtscope( + sin->sin_addr.s_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr_int32(reply, + LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin; + + ifa.lxnl_ifa_family = LX_AF_INET6; + + sin = (struct sockaddr_in6 *)&lifr->lifr_addr; + ifa.lxnl_ifa_scope = lx_ipv6_rtscope(&sin->sin6_addr); + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS, + &sin->sin6_addr, sizeof (sin->sin6_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE); + lx_netlink_reply_done(reply); + + return (0); +} + +struct lx_getroute_ctx { + lx_netlink_reply_t *lgrtctx_reply; + lx_netlink_rtmsg_t *lgrtctx_rtmsg; + lx_netlink_attr_t *lgrtctx_attrs[LX_NETLINK_MAX_RTA]; + unsigned int lgrtctx_max_attr; + lx_netlink_attr_t *lgrtctx_rtadst; +}; + +static void +lx_netlink_getroute_ipv4(ire_t *ire, struct lx_getroute_ctx *ctx) +{ + lx_netlink_reply_t *reply = ctx->lgrtctx_reply; + lx_netlink_rtmsg_t *rtmsg = ctx->lgrtctx_rtmsg; + lx_netlink_attr_t *rtadst = ctx->lgrtctx_rtadst; + lx_netlink_rtmsg_t res; + ill_t *ill = NULL; + + /* Certain IREs are too specific for netlink */ + if ((ire->ire_type & (IRE_BROADCAST | IRE_MULTICAST | IRE_NOROUTE | + IRE_LOOPBACK | IRE_LOCAL)) != 0 || ire->ire_testhidden != 0) { + return; + } + /* + * When listing routes, CLONE entries are undesired. + * They are required for 'ip route get' on a local address. + */ + if (rtmsg->rtm_dst_len == 0 && (ire->ire_type & IRE_IF_CLONE) != 0) { + return; + } + + bzero(&res, sizeof (res)); + res.rtm_family = LX_AF_INET; + res.rtm_table = LX_ROUTE_TABLE_MAIN; + res.rtm_type = LX_RTN_UNICAST; + res.rtm_dst_len = ire->ire_masklen; + + if (ire->ire_type & (IRE_IF_NORESOLVER|IRE_IF_RESOLVER)) { + /* Interface-local networks considered kernel-created */ + res.rtm_protocol = LX_RTPROT_KERNEL; + res.rtm_scope = LX_RTSCOPE_LINK; + } else if (ire->ire_flags & RTF_STATIC) { + res.rtm_protocol = LX_RTPROT_STATIC; + } + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* + * SpecifY single-destination route. + * RTA_DST details will be added later + */ + res.rtm_dst_len = rtmsg->rtm_dst_len; + } + + + lx_netlink_reply_msg(reply, &res, sizeof (res)); + + if (rtmsg->rtm_dst_len == 0x20 && rtadst != NULL) { + /* Add RTA_DST details for single-destination route. */ + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + LXATTR_PAYLOAD(rtadst), sizeof (ipaddr_t)); + } else if (ire->ire_masklen != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_DST, + &ire->ire_addr, sizeof (ire->ire_addr)); + } + + if (ire->ire_ill != NULL) { + ill = ire->ire_ill; + } else if (ire->ire_dep_parent != NULL) { + ill = ire->ire_dep_parent->ire_ill; + } + + if (ill != NULL) { + uint32_t ifindex, addr_src; + + ifindex = ill->ill_phyint->phyint_ifindex; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_OIF, + &ifindex, sizeof (ifindex)); + + addr_src = ill->ill_ipif->ipif_lcl_addr; + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_PREFSRC, + &addr_src, sizeof (addr_src)); + } + + if (ire->ire_flags & RTF_GATEWAY) { + lx_netlink_reply_attr(reply, LX_NETLINK_RTA_GATEWAY, + &ire->ire_gateway_addr, sizeof (ire->ire_gateway_addr)); + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getroute(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, + mblk_t *mp) +{ + struct lx_getroute_ctx ctx; + lx_netlink_reply_t *reply; + lx_netlink_rtmsg_t rtmsg, *rtmsgp; + int rtmsg_size = sizeof (rtmsg); + netstack_t *ns; + int i; + + bzero(&ctx, sizeof (ctx)); + ctx.lgrtctx_max_attr = LX_NETLINK_MAX_RTA; + + if (lx_netlink_parse_msg_attrs(mp, (void **)&rtmsgp, + rtmsg_size, ctx.lgrtctx_attrs, &ctx.lgrtctx_max_attr) != 0) { + return (EPROTO); + } + + /* + * Older version of libnetlink send a truncated rtmsg struct for + * certain RTM_GETROUTE queries. We must detect this condition and + * truncate our input to prevent later confusion. + */ + if (curproc->p_zone->zone_brand == &lx_brand && + lx_kern_release_cmp(curproc->p_zone, "2.6.32") <= 0 && + rtmsgp->rtm_dst_len == 0) { + rtmsg_size = sizeof (rtmsg.rtm_family); + } + bzero(&rtmsg, sizeof (rtmsg)); + bcopy(rtmsgp, &rtmsg, rtmsg_size); + ctx.lgrtctx_rtmsg = &rtmsg; + + /* If RTA_DST was passed, it effects later decisions */ + for (i = 0; i < ctx.lgrtctx_max_attr; i++) { + lx_netlink_attr_t *attr = ctx.lgrtctx_attrs[i]; + + if (attr->lxna_type == LX_NETLINK_RTA_DST && + attr->lxna_len == LXATTR_LEN(sizeof (ipaddr_t))) { + ctx.lgrtctx_rtadst = attr; + break; + } + } + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWROUTE); + if (reply == NULL) { + return (ENOMEM); + } + ctx.lgrtctx_reply = reply; + + /* Do not report anything outside the main table */ + if (rtmsg.rtm_table != LX_ROUTE_TABLE_MAIN && + rtmsg.rtm_table != 0) { + lx_netlink_reply_done(reply); + return (0); + } + + ns = netstack_get_current(); + if (ns == NULL) { + lx_netlink_reply_done(reply); + return (0); + } + if (rtmsg.rtm_family == LX_AF_INET || rtmsg.rtm_family == 0) { + if (rtmsg.rtm_dst_len == 0x20 && ctx.lgrtctx_rtadst != NULL) { + /* resolve route for host */ + ipaddr_t *dst = LXATTR_PAYLOAD(ctx.lgrtctx_rtadst); + ire_t *ire_dst; + + ire_dst = ire_route_recursive_dstonly_v4(*dst, 0, 0, + ns->netstack_ip); + lx_netlink_getroute_ipv4(ire_dst, &ctx); + ire_refrele(ire_dst); + } else { + /* get route listing */ + ire_walk_v4(&lx_netlink_getroute_ipv4, &ctx, ALL_ZONES, + ns->netstack_ip); + } + } + if (rtmsg.rtm_family == LX_AF_INET6) { + /* punt on ipv6 for now */ + netstack_rele(ns); + lx_netlink_reply_done(reply); + return (EPROTO); + } + netstack_rele(ns); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Auditing callback to emit response. + */ +static void +lx_netlink_au_cb(void *r, void *b, uint_t blen) +{ + lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r; + + lx_netlink_reply_msg(reply, b, blen); +} + +/* + * Audit get + */ +static int +lx_netlink_au_get(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET); + if (reply == NULL) + return (ENOMEM); + + lx_audit_get(reply, lx_netlink_au_cb); + lx_netlink_reply_send(reply); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Set or clear flag indicating socket is being used to communicate with the + * user-level auditd. Also update the counter which prevents this module + * from unloading while auditing is using the socket to the auditd. + */ +static void +lx_netlink_au_sock_cb(void *s, boolean_t set) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s; + + if (set) { + lxsock->lxns_flags |= LXNLF_AUDITD; + mutex_enter(&lx_netlink_lock); + lx_netlink_audit_cnt++; + mutex_exit(&lx_netlink_lock); + } else { + lxsock->lxns_flags &= ~LXNLF_AUDITD; + mutex_enter(&lx_netlink_lock); + VERIFY(lx_netlink_audit_cnt > 0); + lx_netlink_audit_cnt--; + mutex_exit(&lx_netlink_lock); + } +} + +static int +lx_netlink_au_set(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + err = lx_audit_set(lxsock, datap, datalen, lx_netlink_au_sock_cb); + if (err != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_SET); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit append rule + */ +static int +lx_netlink_au_ar(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but as we add additional field support, eventually we might need + * to handle an mblk chain for really long string data in the + * rulep->lxar_buf. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + if ((err = lx_audit_append_rule(datap, datalen)) != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_ADD_RULE); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit delete rule + */ +static int +lx_netlink_au_dr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + void *datap; + size_t datalen; + int err; + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but as we add additional field support, eventually we might need + * to handle an mblk chain for really long string data in the + * rulep->lxar_buf. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + datap = (void *)(mp->b_rptr + sizeof (lx_netlink_hdr_t)); + datalen = MBLKL(mp) - sizeof (lx_netlink_hdr_t); + + if ((err = lx_audit_delete_rule(datap, datalen)) != 0) + return (err); + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_DEL_RULE); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Auditing callback to emit rule list. + */ +static void +lx_netlink_au_lr_cb(void *r, void *b0, uint_t b0_len, void *b1, uint_t b1_len) +{ + lx_netlink_reply_t *reply = (lx_netlink_reply_t *)r; + + lx_netlink_reply_msg(reply, b0, b0_len); + lx_netlink_reply_add(reply, b1, b1_len); + lx_netlink_reply_send(reply); +} + +/* + * Audit list rules + */ +static int +lx_netlink_au_lr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_LIST_RULES); + if (reply == NULL) + return (ENOMEM); + + lx_audit_list_rules(reply, lx_netlink_au_lr_cb); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit get feature + */ +static int +lx_netlink_au_gf(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_AUDIT_GET_FEATURE); + if (reply == NULL) + return (ENOMEM); + + lx_audit_get_feature(reply, lx_netlink_au_cb); + lx_netlink_reply_send(reply); + lx_netlink_reply_done(reply); + return (0); +} + +/* + * Audit user message + * User messages are submitted as free-form messages which need to get sent + * back up to the auditd. This includes informative messages such as starting + * or stopping auditing. + */ +static int +lx_netlink_au_um(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + size_t datalen; + void *bp; + + bp = mp->b_rptr + sizeof (lx_netlink_hdr_t); + datalen = MBLKL(mp) - (sizeof (lx_netlink_hdr_t)); + + /* + * TODO: At this time, everything we support fits in a single mblk, + * but eventually we might need to handle an mblk chain for a really + * long user message. + */ + if (mp->b_cont != NULL) + return (EINVAL); + + lx_audit_emit_user_msg(hdr->lxnh_type, datalen, bp); + + if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) { + reply = lx_netlink_reply(lxsock, hdr, hdr->lxnh_type); + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_done(reply); + } + return (0); +} + +static int +lx_netlink_au_emit_cb(void *s, uint_t type, const char *msg, uint_t size) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)s; + lx_netlink_hdr_t *hdr; + mblk_t *mp, *mp1; + int error; + uint32_t len; + + len = LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + if (msg != NULL) { + len += LXNLMSG_ALIGN(size); + if (len > lxsock->lxns_bufsize) + return (E2BIG); + } + + if ((mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + return (ENOMEM); + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + /* LINTED: E_BAD_PTR_CAST_ALIGN */ + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = len; + hdr->lxnh_type = (msg == NULL ? LX_NETLINK_NLMSG_DONE : type); + hdr->lxnh_seq = 0; + hdr->lxnh_pid = 0; + + mp->b_wptr += LXNLMSG_ALIGN(sizeof (lx_netlink_hdr_t)); + if (msg != NULL) { + bcopy(msg, mp->b_wptr, size); + mp->b_wptr += LXNLMSG_ALIGN(size); + } + + /* As in lx_netlink_reply_sendup, send as T_UNITDATA_IND message. */ + if ((mp1 = lx_netlink_alloc_mp1(lxsock)) == NULL) { + freeb(mp); + return (ENOMEM); + } + mp1->b_cont = mp; + + /* + * If the socket is currently flow-controlled, do not allow further + * data to be sent out. Messages of the NLMSG_DONE type, triggered by + * passing msg == NULL, are excempt from this restriction. + */ + mutex_enter(&lxsock->lxns_flowctl_mtx); + if (lxsock->lxns_flowctrled && msg != NULL) { + mutex_exit(&lxsock->lxns_flowctl_mtx); + freemsg(mp1); + return (ENOSPC); + } + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp1, + msgdsize(mp1), 0, &error, NULL); + + /* + * The socket indicated that it is now flow-controlled. That said, it + * still queued the last message, so indicated success (but track the + * flow-controlled state). + */ + if (error == ENOSPC) { + lxsock->lxns_flowctrled = B_TRUE; + lx_netlink_flowctrld++; + error = 0; + } + mutex_exit(&lxsock->lxns_flowctl_mtx); + + return (error); +} + +static int +lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * This is paranoia, in case our socket somehow escaped the zone. + */ + if (curproc->p_zone->zone_brand != &lx_brand) + return (ECONNREFUSED); + + if (MBLKL(mp) < sizeof (lx_netlink_hdr_t)) + return (EINVAL); + + /* + * Ensure audit state is setup whenever we get an audit control msg. + * However, we skip initialization for user messages since some apps + * (e.g. systemd) blindly send audit messages, even though auditing + * is not installed or in use. Uninitialized state is handled in + * lx_audit_user_msg(). + */ + if (hdr->lxnh_type < LX_AUDIT_USER_MSG_START) + lx_audit_init(lx_netlink_au_emit_cb); + + /* + * Within Linux, when a netlink message requests an ack, the code + * first sends the ack as an error response (NLMSG_ERROR) with an + * error code of 0. + * + * TODO: this needs more work, but is unnecessary for now. + * if (hdr->lxnh_flags & LX_NETLINK_NLM_F_ACK) { + * reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_NLMSG_ERROR); + * if (reply == NULL) + * return (ENOMEM); + * lx_netlink_reply_ack(reply); + * } + */ + + if (hdr->lxnh_type >= LX_AUDIT_USER_MSG_START) { + return (lx_netlink_au_um(lxsock, hdr, mp)); + } + + switch (hdr->lxnh_type) { + case LX_AUDIT_GET: + return (lx_netlink_au_get(lxsock, hdr)); + case LX_AUDIT_SET: + return (lx_netlink_au_set(lxsock, hdr, mp)); + case LX_AUDIT_ADD_RULE: + return (lx_netlink_au_ar(lxsock, hdr, mp)); + case LX_AUDIT_DEL_RULE: + return (lx_netlink_au_dr(lxsock, hdr, mp)); + case LX_AUDIT_LIST_RULES: + return (lx_netlink_au_lr(lxsock, hdr)); + case LX_AUDIT_GET_FEATURE: + return (lx_netlink_au_gf(lxsock, hdr)); + } + + /* + * For all other auditing messages (i.e. one we don't yet support), we + * return ECONNREFUSED. + */ + return (ECONNREFUSED); +} + +/*ARGSUSED*/ +static int +lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For udev, we just silently accept all writes and never actually + * reply with anything -- which appears to be sufficient for things + * to work. + */ + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp, + struct nmsghdr *msg, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + int i, rval; + + static struct { + int proto; + uint16_t type; + int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *); + } handlers[] = { + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETLINK, lx_netlink_getlink }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETROUTE, lx_netlink_getroute }, + { LX_NETLINK_AUDIT, + LX_NETLINK_NLMSG_NONE, lx_netlink_audit }, + { LX_NETLINK_KOBJECT_UEVENT, + LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent }, + { LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL } + }; + + if (msg->msg_name != NULL) { + lx_netlink_sockaddr_t *lxsa = + (lx_netlink_sockaddr_t *)msg->msg_name; + + if (msg->msg_namelen < sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + /* + * If this message is targeted beyond just the OS kernel, an + * access check must be made. + */ + if (lxsa->lxnl_port != 0 || lxsa->lxnl_groups != 0) { + int err; + char buf[LX_UNSUP_BUFSZ]; + + err = lx_netlink_access(lxsock, cr, LXNL_SENDMSG); + if (err != 0) { + return (err); + } + + /* + * Support for netlink messages beyond rtnetlink(7) is + * non-existent at this time. These messages are + * tolerated, rather than tossing a potentially fatal + * error to the application. + */ + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "netlink sendmsg addr port:%X groups:%08X", + lxsa->lxnl_port, lxsa->lxnl_groups); + lx_unsupported(buf); + } + } + + if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) { + freemsg(mp); + return (EPROTO); + } + + for (i = 0; handlers[i].func != NULL; i++) { + if (lxsock->lxns_proto != handlers[i].proto) + continue; + + if (handlers[i].type != LX_NETLINK_NLMSG_NONE && + hdr->lxnh_type != handlers[i].type) + continue; + + rval = handlers[i].func(lxsock, hdr, mp); + freemsg(mp); + + return (rval); + } + + /* + * An unrecognized message. We will bounce up an EOPNOTSUPP reply. + */ + rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP); + freemsg(mp); + + return (rval); +} + +static void +lx_netlink_clr_flowctrl(sock_lower_handle_t handle) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + + mutex_enter(&lxsock->lxns_flowctl_mtx); + lxsock->lxns_flowctrled = B_FALSE; + mutex_exit(&lxsock->lxns_flowctl_mtx); +} + +/*ARGSUSED*/ +static int +lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev; + + if (lxsock->lxns_flags & LXNLF_AUDITD) + lx_audit_stop_worker(lxsock, lx_netlink_au_sock_cb); + + mutex_enter(&lx_netlink_lock); + + prev = &lx_netlink_head; + + for (sock = *prev; sock != lxsock; sock = sock->lxns_next) + prev = &sock->lxns_next; + + *prev = sock->lxns_next; + + mutex_exit(&lx_netlink_lock); + + (void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred); + (void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred); + mutex_destroy(&lxsock->lxns_flowctl_mtx); + kmem_free(lxsock, sizeof (lx_netlink_sock_t)); + + return (0); +} + +static sock_downcalls_t sock_lx_netlink_downcalls = { + lx_netlink_activate, /* sd_activate */ + sock_accept_notsupp, /* sd_accept */ + lx_netlink_bind, /* sd_bind */ + sock_listen_notsupp, /* sd_listen */ + sock_connect_notsupp, /* sd_connect */ + sock_getpeername_notsupp, /* sd_getpeername */ + lx_netlink_getsockname, /* sd_getsockname */ + lx_netlink_getsockopt, /* sd_getsockopt */ + lx_netlink_setsockopt, /* sd_setsockopt */ + lx_netlink_send, /* sd_send */ + NULL, /* sd_send_uio */ + NULL, /* sd_recv_uio */ + NULL, /* sd_poll */ + sock_shutdown_notsupp, /* sd_shutdown */ + lx_netlink_clr_flowctrl, /* sd_clr_flowctrl */ + sock_ioctl_notsupp, /* sd_ioctl */ + lx_netlink_close /* sd_close */ +}; + +/*ARGSUSED*/ +static sock_lower_handle_t +lx_netlink_create(int family, int type, int proto, + sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, + int flags, cred_t *credp) +{ + lx_netlink_sock_t *lxsock; + ldi_handle_t handle, handle6; + cred_t *kcred = zone_kcred(); + int err; + + if (family != AF_LX_NETLINK || + (type != SOCK_DGRAM && type != SOCK_RAW)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + switch (proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + break; + + default: + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred, + &handle, lx_netlink_ldi)) != 0) { + *errorp = err; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred, + &handle6, lx_netlink_ldi)) != 0) { + (void) ldi_close(handle, FREAD, kcred); + *errorp = err; + return (NULL); + } + + *sock_downcalls = &sock_lx_netlink_downcalls; + *smodep = SM_ATOMIC; + + lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP); + lxsock->lxns_iphandle = handle; + lxsock->lxns_ip6handle = handle6; + lxsock->lxns_bufsize = lx_netlink_bufsize; + lxsock->lxns_proto = proto; + mutex_init(&lxsock->lxns_flowctl_mtx, NULL, MUTEX_DEFAULT, NULL); + + mutex_enter(&lx_netlink_lock); + + lxsock->lxns_next = lx_netlink_head; + lx_netlink_head = lxsock; + + mutex_exit(&lx_netlink_lock); + + return ((sock_lower_handle_t)lxsock); +} + +static void +lx_netlink_init(void) +{ + major_t major = mod_name_to_major("ip"); + int err; + + VERIFY(major != DDI_MAJOR_T_NONE); + + err = ldi_ident_from_major(major, &lx_netlink_ldi); + VERIFY(err == 0); +} + +static void +lx_netlink_fini(void) +{ + ldi_ident_release(lx_netlink_ldi); +} + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "lx_netlink", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + lx_netlink_create, + NULL +}; + +/* modldrv structure */ +static struct modlsockmod sockmod = { + &mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo +}; + +/* modlinkage structure */ +static struct modlinkage ml = { + MODREV_1, + &sockmod, + NULL +}; + +int +_init(void) +{ + int err; + + lx_netlink_init(); + + if ((err = mod_install(&ml)) != 0) + lx_netlink_fini(); + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&ml, modinfop)); +} + +int +_fini(void) +{ + int err = 0; + + mutex_enter(&lx_netlink_lock); + + if (lx_netlink_head != NULL || lx_netlink_audit_cnt != 0) + err = EBUSY; + + mutex_exit(&lx_netlink_lock); + + if (err == 0) { + lx_audit_cleanup(); + if ((err = mod_remove(&ml)) == 0) + lx_netlink_fini(); + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c new file mode 100644 index 0000000000..23e0c6f459 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c @@ -0,0 +1,1188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + + +/* + * This driver attempts to emulate some of the the behaviors of + * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris + * + * It does this by layering over the /dev/ptmx device and intercepting + * opens to it. + * + * This driver makes the following assumptions about the way the ptm/pts + * drivers on Solaris work: + * + * - all opens of the /dev/ptmx device node return a unique dev_t. + * + * - the dev_t minor node value for each open ptm instance corrospondes + * to it's associated slave terminal device number. ie. the path to + * the slave terminal device associated with an open ptm instance + * who's dev_t minor node vaue is 5, is /dev/pts/5. + * + * - the ptm driver always allocates the lowest numbered slave terminal + * device possible. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/devops.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/kstr.h> +#include <sys/lx_ptm.h> +#include <sys/modctl.h> +#include <sys/pathname.h> +#include <sys/ptms.h> +#include <sys/ptyvar.h> +#include <sys/stat.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/sdt.h> + +#define LP_PTM_PATH "/dev/ptmx" +#define LP_PTS_PATH "/dev/pts/" +#define LP_PTS_DRV_NAME "pts" +#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */ +#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */ + +/* + * this driver is layered on top of the ptm driver. we'd like to + * make this drivers minor name space a mirror of the ptm drivers + * namespace, but we can't actually do this. the reason is that the + * ptm driver is opened via the clone driver. there for no minor nodes + * of the ptm driver are actually accessible via the filesystem. + * since we're not a streams device we can't be opened by the clone + * driver. there for we need to have at least minor node accessible + * via the filesystem so that consumers can open it. we use the device + * node with a minor number of 0 for this purpose. what this means is + * that minor node 0 can't be used to map ptm minor node 0. since this + * minor node is now reserved we need to shift our ptm minor node + * mappings by one. ie. a ptm minor node with a value of 0 will + * corrospond to our minor node with a value of 1. these mappings are + * managed with the following macros. + */ +#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x) +#define INDEX_TO_MINOR(x) ((x) + 1) + +/* + * grow our layered handle array by the same size increment that the ptm + * driver uses to grow the pty device space - PTY_MAXDELTA + */ +#define LP_PTY_INC 128 + +/* + * lx_ptm_ops contains state information about outstanding operations on the + * underlying master terminal device. Currently we only track information + * for read operations. + * + * Note that this data has not been rolled directly into the lx_ptm_handle + * structure because we can't put mutex's of condition variables into + * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle + * structures linked to from the global lx_ptm state can be resized + * dynamically, and when it's resized, the new array is at a different + * memory location and the old array memory is discarded. Mutexs and cvs + * are accessed based off their address, so if this array was re-sized while + * there were outstanding operations on any mutexs or cvs in the array + * then the system would tip over. In the future the lx_ptm_handle structure + * array should probably be replaced with either an array of pointers to + * lx_ptm_handle structures or some other kind of data structure containing + * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure + * could be folded directly into the lx_ptm_handle structures. (This will + * also require the definition of a new locking mechanism to protect the + * contents of lx_ptm_handle structures.) + */ +typedef struct lx_ptm_ops { + int lpo_rops; + kcondvar_t lpo_rops_cv; + kmutex_t lpo_rops_lock; +} lx_ptm_ops_t; + +/* + * Every open of the master terminal device in a zone results in a new + * lx_ptm_handle handle allocation. These handles are stored in an array + * hanging off the lx_ptm_state structure. + */ +typedef struct lx_ptm_handle { + /* Device handle to the underlying real /dev/ptmx master terminal. */ + ldi_handle_t lph_handle; + + /* Flag to indicate if TIOCPKT mode has been enabled. */ + int lph_pktio; + + /* Number of times the slave device has been opened/closed. */ + int lph_eofed; + + /* Callback handler in the ptm driver to check if slave is open. */ + ptmptsopencb_t lph_ppocb; + + /* Pointer to state for operations on underlying device. */ + lx_ptm_ops_t *lph_lpo; +} lx_ptm_handle_t; + +/* + * Global state for the lx_ptm driver. + */ +typedef struct lx_ptm_state { + /* lx_ptm device devinfo pointer */ + dev_info_t *lps_dip; + + /* LDI ident used to open underlying real /dev/ptmx master terminals. */ + ldi_ident_t lps_li; + + /* pts drivers major number */ + major_t lps_pts_major; + + /* rw lock used to manage access and growth of lps_lh_array */ + krwlock_t lps_lh_rwlock; + + /* number of elements in lps_lh_array */ + uint_t lps_lh_count; + + /* Array of handles to underlying real /dev/ptmx master terminals. */ + lx_ptm_handle_t *lps_lh_array; +} lx_ptm_state_t; + +/* Pointer to the lx_ptm global state structure. */ +static lx_ptm_state_t lps; + +/* + * List of modules to be autopushed onto slave terminal devices when they + * are opened in an lx branded zone. + */ +static char *lx_pts_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +static void +lx_ptm_lh_grow(uint_t index) +{ + uint_t new_lh_count, old_lh_count; + lx_ptm_handle_t *new_lh_array, *old_lh_array; + + /* + * allocate a new array. we drop the rw lock on the array so that + * readers can still access devices in case our memory allocation + * blocks. + */ + new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1); + new_lh_array = + kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP); + + /* + * double check that we still actually need to increase the size + * of the array + */ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + if (index < lps.lps_lh_count) { + /* someone beat us to it so there's nothing more to do */ + rw_exit(&lps.lps_lh_rwlock); + kmem_free(new_lh_array, + sizeof (lx_ptm_handle_t) * new_lh_count); + return; + } + + /* copy the existing data into the new array */ + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_count != 0) { + bcopy(lps.lps_lh_array, new_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + } + + /* save info on the old array */ + old_lh_array = lps.lps_lh_array; + old_lh_count = lps.lps_lh_count; + + /* install the new array */ + lps.lps_lh_array = new_lh_array; + lps.lps_lh_count = new_lh_count; + + rw_exit(&lps.lps_lh_rwlock); + + /* free the old array */ + if (old_lh_array != NULL) { + kmem_free(old_lh_array, + sizeof (lx_ptm_handle_t) * old_lh_count); + } +} + +static void +lx_ptm_lh_insert(uint_t index, ldi_handle_t lh) +{ + lx_ptm_ops_t *lpo; + + ASSERT(lh != NULL); + + /* Allocate and initialize the ops structure */ + lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP); + mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + /* check if we need to grow the size of the layered handle array */ + if (index >= lps.lps_lh_count) { + rw_exit(&lps.lps_lh_rwlock); + lx_ptm_lh_grow(index); + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + } + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle == NULL); + ASSERT(lps.lps_lh_array[index].lph_pktio == 0); + ASSERT(lps.lps_lh_array[index].lph_eofed == 0); + ASSERT(lps.lps_lh_array[index].lph_lpo == NULL); + + /* insert the new handle and return */ + lps.lps_lh_array[index].lph_handle = lh; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + lps.lps_lh_array[index].lph_lpo = lpo; + + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_remove(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0); + ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock)); + + /* free the write handle */ + kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t)); + lps.lps_lh_array[index].lph_lpo = NULL; + + /* remove the handle and return it */ + lh = lps.lps_lh_array[index].lph_handle; + lps.lps_lh_array[index].lph_handle = NULL; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static void +lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + *ppocb = lps.lps_lh_array[index].lph_ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static void +lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + lps.lps_lh_array[index].lph_ppocb = *ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_lookup(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the handle */ + lh = lps.lps_lh_array[index].lph_handle; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static lx_ptm_ops_t * +lx_ptm_lpo_lookup(uint_t index) +{ + lx_ptm_ops_t *lpo; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_lpo != NULL); + + /* return the handle */ + lpo = lps.lps_lh_array[index].lph_lpo; + rw_exit(&lps.lps_lh_rwlock); + return (lpo); +} + +static int +lx_ptm_lh_pktio_get(uint_t index) +{ + int pktio; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the pktio state */ + pktio = lps.lps_lh_array[index].lph_pktio; + rw_exit(&lps.lps_lh_rwlock); + return (pktio); +} + +static void +lx_ptm_lh_pktio_set(uint_t index, int pktio) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the pktio state */ + lps.lps_lh_array[index].lph_pktio = pktio; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_lh_eofed_get(uint_t index) +{ + int eofed; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the eofed state */ + eofed = lps.lps_lh_array[index].lph_eofed; + rw_exit(&lps.lps_lh_rwlock); + return (eofed); +} + +static void +lx_ptm_lh_eofed_set(uint_t index) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the eofed state */ + lps.lps_lh_array[index].lph_eofed++; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_read_start(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* Wait for other read operations to finish */ + while (lpo->lpo_rops != 0) { + if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) { + mutex_exit(&lpo->lpo_rops_lock); + return (-1); + } + } + + /* Start a read operation */ + VERIFY(++lpo->lpo_rops == 1); + mutex_exit(&lpo->lpo_rops_lock); + return (0); +} + +static void +lx_ptm_read_end(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* End a read operation */ + VERIFY(--lpo->lpo_rops == 0); + cv_signal(&lpo->lpo_rops_cv); + + mutex_exit(&lpo->lpo_rops_lock); +} + +static int +lx_ptm_pts_isopen(dev_t dev) +{ + ptmptsopencb_t ppocb; + + lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb); + return (ppocb.ppocb_func(ppocb.ppocb_arg)); +} + +static void +lx_ptm_eof_read(ldi_handle_t lh) +{ + struct uio uio; + iovec_t iov; + char junk[1]; + + /* + * We can remove any EOF message from the head of the stream by + * doing a zero byte read from the stream. + */ + iov.iov_len = 0; + iov.iov_base = junk; + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_resid = iov.iov_len; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + (void) ldi_read(lh, &uio, kcred); +} + +static int +lx_ptm_eof_drop_1(dev_t dev, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, msg_size, msg_count; + + *rvalp = 0; + + /* + * Check if there is an EOF message (represented by a zero length + * data message) at the head of the stream. Note that the + * I_NREAD ioctl is a streams framework ioctl so it will succeed + * even if there have been previous write errors on this stream. + */ + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + + if ((msg_count == 0) || (msg_size != 0)) { + /* No EOF message found */ + return (0); + } + + /* Record the fact that the slave device has been closed. */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + + /* drop the EOF */ + lx_ptm_eof_read(lh); + *rvalp = 1; + return (0); +} + +static int +lx_ptm_eof_drop(dev_t dev, int *rvalp) +{ + int rval, err; + + if (rvalp != NULL) + *rvalp = 0; + for (;;) { + if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0) + return (err); + if (rval == 0) + return (0); + if (rvalp != NULL) + *rvalp = 1; + } +} + +static int +lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + *rvalp = 0; + if (ignore_eof) { + int size, rval; + + if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size, + FKIOCTL, kcred, &rval)) != 0) + return (err); + if (size != 0) + *rvalp = 1; + } else { + int msg_size, msg_count; + + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + if (msg_count != 0) + *rvalp = 1; + } + return (0); +} + +static int +lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int err; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + err = ldi_ident_from_dip(dip, &lps.lps_li); + if (err != 0) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + lps.lps_dip = dip; + lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME); + + rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL); + lps.lps_lh_count = 0; + lps.lps_lh_array = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ldi_ident_release(lps.lps_li); + lps.lps_dip = NULL; + + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_array != NULL) { + kmem_free(lps.lps_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + lps.lps_lh_array = NULL; + lps.lps_lh_count = 0; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + struct strioctl iocb; + ptmptsopencb_t ppocb = { NULL, NULL }; + ldi_handle_t lh; + major_t maj, our_major = getmajor(*devp); + minor_t min, lastmin; + uint_t index, anchor = 1; + dev_t ptm_dev; + int err, rval = 0; + + /* + * Don't support the FNDELAY flag and FNONBLOCK until we either + * find a Linux app that opens /dev/ptmx with the O_NDELAY + * or O_NONBLOCK flags explicitly, or until we create test cases + * to determine how reads of master terminal devices opened with + * these flags behave in different situations on Linux. Supporting + * these flags will involve enhancing our read implementation + * and changing the way it deals with EOF notifications. + */ + if (flag & (FNDELAY | FNONBLOCK)) + return (ENOTSUP); + + /* + * we're layered on top of the ptm driver so open that driver + * first. (note that we're opening /dev/ptmx in the global + * zone, not ourselves in the lx zone.) + */ + err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li); + if (err != 0) + return (err); + + /* get the devt returned by the ptmx open */ + err = ldi_get_dev(lh, &ptm_dev); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (err); + } + + /* + * we're a cloning driver so here's where we'll change the devt that we + * return. the ptmx is also a cloning driver so we'll just use + * it's minor number as our minor number (it already manages it's + * minor name space so no reason to duplicate the effort.) + */ + index = getminor(ptm_dev); + *devp = makedevice(our_major, INDEX_TO_MINOR(index)); + + /* Get a callback function to query if the pts device is open. */ + iocb.ic_cmd = PTMPTSOPENCB; + iocb.ic_timout = 0; + iocb.ic_len = sizeof (ppocb); + iocb.ic_dp = (char *)&ppocb; + + err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval); + if ((err != 0) || (rval != 0)) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + ASSERT(ppocb.ppocb_func != NULL); + + /* + * now setup autopush for the terminal slave device. this is + * necessary so that when a Linux program opens the device we + * can push required strmod modules onto the stream. in Solaris + * this is normally done by the application that actually + * allocates the terminal. + */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin, + &anchor, lx_pts_mods); + if (err != 0 && err != EEXIST) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + + /* save off this layered handle for future accesses */ + lx_ptm_lh_insert(index, lh); + lx_ptm_lh_set_ppocb(index, &ppocb); + return (0); +} + +/*ARGSUSED*/ +static int +lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + ldi_handle_t lh; + major_t maj; + minor_t min, lastmin; + uint_t index; + int err; + int i; + + index = DEVT_TO_INDEX(dev); + + /* + * we must cleanup all the state associated with this major/minor + * terminal pair before actually closing the ptm master device. + * this is required because once the close of the ptm device is + * complete major/minor terminal pair is immediatly available for + * re-use in any zone. + */ + + /* free up our saved reference for this layered handle */ + lh = lx_ptm_lh_remove(index); + + /* unconfigure autopush for the associated terminal slave device */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + for (i = 0; i < 5; i++) { + /* + * we loop here because we don't want to release this ptm + * node if autopush can't be disabled on the associated + * slave device because then bad things could happen if + * another brand were to get this terminal allocated + * to them. If we keep failing we eventually drive on so that + * things don't hang. + */ + err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin, + 0, NULL); + if (err == 0) + break; + + cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush", + getzoneid(), err); + + /* wait one second and try again */ + delay(drv_usectohz(1000000)); + } + + err = ldi_close(lh, flag, credp); + + /* + * note that we don't have to bother with changing the permissions + * on the associated slave device here. the reason is that no one + * can actually open the device untill it's associated master + * device is re-opened, which will result in the permissions on + * it being reset. + */ + return (err); +} + +static int +lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, rval; + struct uio uio = *uiop; + + *loop = 0; + + /* + * Here's another way that Linux master terminals behave differently + * from Solaris master terminals. If you do a read on a Linux + * master terminal (that was opened witout NDELAY and NONBLOCK) + * who's corrosponding slave terminal is currently closed and + * has been opened and closed at least once, Linux return -1 and + * set errno to EIO where as Solaris blocks. + */ + if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) { + /* Slave has been opened and closed at least once. */ + if (lx_ptm_pts_isopen(dev) == 0) { + /* + * Slave is closed. Make sure that data is avaliable + * before attempting a read. + */ + if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0) + return (err); + + /* If there is no data available then return. */ + if (rval == 0) + return (EIO); + } + } + + /* Actually do the read operation. */ + if ((err = ldi_read(lh, uiop, credp)) != 0) + return (err); + + /* If read returned actual data then return. */ + if (uio.uio_resid != uiop->uio_resid) + return (0); + + /* + * This was a zero byte read (ie, an EOF). This indicates + * that the slave terinal device has been closed. Record + * the fact that the slave device has been closed and retry + * the read operation. + */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + *loop = 1; + return (0); +} + +static int +lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev)); + int err, loop; + struct uio uio; + struct iovec iovp; + + ASSERT(uiop->uio_iovcnt > 0); + + /* + * If packet mode has been enabled (via TIOCPKT) we need to pad + * all read requests with a leading byte that indicates any + * relevant control status information. + */ + if (pktio != 0) { + /* + * We'd like to write the control information into + * the current buffer but we can't yet. We don't + * want to modify userspace memory here only to have + * the read operation fail later. So instead + * what we'll do here is read one character from the + * beginning of the memory pointed to by the uio + * structure. This will advance the output pointer + * by one. Then when the read completes successfully + * we can update the byte that we passed over. Before + * we do the read make a copy of the current uiop and + * iovec structs so we can write to them later. + */ + uio = *uiop; + iovp = *uiop->uio_iov; + uio.uio_iov = &iovp; + + if (uwritec(uiop) == -1) + return (EFAULT); + } + + do { + /* + * Before we actually attempt a read operation we need + * to make sure there's some buffer space to actually + * read in some data. We do this because if we're in + * pktio mode and the caller only requested one byte, + * then we've already used up that one byte and we + * don't want to pass this read request. Doing a 0 + * byte read (unless there is a problem with the stream + * head) always returns succcess. Normally when a streams + * read returns 0 bytes we interpret that as an EOF on + * the stream (ie, the slave side has been opened and + * closed) and we ignore it and re-try the read operation. + * So if we pass on a 0 byte read here lx_ptm_read_loop() + * will tell us to loop around and we'll end up in an + * infinite loop. + */ + if (uiop->uio_resid == 0) + break; + + /* + * Serialize all reads. We need to do this so that we can + * properly emulate the behavior of master terminals on Linux. + * In reality this serializaion should not pose any kind of + * performance problem since it would be very strange to have + * multiple threads trying to read from the same master + * terminal device concurrently. + */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_read_loop(dev, uiop, credp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + + if (pktio != 0) { + uint8_t pktio_data = TIOCPKT_DATA; + + /* + * Note that the control status information we + * pass back is faked up in the sense that we + * don't actually report any events, we always + * report a status of 0. + */ + if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0) + return (EFAULT); + } + + return (0); +} + +static int +lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + err = ldi_write(lh, uiop, credp); + + return (err); +} + +static int +lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + /* + * here we need to make sure that we never allow the + * I_SETSIG and I_ESETSIG ioctls to pass through. we + * do this because we can't support them. + * + * the native Solaris ptm device supports these ioctls because + * they are streams framework ioctls and all streams devices + * support them by default. these ioctls cause the current + * process to be registered with a stream and receive signals + * when certain stream events occur. + * + * a problem arises with cleanup of these registrations + * for layered drivers. + * + * normally the streams framework is notified whenever a + * process closes any reference to a stream and it goes ahead + * and cleans up these registrations. but actual device drivers + * are not notified when a process performs a close operation + * unless the process is closing the last opened reference to + * the device on the entire system. + * + * so while we could pass these ioctls on and allow processes + * to register for signal delivery, we would never receive + * any notification when those processes exit (or close a + * stream) and we wouldn't be able to unregister them. + * + * luckily these operations are streams specific and Linux + * doesn't support streams devices. so it doesn't actually + * seem like we need to support these ioctls. if it turns + * out that we do need to support them for some reason in + * the future, the current driver model will have to be + * enhanced to better support streams device layering. + */ + if ((cmd == I_SETSIG) || (cmd == I_ESETSIG)) + return (EINVAL); + + /* + * here we fake up support for TIOCPKT. Linux applications expect + * /etc/ptmx to support this ioctl, but on Solaris it doesn't. + * (it is supported on older bsd style ptys.) so we'll fake + * up support for it here. + * + * the reason that this ioctl is emulated here instead of in + * userland is that this ioctl affects the results returned + * from read() operations. if this ioctl was emulated in + * userland the brand library would need to intercept all + * read operations and check to see if pktio was enabled + * for the fd being read from. since this ioctl only needs + * to be supported on the ptmx device it makes more sense + * to support it here where we can easily update the results + * returned for read() operations performed on ourselves. + */ + if (cmd == TIOCPKT) { + int pktio; + + if (ddi_copyin((void *)arg, &pktio, sizeof (pktio), + mode) != DDI_SUCCESS) + return (EFAULT); + + if (pktio == 0) + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0); + else + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1); + + return (0); + } + + err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp); + + /* + * On recent versions of Linux some apps issue the following ioctls to + * the master side of the ptm before opening the slave side. Because + * our streams modules (specifically ptem) aren't autopushed until the + * slave side has been opened, these ioctls will fail. To alleviate the + * issue we simply pretend that these ioctls have succeeded. + * + * We could push our own "lx_ptem" module onto the master side of the + * stream in lx_ptm_open if we need better emulation, but that would + * require an "lx_ptem" module which duplicates most of ptem. ptem + * doesn't work properly when pushed on the master side. + */ + if (err == EINVAL && (cmd == TIOCSWINSZ || cmd == TCSETS) && + lx_ptm_pts_isopen(dev) == 0) { + /* slave side not open, assume we need to succeed */ + DTRACE_PROBE1(lx_ptm_ioctl__override, int, cmd); + return (0); + } + + return (err); +} + +static int +lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + short reventsp2; + int err, rval; + + *loop = 0; + + /* + * If the slave device has been opened and closed at least + * once and the slave device is currently closed, then poll + * always needs to returns immediatly. + */ + if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) && + (lx_ptm_pts_isopen(dev) == 0)) { + /* In this case always return POLLHUP */ + *reventsp = POLLHUP; + + /* + * Check if there really is data on the stream. + * If so set the correct return flags. + */ + if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) { + /* Something went wrong. */ + return (err); + } + if (rval != 0) + *reventsp |= (events & (POLLIN | POLLRDNORM)); + + /* + * Is the user checking for writability? Note that for ptm + * devices Linux seems to ignore the POLLWRBAND write flag. + */ + if ((events & POLLWRNORM) == 0) + return (0); + + /* + * To check if the stream is writable we have to actually + * call poll, but make sure to set anyyet to 1 to prevent + * the streams framework from setting up callbacks. + */ + if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0) + return (err); + + *reventsp |= (reventsp2 & POLLWRNORM); + } else { + int lockstate; + + /* The slave device is open, do the poll */ + if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0) + return (err); + + /* + * Drop any leading EOFs on the stream. + * + * Note that we have to use pollunlock() here to avoid + * recursive mutex enters in the poll framework. The + * reason is that if there is an EOF message on the stream + * then the act of reading from the queue to remove the + * message can cause the ptm drivers event service + * routine to be invoked, and if there is no open + * slave device then the ptm driver may generate + * error messages and put them on the stream. This + * in turn will generate a poll event and the poll + * framework will try to invoke any poll callbacks + * associated with the stream. In the process of + * doing that the poll framework will try to aquire + * locks that we are already holding. So we need to + * drop those locks here before we do our read. + */ + if (pollunlock(&lockstate) != 0) { + *reventsp = POLLNVAL; + return (0); + } + err = lx_ptm_eof_drop(dev, &rval); + pollrelock(lockstate); + if (err) + return (err); + + /* If no EOF was dropped then return */ + if (rval == 0) + return (0); + + /* + * An EOF was removed from the stream. Retry the entire + * poll operation from the top because polls on the ptm + * device should behave differently now. + */ + *loop = 1; + } + return (0); +} + +static int +lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int loop, err; + + do { + /* Serialize ourself wrt read operations. */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_poll_loop(dev, + events, anyyet, reventsp, phpp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + return (0); +} + +static struct cb_ops lx_ptm_cb_ops = { + lx_ptm_open, /* open */ + lx_ptm_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lx_ptm_read, /* read */ + lx_ptm_write, /* write */ + lx_ptm_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + lx_ptm_poll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lx_ptm_ops = { + DEVO_REV, + 0, + ddi_getinfo_1to1, + nulldev, + nulldev, + lx_ptm_attach, + lx_ptm_detach, + nodev, + &lx_ptm_cb_ops, + NULL, + NULL, + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "Linux master terminal driver", /* description of module */ + &lx_ptm_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf new file mode 100644 index 0000000000..481b4e3c74 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_ptm" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/os/lx_acct.c b/usr/src/uts/common/brand/lx/os/lx_acct.c new file mode 100644 index 0000000000..7f38a240ab --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_acct.c @@ -0,0 +1,198 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/acct.h> +#include <sys/proc.h> +#include <sys/user.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/session.h> +#include <sys/wait.h> +#include <sys/ddi.h> +#include <sys/zone.h> +#include <sys/lx_types.h> + +/* + * Based on the Linux acct(5) man page, their comp_t definition is the same + * as ours. lxac_etime is encoded as a float for v3 accounting records. + */ + +#define LX_ACCT_VERSION 3 + +/* + * Bit flags in lxac_flag. The Linux AFORK and ASU match native. The rest of + * the flags diverge. + */ +#define LX_AFORK 0x01 /* executed fork, but no exec */ +#define LX_ASU 0x02 /* used superuser privileges */ +#define LX_ACORE 0x08 /* dumped core */ +#define LX_AXSIG 0x10 /* killed by a signal */ + +typedef struct lx_acct { + char lxac_flag; + char lxac_version; + uint16_t lxac_tty; + uint32_t lxac_exitcode; + uint32_t lxac_uid; + uint32_t lxac_gid; + uint32_t lxac_pid; + uint32_t lxac_ppid; + uint32_t lxac_btime; /* seconds since the epoch */ + uint32_t lxac_etime; /* float representation of ticks */ + comp_t lxac_utime; + comp_t lxac_stime; + comp_t lxac_mem; /* kb */ + comp_t lxac_io; /* unused */ + comp_t lxac_rw; /* unused */ + comp_t lxac_minflt; + comp_t lxac_majflt; + comp_t lxac_swaps; /* unused */ + char lxac_comm[16]; +} lx_acct_t; + +/* + * Same functionality as acct_compress(). Produce a pseudo-floating point + * representation with 3 bits base-8 exponent, 13 bits fraction. + */ +static comp_t +lx_acct_compt(ulong_t t) +{ + int exp = 0, round = 0; + + while (t >= 8192) { + exp++; + round = t & 04; + t >>= 3; + } + if (round) { + t++; + if (t >= 8192) { + t >>= 3; + exp++; + } + } +#ifdef _LP64 + if (exp > 7) { + /* prevent wraparound */ + t = 8191; + exp = 7; + } +#endif + return ((exp << 13) + t); +} + +/* + * 32-bit IEEE float encoding as-per Linux. + */ +static uint32_t +lx_acct_float(int64_t t) +{ + uint32_t val, exp = 190; + + if (t == 0) + return (0); + + while (t > 0) { + t <<= 1; + exp--; + } + val = (uint32_t)(t >> 40) & 0x7fffffu; + + return (val | (exp << 23)); +} + +/* + * Write a Linux-formatted record to the accounting file. + */ +void +lx_acct_out(vnode_t *vp, int exit_status) +{ + struct proc *p; + user_t *ua; + struct cred *cr; + dev_t d; + pid_t pid, ppid; + struct vattr va; + ssize_t resid = 0; + int err; + lx_acct_t a; + + p = curproc; + ua = PTOU(p); + cr = CRED(); + + bzero(&a, sizeof (a)); + + a.lxac_flag = ua->u_acflag & (LX_AFORK | LX_ASU); + a.lxac_version = LX_ACCT_VERSION; + d = cttydev(p); + a.lxac_tty = LX_MAKEDEVICE(getmajor(d), getminor(d)); + if (WIFEXITED(exit_status)) { + a.lxac_exitcode = WEXITSTATUS(exit_status); + } else if (WIFSIGNALED(exit_status)) { + a.lxac_flag |= LX_AXSIG; + if (WCOREDUMP(exit_status)) { + a.lxac_flag |= LX_ACORE; + } + } + a.lxac_uid = crgetruid(cr); + a.lxac_gid = crgetrgid(cr); + pid = p->p_pid; + ppid = p->p_ppid; + /* Perform pid translation ala lxpr_fixpid(). */ + if (pid == curzone->zone_proc_initpid) { + pid = 1; + ppid = 0; + } else { + if (ppid == curzone->zone_proc_initpid) { + ppid = 1; + } else if (ppid == curzone->zone_zsched->p_pid || + (p->p_flag & SZONETOP) != 0) { + ppid = 1; + } + } + a.lxac_pid = pid; + a.lxac_ppid = ppid; + a.lxac_btime = ua->u_start.tv_sec; + /* For Linux v3 accounting record, this is an encoded float. */ + a.lxac_etime = lx_acct_float(ddi_get_lbolt() - ua->u_ticks); + a.lxac_utime = lx_acct_compt(NSEC_TO_TICK(p->p_acct[LMS_USER])); + a.lxac_stime = lx_acct_compt( + NSEC_TO_TICK(p->p_acct[LMS_SYSTEM] + p->p_acct[LMS_TRAP])); + a.lxac_mem = lx_acct_compt((ulong_t)(ptob(ua->u_mem) / 1024)); + /* a.lxac_io unused */ + /* a.lxac_rw unused */ + a.lxac_minflt = lx_acct_compt((ulong_t)p->p_ru.minflt); + a.lxac_majflt = lx_acct_compt((ulong_t)p->p_ru.majflt); + /* a.lxac_swaps unused */ + bcopy(ua->u_comm, a.lxac_comm, sizeof (a.lxac_comm)); + + /* + * As with the native acct() handling, we save the size so that if the + * write fails, we can reset the size to avoid corrupting the accounting + * file. + */ + va.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &va, 0, kcred, NULL) == 0) { + err = vn_rdwr(UIO_WRITE, vp, (caddr_t)&a, sizeof (a), 0LL, + UIO_SYSSPACE, FAPPEND, (rlim64_t)MAXOFF_T, kcred, &resid); + if (err != 0 || resid != 0) + (void) VOP_SETATTR(vp, &va, 0, kcred, NULL); + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_acl.c b/usr/src/uts/common/brand/lx/os/lx_acl.c new file mode 100644 index 0000000000..184f05b6ed --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_acl.c @@ -0,0 +1,213 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/sunddi.h> +#include <sys/pathname.h> +#include <sys/acl.h> +#include <acl/acl_common.h> +#include <sys/lx_acl.h> + + +typedef struct { + uint16_t lpaxe_tag; + uint16_t lpaxe_perm; + uint32_t lpaxe_id; +} lx_posix_acl_xattr_entry_t; + +typedef struct { + uint32_t lpaxh_version; + lx_posix_acl_xattr_entry_t lpaxh_entries[]; +} lx_posix_acl_xattr_header_t; + +#define LX_POSIX_ACL_XATTR_VERSION 0x0002 + +/* e_tag entry in struct posix_acl_entry */ +#define LX_ACL_USER_OBJ 0x01 /* USER_OBJ */ +#define LX_ACL_USER 0x02 /* USER */ +#define LX_ACL_GROUP_OBJ 0x04 /* GROUP_OBJ */ +#define LX_ACL_GROUP 0x08 /* GROUP */ +#define LX_ACL_MASK 0x10 /* CLASS_OBJ */ +#define LX_ACL_OTHER 0x20 /* OTHER_OBJ */ + + +static int +lx_acl_from_xattr(enum lx_acl_type atype, void *xattr, uint_t xlen, + acl_t **aclpp) +{ + lx_posix_acl_xattr_header_t *head = xattr; + lx_posix_acl_xattr_entry_t *entry; + int err = 0; + uint_t count, sz = xlen; + const uint_t mask = (atype == LX_ACL_DEFAULT) ? ACL_DEFAULT : 0; + acl_t *acl; + aclent_t *acle; + + if (xattr == NULL) { + /* Handle zero-length set operations */ + acl = acl_alloc(ACLENT_T); + *aclpp = acl; + return (0); + } + + if (xlen < sizeof (*head)) { + return (EINVAL); + } else if (head->lpaxh_version != LX_POSIX_ACL_XATTR_VERSION) { + return (EOPNOTSUPP); + } + + sz -= sizeof (lx_posix_acl_xattr_header_t); + if (sz % sizeof (lx_posix_acl_xattr_entry_t) != 0) { + return (EINVAL); + } + count = sz / sizeof (lx_posix_acl_xattr_entry_t); + + acl = acl_alloc(ACLENT_T); + if (count == 0) { + *aclpp = acl; + return (0); + } + + acle = kmem_alloc(count * sizeof (aclent_t), KM_SLEEP); + acl->acl_cnt = count; + acl->acl_aclp = acle; + entry = head->lpaxh_entries; + for (uint_t i = 0; i < count && err == 0; i++, entry++, acle++) { + switch (entry->lpaxe_tag) { + case LX_ACL_USER_OBJ: + case LX_ACL_GROUP_OBJ: + case LX_ACL_OTHER: + case LX_ACL_MASK: + break; + case LX_ACL_USER: + case LX_ACL_GROUP: + if (entry->lpaxe_id > MAXUID) { + err = EINVAL; + } + break; + default: + err = EINVAL; + break; + } + acle->a_id = entry->lpaxe_id | mask; + acle->a_type = entry->lpaxe_tag; + acle->a_perm = entry->lpaxe_perm; + } + if (err != 0) { + acl_free(acl); + return (err); + } + + *aclpp = acl; + return (0); +} + +/* ARGSUSED */ +int +lx_acl_setxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t len) +{ + const boolean_t is_dir = (vp->v_type == VDIR); + acl_t *acl = NULL; + cred_t *cr = CRED(); + int err; + + if (vp->v_type == VLNK) { + return (ENOTSUP); + } else if (atype == LX_ACL_DEFAULT && !is_dir) { + return (EACCES); + } + + /* + * Copyin and verify the input, even through there is little to be done + * with the result. + */ + if ((err = lx_acl_from_xattr(atype, data, len, &acl)) != 0) { + return (err); + } + + /* + * Because systemd has decided to scope-creep its way into a position + * of moribund domination over all things system software, there exist + * work-arounds which are required to address its numerous bugs and + * shortcomings. One such case involves the FreeIPA installer needing + * to perform setfacl(3) on /run/systemd/ask-password. + * + * Between the fact that meaningful ACL translation can be challenging + * and that the path in question resides on tmpfs (which doesn't yet + * support ACLs at all on illumos), faked success is the only palatable + * course of action for now. Atonement will follow. + * + * See also: https://bugzilla.redhat.com/show_bug.cgi?id=1322167 + */ + err = ENOTSUP; + if (crgetuid(cr) == 0) { + char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if (vnodetopath(NULL, vp, path, MAXPATHLEN, cr) == 0 && + strncmp(path, "/run/systemd/", 13) == 0) { + /* Saccharin-sweet fake success */ + err = 0; + } + kmem_free(path, MAXPATHLEN); + } + acl_free(acl); + + return (err); +} + +/* ARGSUSED */ +int +lx_acl_getxattr(vnode_t *vp, enum lx_acl_type atype, void *data, size_t slen, + ssize_t *solen) +{ + const boolean_t is_dir = (vp->v_type == VDIR); + vsecattr_t vsattr; + int err; + + if (vp->v_type == VLNK) { + return (ENOTSUP); + } else if (atype == LX_ACL_DEFAULT && !is_dir) { + return (ENODATA); + } + + bzero(&vsattr, sizeof (vsattr)); + vsattr.vsa_mask = VSA_ACECNT; + if ((err = VOP_GETSECATTR(vp, &vsattr, 0, CRED(), NULL)) != 0) { + err = (err == ENOENT) ? ENODATA : err; + return (err); + } + + if (vsattr.vsa_aclentp != NULL) + kmem_free(vsattr.vsa_aclentp, vsattr.vsa_aclentsz); + + return (ENODATA); +} + +/* ARGSUSED */ +int +lx_acl_removexattr(vnode_t *vp, enum lx_acl_type atype) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +lx_acl_listxattr(vnode_t *vp, uio_t *uio) +{ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_audit.c b/usr/src/uts/common/brand/lx/os/lx_audit.c new file mode 100644 index 0000000000..6e522e6d8d --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_audit.c @@ -0,0 +1,1604 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * The Linux auditing system provides a fairly complex rule-based syntax + * for configuring what actions are to be audited. The user-level details + * are generally described in the Linux audit.rules(7), auditctl(8), and + * auditd(8) man pages. The user/kernel netlink API does not seem to be + * documented. The Linux kernel source and the user-level auditd source must + * be used to understand the interface we have to emulate. The relevant Linux + * source files are: + * include/uapi/linux/audit.h + * include/linux/audit.h + * kernel/audit.c + * + * The lx_netlink module implements the API used for getting or changing the + * audit configuration. For rule-oriented operations (list, append, delete), + * an lx_audit_rule_t structure (or sequence when listing) is passed in/out of + * the kernel. The netlink code calls into the lx_audit_append_rule or + * lx_audit_delete_rule functions here to perform the relevant operation. + * Within the lx_audit_rule_t structure, each member has the following + * meaning: + * lxar_flag: corresponds to user-level list (e.g. "exit" for syscall return) + * lxar_action: user-level action (e.g. "always") + * lxar_fld_cnt: number of fields specified in lxar_fields, lxar_values, and + * lxar_flg_flag arrays + * lxar_mask: syscall number bitmask the rule applies to (bit position in + * the array corresponds to the syscall number) + * laxr_fields: array of fields in the rule (i.e. each -F on user-level rule). + * A numeric code (e.g. LX_RF_AUDIT_ARCH) is assigned to each + * possible field. + * lxar_values: array of numeric field values (e.g. the internal b64 value on + * the -F AUDIT_ARCH=b64 rule) + * lxar_fld_flag: array of field operators (e.g. the '=' operator on the + * -F AUDIT_ARCH=b64 rule) + * lxar_buflen: length of the buffer data immediately following + * lxar_buf: A variable amount of additional field string data. Non-numeric + * field values are passed here. For example, the string associated + * with the '-F key=...' or -F path=...' rules. For string values, + * the corresponding lxar_values entry is the length of the string. + * The strings in lxar_buf are not C strings because they are not + * NULL terminated. The character data is pulled out of lxar_buf + * in chunks specified by the value and the pointer into the buf + * is advanced accordingly. + * + * There are two primary kinds of actions which we are currently interested in + * auditing; + * 1) system call return + * this corresponds to user-level "exit" rule actions + * 2) file system related actions + * this corresponds to user-level file system watch rules (-w) + * + * Only system call return is currently implemented, and only a very limited + * subset of all of the possible rule selection behavior. + * + * The Linux audit rule syntax defines that all selection criteria within a + * rule is ANDed together before an audit record is created. However, multiple + * rules can be defined for a specific syscall. For example, this user-level + * syntax defines two different rules for the "open" syscall: + * -a always,exit -F arch=b64 -S open -F auid>=1000 -F key=user-open + * -a always,exit -F arch=b64 -S open -F auid=0 -F key=priv-open + * The first rule would cause an audit record to be created when an "open" + * syscall returns and the syscall was performed by a process with a + * loginuid >= 1000. The key added to that audit record would be "user-open". + * The second rule would create an audit record if the loginuid was 0 and the + * record's key would be "priv-open". + * + * When auditing is enabled for a syscall return, we have to look at multiple + * rules and create an audit record for each rule that matches the selection + * criteria. + * + * Although the current implementation is limited, the overall structure is + * designed to be enhanced as more auditing support is added over time. + * + * By default, auditing is not enabled for a zone and no internal audit data + * exists. When the first netlink audit msg is received, the zone's audit state + * (lx_audit_state_t) is allocated (via lx_audit_init) and attached to the + * zone's lx brand-specific data (lxzd_audit_state). Once allocated, the audit + * data will persist until the zone halts. + * + * Audit records are enqueued onto the lxast_ev_queue and a worker thread + * (lx_audit_worker) is responsible for dequeueing the audit records and + * sending them up to the user-level auditd. + * + * Audit rules are stored in the lxast_rules list. This is an internal list + * consisting of elements of type lx_audit_rule_ent_t. Each element contains + * the input rule (lxare_rule) along with some additional data parsed out of + * the rule when it is appended (currently only the arch and key). + * + * When auditing is enabled for a syscall, the appropriate entry in the + * lxast_sys64_rulep (or lxast_sys32_rulep) array will point to the first + * rule that is applicable to the syscall. When that syscall returns, rule + * matching proceeds from that rule to the end of the rule list. + * + * New rules are always appended at the end of the list and Linux expects that + * rules are matched in order. + * + * If the rule list ever gets large enough that a linear search, anchored off + * the syscall pointer, becomes a performance bottleneck, then we'll have to + * explore alternate implementations. However, use of auditing is not that + * common to begin with, and most syscalls are typically not audited, so as + * long as the number of rules is in the order of tens, then the current + * implementation should be fine. + * + * When a rule is deleted, all associated syscall entries (lxast_sys64_rulep or + * lxast_sys32_rulep) are cleared, then the rule list is searched to see if + * there are any remaining rules which are applicable to the syscall(s). If so, + * pointers are reestablished in the relevant lxast_sys64_rulep (or 32) array. + */ + +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/ddi.h> +#include <sys/zone.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <sys/debug.h> +#include <sys/ucred.h> +#include <sys/session.h> +#include <sys/lx_types.h> +#include <sys/lx_audit.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/bitmap.h> +#include <sockcommon.h> + +#define LX_AUDIT_FEATURE_VERSION 1 + +/* + * Audit status mask values (lxas_mask in structure defined below) + * See Linux include/uapi/linux/audit.h + */ +#define LX_AUDIT_STATUS_ENABLED 0x001 +#define LX_AUDIT_STATUS_FAILURE 0x002 +#define LX_AUDIT_STATUS_PID 0x004 +#define LX_AUDIT_STATUS_RATE_LIMIT 0x008 +#define LX_AUDIT_STATUS_BACKLOG_LIMIT 0x010 +#define LX_AUDIT_STATUS_BACKLOG_WAIT_TIME 0x020 +#define LX_AUDIT_STATUS_LOST 0x040 + +/* + * Audit features + * See Linux include/uapi/linux/audit.h + */ +#define LX_AUDIT_F_BACKLOG_LIMIT 0x001 +#define LX_AUDIT_F_BACKLOG_WAIT_TIME 0x002 +#define LX_AUDIT_F_EXECUTABLE_PATH 0x004 +#define LX_AUDIT_F_EXCLUDE_EXTEND 0x008 +#define LX_AUDIT_F_SESSIONID_FILTER 0x010 +#define LX_AUDIT_F_LOST_RESET 0x020 +#define LX_AUDIT_F_FILTER_FS 0x040 + +#define LX_AUDIT_FEATURE_ALL (LX_AUDIT_F_BACKLOG_LIMIT | \ + LX_AUDIT_F_BACKLOG_WAIT_TIME | LX_AUDIT_F_EXECUTABLE_PATH | \ + LX_AUDIT_F_EXCLUDE_EXTEND | LX_AUDIT_F_SESSIONID_FILTER | \ + LX_AUDIT_F_LOST_RESET | LX_AUDIT_F_FILTER_FS) + + +/* Audit events */ +#define LX_AUDIT_SYSCALL 1300 /* syscall */ +#define LX_AUDIT_PATH 1302 /* file path */ +#define LX_AUDIT_CONFIG_CHANGE 1305 /* configuration change */ +#define LX_AUDIT_CWD 1307 /* current working directory */ +#define LX_AUDIT_EXECVE 1309 /* exec args */ +#define LX_AUDIT_EOE 1320 /* end of multi-record event */ + +#define LX_AUDIT_BITMASK_SIZE 64 +#define LX_AUDIT_MAX_KEY_LEN 256 + +/* Audit rule filter type */ +#define LX_AUDIT_FILTER_USER 0 /* user generated msgs */ +#define LX_AUDIT_FILTER_TASK 1 /* task creation */ +#define LX_AUDIT_FILTER_ENTRY 2 /* syscall entry - obsolete */ +#define LX_AUDIT_FILTER_WATCH 3 /* fs watch */ +#define LX_AUDIT_FILTER_EXIT 4 /* syscall return */ +#define LX_AUDIT_FILTER_TYPE 5 /* audit log start */ +#define LX_AUDIT_FILTER_FS 6 /* audit inode child */ + +/* Audit rule action type */ +#define LX_AUDIT_ACT_NEVER 0 +#define LX_AUDIT_ACT_POSSIBLE 1 +#define LX_AUDIT_ACT_ALWAYS 2 /* the common case */ + +#define LX_AUDIT_RULE_MAX_FIELDS 64 + +/* Linux defaults */ +#define LX_AUDIT_DEF_BACKLOG_LIMIT 64 +#define LX_AUDIT_DEF_WAIT_TIME (60 * HZ_TO_LX_USERHZ(hz)) + +/* + * Audit rule field types + * Linux defines a lot of Rule Field values in include/uapi/linux/audit.h. + * We currently only handle a few. + */ +#define LX_RF_AUDIT_LOGINUID 9 /* e.g. auid>=1000 */ +#define LX_RF_AUDIT_ARCH 11 /* e.g. -F arch=b64 */ +#define LX_RF_AUDIT_WATCH 105 /* user-level -w rule */ +#define LX_RF_AUDIT_PERM 106 /* user-level -p option */ +#define LX_RF_AUDIT_FILTERKEY 210 /* user-level -k key option */ + +/* + * Audit rule field operators + * Linux defines the operator values in include/uapi/linux/audit.h. + * These 4 bits are combined in various ways for additional operators. + */ +#define LX_OF_AUDIT_BM 0x08000000 /* bit mask (&) */ +#define LX_OF_AUDIT_LT 0x10000000 +#define LX_OF_AUDIT_GT 0x20000000 +#define LX_OF_AUDIT_EQ 0x40000000 +#define LX_OF_AUDIT_NE (LX_OF_AUDIT_LT | LX_OF_AUDIT_GT) +#define LX_OF_AUDIT_BT (LX_OF_AUDIT_BM | LX_OF_AUDIT_EQ) /* bit test (&=) */ +#define LX_OF_AUDIT_LE (LX_OF_AUDIT_LT | LX_OF_AUDIT_EQ) +#define LX_OF_AUDIT_GE (LX_OF_AUDIT_GT | LX_OF_AUDIT_EQ) +#define LX_OF_AUDIT_ALL (LX_OF_AUDIT_EQ | LX_OF_AUDIT_NE | LX_OF_AUDIT_BM) + +/* + * Audit rule arch specification + * See Linux EM_X86_64 and EM_386 defs. + * -F arch=b64 looks like: 0xc000003e + * -F arch=b32 looks like: 0x40000003 + * If no arch is specified (possible with '-S syslog', '-S all', or '-w <file>') + * the rule applies to both architectures and LX_RF_AUDIT_ARCH is not passed. + */ +#define LX_AUDIT_ARCH64 0xc000003e +#define LX_AUDIT_ARCH32 0x40000003 + +/* + * See Linux include/uapi/linux/audit.h, AUDIT_MESSAGE_TEXT_MAX is 8560. + * The auditd src has MAX_AUDIT_MESSAGE_LENGTH as 8970. + * Until necessary, we'll limit ourselves to a smaller length. + */ +#define LX_AUDIT_MESSAGE_TEXT_MAX 1024 + +typedef struct lx_audit_features { + uint32_t lxaf_version; + uint32_t lxaf_mask; + uint32_t lxaf_features; + uint32_t lxaf_lock; +} lx_audit_features_t; + +typedef struct lx_audit_status { + uint32_t lxas_mask; + uint32_t lxas_enabled; + uint32_t lxas_failure; + uint32_t lxas_pid; + uint32_t lxas_rate_limit; + uint32_t lxas_backlog_limit; + uint32_t lxas_lost; + uint32_t lxas_backlog; + /* LINTED: E_ANONYMOUS_UNION_DECL */ + union { + uint32_t lxas_version; + uint32_t lxas_feature_bitmap; + }; + uint32_t lxas_backlog_wait_time; +} lx_audit_status_t; + +typedef struct lx_audit_rule { + uint32_t lxar_flag; + uint32_t lxar_action; + uint32_t lxar_fld_cnt; + uint32_t lxar_mask[LX_AUDIT_BITMASK_SIZE]; + uint32_t lxar_fields[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_values[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_fld_flag[LX_AUDIT_RULE_MAX_FIELDS]; + uint32_t lxar_buflen; + /* LINTED: E_ZERO_OR_NEGATIVE_SUBSCRIPT */ + char lxar_buf[0]; +} lx_audit_rule_t; + +/* + * Internal structure for an audit rule. + * Each rule is on the zone's top-level list of all rules (lxast_rules). + * This structure also holds the parsed character string fields from the + * original input rule (lxar_buf) so that we don't need to re-parse that + * data on every match. + */ +typedef struct lx_audit_rule_ent { + list_node_t lxare_link; + lx_audit_rule_t lxare_rule; + char *lxare_buf; + boolean_t lxare_is32bit; + boolean_t lxare_is64bit; + char *lxare_key; +} lx_audit_rule_ent_t; + +typedef enum lx_audit_fail { + LXAE_SILENT, + LXAE_PRINT, /* default */ + LXAE_PANIC /* reboot the zone */ +} lx_audit_fail_t; + +typedef struct lx_audit_record { + list_node_t lxar_link; + uint32_t lxar_type; + char *lxar_msg; +} lx_audit_record_t; + +/* + * Per-zone audit state + * Lazy allocated when first needed. + * + * lxast_rate_limit + * Currently unused, but can be get/set. Linux default is 0. + * lxast_backlog_limit + * The maximum number of outstanding audit events allowed (the Linux kernel + * default is 64). If the limit is reached, lxast_failure determines what + * to do. + * lxast_backlog_wait_time + * Currently unused, but can be get/set. Linux default is 60HZ. + */ +typedef struct lx_audit_state { + lx_audit_fail_t lxast_failure; /* failure behavior */ + uint32_t lxast_rate_limit; + uint32_t lxast_backlog_limit; + uint32_t lxast_backlog_wait_time; + lx_audit_rule_ent_t *lxast_sys32_rulep[LX_NSYSCALLS]; + lx_audit_rule_ent_t *lxast_sys64_rulep[LX_NSYSCALLS]; + kcondvar_t lxast_worker_cv; + kmutex_t lxast_lock; /* protects members below */ + pid_t lxast_pid; /* auditd pid */ + uint64_t lxast_seq; /* event sequence num */ + uint32_t lxast_backlog; /* num of queued events */ + uint32_t lxast_lost; /* num of lost events */ + void *lxast_sock; /* auditd lx_netlink_sock_t */ + boolean_t lxast_exit; /* taskq worker should quit */ + boolean_t lxast_panicing; /* audit forcing reboot? */ + kthread_t *lxast_worker; + list_t lxast_ev_queue; /* audit record queue */ + list_t lxast_rules; /* the list of rules */ +} lx_audit_state_t; + +/* + * Function pointer to netlink function used by audit worker threads to send + * audit messages up to the user-level auditd. + */ +static int (*lx_audit_emit_msg)(void *, uint_t, const char *, uint_t); +static kmutex_t lx_audit_em_lock; /* protects emit_msg above */ + +/* From uts/common/brand/lx/syscall/lx_socket.c */ +extern long lx_socket(int, int, int); +/* From uts/common/syscall/close.c */ +extern int close(int); + +static int +lx_audit_emit_syscall_event(uint_t mtype, void *lxsock, const char *msg) +{ + int err; + + err = lx_audit_emit_msg(lxsock, mtype, msg, LX_AUDIT_MESSAGE_TEXT_MAX); + if (err != 0) + return (err); + err = lx_audit_emit_msg(lxsock, 0, NULL, 0); + return (err); +} + +/* + * Worker thread for audit record output up to user-level auditd. + */ +static void +lx_audit_worker(void *a) +{ + lx_audit_state_t *asp = (lx_audit_state_t *)a; + lx_audit_record_t *rp; + int err; + + VERIFY(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + while (!asp->lxast_exit) { + + if (asp->lxast_backlog == 0 || asp->lxast_sock == NULL || + asp->lxast_pid == 0) { + cv_wait(&asp->lxast_worker_cv, &asp->lxast_lock); + continue; + } + + rp = list_remove_head(&asp->lxast_ev_queue); + asp->lxast_backlog--; + + err = lx_audit_emit_syscall_event(rp->lxar_type, + asp->lxast_sock, rp->lxar_msg); + if (err != ENOMEM && err != ENOSPC) { + kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX); + kmem_free(rp, sizeof (lx_audit_record_t)); + } else { + /* + * Put it back on the list, drop the mutex so that + * any other audit-related action could occur (such as + * socket deletion), then wait briefly before retry. + */ + list_insert_head(&asp->lxast_ev_queue, rp); + asp->lxast_backlog++; + mutex_exit(&asp->lxast_lock); + /* wait 1/10th second and try again */ + delay(drv_usectohz(100000)); + mutex_enter(&asp->lxast_lock); + } + } + + /* Leave state ready for new worker when auditing restarted */ + asp->lxast_exit = B_FALSE; + mutex_exit(&asp->lxast_lock); + + thread_exit(); +} + +static void +lx_audit_set_worker(uint32_t pid, void *lxsock, + void (*cb)(void *, boolean_t)) +{ + lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state; + + ASSERT(asp != NULL); + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + /* First, stop any existing worker thread */ + while (asp->lxast_sock != NULL) { + mutex_exit(&asp->lxast_lock); + lx_audit_stop_worker(NULL, cb); + mutex_enter(&asp->lxast_lock); + /* unlikely we loop, but handle racing setters */ + } + + VERIFY(asp->lxast_pid == 0); + VERIFY(asp->lxast_sock == NULL); + VERIFY(asp->lxast_exit == B_FALSE); + VERIFY(asp->lxast_worker == NULL); + if (pid != 0) { + /* Start a worker with the new socket */ + asp->lxast_sock = lxsock; + cb(asp->lxast_sock, B_TRUE); + asp->lxast_pid = pid; + asp->lxast_worker = thread_create(NULL, 0, lx_audit_worker, + asp, 0, curzone->zone_zsched, TS_RUN, minclsyspri); + } +} + +static boolean_t +lx_audit_match_val(uint32_t op, uint32_t ruleval, uint32_t curval) +{ + switch (op) { + case LX_OF_AUDIT_LT: + return (curval < ruleval); + case LX_OF_AUDIT_GT: + return (curval > ruleval); + case LX_OF_AUDIT_EQ: + return (curval == ruleval); + case LX_OF_AUDIT_NE: + return (curval != ruleval); + case LX_OF_AUDIT_LE: + return (curval <= ruleval); + case LX_OF_AUDIT_GE: + return (curval >= ruleval); + case LX_OF_AUDIT_BM: /* bit mask - any bit is set? */ + return ((curval & ruleval) != 0); + case LX_OF_AUDIT_BT: /* bit test - all bits must be set */ + return ((curval & ruleval) == ruleval); + default: + break; + } + return (B_FALSE); +} + +/* + * Per the Linux audit.rules(7) man page, a rule with an auid of -1 means the + * process does not have a loginuid. We'll use the absence of a session on the + * process to mimic this behavior. + */ +static uint32_t +lx_audit_get_auid() +{ + sess_t *s; + uint32_t v; + + /* + * A process with no session has: + * s_dev == 0xffffffffffffffff + * s_vp == NULL + * s_cred == NULL + */ + s = curproc->p_sessp; + if (s != NULL && s->s_vp != NULL) { + v = crgetsuid(CRED()); + } else { + v = UINT32_MAX; /* emulate auid of -1 */ + } + + return (v); +} + +/* + * Determine if the rule matches. + * Currently, we're actually just checking LX_RF_AUDIT_LOGINUID (-F auid) + * fields, but as we add support for additional field matching, this function + * should be enhanced. + */ +static boolean_t +lx_audit_syscall_rule_match(lx_audit_rule_ent_t *erp) +{ + uint32_t i, v; + lx_audit_rule_t *rp = &erp->lxare_rule; + + for (i = 0; i < rp->lxar_fld_cnt; i++) { + uint32_t ftype, fval, fop; + + ftype = rp->lxar_fields[i]; + if (ftype != LX_RF_AUDIT_LOGINUID) + continue; + + fop = rp->lxar_fld_flag[i]; + fval = rp->lxar_values[i]; + v = lx_audit_get_auid(); + + if (!lx_audit_match_val(fop, fval, v)) + return (B_FALSE); + } + return (B_TRUE); +} + +static int +lx_audit_write(file_t *fp, const char *msg) +{ + int fflag; + ssize_t count; + size_t nwrite = 0; + struct uio auio; + struct iovec aiov; + + count = strlen(msg); + fflag = fp->f_flag; + + aiov.iov_base = (void *) msg; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + return (lx_write_common(fp, &auio, &nwrite, B_FALSE)); +} + +/* + * We first try to send the msg out to the zone's logging service, then + * fallback to the zone's console, although in practice, that is unlikely to + * be useful to most users. + */ +static void +lx_audit_log_msg(const char *msg) +{ + int fd; + struct sockaddr_un addr; + struct sonode *so; + uint_t alen; + uint_t sizediff = (sizeof (addr) - sizeof (addr.sun_path)); + file_t *fp; + int err; + vnode_t *vp; + + ttolwp(curthread)->lwp_errno = 0; + fd = lx_socket(LX_AF_UNIX, LX_SOCK_DGRAM, 0); + if (ttolwp(curthread)->lwp_errno != 0) + goto trycons; + + bzero((char *)&addr, sizeof (addr)); + addr.sun_family = AF_UNIX; + (void) strncpy(addr.sun_path, "/dev/log", sizeof (addr.sun_path) - 1); + alen = strlen(addr.sun_path) + 1 + sizediff; + + /* + * We can't use lx_connect here since that expects to be called from + * user-land, so we do the (streamlined) connect ourselves. + */ + if ((so = getsonode(fd, &err, &fp)) == NULL) { + (void) close(fd); + goto trycons; + } + + err = socket_connect(so, (struct sockaddr *)&addr, alen, fp->f_flag, + _SOCONNECT_XPG4_2, CRED()); + + if (err == 0) + err = lx_audit_write(fp, msg); + + releasef(fd); /* release getsonode hold */ + (void) close(fd); + + if (err == 0) + return; + +trycons: + /* "open" the console device */ + if (lookupnameatcred("/dev/console", UIO_SYSSPACE, FOLLOW, NULLVPP, + &vp, NULL, CRED()) != 0) + return; + + if (falloc(vp, FWRITE, &fp, &fd) != 0) { + VN_RELE(vp); + return; + } + mutex_exit(&fp->f_tlock); + setf(fd, fp); + + /* nothing left to do if console write fails */ + (void) lx_audit_write(fp, msg); + close(fd); +} + +static void +lx_audit_fail(lx_audit_state_t *asp, const char *msg) +{ + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + if (asp->lxast_failure == LXAE_PRINT || + asp->lxast_failure == LXAE_PANIC) { + /* + * Linux can ratelimit the amount of log spam here, so we'll + * do something similar, especially since this could be called + * on many syscall returns if the audit daemon is down or + * not consuming audit records for some other reason. + */ + if (asp->lxast_lost % 100 == 0) + lx_audit_log_msg(msg); + if (asp->lxast_failure == LXAE_PANIC && + !asp->lxast_panicing) { + /* + * Reboot the zone so that no audit records are lost. + * We delay a second to give the zone's logger a chance + * to handle the log message. We have to drop the lock + * here in case the zone's logger itself is making + * syscalls which would be audited, although that + * wouldn't be the ideal configuration. + */ + asp->lxast_panicing = B_TRUE; + mutex_exit(&asp->lxast_lock); + lx_audit_log_msg("audit: panic"); + delay(drv_usectohz(1000000)); + zone_kadmin(A_SHUTDOWN, AD_BOOT, NULL, kcred); + mutex_enter(&asp->lxast_lock); + } + } + asp->lxast_lost++; +} + +/* + * This formats the input string into a format that matches Linux. The input + * strings are small right now (<= PSARGSZ) so for simpicity we're using + * a temporary buffer of adequate size. + */ +static void +lx_audit_fmt_str(char *dst, char *str, uint_t dlen) +{ + char *sp, tmp[100]; + + (void) strlcpy(tmp, str, sizeof (tmp)); + if ((sp = strchr(tmp, ' ')) != NULL) + *sp = '\0'; + + if ((sp = strchr(tmp, '"')) == NULL) { + (void) snprintf(dst, dlen, "\"%s\"", tmp); + } else { + char *p, *dp; + uint_t olen = 0; + + ASSERT(dlen > 2); + dlen -= 2; /* leave room for terminating nul */ + dp = dst; + for (p = str; *p != '\0' && olen < dlen; p++) { + (void) sprintf(dp, "%02x", *p); + dp += 2; + olen += 2; + } + *dp = '\0'; + } +} + +/* + * Format and enqueue a syscall audit record. + */ +static void +lx_audit_syscall_fmt_rcd(int sysnum, uint32_t arch, long ret, + lx_audit_state_t *asp, lx_audit_rule_ent_t *erp, uint64_t seq, + timestruc_t *tsp) +{ + klwp_t *lwp; + proc_t *p; + uint32_t items, sessid; + lx_lwp_data_t *lwpd; + lx_audit_record_t *rp; + cred_t *cr = CRED(); + minor_t minor; + char key[LX_AUDIT_MAX_KEY_LEN + 6]; /* for key="%s" formatting */ + char exe[PSARGSZ * 2 + 8], comm[MAXCOMLEN * 2 + 8]; + + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + if (asp->lxast_backlog >= asp->lxast_backlog_limit) { + lx_audit_fail(asp, "audit: backlog limit exceeded"); + return; + } + + if (arch == LX_AUDIT_ARCH32) { + items = MIN(4, lx_sysent32[sysnum].sy_narg); + } else { + ASSERT3U(arch, ==, LX_AUDIT_ARCH64); + items = MIN(4, lx_sysent64[sysnum].sy_narg); + } + + lwp = ttolwp(curthread); + lwpd = lwptolxlwp(lwp); + p = curproc; + + /* + * For the key, if no key has been set on the rule, Linux formats the + * string "(null)" (with no quotes - i.e. key=(null)). + */ + if (erp->lxare_key != NULL) { + (void) snprintf(key, sizeof (key), "key=\"%s\"", + erp->lxare_key); + } else { + (void) snprintf(key, sizeof (key), "key=(null)"); + } + + rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP); + if (rp == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + return; + } + rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP); + if (rp->lxar_msg == NULL) { + kmem_free(rp, sizeof (lx_audit_record_t)); + lx_audit_fail(asp, "audit: no kernel memory"); + return; + } + rp->lxar_type = LX_AUDIT_SYSCALL; + + mutex_enter(&p->p_splock); + sessid = p->p_sessp->s_sid; + minor = getminor(p->p_sessp->s_dev); + mutex_exit(&p->p_splock); + + mutex_enter(&p->p_lock); + lx_audit_fmt_str(exe, p->p_user.u_psargs, sizeof (exe)); + lx_audit_fmt_str(comm, p->p_user.u_comm, sizeof (comm)); + mutex_exit(&p->p_lock); + + /* + * See Linux audit_log_exit() for how a syscall exit record is + * formatted. + * + * For "arch" value, see Linux AUDIT_ARCH_IA64, AUDIT_ARCH_I386, + * __AUDIT_ARCH_64BIT and __AUDIT_ARCH_LE definitions. + * + * For fsuid/fsgid, see lx_setfsuid/lx_setfsgid for how we handle that. + */ + (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX, + "audit(%lu.%03lu:%lu): arch=%x syscall=%u " + "success=%s exit=%ld a0=%lu a1=%lu a2=%lu a3=%lu items=%u " + "ppid=%u pid=%u auid=%u uid=%u gid=%u euid=%u suid=%u " + "fsuid=%u egid=%u sgid=%u fsgid=%u tty=pts%u ses=%u " + "comm=%s exe=%s %s", + (uint64_t)tsp->tv_sec, /* zone's timestamp */ + (uint64_t)tsp->tv_nsec / 1000000, + seq, /* serial number */ + arch, /* arch */ + sysnum, /* syscall */ + (lwp->lwp_errno == 0 ? "yes" : "no"), /* success */ + ret, /* exit */ + lwpd->br_syscall_args[0], /* a0 */ + lwpd->br_syscall_args[1], /* a1 */ + lwpd->br_syscall_args[2], /* a2 */ + lwpd->br_syscall_args[3], /* a3 */ + items, /* items */ + lx_lwp_ppid(lwp, NULL, NULL), /* ppid */ + (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid), + lx_audit_get_auid(), /* auid */ + crgetruid(cr), /* uid */ + crgetrgid(cr), /* gid */ + crgetuid(cr), /* euid */ + crgetsuid(cr), /* saved uid */ + crgetuid(cr), /* fsuid */ + crgetgid(cr), /* egid */ + crgetsgid(cr), /* saved gid */ + crgetgid(cr), /* fsgid */ + minor, /* tty */ + sessid, /* ses */ + comm, /* comm */ + exe, /* exe */ + key); /* key="VAL" */ + + list_insert_tail(&asp->lxast_ev_queue, rp); + if (asp->lxast_backlog == 0) + cv_signal(&asp->lxast_worker_cv); + asp->lxast_backlog++; +} + +/* + * Get the next rule in the list that is generally applicable to the given + * syscall. + */ +static lx_audit_rule_ent_t * +lx_audit_next_applicable_rule(int sysnum, uint32_t arch, lx_audit_state_t *asp, + lx_audit_rule_ent_t *erp) +{ + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + for (erp = list_next(&asp->lxast_rules, erp); + erp != NULL; + erp = list_next(&asp->lxast_rules, erp)) { + lx_audit_rule_t *r = &erp->lxare_rule; + + /* Determine if the rule in the list has the same ARCH. */ + if (arch == LX_AUDIT_ARCH32 && !erp->lxare_is32bit) + continue; + if (arch == LX_AUDIT_ARCH64 && !erp->lxare_is64bit) + continue; + + /* Determine if this rule applies to the relevant syscall. */ + if (BT_TEST32(r->lxar_mask, sysnum)) + return (erp); + } + + return (NULL); +} + +void +lx_audit_syscall_exit(int sysnum, long ret) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + uint64_t seq; + lx_audit_rule_ent_t *erp; + timestruc_t ts; + uint32_t arch; + + if (lxzd->lxzd_audit_enabled == LXAE_DISABLED) + return; + + if (sysnum >= LX_NSYSCALLS) + return; + + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + if (get_udatamodel() == DATAMODEL_ILP32) { + arch = LX_AUDIT_ARCH32; + } else { + ASSERT(get_udatamodel() == DATAMODEL_LP64); + arch = LX_AUDIT_ARCH64; + } + + /* + * Fast top-level check to see if we're auditing this syscall. + * We don't take the mutex for this since there is no need. + */ + if (arch == LX_AUDIT_ARCH32) { + if (asp->lxast_sys32_rulep[sysnum] == NULL) + return; + } else { + if (asp->lxast_sys64_rulep[sysnum] == NULL) + return; + } + + mutex_enter(&asp->lxast_lock); + if (arch == LX_AUDIT_ARCH32) { + erp = asp->lxast_sys32_rulep[sysnum]; + } else { + erp = asp->lxast_sys64_rulep[sysnum]; + } + + if (erp == NULL) { + /* Hit a race and the syscall is no longer being audited */ + mutex_exit(&asp->lxast_lock); + return; + } + + /* + * All of the records in the set (i.e. same serial number) have + * the same timestamp. + */ + seq = asp->lxast_seq++; + gethrestime(&ts); + ts.tv_sec -= curzone->zone_boot_time; + + /* + * We have to determine if the first rule associated with the syscall, + * or any subsequent applicable rules, match. + * + * The first rule associated with the syscall may (or may not) match, + * but there can be additional rules which might also match. The first + * possible rule is always the one that enables the syscall auditing, + * but we also have to iterate to the end of the list to see if any + * other rules are applicable to this syscall. + */ + for (; erp != NULL; + erp = lx_audit_next_applicable_rule(sysnum, arch, asp, erp)) { + if (!lx_audit_syscall_rule_match(erp)) + continue; + + lx_audit_syscall_fmt_rcd(sysnum, arch, ret, asp, erp, seq, &ts); + } + + /* + * TODO: Currently we only output a single SYSCALL record. + * Real Linux emits a set of audit records for a syscall exit event + * (e.g. for an unlink syscall): + * type=SYSCALL + * type=CWD + * type=PATH - one for the parent dir + * type=PATH - one for the actual file unlinked + * type=PROCTITLE - (this one seems worthless) + * followed by an AUDIT_EOE message (which seems to be ignored). + * + * For syscalls that don't change files in the file system (e.g. ioctl) + * there are no PATH records. + */ + mutex_exit(&asp->lxast_lock); +} + +/* + * Determine which syscalls this rule applies to and setup a fast pointer for + * the syscall to enable it's rule match. + * + * We have to look at each bit and translate the external syscall bits into the + * internal syscall number. + */ +static void +lx_enable_syscall_rule(lx_audit_state_t *asp, lx_audit_rule_t *rulep, + lx_audit_rule_ent_t *rp) +{ + uint_t sysnum; + + ASSERT(MUTEX_HELD(&asp->lxast_lock)); + + for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) { + if (BT_TEST32(rulep->lxar_mask, sysnum)) { + if (rp->lxare_is32bit) { + if (asp->lxast_sys32_rulep[sysnum] == NULL) + asp->lxast_sys32_rulep[sysnum] = rp; + } + if (rp->lxare_is64bit) { + if (asp->lxast_sys64_rulep[sysnum] == NULL) + asp->lxast_sys64_rulep[sysnum] = rp; + } + } + } +} + +int +lx_audit_append_rule(void *r, uint_t datalen) +{ + lx_audit_rule_t *rulep = (lx_audit_rule_t *)r; + char *datap; + uint_t i; + lx_audit_rule_ent_t *rp; + lx_audit_state_t *asp; + boolean_t is_32bit = B_TRUE, is_64bit = B_TRUE, sys_found = B_FALSE; + char *tdp; + char key[LX_AUDIT_MAX_KEY_LEN + 1]; + uint32_t tlen; + + if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED) + return (EPERM); + + if (datalen < sizeof (lx_audit_rule_t)) + return (EINVAL); + datalen -= sizeof (lx_audit_rule_t); + + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + + if (rulep->lxar_buflen > datalen) + return (EINVAL); + + datap = rulep->lxar_buf; + + /* + * First check the rule to determine if we support the flag, actions, + * and all of the fields specified (since currently, our rule support + * is incomplete). + * + * NOTE: We currently only handle syscall exit rules. + */ + if (rulep->lxar_flag != LX_AUDIT_FILTER_EXIT || + rulep->lxar_action != LX_AUDIT_ACT_ALWAYS) + return (ENOTSUP); + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + tdp = datap; + tlen = rulep->lxar_buflen; + key[0] = '\0'; + for (i = 0; i < rulep->lxar_fld_cnt; i++) { + uint32_t ftype, fval, fop; + + fop = rulep->lxar_fld_flag[i]; + ftype = rulep->lxar_fields[i]; + fval = rulep->lxar_values[i]; + DTRACE_PROBE3(lx__audit__field, uint32_t, fop, + uint32_t, ftype, uint32_t, fval); + + if (ftype == LX_RF_AUDIT_ARCH) { + if (fop != LX_OF_AUDIT_EQ) + return (ENOTSUP); + if (!is_32bit || !is_64bit) + return (EINVAL); + if (fval == LX_AUDIT_ARCH64) { + is_32bit = B_FALSE; + } else if (fval == LX_AUDIT_ARCH32) { + is_64bit = B_FALSE; + } else { + return (ENOTSUP); + } + } else if (ftype == LX_RF_AUDIT_LOGINUID) { + if ((fop & LX_OF_AUDIT_ALL) == 0) + return (ENOTSUP); + } else if (ftype == LX_RF_AUDIT_FILTERKEY) { + if (fop != LX_OF_AUDIT_EQ) + return (ENOTSUP); + if (tlen < fval || fval > LX_AUDIT_MAX_KEY_LEN) + return (EINVAL); + if (key[0] != '\0') + return (EINVAL); + /* while we're here, save the parsed key */ + bcopy(tdp, key, fval); + key[fval] = '\0'; + tdp += fval; + tlen -= fval; + } else { + /* + * TODO: expand the support for additional Linux field + * options. + */ + return (ENOTSUP); + } + } + for (i = 0; i < LX_NSYSCALLS; i++) { + if (BT_TEST32(rulep->lxar_mask, i)) { + /* At least one syscall enabled in this mask entry */ + sys_found = B_TRUE; + break; + } + } + if (!sys_found) + return (ENOTSUP); + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + /* + * We have confirmed that we can handle the rule specified. + * Before taking the lock, allocate and setup the internal rule struct. + */ + rp = kmem_alloc(sizeof (lx_audit_rule_ent_t), KM_SLEEP); + bcopy(rulep, &rp->lxare_rule, sizeof (lx_audit_rule_t)); + rp->lxare_buf = kmem_alloc(rulep->lxar_buflen, KM_SLEEP); + bcopy(datap, rp->lxare_buf, rulep->lxar_buflen); + rp->lxare_is32bit = is_32bit; + rp->lxare_is64bit = is_64bit; + if (key[0] == '\0') { + rp->lxare_key = NULL; + } else { + int slen = strlen(key); + rp->lxare_key = kmem_alloc(slen + 1, KM_SLEEP); + (void) strlcpy(rp->lxare_key, key, slen + 1); + } + + mutex_enter(&asp->lxast_lock); + /* Save the rule on our top-level list. */ + list_insert_tail(&asp->lxast_rules, rp); + /* Enable tracing on the relevant syscalls. */ + lx_enable_syscall_rule(asp, rulep, rp); + mutex_exit(&asp->lxast_lock); + + return (0); +} + +int +lx_audit_delete_rule(void *r, uint_t datalen) +{ + lx_audit_rule_t *rulep = (lx_audit_rule_t *)r; + char *datap; + uint_t sysnum; + lx_audit_state_t *asp; + lx_audit_rule_ent_t *erp; + + if (ztolxzd(curproc->p_zone)->lxzd_audit_enabled == LXAE_LOCKED) + return (EPERM); + + if (datalen < sizeof (lx_audit_rule_t)) + return (EINVAL); + datalen -= sizeof (lx_audit_rule_t); + + if (rulep->lxar_fld_cnt > LX_AUDIT_RULE_MAX_FIELDS) + return (EINVAL); + + if (rulep->lxar_buflen > datalen) + return (EINVAL); + + datap = rulep->lxar_buf; + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + /* Find the matching rule from the rule list */ + for (erp = list_head(&asp->lxast_rules); + erp != NULL; + erp = list_next(&asp->lxast_rules, erp)) { + lx_audit_rule_t *r; + uint_t i; + boolean_t mtch; + + r = &erp->lxare_rule; + if (rulep->lxar_flag != r->lxar_flag) + continue; + if (rulep->lxar_action != r->lxar_action) + continue; + if (rulep->lxar_fld_cnt != r->lxar_fld_cnt) + continue; + for (i = 0, mtch = B_TRUE; i < LX_AUDIT_BITMASK_SIZE; i++) { + if (rulep->lxar_mask[i] != r->lxar_mask[i]) { + mtch = B_FALSE; + break; + } + } + if (!mtch) + continue; + + for (i = 0, mtch = B_TRUE; i < rulep->lxar_fld_cnt; i++) { + if (rulep->lxar_fields[i] != r->lxar_fields[i] || + rulep->lxar_values[i] != r->lxar_values[i] || + rulep->lxar_fld_flag[i] != r->lxar_fld_flag[i]) { + mtch = B_FALSE; + break; + } + } + if (!mtch) + continue; + if (rulep->lxar_buflen != r->lxar_buflen) + continue; + if (bcmp(datap, erp->lxare_buf, r->lxar_buflen) == 0) + break; + } + + /* There is no matching rule */ + if (erp == NULL) { + mutex_exit(&asp->lxast_lock); + return (ENOENT); + } + + /* + * Disable each relevant syscall enabling. + */ + for (sysnum = 0; sysnum < LX_NSYSCALLS; sysnum++) { + if (BT_TEST32(rulep->lxar_mask, sysnum)) { + /* + * If this was the first rule on the list for the + * given syscall (likely, since usually only one rule + * per syscall) then either disable tracing for that + * syscall, or point to the next applicable rule in the + * list. + */ + if (erp->lxare_is32bit) { + if (asp->lxast_sys32_rulep[sysnum] == erp) { + asp->lxast_sys32_rulep[sysnum] = + lx_audit_next_applicable_rule( + sysnum, LX_AUDIT_ARCH32, asp, erp); + } + } + if (erp->lxare_is64bit) { + if (asp->lxast_sys64_rulep[sysnum] == erp) { + asp->lxast_sys64_rulep[sysnum] = + lx_audit_next_applicable_rule( + sysnum, LX_AUDIT_ARCH64, asp, erp); + } + } + } + } + + /* Remove the rule from the top-level list */ + list_remove(&asp->lxast_rules, erp); + + kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen); + if (erp->lxare_key != NULL) + kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1); + kmem_free(erp, sizeof (lx_audit_rule_ent_t)); + + mutex_exit(&asp->lxast_lock); + return (0); +} + +void +lx_audit_emit_user_msg(uint_t mtype, uint_t len, char *datap) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + lx_audit_record_t *rp; + timestruc_t ts; + uint_t sessid; + proc_t *p = curproc; + lx_lwp_data_t *lwpd = lwptolxlwp(ttolwp(curthread)); + uint_t prelen, alen; + char msg[LX_AUDIT_MESSAGE_TEXT_MAX]; + + /* + * For user messages, auditing may not actually be initialized. If not, + * just return. + */ + if (lxzd->lxzd_audit_enabled == LXAE_DISABLED || + lxzd->lxzd_audit_state == NULL) + return; + + if (len >= sizeof (msg)) + len = sizeof (msg) - 1; + + mutex_enter(&p->p_splock); + sessid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + mutex_enter(&asp->lxast_lock); + + if (asp->lxast_backlog >= asp->lxast_backlog_limit) { + lx_audit_fail(asp, "audit: backlog limit exceeded"); + mutex_exit(&asp->lxast_lock); + return; + } + + rp = kmem_alloc(sizeof (lx_audit_record_t), KM_NOSLEEP); + if (rp == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + mutex_exit(&asp->lxast_lock); + return; + } + rp->lxar_msg = kmem_zalloc(LX_AUDIT_MESSAGE_TEXT_MAX, KM_NOSLEEP); + if (rp->lxar_msg == NULL) { + lx_audit_fail(asp, "audit: no kernel memory"); + mutex_exit(&asp->lxast_lock); + kmem_free(rp, sizeof (lx_audit_record_t)); + return; + } + rp->lxar_type = mtype; + bcopy(datap, msg, len); + msg[len] = '\0'; + + gethrestime(&ts); + ts.tv_sec -= curzone->zone_boot_time; + + (void) snprintf(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX, + "audit(%lu.%03lu:%lu): pid=%u uid=%u auid=%u ses=%u msg=\'", + (uint64_t)ts.tv_sec, /* zone's timestamp */ + (uint64_t)ts.tv_nsec / 1000000, + asp->lxast_seq++, /* serial number */ + (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid), + crgetruid(CRED()), /* uid */ + lx_audit_get_auid(), /* auid */ + sessid); /* ses */ + + prelen = strlen(rp->lxar_msg); + alen = LX_AUDIT_MESSAGE_TEXT_MAX - prelen - 2; + (void) strlcat(rp->lxar_msg + prelen, msg, alen); + (void) strlcat(rp->lxar_msg, "\'", LX_AUDIT_MESSAGE_TEXT_MAX); + + list_insert_tail(&asp->lxast_ev_queue, rp); + if (asp->lxast_backlog == 0) + cv_signal(&asp->lxast_worker_cv); + asp->lxast_backlog++; + mutex_exit(&asp->lxast_lock); +} + +void +lx_audit_list_rules(void *reply, + void (*cb)(void *, void *, uint_t, void *, uint_t)) +{ + lx_audit_state_t *asp; + lx_audit_rule_ent_t *rp; + + asp = ztolxzd(curzone)->lxzd_audit_state; + ASSERT(asp != NULL); + + /* + * Output the rule list + */ + mutex_enter(&asp->lxast_lock); + for (rp = list_head(&asp->lxast_rules); rp != NULL; + rp = list_next(&asp->lxast_rules, rp)) { + cb(reply, &rp->lxare_rule, sizeof (lx_audit_rule_t), + rp->lxare_buf, rp->lxare_rule.lxar_buflen); + } + mutex_exit(&asp->lxast_lock); +} + +void +lx_audit_get_feature(void *reply, void (*cb)(void *, void *, uint_t)) +{ + lx_audit_features_t af; + + af.lxaf_version = LX_AUDIT_FEATURE_VERSION; + af.lxaf_mask = 0xffffffff; + af.lxaf_features = 0; + af.lxaf_lock = 0; + + cb(reply, &af, sizeof (af)); +} + +void +lx_audit_get(void *reply, void (*cb)(void *, void *, uint_t)) +{ + lx_audit_status_t status; + lx_zone_data_t *lxzd; + lx_audit_state_t *asp; + + lxzd = ztolxzd(curproc->p_zone); + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + bzero(&status, sizeof (status)); + + mutex_enter(&asp->lxast_lock); + status.lxas_enabled = lxzd->lxzd_audit_enabled; + status.lxas_failure = asp->lxast_failure; + status.lxas_pid = asp->lxast_pid; + status.lxas_rate_limit = asp->lxast_rate_limit; + status.lxas_backlog_limit = asp->lxast_backlog_limit; + status.lxas_lost = asp->lxast_lost; + status.lxas_backlog = asp->lxast_backlog; + status.lxas_backlog_wait_time = asp->lxast_backlog_wait_time; + status.lxas_feature_bitmap = LX_AUDIT_FEATURE_ALL; + mutex_exit(&asp->lxast_lock); + + cb(reply, &status, sizeof (status)); +} + +int +lx_audit_set(void *lxsock, void *s, uint_t datalen, + void (*cb)(void *, boolean_t)) +{ + lx_audit_status_t *statusp = (lx_audit_status_t *)s; + lx_zone_data_t *lxzd; + lx_audit_state_t *asp; + + /* + * Unfortunately, some user-level code does not send down a full + * lx_audit_status_t structure in the message (e.g. this occurs on + * CentOS7). Only the structure up to, but not including, the embedded + * union is being sent in. This appears to be a result of the user-level + * code being built for older versions of the kernel. To handle this, + * we have to subtract the last 8 bytes from the size in order to + * accomodate this code. We'll revalidate with the full size if + * LX_AUDIT_STATUS_BACKLOG_WAIT_TIME were to be set in the mask. + */ + if (datalen < sizeof (lx_audit_status_t) - 8) + return (EINVAL); + + lxzd = ztolxzd(curproc->p_zone); + asp = lxzd->lxzd_audit_state; + ASSERT(asp != NULL); + + /* Once the config is locked, we only allow changing the auditd pid */ + mutex_enter(&asp->lxast_lock); + if (lxzd->lxzd_audit_enabled == LXAE_LOCKED && + (statusp->lxas_mask & ~LX_AUDIT_STATUS_PID)) { + mutex_exit(&asp->lxast_lock); + return (EPERM); + } + + if (statusp->lxas_mask & LX_AUDIT_STATUS_FAILURE) { + switch (statusp->lxas_failure) { + case LXAE_SILENT: + case LXAE_PRINT: + case LXAE_PANIC: + asp->lxast_failure = statusp->lxas_failure; + break; + default: + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_PID) { + /* + * The process that sets the pid is the daemon, so this is the + * socket we'll write audit records out to. + */ + lx_audit_set_worker(statusp->lxas_pid, lxsock, cb); + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_RATE_LIMIT) { + asp->lxast_rate_limit = statusp->lxas_rate_limit; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_LIMIT) { + asp->lxast_backlog_limit = statusp->lxas_backlog_limit; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_BACKLOG_WAIT_TIME) { + /* + * See the comment above. We have to revalidate the full struct + * size since we previously only validated for a shorter struct. + */ + if (datalen < sizeof (lx_audit_status_t)) { + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + asp->lxast_backlog_wait_time = statusp->lxas_backlog_wait_time; + } + if (statusp->lxas_mask & LX_AUDIT_STATUS_LOST) { + asp->lxast_lost = statusp->lxas_lost; + } + + if (statusp->lxas_mask & LX_AUDIT_STATUS_ENABLED) { + switch (statusp->lxas_enabled) { + case 0: + lxzd->lxzd_audit_enabled = LXAE_DISABLED; + break; + case 1: + lxzd->lxzd_audit_enabled = LXAE_ENABLED; + break; + case 2: + lxzd->lxzd_audit_enabled = LXAE_LOCKED; + break; + default: + mutex_exit(&asp->lxast_lock); + return (EINVAL); + } + } + mutex_exit(&asp->lxast_lock); + + return (0); +} + +void +lx_audit_stop_worker(void *s, void (*cb)(void *, boolean_t)) +{ + lx_audit_state_t *asp = ztolxzd(curzone)->lxzd_audit_state; + kt_did_t tid = 0; + + ASSERT(asp != NULL); + mutex_enter(&asp->lxast_lock); + if (s == NULL) { + s = asp->lxast_sock; + } else { + VERIFY(s == asp->lxast_sock); + } + asp->lxast_sock = NULL; + asp->lxast_pid = 0; + if (asp->lxast_worker != NULL) { + tid = asp->lxast_worker->t_did; + asp->lxast_worker = NULL; + asp->lxast_exit = B_TRUE; + cv_signal(&asp->lxast_worker_cv); + } + if (s != NULL) + cb(s, B_FALSE); + mutex_exit(&asp->lxast_lock); + + if (tid != 0) + thread_join(tid); +} + +/* + * Called when audit netlink message received, in order to perform lazy + * allocation of audit state for the zone. We also perform the one-time step to + * cache the netlink callback used by the audit worker thread to send messages + * up to the auditd. + */ +void +lx_audit_init(int (*cb)(void *, uint_t, const char *, uint_t)) +{ + lx_zone_data_t *lxzd = ztolxzd(curzone); + lx_audit_state_t *asp; + + mutex_enter(&lxzd->lxzd_lock); + + if (lxzd->lxzd_audit_state != NULL) { + mutex_exit(&lxzd->lxzd_lock); + return; + } + + asp = kmem_zalloc(sizeof (lx_audit_state_t), KM_SLEEP); + + mutex_init(&asp->lxast_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&asp->lxast_worker_cv, NULL, CV_DEFAULT, NULL); + list_create(&asp->lxast_ev_queue, sizeof (lx_audit_record_t), + offsetof(lx_audit_record_t, lxar_link)); + list_create(&asp->lxast_rules, sizeof (lx_audit_rule_ent_t), + offsetof(lx_audit_rule_ent_t, lxare_link)); + asp->lxast_failure = LXAE_PRINT; + asp->lxast_backlog_limit = LX_AUDIT_DEF_BACKLOG_LIMIT; + asp->lxast_backlog_wait_time = LX_AUDIT_DEF_WAIT_TIME; + + lxzd->lxzd_audit_state = asp; + + mutex_exit(&lxzd->lxzd_lock); + + mutex_enter(&lx_audit_em_lock); + if (lx_audit_emit_msg == NULL) + lx_audit_emit_msg = cb; + mutex_exit(&lx_audit_em_lock); +} + +/* + * Called when netlink module is unloading so that we can clear the cached + * netlink callback used by the audit worker thread to send messages up to the + * auditd. + */ +void +lx_audit_cleanup(void) +{ + mutex_enter(&lx_audit_em_lock); + lx_audit_emit_msg = NULL; + mutex_exit(&lx_audit_em_lock); +} + +/* + * Called when the zone is being destroyed, not when auditing is being disabled. + * Note that zsched has already exited and any lxast_worker thread has exited. + */ +void +lx_audit_fini(zone_t *zone) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + lx_audit_state_t *asp; + lx_audit_record_t *rp; + lx_audit_rule_ent_t *erp; + + ASSERT(MUTEX_HELD(&lxzd->lxzd_lock)); + + if ((asp = lxzd->lxzd_audit_state) == NULL) + return; + + mutex_enter(&asp->lxast_lock); + + VERIFY(asp->lxast_worker == NULL); + + rp = list_remove_head(&asp->lxast_ev_queue); + while (rp != NULL) { + kmem_free(rp->lxar_msg, LX_AUDIT_MESSAGE_TEXT_MAX); + kmem_free(rp, sizeof (lx_audit_record_t)); + rp = list_remove_head(&asp->lxast_ev_queue); + } + + list_destroy(&asp->lxast_ev_queue); + asp->lxast_backlog = 0; + asp->lxast_pid = 0; + + erp = list_remove_head(&asp->lxast_rules); + while (erp != NULL) { + kmem_free(erp->lxare_buf, erp->lxare_rule.lxar_buflen); + if (erp->lxare_key != NULL) + kmem_free(erp->lxare_key, strlen(erp->lxare_key) + 1); + kmem_free(erp, sizeof (lx_audit_rule_ent_t)); + erp = list_remove_head(&asp->lxast_rules); + } + list_destroy(&asp->lxast_rules); + + mutex_exit(&asp->lxast_lock); + + cv_destroy(&asp->lxast_worker_cv); + mutex_destroy(&asp->lxast_lock); + lxzd->lxzd_audit_state = NULL; + kmem_free(asp, sizeof (lx_audit_state_t)); +} + +/* + * Audit initialization/cleanup when lx brand module is loaded and + * unloaded. + */ +void +lx_audit_ld() +{ + mutex_init(&lx_audit_em_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_audit_unld() +{ + mutex_destroy(&lx_audit_em_lock); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c new file mode 100644 index 0000000000..31bb86cce1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -0,0 +1,2728 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2020 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * The LX Brand: emulation of a Linux operating environment within a zone. + * + * OVERVIEW + * + * The LX brand enables a full Linux userland -- including a C library, + * init(1) framework, and some set of applications -- to run unmodified + * within an illumos zone. Unlike illumos, where applications are expected + * to link against and consume functions exported from libraries, the + * supported Linux binary compatibility boundary is the system call + * interface. By accurately emulating the behaviour of Linux system calls, + * Linux software can be executed in this environment as if it were running + * on a native Linux system. + * + * EMULATING LINUX SYSTEM CALLS + * + * Linux system calls are made in 32-bit processes via the "int 0x80" + * instruction; in 64-bit processes the "syscall" instruction is used, as it + * is with native illumos processes. In both cases, arguments to system + * calls are generally passed in registers and the usermode stack is not + * interpreted or modified by the Linux kernel. + * + * When the emulated Linux process makes a system call, it traps into the + * illumos kernel. The in-kernel brand module contains various emulation + * routines, and can fully service some emulated system calls; e.g. read(2) + * and write(2). Other system calls require assistance from the illumos + * libc, bouncing back out to the brand library ("lx_brand.so.1") for + * emulation. + * + * The brand mechanism allows for the provision of an alternative trap + * handler for the various system call mechanisms. Traditionally this was + * used to immediately revector execution to the usermode emulation library, + * which was responsible for handling all system calls. In the interests of + * more accurate emulation and increased performance, much of the regular + * illumos system call path is now invoked. Only the argument processing and + * handler dispatch are replaced by the brand, via the per-LWP + * "lwp_brand_syscall" interposition function pointer. + * + * THE NATIVE AND BRAND STACKS + * + * Some runtime environments (e.g. the Go language) allocate very small + * thread stacks, preferring to grow or split the stack as necessary. The + * Linux kernel generally does not use the usermode stack when servicing + * system calls, so this is not a problem. In order for our emulation to + * have the same zero stack impact, we must execute usermode emulation + * routines on an _alternate_ stack. This is similar, in principle, to the + * use of sigaltstack(3C) to run signal handlers off the main thread stack. + * + * To this end, the brand library allocates and installs an alternate stack + * (called the "native" stack) for each LWP. The in-kernel brand code uses + * this stack for usermode emulation calls and interposed signal delivery, + * while the emulated Linux process sees only the data on the main thread + * stack, known as the "brand" stack. The stack mode is tracked in the + * per-LWP brand-private data, using the LX_STACK_MODE_* enum. + * + * The stack mode doubles as a system call "mode bit". When in the + * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux + * system calls. In other modes, system calls are assumed to be native + * illumos system calls as made during brand library initialisation and + * usermode emulation. + * + * USERMODE EMULATION + * + * When a Linux system call cannot be emulated within the kernel, we preserve + * the register state of the Linux process and revector the LWP to the brand + * library usermode emulation handler: the "lx_emulate()" function in + * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, + * and is performed in "lx_emulate_user()". + * + * First, the emulated process state is written out to the usermode stack of + * the process as a "ucontext_t" object. Arguments to the emulation routine + * are passed on the stack or in registers, depending on the ABI. When the + * usermode emulation is complete, the result is passed back to the kernel + * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context + * for restoration. + * + * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT + * + * When servicing emulated system calls in the usermode brand library, or + * during signal delivery, various state is preserved by the kernel so that + * the running LWP may be revectored to a handling routine. The context + * allows the kernel to restart the program at the point of interruption, + * either at the return of the signal handler, via setcontext(3C); or after + * the usermode emulation request has been serviced, via B_EMULATION_DONE. + * + * In illumos native processes, the saved context (a "ucontext_t" object) + * includes the state of registers and the current signal mask at the point + * of interruption. The context also includes a link to the most recently + * saved context, forming a chain to be unwound as requests complete. The LX + * brand requires additional book-keeping to describe the machine state: in + * particular, the current stack mode and the occupied extent of the native + * stack. + * + * The brand code is able to interpose on the context save and restore + * operations in the kernel -- see "lx_savecontext()" and + * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to + * function correctly in the face of a dual stack LWP. The brand also + * interposes on the signal delivery mechanism -- see "lx_sendsig()" and + * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand + * library interposer on the native stack, regardless of the interrupted + * execution mode. Linux sigaltstack(2) emulation is performed entirely by + * the usermode brand library during signal handler interposition. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/exec.h> +#include <sys/lx_impl.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <sys/lx_futex.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_userhz.h> +#include <sys/param.h> +#include <sys/termios.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/auxv.h> +#include <sys/priv.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/archsystm.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sdt.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <sys/core.h> +#include <sys/stack.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <lx_signum.h> +#include <util/sscanf.h> +#include <sys/lx_brand.h> +#include <sys/zfs_ioctl.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> + +int lx_debug = 0; +uint_t lx_hz_scale = 0; + +void lx_init_brand_data(zone_t *, kmutex_t *); +void lx_free_brand_data(zone_t *); +void lx_setbrand(proc_t *); +int lx_getattr(zone_t *, int, void *, size_t *); +int lx_setattr(zone_t *, int, void *, size_t); +int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, uintptr_t); +void lx_set_kern_version(zone_t *, char *); +void lx_copy_procdata(proc_t *, proc_t *); + +extern int getsetcontext(int, void *); +extern int waitsys(idtype_t, id_t, siginfo_t *, int); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +extern int waitsys32(idtype_t, id_t, siginfo_t *, int); +#endif + +extern int zvol_name2minor(const char *, minor_t *); +extern int zvol_create_minor(const char *); + +extern void lx_proc_exit(proc_t *); +extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); + +extern void lx_io_clear(lx_proc_data_t *); +extern void lx_io_cleanup(proc_t *); + +extern void lx_ioctl_init(); +extern void lx_ioctl_fini(); +extern void lx_socket_init(); +extern void lx_socket_fini(); + +extern int lx_start_nfs_lockd(); +extern void lx_upcall_statd(); + +lx_systrace_f *lx_systrace_entry_ptr; +lx_systrace_f *lx_systrace_return_ptr; + +static int lx_systrace_enabled; + +/* + * cgroup file system maintenance functions which are set when cgroups loads. + */ +void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +/* + * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly + * want an MMU dependency here (and should there be a microprocessor without + * a hole, we don't want to start allocating from the top of the VA range). + */ +#define LX_MAXSTACK64 0x7ffffff00000 + +uint64_t lx_maxstack64 = LX_MAXSTACK64; + +static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, size_t *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action); + +static boolean_t lx_native_exec(uint8_t, const char **); +static uint32_t lx_map32limit(proc_t *); + +static void lx_savecontext(ucontext_t *); +static void lx_restorecontext(ucontext_t *); +static caddr_t lx_sendsig_stack(int); +static void lx_sendsig(int); +#if defined(_SYSCALL32_IMPL) +static void lx_savecontext32(ucontext32_t *); +#endif +static int lx_setid_clear(vattr_t *, cred_t *); +#if defined(_LP64) +static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); +#endif +static void lx_clearbrand(proc_t *, boolean_t); + +typedef struct lx_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +} lx_zfs_ds_t; + +/* lx brand */ +struct brand_ops lx_brops = { + lx_init_brand_data, /* b_init_brand_data */ + lx_free_brand_data, /* b_free_brand_data */ + lx_brandsys, /* b_brandsys */ + lx_setbrand, /* b_setbrand */ + lx_getattr, /* b_getattr */ + lx_setattr, /* b_setattr */ + lx_copy_procdata, /* b_copy_procdata */ + lx_proc_exit, /* b_proc_exit */ + lx_exec, /* b_exec */ + lx_setrval, /* b_lwp_setrval */ + lx_lwpdata_alloc, /* b_lwpdata_alloc */ + lx_lwpdata_free, /* b_lwpdata_free */ + lx_initlwp, /* b_initlwp */ + lx_initlwp_post, /* b_initlwp_post */ + lx_forklwp, /* b_forklwp */ + lx_freelwp, /* b_freelwp */ + lx_exitlwp, /* b_lwpexit */ + lx_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + lx_sigfd_translate, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + lx_exit_with_sig, /* b_exit_with_sig */ + lx_wait_filter, /* b_wait_filter */ + lx_native_exec, /* b_native_exec */ + lx_map32limit, /* b_map32limit */ + lx_stop_notify, /* b_stop_notify */ + lx_waitid_helper, /* b_waitid_helper */ + lx_sigcld_repost, /* b_sigcld_repost */ + lx_ptrace_issig_stop, /* b_issig_stop */ + lx_ptrace_sig_ignorable, /* b_sig_ignorable */ + lx_savecontext, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + lx_savecontext32, /* b_savecontext32 */ +#endif + lx_restorecontext, /* b_restorecontext */ + lx_sendsig_stack, /* b_sendsig_stack */ + lx_sendsig, /* b_sendsig */ + lx_setid_clear, /* b_setid_clear */ +#if defined(_LP64) + lx_pagefault, /* b_pagefault */ +#else + NULL, +#endif + B_FALSE, /* b_intp_parse_arg */ + lx_clearbrand, /* b_clearbrand */ + lx_upcall_statd, /* b_rpc_statd */ + lx_acct_out /* b_acct_out */ +}; + +struct brand_mach_ops lx_mops = { + NULL, + NULL, + NULL, + NULL, + NULL, + lx_fixsegreg, + lx_fsbase +}; + +struct brand lx_brand = { + BRAND_VER_1, + "lx", + &lx_brops, + &lx_mops, + sizeof (struct lx_proc_data) +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "lx brand", &lx_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +lx_proc_exit(proc_t *p) +{ + lx_proc_data_t *lxpd; + proc_t *cp; + + lx_clone_grp_exit(p, B_FALSE); + /* Cleanup any outstanding aio contexts */ + lx_io_cleanup(p); + + mutex_enter(&p->p_lock); + VERIFY((lxpd = ptolxproc(p)) != NULL); + VERIFY(lxpd->l_ptrace == 0); + if ((lxpd->l_flags & LX_PROC_CHILD_DEATHSIG) == 0) { + mutex_exit(&p->p_lock); + return; + } + mutex_exit(&p->p_lock); + + /* Check for children which desire notification of parental death. */ + mutex_enter(&pidlock); + for (cp = p->p_child; cp != NULL; cp = cp->p_sibling) { + mutex_enter(&cp->p_lock); + if ((lxpd = ptolxproc(cp)) == NULL) { + mutex_exit(&cp->p_lock); + continue; + } + if (lxpd->l_parent_deathsig != 0) { + sigtoproc(cp, NULL, lxpd->l_parent_deathsig); + } + mutex_exit(&cp->p_lock); + } + mutex_exit(&pidlock); +} + +void +lx_setbrand(proc_t *p) +{ + /* Send SIGCHLD to parent by default when child exits */ + ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; +} + +/* ARGSUSED */ +int +lx_setattr(zone_t *zone, int attr, void *ubuf, size_t ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + bzero(buf, LX_KERN_RELEASE_MAX); + if (ubufsz >= LX_KERN_RELEASE_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_release, buf, + LX_KERN_RELEASE_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + bzero(buf, LX_KERN_VERSION_MAX); + if (ubufsz >= LX_KERN_VERSION_MAX) { + return (ERANGE); + } + if (copyin(ubuf, buf, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(lxzd->lxzd_kernel_version, buf, + LX_KERN_VERSION_MAX); + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + case LX_ATTR_TTY_GID: { + gid_t gid; + if (ubufsz != sizeof (gid)) { + return (ERANGE); + } + if (copyin(ubuf, &gid, ubufsz) != 0) { + return (EFAULT); + } + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_ttygrp = gid; + mutex_exit(&lxzd->lxzd_lock); + return (0); + } + default: + return (EINVAL); + } +} + +/* ARGSUSED */ +int +lx_getattr(zone_t *zone, int attr, void *ubuf, size_t *ubufsz) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + int len; + + switch (attr) { + case LX_ATTR_KERN_RELEASE: { + char buf[LX_KERN_RELEASE_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_release, LX_KERN_RELEASE_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_release, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + case LX_ATTR_KERN_VERSION: { + char buf[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + len = strnlen(lxzd->lxzd_kernel_version, LX_KERN_VERSION_MAX); + len++; + if (*ubufsz < len) { + mutex_exit(&lxzd->lxzd_lock); + return (ERANGE); + } + bzero(buf, sizeof (buf)); + (void) strncpy(buf, lxzd->lxzd_kernel_version, sizeof (buf)); + mutex_exit(&lxzd->lxzd_lock); + if (copyout(buf, ubuf, len) != 0) { + return (EFAULT); + } + *ubufsz = len; + return (0); + } + default: + return (EINVAL); + } +} + +uint32_t +lx_map32limit(proc_t *p) +{ + /* + * To be bug-for-bug compatible with Linux, we have MAP_32BIT only + * allow mappings in the first 31 bits. This was a nuance in the + * original Linux implementation circa 2002, and applications have + * come to depend on its behavior. + * + * This is only relevant for 64-bit processes. + */ + if (p->p_model == DATAMODEL_LP64) + return ((uint32_t)1 << 31); + + return ((uint32_t)USERLIMIT32); +} + +void +lx_brand_systrace_enable(void) +{ + VERIFY(!lx_systrace_enabled); + + lx_systrace_enabled = 1; +} + +void +lx_brand_systrace_disable(void) +{ + VERIFY(lx_systrace_enabled); + + lx_systrace_enabled = 0; +} + +void +lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) +{ + VERIFY(lwpd->br_ntv_stack != 0); + + /* + * The "brand-lx-set-ntv-stack-current" probe has arguments: + * arg0: stack pointer before change + * arg1: stack pointer after change + * arg2: current stack base + */ + DTRACE_PROBE3(brand__lx__set__ntv__stack__current, + uintptr_t, lwpd->br_ntv_stack_current, + uintptr_t, new_sp, + uintptr_t, lwpd->br_ntv_stack); + + lwpd->br_ntv_stack_current = new_sp; +} + +#if defined(_LP64) +static int +lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, + enum seg_rw rw) +{ + int syscall_num; + + /* + * We only want to handle a very specific set of circumstances. + * Namely: this is a 64-bit LX-branded process attempting to execute an + * address in a page for which it does not have a valid mapping. If + * this is not the case, we bail out as fast as possible. + */ + VERIFY(PROC_IS_BRANDED(p)); + if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != + DATAMODEL_NATIVE) { + return (-1); + } + + if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { + return (-1); + } + + /* + * This is a valid vsyscall address. We service the system call and + * return 0 to signal that the pagefault has been handled completely. + */ + lx_vsyscall_enter(p, lwp, syscall_num); + return (0); +} +#endif + +static void +lx_clearbrand(proc_t *p, boolean_t lwps_ok) +{ + lx_clone_grp_exit(p, lwps_ok); +} + +/* + * This hook runs prior to sendsig() processing and allows us to nominate + * an alternative stack pointer for delivery of the signal handling frame. + * Critically, this routine should _not_ modify any LWP state as the + * savecontext() does not run until after this hook. + */ +/* ARGSUSED */ +static caddr_t +lx_sendsig_stack(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We want to take signal delivery on the native stack, but only if + * one has been allocated and installed for this LWP. + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The program is not running on the native stack. Return + * the native stack pointer from our brand-private data so + * that we may switch to it for signal handling. + */ + return ((caddr_t)lwpd->br_ntv_stack_current); + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * Either the program is already running on the native stack, + * or one has not yet been allocated for this LWP. Use the + * current stack pointer value. + */ + return ((caddr_t)rp->r_sp); + } +} + +/* + * This hook runs after sendsig() processing and allows us to update the + * per-LWP mode flags for system calls and stacks. The pre-signal + * context has already been saved and delivered to the user at this point. + */ +/* ARGSUSED */ +static void +lx_sendsig(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * In lx_sendsig_stack(), we nominated a stack pointer from the + * native stack. Update the stack mode, and the current in-use + * extent of the native stack, accordingly: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, rp->r_sp); + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + break; + + default: + /* + * Otherwise, the brand library has not yet installed the + * alternate stack for this LWP. Signals will be handled on + * the regular stack thread. + */ + return; + } +} + +/* + * This hook runs prior to the context restoration, allowing us to take action + * or modify the context before it is loaded. + */ +static void +lx_restorecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + caddr_t sp = ucp->uc_brand_data[1]; + + if (lwpd->br_stack_mode == LX_STACK_MODE_PREINIT) { + /* + * Since we're here with stack_mode as LX_STACK_MODE_PREINIT, + * that can only mean we took a signal really early in this + * thread's lifetime, before we had a chance to setup a native + * stack and start running the thread's code. Since we're still + * handling everything on the single stack, we can't do any of + * the usual work below. Note: this means we cannot look at + * "flags" since the uc_brand_data may not have been properly + * set, depending on where we were when we took the signal. + */ + return; + } + + /* + * We have a saved native stack pointer value that we must restore + * into the per-LWP data. + */ + if (flags & LX_UC_RESTORE_NATIVE_SP) { + lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); + } + + /* + * We do not wish to restore the value of uc_link in this context, + * so replace it with the value currently in the LWP. + */ + if (flags & LX_UC_IGNORE_LINK) { + ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; + } + + /* + * Set or restore the stack mode. Usually this restores the mode, but + * the lx_runexe code flow also uses this to set the mode from + * LX_STACK_MODE_INIT to LX_UC_STACK_BRAND. + */ + if (flags & LX_UC_STACK_NATIVE) { + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + } else if (flags & LX_UC_STACK_BRAND) { + lwpd->br_stack_mode = LX_STACK_MODE_BRAND; + } + +#if defined(__amd64) + /* + * Override the fs/gsbase in the context with the value provided + * through the Linux arch_prctl(2) system call. + */ + if (flags & LX_UC_STACK_BRAND) { + if (lwpd->br_lx_fsbase != 0) { + ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; + } + if (lwpd->br_lx_gsbase != 0) { + ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; + } + } +#endif +} + +static void +lx_savecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = + (void *)(uintptr_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = (void *)flags; +} + +#if defined(_SYSCALL32_IMPL) +static void +lx_savecontext32(ucontext32_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + unsigned int flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = (caddr32_t)(uintptr_t)NULL; + } + + ucp->uc_brand_data[0] = flags; +} +#endif + +static int +lx_zfs_ioctl(ldi_handle_t lh, int cmd, zfs_cmd_t *zc, size_t *dst_alloc_size) +{ + uint64_t cookie; + size_t dstsize; + int rc, unused; + + cookie = zc->zc_cookie; + + dstsize = (dst_alloc_size == NULL ? 0 : 8192); + +again: + if (dst_alloc_size != NULL) { + zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(dstsize, + KM_SLEEP); + zc->zc_nvlist_dst_size = dstsize; + } + + rc = ldi_ioctl(lh, cmd, (intptr_t)zc, FKIOCTL, kcred, &unused); + if (rc == ENOMEM && dst_alloc_size != NULL) { + /* + * Our nvlist_dst buffer was too small, retry with a bigger + * buffer. ZFS will tell us the exact needed size. + */ + size_t newsize = zc->zc_nvlist_dst_size; + ASSERT(newsize > dstsize); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, dstsize); + dstsize = newsize; + zc->zc_cookie = cookie; + + goto again; + } + + if (dst_alloc_size != NULL) { + *dst_alloc_size = dstsize; + } + + return (rc); +} + +static int +lx_zone_zfs_open(ldi_handle_t *lh, dev_t *zfs_dev) +{ + ldi_ident_t li; + + if (ldi_ident_from_mod(&modlinkage, &li) != 0) { + return (-1); + } + if (ldi_open_by_name("/dev/zfs", FREAD|FWRITE, kcred, lh, li) != 0) { + ldi_ident_release(li); + return (-1); + } + ldi_ident_release(li); + if (ldi_get_dev(*lh, zfs_dev) != 0) { + (void) ldi_close(*lh, FREAD|FWRITE, kcred); + return (-1); + } + return (0); +} + +/* + * We only get the relevant properties for zvols. This is because we're + * essentially iterating all of the ZFS datasets/zvols on the entire system + * when we boot the zone and there is a significant performance penalty if we + * have to retrieve all of the properties for everything. Especially since we + * don't care about any of them except the zvols actually in our delegated + * datasets. + * + * Note that the two properties we care about, volsize & volblocksize, are + * mandatory for zvols and should always be present. Also, note that the + * blocksize property value cannot change after the zvol has been created. + */ +static void +lx_zvol_props(ldi_handle_t lh, zfs_cmd_t *zc, uint64_t *vsz, uint64_t *bsz) +{ + int rc; + size_t size; + nvlist_t *nv = NULL, *nv2; + + rc = lx_zfs_ioctl(lh, ZFS_IOC_OBJSET_STATS, zc, &size); + if (rc != 0) + return; + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &nv, 0); + ASSERT(rc == 0); + + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + zc->zc_nvlist_dst = (uintptr_t)NULL; + zc->zc_nvlist_dst_size = 0; + + if ((rc = nvlist_lookup_nvlist(nv, "volsize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *vsz = val; + } + } + + if ((rc = nvlist_lookup_nvlist(nv, "volblocksize", &nv2)) == 0) { + uint64_t val; + + rc = nvlist_lookup_uint64(nv2, ZPROP_VALUE, &val); + if (rc == 0) { + *bsz = val; + } + } + + nvlist_free(nv); +} + +/* + * Unlike ZFS proper, which does dynamic zvols, we currently only generate the + * zone's "disk" list once at zone boot time and use that consistently in all + * of the various subsystems (devfs, sysfs, procfs). This allows us to avoid + * re-iterating the datasets every time one of those subsystems accesses a + * "disk" and allows us to keep the view consistent across all subsystems, but + * it does mean a reboot is required to see new "disks". This is somewhat + * mitigated by its similarity to actual disk drives on a real system. + */ +static void +lx_zone_get_zvols(zone_t *zone, ldi_handle_t lh, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd; + list_t *zvol_lst, ds_lst; + int rc; + unsigned int devnum = 0; + size_t size; + zfs_cmd_t *zc; + nvpair_t *elem = NULL; + nvlist_t *pnv = NULL; + + lxzd = ztolxzd(zone); + ASSERT(lxzd != NULL); + zvol_lst = lxzd->lxzd_vdisks; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + if (lx_zfs_ioctl(lh, ZFS_IOC_POOL_CONFIGS, zc, &size) != 0) { + goto out; + } + ASSERT(zc->zc_cookie > 0); + + rc = nvlist_unpack((char *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, &pnv, 0); + kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size); + if (rc != 0) + goto out; + + /* + * We use a dataset list to process all of the datasets in the pool + * without doing recursion so that we don't risk blowing the kernel + * stack. + */ + list_create(&ds_lst, sizeof (lx_zfs_ds_t), + offsetof(lx_zfs_ds_t, ds_link)); + + while ((elem = nvlist_next_nvpair(pnv, elem)) != NULL) { + lx_zfs_ds_t *ds; + + ds = kmem_zalloc(sizeof (lx_zfs_ds_t), KM_SLEEP); + (void) strcpy(ds->ds_name, nvpair_name(elem)); + list_insert_head(&ds_lst, ds); + + while (ds != NULL) { + int w; /* dummy variable */ + + bzero(zc, sizeof (zfs_cmd_t)); + zc->zc_cookie = ds->ds_cookie; + (void) strcpy(zc->zc_name, ds->ds_name); + + rc = lx_zfs_ioctl(lh, ZFS_IOC_DATASET_LIST_NEXT, + zc, NULL); + /* Update the cookie before doing anything else. */ + ds->ds_cookie = zc->zc_cookie; + + if (rc != 0) { + list_remove(&ds_lst, ds); + kmem_free(ds, sizeof (lx_zfs_ds_t)); + ds = list_tail(&ds_lst); + continue; + } + + /* Reserved internal names, skip over these. */ + if (strchr(zc->zc_name, '$') != NULL || + strchr(zc->zc_name, '%') != NULL) + continue; + + if (!zone_dataset_visible_inzone(zone, zc->zc_name, &w)) + continue; + + if (zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) { + lx_virt_disk_t *vd; + minor_t m = 0; + char *znm = zc->zc_name; + + /* Create a virtual disk entry for the zvol */ + vd = kmem_zalloc(sizeof (lx_virt_disk_t), + KM_SLEEP); + vd->lxvd_type = LXVD_ZVOL; + (void) snprintf(vd->lxvd_name, + sizeof (vd->lxvd_name), + "zvol%u", devnum++); + (void) strlcpy(vd->lxvd_real_name, + zc->zc_name, + sizeof (vd->lxvd_real_name)); + + /* Record emulated and real dev_t values */ + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, + (*emul_minor)++); + if (zvol_name2minor(znm, &m) != 0) { + (void) zvol_create_minor(znm); + VERIFY(zvol_name2minor(znm, &m) == 0); + } + if (m != 0) { + vd->lxvd_real_dev = makedevice( + getmajor(lxzd->lxzd_zfs_dev), m); + } + + /* Query volume size properties */ + lx_zvol_props(lh, zc, &vd->lxvd_volsize, + &vd->lxvd_blksize); + + list_insert_tail(zvol_lst, vd); + } else { + lx_zfs_ds_t *nds; + + /* Create a new ds_t for the child. */ + nds = kmem_zalloc(sizeof (lx_zfs_ds_t), + KM_SLEEP); + (void) strcpy(nds->ds_name, zc->zc_name); + list_insert_after(&ds_lst, ds, nds); + + /* Depth-first, so do the one just created. */ + ds = nds; + } + } + + ASSERT(list_is_empty(&ds_lst)); + } + + list_destroy(&ds_lst); + +out: + nvlist_free(pnv); + kmem_free(zc, sizeof (zfs_cmd_t)); +} + +static void +lx_zone_get_zfsds(zone_t *zone, minor_t *emul_minor) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + vfs_t *vfsp = zone->zone_rootvp->v_vfsp; + + /* + * Only the root will be mounted at zone init time. + * Finding means of discovering other datasets mounted in the zone + * would be a good enhancement later. + */ + if (getmajor(vfsp->vfs_dev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t *vd; + + vd = kmem_zalloc(sizeof (lx_virt_disk_t), KM_SLEEP); + vd->lxvd_type = LXVD_ZFS_DS; + vd->lxvd_real_dev = vfsp->vfs_dev; + vd->lxvd_emul_dev = makedevice(LX_MAJOR_DISK, (*emul_minor)++); + (void) snprintf(vd->lxvd_name, sizeof (vd->lxvd_name), + "zfsds%u", 0); + (void) strlcpy(vd->lxvd_real_name, + refstr_value(vfsp->vfs_resource), + sizeof (vd->lxvd_real_name)); + + list_insert_tail(lxzd->lxzd_vdisks, vd); + } +} + +/* Cleanup virtual disk list */ +static void +lx_zone_cleanup_vdisks(lx_zone_data_t *lxzd) +{ + lx_virt_disk_t *vd; + + ASSERT(lxzd->lxzd_vdisks != NULL); + vd = (list_remove_head(lxzd->lxzd_vdisks)); + while (vd != NULL) { + kmem_free(vd, sizeof (lx_virt_disk_t)); + vd = list_remove_head(lxzd->lxzd_vdisks); + } + + list_destroy(lxzd->lxzd_vdisks); + kmem_free(lxzd->lxzd_vdisks, sizeof (list_t)); + lxzd->lxzd_vdisks = NULL; +} + +/* + * By default illumos restricts access to ULP_DEF_EPRIV_PORT1 and + * ULP_DEF_EPRIV_PORT2 for TCP and UDP, even though these ports are outside of + * the privileged port range. Linux does not do this, so we need to remove + * these defaults. + * + * See also: mod_set_extra_privports + */ +static void +lx_fix_ns_eports(netstack_t *ns) +{ + tcp_stack_t *tcps; + udp_stack_t *udps; + in_port_t *ports; + uint_t i, nports; + kmutex_t *lock; + + tcps = ns->netstack_tcp; + ports = tcps->tcps_g_epriv_ports; + nports = tcps->tcps_g_num_epriv_ports; + lock = &tcps->tcps_epriv_port_lock; + mutex_enter(lock); + for (i = 0; i < nports; i++) + ports[i] = 0; + mutex_exit(lock); + + udps = ns->netstack_udp; + ports = udps->us_epriv_ports; + nports = udps->us_num_epriv_ports; + lock = &udps->us_epriv_port_lock; + mutex_enter(lock); + for (i = 0; i < nports; i++) + ports[i] = 0; + mutex_exit(lock); +} + +/* + * The default limit for TCP buffer sizing on illumos is smaller than its + * counterparts on Linux. Adjust it to meet minimum expectations. + */ +static void +lx_fix_ns_buffers(netstack_t *ns) +{ + mod_prop_info_t *pinfo; + ulong_t target, parsed; + char buf[16]; + + /* + * Prior to kernel 3.4, Linux defaulted to a max of 4MB for both the + * tcp_rmem and tcp_wmem tunables. Kernels since then increase the + * tcp_rmem default max to 6MB. Since illumos lacks separate tunables + * to cap sizing for read and write buffers, the higher value is + * selected for compatibility. + */ + if (lx_kern_release_cmp(curzone, "3.4.0") < 0) { + target = 4*1024*1024; + } else { + target = 6*1024*1024; + } + + pinfo = mod_prop_lookup(ns->netstack_tcp->tcps_propinfo_tbl, + "max_buf", MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_getf(ns, pinfo, NULL, buf, sizeof (buf), 0) != 0 || + ddi_strtoul(buf, NULL, 10, &parsed) != 0 || + parsed >= target) { + return; + } + + (void) snprintf(buf, sizeof (buf), "%lu", target); + (void) pinfo->mpi_setf(ns, CRED(), pinfo, NULL, buf, 0); +} + +static void +lx_bootup_hooks() +{ + netstack_t *ns; + + ns = netstack_get_current(); + if (ns == NULL) + return; + + lx_fix_ns_eports(ns); + lx_fix_ns_buffers(ns); + + netstack_rele(ns); +} + +void +lx_init_brand_data(zone_t *zone, kmutex_t *zsl) +{ + lx_zone_data_t *data; + ldi_handle_t lh; + + ASSERT(MUTEX_HELD(zsl)); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(zone->zone_brand_data == NULL); + + data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); + mutex_init(&data->lxzd_lock, NULL, MUTEX_DEFAULT, NULL); + + /* No need to hold mutex now since zone_brand_data is not set yet. */ + + /* + * Set the default lxzd_kernel_version to 2.4. + * This can be changed by a call to setattr() during zone boot. + */ + (void) strlcpy(data->lxzd_kernel_release, "2.4.21", + LX_KERN_RELEASE_MAX); + (void) strlcpy(data->lxzd_kernel_version, "BrandZ virtual linux", + LX_KERN_VERSION_MAX); + data->lxzd_pipe_max_sz = lx_pipe_max_default; + + zone->zone_brand_data = data; + + /* + * In Linux, if the init(1) process terminates the system panics. + * The zone must reboot to simulate this behaviour. + */ + zone->zone_reboot_on_init_exit = B_TRUE; + + /* + * We cannot hold the zone_status_lock while performing zfs operations + * so we drop the lock, get the zfs devs as the last step in this + * function, then reaquire the lock. Don't add any code after this + * which requires that the zone_status_lock was continuously held. + */ + mutex_exit(zsl); + + data->lxzd_vdisks = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(data->lxzd_vdisks, sizeof (lx_virt_disk_t), + offsetof(lx_virt_disk_t, lxvd_link)); + + if (lx_zone_zfs_open(&lh, &data->lxzd_zfs_dev) == 0) { + minor_t emul_minor = 1; + + lx_zone_get_zfsds(zone, &emul_minor); + lx_zone_get_zvols(zone, lh, &emul_minor); + (void) ldi_close(lh, FREAD|FWRITE, kcred); + } else { + /* Avoid matching any devices */ + data->lxzd_zfs_dev = makedevice(-1, 0); + } + mutex_enter(zsl); +} + +void +lx_free_brand_data(zone_t *zone) +{ + lx_zone_data_t *data = ztolxzd(zone); + ASSERT(data != NULL); + mutex_enter(&data->lxzd_lock); + lx_audit_fini(zone); + if (data->lxzd_ioctl_sock != NULL) { + /* + * Since zone_kcred has been cleaned up already, close the + * socket using the global kcred. + */ + (void) ksocket_close(data->lxzd_ioctl_sock, kcred); + data->lxzd_ioctl_sock = NULL; + } + ASSERT(data->lxzd_cgroup == NULL); + + lx_zone_cleanup_vdisks(data); + + mutex_exit(&data->lxzd_lock); + zone->zone_brand_data = NULL; + mutex_destroy(&data->lxzd_lock); + kmem_free(data, sizeof (*data)); +} + +void +lx_unsupported(char *dmsg) +{ + lx_proc_data_t *pd = ttolxproc(curthread); + + DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); + + if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { + /* + * If this process was run with strict mode enabled + * (via LX_STRICT in the environment), we mark this + * LWP as having triggered an unsupported behaviour. + * This flag will be checked at an appropriate point + * by lx_check_strict_failure(). + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + lwpd->br_strict_failure = B_TRUE; + } +} + +void +lx_check_strict_failure(lx_lwp_data_t *lwpd) +{ + proc_t *p; + + if (!lwpd->br_strict_failure) { + return; + } + + lwpd->br_strict_failure = B_FALSE; + + /* + * If this process is operating in strict mode (via LX_STRICT in + * the environment), and has triggered a call to + * lx_unsupported(), we drop SIGSYS on it as we return. + */ + p = curproc; + mutex_enter(&p->p_lock); + sigtoproc(p, curthread, SIGSYS); + mutex_exit(&p->p_lock); +} + +void +lx_trace_sysenter(int syscall_num, uintptr_t *args) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_entry_ptr != NULL); + + (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], + args[2], args[3], args[4], args[5]); + } +} + +void +lx_trace_sysreturn(int syscall_num, long ret) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); + } +} + +/* + * Get the addresses of the user-space system call handler and attach it to + * the proc structure. Returning 0 indicates success; the value returned + * by the system call is the value stored in rval. Returning a non-zero + * value indicates a failure; the value returned is used to set errno, -1 + * is returned from the syscall and the contents of rval are ignored. To + * set errno and have the syscall return a value other than -1 we can + * manually set errno and rval and return 0. + */ +int +lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lx_proc_data_t *pd; + struct termios *termios; + uint_t termios_len; + int error; + int code; + int sig; + lx_brand_registration_t reg; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * There is one operation that is suppored for non-branded + * process. B_EXEC_BRAND. This is the equilivant of an + * exec call, but the new process that is created will be + * a branded process. + */ + if (cmd == B_EXEC_BRAND) { + VERIFY(p->p_zone != NULL); + VERIFY(p->p_zone->zone_brand == &lx_brand); + return (exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_BRAND)); + } + + /* For all other operations this must be a branded process. */ + if (p->p_brand == NULL) + return (ENOSYS); + + /* + * Certain native applications may wish to start the lx_lockd process. + * Every other process that's not branded should be denied. + */ + if (p->p_brand != &lx_brand && cmd != B_START_NFS_LOCKD) + return (ENOSYS); + + if (cmd != B_START_NFS_LOCKD) + VERIFY(p->p_brand_data != NULL); + + switch (cmd) { + case B_REGISTER: + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("stack mode was not PREINIT during " + "REGISTER\n"); + return (EINVAL); + } + + if (p->p_model == DATAMODEL_NATIVE) { + if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + } +#ifdef _LP64 + else { + /* 32-bit userland on 64-bit kernel */ + lx_brand_registration32_t reg32; + + if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + + reg.lxbr_version = (uint_t)reg32.lxbr_version; + reg.lxbr_handler = + (void *)(uintptr_t)reg32.lxbr_handler; + reg.lxbr_flags = reg32.lxbr_flags; + } +#endif + + if (reg.lxbr_version != LX_VERSION_1) { + lx_print("Invalid brand library version (%u)\n", + reg.lxbr_version); + return (EINVAL); + } + + if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { + lx_print("Invalid brand flags (%u)\n", + reg.lxbr_flags); + return (EINVAL); + } + + lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", + (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); + pd = p->p_brand_data; + pd->l_handler = (uintptr_t)reg.lxbr_handler; + pd->l_flags = reg.lxbr_flags & LX_PROC_ALL; + + /* + * There are certain setup tasks which cannot be performed + * during the lx_init_brand_data hook due to the calling + * context from zoneadmd (in the GZ). This work is instead + * delayed until the init process starts inside the zone. + */ + if (p->p_pid == p->p_zone->zone_proc_initpid) { + lx_bootup_hooks(); + } + + return (0); + + case B_TTYMODES: + /* This is necessary for emulating TCGETS ioctls. */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, + &termios_len) != DDI_SUCCESS) + return (EIO); + + ASSERT(termios_len == sizeof (*termios)); + + if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { + ddi_prop_free(termios); + return (EFAULT); + } + + ddi_prop_free(termios); + return (0); + + case B_ELFDATA: { + mutex_enter(&p->p_lock); + pd = curproc->p_brand_data; + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_elf_data_t led; + + bcopy(&pd->l_elf_data, &led, sizeof (led)); + mutex_exit(&p->p_lock); + + if (copyout(&led, (void *)arg1, + sizeof (lx_elf_data_t)) != 0) { + return (EFAULT); + } + } +#if defined(_LP64) + else { + /* 32-bit userland on 64-bit kernel */ + lx_elf_data32_t led32; + + led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; + led32.ed_phent = (int)pd->l_elf_data.ed_phent; + led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; + led32.ed_entry = (int)pd->l_elf_data.ed_entry; + led32.ed_base = (int)pd->l_elf_data.ed_base; + led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; + mutex_exit(&p->p_lock); + + if (copyout(&led32, (void *)arg1, + sizeof (led32)) != 0) { + return (EFAULT); + } + } +#endif + return (0); + } + + case B_EXEC_NATIVE: + return (exec_common((char *)arg1, (const char **)arg2, + (const char **)arg3, EBA_NATIVE)); + + /* + * The B_TRUSS_POINT subcommand is used so that we can make a no-op + * syscall for debugging purposes (dtracing) from within the user-level + * emulation. + */ + case B_TRUSS_POINT: + return (0); + + case B_LPID_TO_SPAIR: { + /* + * Given a Linux pid as arg1, return the Solaris pid in arg2 and + * the Solaris LWP in arg3. We also translate pid 1 (which is + * hardcoded in many applications) to the zone's init process. + */ + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { + return (ESRCH); + } + + if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { + return (EFAULT); + } + + return (0); + } + + case B_PTRACE_STOP_FOR_OPT: + return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); + + case B_PTRACE_CLONE_BEGIN: + /* + * Leverage ptrace brand call to create a clone group for this + * proc if necessary. + */ + lx_clone_grp_create((uint_t)arg3); + + return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE)); + + case B_PTRACE_SIG_RETURN: { + /* + * Our ptrace emulation must emit PR_SYSEXIT for rt_sigreturn. + * Since that syscall does not pass through the normal + * emulation, which would call lx_syscall_return, the event is + * emitted manually. A successful result of the syscall is + * assumed since there is little to be done in the face of + * failure. + */ + struct regs *rp = lwptoregs(lwp); + + rp->r_r0 = 0; + (void) lx_ptrace_stop(LX_PR_SYSEXIT); + return (0); + } + + case B_UNSUPPORTED: { + char dmsg[256]; + + if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { + lx_print("Failed to copyin unsupported msg " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + dmsg[255] = '\0'; + lx_unsupported(dmsg); + + lx_check_strict_failure(lwpd); + + return (0); + } + + case B_STORE_ARGS: { + /* + * B_STORE_ARGS subcommand + * arg1 = address of struct to be copied in + * arg2 = size of the struct being copied in + * arg3-arg6 ignored + * rval = the amount of data copied. + */ + void *buf; + + /* only have upper limit because arg2 is unsigned */ + if (arg2 > LX_BR_ARGS_SIZE_MAX) { + return (EINVAL); + } + + buf = kmem_alloc(arg2, KM_SLEEP); + if (copyin((void *)arg1, buf, arg2) != 0) { + lx_print("Failed to copyin scall arg at 0x%p\n", + (void *) arg1); + kmem_free(buf, arg2); + /* + * Purposely not setting br_scall_args to NULL + * to preserve data for debugging. + */ + return (EFAULT); + } + + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, + lwpd->br_args_size); + } + + lwpd->br_scall_args = buf; + lwpd->br_args_size = arg2; + *rval = arg2; + return (0); + } + + case B_HELPER_CLONE: + return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, + (void *)arg4)); + + case B_HELPER_SETGROUPS: + return (lx_helper_setgroups(arg1, (gid_t *)arg2)); + + case B_HELPER_SIGQUEUE: + return (lx_helper_rt_sigqueueinfo(arg1, arg2, + (siginfo_t *)arg3)); + + case B_HELPER_TGSIGQUEUE: + return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, + (siginfo_t *)arg4)); + + case B_GETPID: + /* + * The usermode clone(2) code needs to be able to call + * lx_getpid() from native code: + */ + *rval = lx_getpid(); + return (0); + + case B_SET_NATIVE_STACK: + /* + * B_SET_NATIVE_STACK subcommand + * arg1 = the base of the stack to use for emulation + */ + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("B_SET_NATIVE_STACK when stack was already " + "set to %p\n", (void *)arg1); + return (EEXIST); + } + + /* + * We move from the PREINIT state, where we have no brand + * emulation stack, to the INIT state. Here, we are still + * running on what will become the BRAND stack, but are running + * emulation (i.e. native) code. Once the initialisation + * process for this thread has finished, we will jump to + * brand-specific code, while moving to the BRAND mode. + * + * When a new LWP is created, lx_initlwp() will clear the + * stack data. If that LWP is actually being duplicated + * into a child process by fork(2), lx_forklwp() will copy + * it so that the cloned thread will keep using the same + * alternate stack. + */ + lwpd->br_ntv_stack = arg1; + lwpd->br_stack_mode = LX_STACK_MODE_INIT; + lx_lwp_set_native_stack_current(lwpd, arg1); + + return (0); + + case B_GET_CURRENT_CONTEXT: + /* + * B_GET_CURRENT_CONTEXT subcommand: + * arg1 = address for pointer to current ucontext_t + */ + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; + + error = copyout(&addr, (void *)arg1, sizeof (addr)); + } else +#endif + { + error = copyout(&lwp->lwp_oldcontext, (void *)arg1, + sizeof (lwp->lwp_oldcontext)); + } + + return (error != 0 ? EFAULT : 0); + + case B_JUMP_TO_LINUX: + /* + * B_JUMP_TO_LINUX subcommand: + * arg1 = ucontext_t pointer for jump state + */ + + if (arg1 == (uintptr_t)NULL) + return (EINVAL); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_NATIVE: { + struct regs *rp = lwptoregs(lwp); + + /* + * We are on the NATIVE stack, so we must preserve + * the extent of that stack. The pointer will be + * reset by a future setcontext(). + */ + lx_lwp_set_native_stack_current(lwpd, + (uintptr_t)rp->r_sp); + break; + } + + case LX_STACK_MODE_INIT: + /* + * The LWP is transitioning to Linux code for the first + * time. + */ + break; + + case LX_STACK_MODE_PREINIT: + /* + * This LWP has not installed an alternate stack for + * usermode emulation handling. + */ + return (ENOENT); + + case LX_STACK_MODE_BRAND: + /* + * The LWP should not be on the BRAND stack. + */ + exit(CLD_KILLED, SIGSYS); + return (0); + } + + /* + * Transfer control to Linux: + */ + return (lx_runexe(lwp, (void *)arg1)); + + case B_EMULATION_DONE: + /* + * B_EMULATION_DONE subcommand: + * arg1 = ucontext_t * to restore + * arg2 = system call number + * arg3 = return code + * arg4 = if operation failed, the errno value + */ + + /* + * The first part of this operation is a setcontext() to + * restore the register state to the copy we preserved + * before vectoring to the usermode emulation routine. + * If that fails, we return (hopefully) to the emulation + * routine and it will handle the error. + */ +#if (_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + error = getsetcontext32(SETCONTEXT, (void *)arg1); + } else +#endif + { + error = getsetcontext(SETCONTEXT, (void *)arg1); + } + + if (error != 0) { + return (error); + } + + /* + * The saved Linux context has been restored. We handle the + * return value or errno with code common to the in-kernel + * system call emulation. + */ + if ((error = (int)arg4) != 0) { + /* + * lx_syscall_return() looks at the errno in the LWP, + * so set it here: + */ + (void) set_errno(error); + } + lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); + + return (0); + + case B_EXIT_AS_SIG: + code = CLD_KILLED; + sig = (int)arg1; + proc_is_exiting(p); + if (exitlwps(1) != 0) { + mutex_enter(&p->p_lock); + lwp_exit(); + } + ttolwp(curthread)->lwp_cursig = sig; + if (sig == SIGSEGV) { + if (core(sig, 0) == 0) + code = CLD_DUMPED; + } + exit(code, sig); + /* NOTREACHED */ + break; + + case B_OVERRIDE_KERN_VER: { + void *urel = (void *)arg1; + void *uver = (void *)arg2; + size_t len; + + pd = ptolxproc(p); + if (urel != NULL) { + if (copyinstr(urel, pd->l_uname_release, + LX_KERN_RELEASE_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_release[LX_KERN_RELEASE_MAX - 1] = '\0'; + } + if (uver != NULL) { + if (copyinstr(uver, pd->l_uname_version, + LX_KERN_VERSION_MAX, &len) != 0) { + return (EFAULT); + } + pd->l_uname_version[LX_KERN_VERSION_MAX - 1] = '\0'; + } + + return (0); + } + + case B_GET_PERSONALITY: { + unsigned int result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + result = pd->l_personality; + mutex_exit(&p->p_lock); + return (result); + } + + case B_START_NFS_LOCKD: + (void) lx_start_nfs_lockd(); + return (0); + + case B_BLOCK_ALL_SIGS: { + uint_t result = 0; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + /* + * This is used to block handling of all signals during vfork() + * or clone(LX_CLONE_VFORK) emulation to match Linux semantics + * and prevent the parent's signal handlers being called before + * they are properly reset. + */ + if (pd->l_block_all_signals != 0) { + result = set_errno(EAGAIN); + } else { + pd->l_block_all_signals = 1; + } + mutex_exit(&p->p_lock); + return (result); + } + + case B_UNBLOCK_ALL_SIGS: { + uint_t result = 0; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + if (pd->l_block_all_signals == 0) { + result = set_errno(EINVAL); + } else { + pd->l_block_all_signals = 0; + } + mutex_exit(&p->p_lock); + return (result); + } + + case B_ALL_SIGS_BLOCKED: { + uint_t result; + + mutex_enter(&p->p_lock); + pd = ptolxproc(p); + result = pd->l_block_all_signals; + mutex_exit(&p->p_lock); + return (result); + } + } + + return (EINVAL); +} + +/* + * Compare linux kernel version to the one set for the zone. + * Returns greater than 0 if zone version is higher, less than 0 if the zone + * version is lower, and 0 if the versions are equal. + */ +int +lx_kern_release_cmp(zone_t *zone, const char *vers) +{ + int zvers[3] = {0, 0, 0}; + int cvers[3] = {0, 0, 0}; + int i; + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + VERIFY(zone->zone_brand == &lx_brand); + + mutex_enter(&lxzd->lxzd_lock); + (void) sscanf(lxzd->lxzd_kernel_release, "%d.%d.%d", &zvers[0], + &zvers[1], &zvers[2]); + mutex_exit(&lxzd->lxzd_lock); + (void) sscanf(vers, "%d.%d.%d", &cvers[0], &cvers[1], &cvers[2]); + + for (i = 0; i < 3; i++) { + if (zvers[i] > cvers[i]) { + return (1); + } else if (zvers[i] < cvers[i]) { + return (-1); + } + } + return (0); +} + +/* + * Linux unconditionally removes the setuid and setgid bits when changing + * file ownership. This brand hook overrides the illumos native behaviour, + * which is based on the PRIV_FILE_SETID privilege. + */ +/* ARGSUSED */ +static int +lx_setid_clear(vattr_t *vap, cred_t *cr) +{ + if (S_ISDIR(vap->va_mode)) { + return (0); + } + + if (vap->va_mode & S_ISUID) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISUID; + } + if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Copy the per-process brand data from a parent proc to a child. + */ +void +lx_copy_procdata(proc_t *cp, proc_t *pp) +{ + lx_proc_data_t *cpd, *ppd; + + /* + * Since b_copy_procdata is called during getproc(), while the child + * process is still being initialized, acquiring cp->p_lock should not + * be required. + */ + VERIFY(cp->p_brand == &lx_brand); + VERIFY((cpd = cp->p_brand_data) != NULL); + + mutex_enter(&pp->p_lock); + VERIFY(pp->p_brand == &lx_brand); + VERIFY((ppd = pp->p_brand_data) != NULL); + + bcopy(ppd, cpd, sizeof (lx_proc_data_t)); + mutex_exit(&pp->p_lock); + + /* Clear any aio contexts from child */ + lx_io_clear(cpd); + + /* + * The l_ptrace count is normally manipulated only while under holding + * p_lock. Since this is a freshly created process, it's safe to zero + * out. If it is to be inherited, the attach will occur later. + */ + cpd->l_ptrace = 0; + + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; + + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; + + bzero(cpd->l_clone_grps, sizeof (cpd->l_clone_grps)); +} + +#if defined(_LP64) +static void +Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) +{ + bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); + dst->e_type = src->e_type; + dst->e_machine = src->e_machine; + dst->e_version = src->e_version; + dst->e_entry = src->e_entry; + dst->e_phoff = src->e_phoff; + dst->e_shoff = src->e_shoff; + dst->e_flags = src->e_flags; + dst->e_ehsize = src->e_ehsize; + dst->e_phentsize = src->e_phentsize; + dst->e_phnum = src->e_phnum; + dst->e_shentsize = src->e_shentsize; + dst->e_shnum = src->e_shnum; + dst->e_shstrndx = src->e_shstrndx; +} +#endif /* _LP64 */ + +static void +restoreexecenv(struct execenv *ep, stack_t *sp) +{ + klwp_t *lwp = ttolwp(curthread); + + setexecenv(ep); + lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; + lwp->lwp_sigaltstack.ss_size = sp->ss_size; + lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; +} + +static uintptr_t +lx_map_vdso(struct uarg *args, struct cred *cred) +{ + int err; + char *fpath = LX_VDSO_PATH; + vnode_t *vp; + vattr_t attr; + caddr_t addr; + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + fpath = LX_VDSO_PATH32; + } +#endif + + /* + * The comm page should have been mapped in already. + */ + if (args->commpage == (uintptr_t)NULL) { + return ((uintptr_t)NULL); + } + + /* + * Ensure the VDSO library is present and appropriately sized. + * This lookup is started at the zone root to avoid complications for + * processes which have chrooted. For the specified lookup root to be + * used, the leading slash must be dropped from the path. + */ + ASSERT(fpath[0] == '/'); + fpath++; + if (lookupnameat(fpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, + curzone->zone_rootvp) != 0) { + return ((uintptr_t)NULL); + } + + /* + * The VDSO requires data exposed via the comm page in order to + * function properly. The VDSO is always mapped in at a fixed known + * offset from the comm page, providing an easy means to locate it. + */ + addr = (caddr_t)(args->commpage - LX_VDSO_SIZE); + attr.va_mask = AT_SIZE; + if (VOP_GETATTR(vp, &attr, 0, cred, NULL) != 0 || + attr.va_size > LX_VDSO_SIZE) { + VN_RELE(vp); + return ((uintptr_t)NULL); + } + + err = execmap(vp, addr, attr.va_size, 0, 0, + PROT_USER|PROT_READ|PROT_EXEC, 1, 0); + VN_RELE(vp); + if (err != 0) { + return ((uintptr_t)NULL); + } + return ((uintptr_t)addr); +} + +/* + * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux + * binaries. + */ +/* ARGSUSED4 */ +static int +lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, size_t *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action) +{ + int error; + vnode_t *nvp; + Ehdr ehdr; + Addr uphdr_vaddr; + intptr_t voffset; + char *interp = NULL; + uintptr_t ldaddr = (uintptr_t)NULL; + proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + lx_proc_data_t *lxpd = ptolxproc(p); + struct execenv env, origenv; + stack_t orig_sigaltstack; + struct user *up = PTOU(ttoproc(curthread)); + lx_elf_data_t edp; + char *lib_path = LX_LIB_PATH; + boolean_t execstk = B_TRUE; + unsigned int personality; + + ASSERT(p->p_brand == &lx_brand); + ASSERT(lxpd != NULL); + + /* + * Start with a separate struct for ELF data instead of inheriting + * values from the currently running binary. This ensures that fields + * such as ed_base are cleared if the new binary does not utilize an + * interpreter. + */ + bzero(&edp, sizeof (edp)); + +#if defined(_LP64) + if (args->to_model != DATAMODEL_NATIVE) { + lib_path = LX_LIB_PATH32; + } +#endif + + /* + * Set the brandname and library name for the new process so that + * elfexec() puts them onto the stack. + */ + args->brandname = LX_BRANDNAME; + args->emulator = lib_path; + +#if defined(_LP64) + /* + * To conform with the way Linux lays out the address space, we clamp + * the stack to be the top of the lower region of the x86-64 canonical + * form address space -- which has the side-effect of laying out the + * entire address space in that lower region. Note that this only + * matters on 64-bit processes (this value will always be greater than + * the size of a 32-bit address space) and doesn't actually affect + * USERLIMIT: if a Linux-branded processes wishes to map something + * into the top half of the address space, it can do so -- but with + * the user stack starting at the top of the bottom region, those high + * virtual addresses won't be used unless explicitly directed. + */ + args->maxstack = lx_maxstack64; +#endif + + /* + * Search the binary for a PT_GNU_STACK header. The PF_X bit contained + * within is used to dictate protection defaults for the stack, among + * other things. + */ + if (args->to_model == DATAMODEL_NATIVE) { + Ehdr ehdr; + Phdr *phdrp; + caddr_t phdrbase = NULL; + size_t phdrsize = 0; + uint_t nphdrs, hsize; + + if ((error = elfreadhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + /* LINTED: alignment */ + phdrp = (Phdr *)phdrbase; + for (uint_t i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + /* LINTED: alignment */ + phdrp = (Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr; + Elf32_Phdr *phdrp; + caddr_t phdrbase = NULL; + size_t phdrsize = 0; + uint_t nphdrs, hsize; + + if ((error = elf32readhdr(vp, cred, &ehdr, &nphdrs, &phdrbase, + &phdrsize)) != 0) { + return (error); + } + + hsize = ehdr.e_phentsize; + /* LINTED: alignment */ + phdrp = (Elf32_Phdr *)phdrbase; + for (uint_t i = nphdrs; i > 0; i--) { + switch (phdrp->p_type) { + case PT_GNU_STACK: + if ((phdrp->p_flags & PF_X) == 0) { + execstk = B_FALSE; + } + break; + } + /* LINTED: alignment */ + phdrp = (Elf32_Phdr *)((caddr_t)phdrp + hsize); + } + kmem_free(phdrbase, phdrsize); + } +#endif + + /* + * Revert the base personality while maintaining any existing flags. + */ + personality = LX_PER_LINUX | (lxpd->l_personality & ~LX_PER_MASK); + + /* + * Linux defaults to an executable stack unless the aformentioned + * PT_GNU_STACK entry in the elf header dictates otherwise. Enabling + * the READ_IMPLIES_EXEC personality flag is also implied in this case. + */ + if (execstk) { + args->stk_prot |= PROT_EXEC; + args->stk_prot_override = B_TRUE; + personality |= LX_PER_READ_IMPLIES_EXEC; + } + + /* + * We will first exec the brand library, then map in the linux + * executable and the linux linker. + */ + if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp))) { + uprintf("%s: not found.", lib_path); + return (error); + } + + /* + * We will eventually set the p_exec member to be the vnode for the new + * executable when we call setexecenv(). However, if we get an error + * before that call we need to restore the execenv to its original + * values so that when we return to the caller fop_close() works + * properly while cleaning up from the failed exec(). Restoring the + * original value will also properly decrement the 2nd VN_RELE that we + * took on the brand library. + */ + origenv.ex_bssbase = p->p_bssbase; + origenv.ex_brkbase = p->p_brkbase; + origenv.ex_brksize = p->p_brksize; + origenv.ex_vp = p->p_exec; + orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; + orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; + orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; + + if (args->to_model == DATAMODEL_NATIVE) { + error = elfexec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#if defined(_LP64) + else { + error = elf32exec(nvp, uap, args, idata, INTP_MAXDEPTH + 1, + execsz, setid, exec_file, cred, brand_action); + } +#endif + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * exec-ed in the brand library above. + * The u_auxv vectors are now setup by elfexec to point to the + * brand emulation library and its linker. + */ + + /* + * After execing the brand library (which should have implicitly mapped + * in the comm page), map the VDSO into the approprate place in the AS. + */ + lxpd->l_vdso = lx_map_vdso(args, cred); + + bzero(&env, sizeof (env)); + + /* + * map in the the Linux executable + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + + return (error); + } + + /* + * Save off the important properties of the lx executable. The brand + * library will ask us for this data later, when it is ready to set + * things up for the lx executable. + */ + edp.ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : + voffset + uphdr_vaddr; + edp.ed_entry = voffset + ehdr.e_entry; + edp.ed_phent = ehdr.e_phentsize; + edp.ed_phnum = ehdr.e_phnum; + + if (interp != NULL) { + if (ehdr.e_type == ET_DYN) { + /* + * This is a shared object executable, so we need to + * pick a reasonable place to put the heap. Just don't + * use the first page. + */ + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_bssbase = (caddr_t)PAGESIZE; + } + + /* + * If the program needs an interpreter (most do), map it in and + * store relevant information about it in the aux vector, where + * the brand library can find it. + */ + if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, + NULLVPP, &nvp))) { + uprintf("%s: not found.", interp); + restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); + return (error); + } + + kmem_free(interp, MAXPATHLEN); + interp = NULL; + + /* + * map in the Linux linker + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(nvp, args, &ehdr, + &uphdr_vaddr, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(nvp, args, &ehdr32, + &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * Now that we know the base address of the brand's linker, + * we also save this for later use by the brand library. + */ + edp.ed_base = voffset; + edp.ed_ldentry = voffset + ehdr.e_entry; + } else { + /* + * This program has no interpreter. The lx brand library will + * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, + * so in this case, put the entry point of the main executable + * there. + */ + if (ehdr.e_type == ET_EXEC) { + /* + * An executable with no interpreter, this must be a + * statically linked executable, which means we loaded + * it at the address specified in the elf header, in + * which case the e_entry field of the elf header is an + * absolute address. + */ + edp.ed_ldentry = ehdr.e_entry; + edp.ed_entry = ehdr.e_entry; + } else { + /* + * A shared object with no interpreter, we use the + * calculated address from above. + */ + edp.ed_ldentry = edp.ed_entry; + + /* + * In all situations except an ET_DYN elf object with no + * interpreter, we want to leave the brk and base + * values set by mapexec_brand alone. Normally when + * running ET_DYN objects on Solaris (most likely + * /lib/ld.so.1) the kernel sets brk and base to 0 since + * it doesn't know where to put the heap, and later the + * linker will call brk() to initialize the heap in: + * usr/src/cmd/sgs/rtld/common/setup.c:setup() + * after it has determined where to put it. (This + * decision is made after the linker loads and inspects + * elf properties of the target executable being run.) + * + * The Linux linker does not do this, though, and + * doesn't understand the semantics that we give the + * native linker (namely, that the first non-zero arg + * call to brk() will *set* the brkbase but leave + * brksize at 0 -- Linux binaries instead expect that + * this would extend the brk from 0 upto that arg). + * + * So we should never leave here with ex_brkbase == 0 + * or else we will get segfaults as Linux binaries + * misinterpret what we return from brk(). + * + * It's probably not great, but we'll just set brkbase + * to PAGESIZE. If there's something down there in the + * way then libc/malloc should fall back to mmap() when + * we fail to extend the brk for them. + */ + if (ehdr.e_type == ET_DYN) { + env.ex_bssbase = (caddr_t)PAGESIZE; + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_brksize = 0; + } + } + } + + /* + * Never leave this code with a 0 brkbase, or else we expose the + * native linker semantics of initial brk() to Linux binaries which + * will misinterpret them. + */ + ASSERT3U(env.ex_brkbase, !=, (caddr_t)0); + + env.ex_vp = vp; + setexecenv(&env); + + /* + * We try to keep /proc's view of the aux vector consistent with + * what's on the process stack. See the comment on the lx_times + * syscall for an explanation of the hardcoded LX_USERHZ. + */ + if (args->to_model == DATAMODEL_NATIVE) { + auxv_t phdr_auxv[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv[0].a_un.a_val = edp.ed_phdr; + phdr_auxv[1].a_un.a_val = ldaddr; + phdr_auxv[2].a_un.a_val = LX_USERHZ; + phdr_auxv[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv, args->auxp_brand, + sizeof (phdr_auxv)) == -1) + return (EFAULT); + } +#if defined(_LP64) + else { + auxv32_t phdr_auxv32[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_CLKTCK, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 } + }; + phdr_auxv32[0].a_un.a_val = edp.ed_phdr; + phdr_auxv32[1].a_un.a_val = ldaddr; + phdr_auxv32[2].a_un.a_val = hz; + phdr_auxv32[3].a_un.a_val = lxpd->l_vdso; + + if (copyout(&phdr_auxv32, args->auxp_brand, + sizeof (phdr_auxv32)) == -1) + return (EFAULT); + } +#endif + + /* + * /proc uses the AT_ENTRY aux vector entry to deduce + * the location of the executable in the address space. The user + * structure contains a copy of the aux vector that needs to have those + * entries patched with the values of the real lx executable (they + * currently contain the values from the lx brand library that was + * elfexec'd, above). + * + * For live processes, AT_BASE is used to locate the linker segment, + * which /proc and friends will later use to find Solaris symbols + * (such as rtld_db_preinit). However, for core files, /proc uses + * AT_ENTRY to find the right segment to label as the executable. + * So we set AT_ENTRY to be the entry point of the linux executable, + * but leave AT_BASE to be the address of the Solaris linker. + */ + for (uint_t i = 0; i < __KERN_NAUXV_IMPL; i++) { + switch (up->u_auxv[i].a_type) { + case AT_ENTRY: + up->u_auxv[i].a_un.a_val = edp.ed_entry; + break; + + case AT_SUN_BRAND_LX_PHDR: + up->u_auxv[i].a_un.a_val = edp.ed_phdr; + break; + + case AT_SUN_BRAND_LX_INTERP: + up->u_auxv[i].a_un.a_val = ldaddr; + break; + + case AT_SUN_BRAND_LX_CLKTCK: + up->u_auxv[i].a_un.a_val = hz; + break; + + default: + break; + } + } + + /* + * Record the brand ELF data and new personality now that the exec has + * proceeded successfully. + */ + bcopy(&edp, &lxpd->l_elf_data, sizeof (edp)); + lxpd->l_personality = personality; + + return (0); +} + +boolean_t +lx_native_exec(uint8_t osabi, const char **interp) +{ + if (osabi != ELFOSABI_SOLARIS) + return (B_FALSE); + + /* + * If the process root matches the zone root, prepend /native to the + * interpreter path for native executables. Absolute precision from + * VN_CMP is not necessary since any change of process root is likely + * to make native binaries inaccessible via /native. + * + * Processes which chroot directly into /native will be able to + * function as expected with no need for the prefix. + */ + mutex_enter(&curproc->p_lock); + if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { + *interp = "/native"; + } + mutex_exit(&curproc->p_lock); + + return (B_TRUE); +} + +static void +lx_syscall_init(void) +{ + int i; + + /* + * Count up the 32-bit Linux system calls. Note that lx_sysent32 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) + continue; + lx_nsysent32 = i; + +#if defined(_LP64) + /* + * Count up the 64-bit Linux system calls. Note that lx_sysent64 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) + continue; + lx_nsysent64 = i; +#endif +} + +int +_init(void) +{ + int err = 0; + + /* Initialize USER_HZ scaling factor */ + ASSERT(hz >= LX_USERHZ); + lx_hz_scale = hz / LX_USERHZ; + + lx_syscall_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_futex_init(); + lx_ptrace_init(); + lx_socket_init(); + lx_audit_ld(); + + err = mod_install(&modlinkage); + if (err != 0) { + cmn_err(CE_WARN, "Couldn't install lx brand module"); + + /* + * This looks drastic, but it should never happen. These + * two data structures should be completely free-able until + * they are used by Linux processes. Since the brand + * wasn't loaded there should be no Linux processes, and + * thus no way for these data structures to be modified. + */ + lx_pid_fini(); + lx_ioctl_fini(); + if (lx_futex_fini()) + panic("lx brand module cannot be loaded or unloaded."); + } + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + int futex_done = 0; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&lx_brand)) + return (EBUSY); + + lx_ptrace_fini(); + lx_pid_fini(); + lx_ioctl_fini(); + lx_socket_fini(); + lx_audit_unld(); + + if ((err = lx_futex_fini()) != 0) { + goto done; + } + futex_done = 1; + + err = mod_remove(&modlinkage); + +done: + if (err) { + /* + * If we can't unload the module, then we have to get it + * back into a sane state. + */ + lx_ptrace_init(); + lx_pid_init(); + lx_ioctl_init(); + lx_socket_init(); + + if (futex_done) { + lx_futex_init(); + } + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_lockd.c b/usr/src/uts/common/brand/lx/os/lx_lockd.c new file mode 100644 index 0000000000..37b744b0e8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_lockd.c @@ -0,0 +1,351 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * lx_start_nfs_lockd() starts an NFS lockd (lx_lockd) process inside the zone. + * This uses the same technique as used in our lx cgroupfs to launch a release + * agent process. This is called implicitly when an NFS mount syscall occurs + * within the zone. See the user-level lx_lockd source for the "big theory" + * comment behind this. + * + * lx_upcall_statd() is a brand hook that interposes on the rpc.statd RPC + * handling so that we can interface to a Linux rpc.statd that must run + * when NFSv3 locking is in use. The rpc.statd handles server or client reboots + * and interacts with the lockd to reclaim locks after the server reboots. The + * rcp.statd also informs the server when we reboot, so the server can release + * the locks we held. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/errno.h> +#include <sys/cred.h> +#include <sys/systm.h> +#include <sys/policy.h> +#include <sys/vmparam.h> +#include <sys/contract_impl.h> +#include <sys/pool.h> +#include <sys/stack.h> +#include <sys/var.h> +#include <sys/rt.h> +#include <sys/fx.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/pathname.h> +#include <rpcsvc/nlm_prot.h> +#include <rpcsvc/sm_inter.h> +#include <klm/nlm_impl.h> + +#define LX_LOCKD_PATH "/native/usr/lib/brand/lx/lx_lockd" + +/* Linux lockd RPC called by statd when it detects an NFS server reboot */ +#define LX_NLMPROC_NSM_NOTIFY 16 + +/* From uts/common/klm/nlm_impl.c */ +extern void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); +extern void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); + +/* + * Check if the current lockd is still running. + */ +static boolean_t +lx_lockd_alive(pid_t lockd_pid) +{ + boolean_t ret = B_FALSE; + proc_t *p; + vnode_t *vp; + char path[MAXPATHLEN]; + + mutex_enter(&pidlock); + p = prfind(lockd_pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (B_FALSE); + } + + mutex_enter(&p->p_lock); + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + return (B_FALSE); + } + vp = p->p_exec; + VN_HOLD(vp); + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + + if (vnodetopath(NULL, vp, path, sizeof (path), CRED()) == 0 && + strcmp(path, LX_LOCKD_PATH) == 0) { + ret = B_TRUE; + } + + VN_RELE(vp); + return (ret); +} + +static void +lx_run_lockd(void *a) +{ + proc_t *p = curproc; + zone_t *z = curzone; + struct core_globals *cg; + lx_zone_data_t *lxzd = ztolxzd(z); + int res; + + ASSERT(!INGLOBALZONE(p)); + VERIFY(lxzd != NULL); + + /* The following block is derived from start_init_common */ + ASSERT_STACK_ALIGNED(); + + p->p_cstime = p->p_stime = p->p_cutime = p->p_utime = 0; + p->p_usrstack = (caddr_t)USRSTACK32; + p->p_model = DATAMODEL_ILP32; + p->p_stkprot = PROT_ZFOD & ~PROT_EXEC; + p->p_datprot = PROT_ZFOD & ~PROT_EXEC; + p->p_stk_ctl = INT32_MAX; + + p->p_as = as_alloc(); + p->p_as->a_proc = p; + p->p_as->a_userlimit = (caddr_t)USERLIMIT32; + (void) hat_setup(p->p_as->a_hat, HAT_INIT); + + VERIFY((cg = zone_getspecific(core_zone_key, z)) != NULL); + + corectl_path_hold(cg->core_default_path); + corectl_content_hold(cg->core_default_content); + + p->p_corefile = cg->core_default_path; + p->p_content = cg->core_default_content; + + init_mstate(curthread, LMS_SYSTEM); + res = exec_init(LX_LOCKD_PATH, NULL); + + /* End of code derived from start_init_common */ + + /* The following is derived from zone_start_init - see comments there */ + if (res != 0 || zone_status_get(global_zone) >= ZONE_IS_SHUTTING_DOWN) { + if (proc_exit(CLD_EXITED, res) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + } else { + id_t cid = curthread->t_cid; + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + (void) parmsset(&pcparms, curthread); + mutex_exit(&p->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + + /* + * Set our pid as the lockd pid in the zone data, or exit + * if another process raced and already did so. + */ + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != 0) { + /* another mount raced and created a new lockd */ + mutex_exit(&lxzd->lxzd_lock); + if (proc_exit(CLD_EXITED, 0) != 0) { + mutex_enter(&p->p_lock); + ASSERT(p->p_flag & SEXITLWPS); + lwp_exit(); + } + return; + } + lxzd->lxzd_lockd_pid = p->p_pid; + mutex_exit(&lxzd->lxzd_lock); + + /* cause the process to return to userland. */ + lwp_rtt(); + } +} + +/* + * Launch the user-level, native, lx_lockd process. + */ +int +lx_start_nfs_lockd() +{ + id_t cid; + proc_t *p = ttoproc(curthread); + zone_t *z = p->p_zone; + lx_zone_data_t *lxzd = ztolxzd(z); + + ASSERT(!INGLOBALZONE(p)); + ASSERT(lxzd != NULL); + + /* + * This should only be called by the mount emulation, which must have + * 'root' privileges in order to have performed a mount, but + * double-check. + */ + if (crgetuid(CRED()) != 0) + return (EPERM); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != 0) { + /* verify lockd is still alive */ + pid_t lockd_pid; + + lockd_pid = lxzd->lxzd_lockd_pid; + mutex_exit(&lxzd->lxzd_lock); + + if (lx_lockd_alive(lockd_pid)) + return (EEXIST); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_lockd_pid != lockd_pid) { + /* another mount raced and created a new lockd */ + mutex_exit(&lxzd->lxzd_lock); + return (EEXIST); + } + + /* old lockd is dead, launch a new one */ + lxzd->lxzd_lockd_pid = 0; + } + mutex_exit(&lxzd->lxzd_lock); + + if (z->zone_defaultcid > 0) { + cid = z->zone_defaultcid; + } else { + pool_lock(); + cid = pool_get_class(z->zone_pool); + pool_unlock(); + } + if (cid == -1) + cid = defaultcid; + + /* + * There's nothing to do here if creating the proc fails, but we + * return the result to make it obvious while DTracing. + */ + return (newproc(lx_run_lockd, NULL, cid, minclsyspri - 1, NULL, -1)); +} + +void +lx_upcall_statd(int op, struct nlm_globals *g, struct nlm_host *host) +{ + struct nlm_nsm *nsm; + struct mon args; + struct mon_id *mip = &args.mon_id; + int family; + netobj obj; + enum clnt_stat stat; + + /* + * For Linux rpc.statd monitor registration, the Linux NSMPROC_MON and + * NSMPROC_UNMON RPC upcalls correspond almost directly to the native + * SM_MON and SM_UNMON RPC upcalls. The key differences with the native + * registration is that in our nlm_host_monitor function we make two + * RPC calls: + * - the first RPC (nsmaddrproc1_reg_1) uses our private 'nsm_addr' + * RPC protocol to register the lockd RPC information that statd + * should call when it detects that the remote server rebooted + * - the second RPC (sm_mon_1) tells statd the information about the + * remote server to be monitored + * For Linux, there is only a single RPC from the kernel to the local + * statd. This RPC is equivalent to our sm_mon_1 code, but it uses the + * Linux-private NLMPROC_NSM_NOTIFY lockd procedure in the 'my_proc' + * RPC parameter. This corresponds to our private 'nsm_addr' code, and + * tells statd which lockd RPC to call when it detects a server reboot. + * + * Because our sm_mon_1 RPC is so similar to the Linux RPC, we can use + * that directly and simply set the expected value in the 'my_proc' + * argument. + * + * Within the kernel lockd RPC handling, the nlm_prog_3_dtable dispatch + * table has an entry for each lockd RPC function. Thus, this table also + * contains an entry for the Linux NLMPROC_NSM_NOTIFY procedure. That + * procedure number is unused by the native lockd code, so there is no + * conflict with dispatching that procedure. The implementation of the + * procedure corresponds to the native, private NLM_SM_NOTIFY1 + * procedure which is called by the native rpc.statd. + * + * The Linux RPC call to "unmonitor" a host expects the same arguments + * as we pass to monitor, so that is also handled here by this same + * brand hook. + */ + + /* + * If the NLM was set up to be "v4 only" (i.e. no RPC call handlers + * to localhost at configure time), the semaphore is uninitialized, + * and will indefinitely hang. FURTHERMORE if just the semaphore + * was initialized, we'd still panic with a NULL nsm->ns_handle. + */ + if (g->nlm_v4_only) { + stat = RPC_SYSTEMERROR; + goto bail; + } + + nlm_netbuf_to_netobj(&host->nh_addr, &family, &obj); + nsm = &g->nlm_nsm; + + bzero(&args, sizeof (args)); + + mip->mon_name = host->nh_name; + mip->my_id.my_name = uts_nodename(); + mip->my_id.my_prog = NLM_PROG; + mip->my_id.my_vers = NLM_SM; + mip->my_id.my_proc = LX_NLMPROC_NSM_NOTIFY; + if (op == SM_MON) { + bcopy(&host->nh_sysid, args.priv, sizeof (uint16_t)); + } + + sema_p(&nsm->ns_sem); + nlm_nsm_clnt_init(nsm->ns_handle, nsm); + if (op == SM_MON) { + struct sm_stat_res mres; + + bzero(&mres, sizeof (mres)); + stat = sm_mon_1(&args, &mres, nsm->ns_handle); + } else { + struct sm_stat ures; + + ASSERT(op == SM_UNMON); + bzero(&ures, sizeof (ures)); + stat = sm_unmon_1(mip, &ures, nsm->ns_handle); + } + sema_v(&nsm->ns_sem); + +bail: + if (stat != RPC_SUCCESS) { + NLM_WARN("Failed to contact local statd, stat=%d", stat); + if (op == SM_MON) { + mutex_enter(&g->lock); + host->nh_flags &= ~NLM_NH_MONITORED; + mutex_exit(&g->lock); + } + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c new file mode 100644 index 0000000000..a97e025200 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -0,0 +1,1136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2022 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/privregs.h> +#include <sys/exec.h> +#include <sys/lwp.h> +#include <sys/sem.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_siginfo.h> +#include <sys/lx_futex.h> +#include <lx_errno.h> +#include <sys/lx_userhz.h> +#include <sys/cmn_err.h> +#include <sys/siginfo.h> +#include <sys/contract/process_impl.h> +#include <sys/x86_archext.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <net/if.h> +#include <inet/ip6.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/sysmacros.h> + +/* Linux specific functions and definitions */ +static void lx_save(void *); +static void lx_restore(void *); + +/* Context op template. */ +static const struct ctxop_template lx_ctxop_template = { + .ct_rev = CTXOP_TPL_REV, + .ct_save = lx_save, + .ct_restore = lx_restore, + .ct_exit = lx_save, +}; + +/* + * Set the return code for the forked child, always zero + */ +/*ARGSUSED*/ +void +lx_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_r0 = 0; +} + +/* + * Reset process state on exec(2) + */ +void +lx_exec() +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = ttoproc(curthread); + lx_proc_data_t *pd = ptolxproc(p); + struct regs *rp = lwptoregs(lwp); + + /* b_exec is called without p_lock held */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Any l_handler handlers set as a result of B_REGISTER are now + * invalid; clear them. + */ + pd->l_handler = (uintptr_t)NULL; + + /* + * If this was a multi-threaded Linux process and this lwp wasn't the + * main lwp, then we need to make its Illumos and Linux PIDs match. + */ + if (curthread->t_tid != 1) { + lx_pid_reassign(curthread); + } + + /* + * Inform ptrace(2) that we are processing an execve(2) call so that if + * we are traced we can post either the PTRACE_EVENT_EXEC event or the + * legacy SIGTRAP. + */ + (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0); + + /* clear the fs/gsbase values until the app. can reinitialize them */ + lwpd->br_lx_fsbase = (uintptr_t)NULL; + lwpd->br_ntv_fsbase = (uintptr_t)NULL; + lwpd->br_lx_gsbase = (uintptr_t)NULL; + lwpd->br_ntv_gsbase = (uintptr_t)NULL; + + /* + * Clear the native stack flags. This will be reinitialised by + * lx_init() in the new process image. + */ + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + lwpd->br_ntv_stack = 0; + lwpd->br_ntv_stack_current = 0; + + ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp); + + /* + * clear out the tls array + */ + bzero(lwpd->br_tls, sizeof (lwpd->br_tls)); + + /* + * reset the tls entries in the gdt + */ + kpreempt_disable(); + lx_restore(lwp); + kpreempt_enable(); + + /* + * The exec syscall doesn't return (so we don't call lx_syscall_return) + * but for our ptrace emulation we need to do this so that a tracer + * does not get out of sync. We know that by the time this lx_exec + * function is called that the exec has succeeded. + */ + rp->r_r0 = 0; + (void) lx_ptrace_stop(LX_PR_SYSEXIT); +} + +static void +lx_cleanlwp(klwp_t *lwp, proc_t *p) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + void *rb_list = NULL; + + VERIFY(lwpd != NULL); + + mutex_enter(&p->p_lock); + if ((lwpd->br_ptrace_flags & LX_PTF_EXITING) == 0) { + lx_ptrace_exit(p, lwp); + } + + /* + * While we have p_lock, clear the TP_KTHREAD flag. This is needed + * to prevent races within lx procfs. It's fine for prchoose() to pick + * this thread now since it is exiting and no longer blocked in the + * kernel. + */ + lwptot(lwp)->t_proc_flag &= ~TP_KTHREAD; + + /* + * While we have p_lock, safely grab any robust_list references and + * clear the lwp field. + */ + sprlock_proc(p); + rb_list = lwpd->br_robust_list; + lwpd->br_robust_list = NULL; + sprunlock(p); + + if (rb_list != NULL) { + lx_futex_robust_exit((uintptr_t)rb_list, lwpd->br_pid); + } + + /* + * We need to run our context exit operation (lx_save) here to ensure + * we don't leave any garbage around. This is necessary to handle the + * following calling sequence: + * exit -> proc_exit -> lx_freelwp -> removectx + * That is, when our branded process exits, proc_exit will call our + * lx_freelwp brand hook which does call this function (lx_cleanlwp), + * but lx_freelwp also removes our context exit operation. The context + * exit functions are run by exitctx, which is called by either + * lwp_exit or thread_exit. The thread_exit function is called at the + * end of proc_exit when we'll swtch() to another thread, but by then + * our context exit function has been removed. + * + * It's ok if this function happens to be called more than once (for + * example, if we exec a native binary). + */ + kpreempt_disable(); + lx_save(lwp); + kpreempt_enable(); +} + +void +lx_exitlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + kthread_t *t; + sigqueue_t *sqp = NULL; + pid_t ppid; + id_t ptid; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* second time thru' */ + return; + } + + lx_cleanlwp(lwp, p); + + if (lwpd->br_clear_ctidp != NULL) { + (void) suword32(lwpd->br_clear_ctidp, 0); + (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1, + (uintptr_t)NULL, (uintptr_t)NULL, 0); + lwpd->br_clear_ctidp = NULL; + } + + if (lwpd->br_signal != 0) { + /* + * The first thread in a process doesn't cause a signal to + * be sent when it exits. It was created by a fork(), not + * a clone(), so the parent should get signalled when the + * process exits. + */ + if (lwpd->br_ptid == -1) + goto free; + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * If br_ppid is 0, it means this is a CLONE_PARENT thread, + * so the signal goes to the parent process - not to a + * specific thread in this process. + */ + p = lwptoproc(lwp); + if (lwpd->br_ppid == 0) { + mutex_enter(&p->p_lock); + ppid = p->p_ppid; + t = NULL; + } else { + /* + * If we have been reparented to init or if our + * parent thread is gone, then nobody gets + * signaled. + */ + if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) || + (ptid == -1)) + goto free; + + mutex_enter(&pidlock); + if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + goto free; + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((t = idtot(p, ptid)) == NULL) { + mutex_exit(&p->p_lock); + goto free; + } + } + + sqp->sq_info.si_signo = lwpd->br_signal; + sqp->sq_info.si_code = lwpd->br_exitwhy; + sqp->sq_info.si_status = lwpd->br_exitwhat; + sqp->sq_info.si_pid = lwpd->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + sqp = NULL; + } + +free: + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, lwpd->br_args_size); + } + if (sqp) + kmem_free(sqp, sizeof (sigqueue_t)); +} + +void +lx_freelwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) { + /* + * There is one case where an LX branded process will possess + * LWPs which lack their own brand data. During the course of + * executing native binary, the process will be preemptively + * branded to allow hooks such as b_native_exec to function. + * If that process possesses multiple LWPS, they will _not_ be + * branded since they will exit if the exec succeeds. It's + * during this LWP exit that lx_freelwp would be called on an + * unbranded LWP. When that is the case, it is acceptable to + * bypass the hook. + */ + return; + } + + /* cgroup integration */ + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_freelwp != NULL); + (*lx_cgrp_freelwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } + + /* + * It is possible for the lx_freelwp hook to be called without a prior + * call to lx_exitlwp being made. This happens as part of lwp + * de-branding when a native binary is executed from a branded process. + * + * To cover all cases, lx_cleanlwp is called from lx_exitlwp as well + * here in lx_freelwp. When the second call is redundant, the + * resources will already be freed and no work will be needed. + */ + lx_cleanlwp(lwp, p); + + /* + * Remove our system call interposer. + */ + lwp->lwp_brand_syscall = NULL; + + /* + * If this process is being de-branded during an exec(), + * the LX ctxops may have already been removed, so the result + * from ctxop_remove is irrelevant. + */ + (void) ctxop_remove(lwptot(lwp), &lx_ctxop_template, lwp); + if (lwpd->br_pid != 0) { + lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid); + } + + /* + * Discard the affinity mask. + */ + VERIFY(lwpd->br_affinitymask != NULL); + cpuset_free(lwpd->br_affinitymask); + lwpd->br_affinitymask = NULL; + + /* + * Ensure that lx_ptrace_exit() has been called to detach + * ptrace(2) tracers and tracees. + */ + VERIFY(lwpd->br_ptrace_tracer == NULL); + VERIFY(lwpd->br_ptrace_accord == NULL); + + lwp->lwp_brand = NULL; + kmem_free(lwpd, sizeof (struct lx_lwp_data)); +} + +void * +lx_lwpdata_alloc(proc_t *p) +{ + lx_lwp_data_t *lwpd; + struct lx_pid *lpidp; + cpuset_t *affmask; + pid_t newpid = 0; + struct pid *pidp = NULL; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * LWPs beyond the first will require a pid to be allocated to emulate + * Linux's goofy thread model. While this allocation may be + * unnecessary when a single-lwp process undergoes branding, it cannot + * be performed during b_initlwp due to p_lock being held. + */ + if (p->p_lwpcnt > 0) { + if ((newpid = pid_allocate(p, 0, 0)) < 0) { + return (NULL); + } + pidp = pid_find(newpid); + } + + lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP); + lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP); + affmask = cpuset_alloc(KM_SLEEP); + + lpidp->lxp_lpid = newpid; + lpidp->lxp_pidp = pidp; + lwpd->br_lpid = lpidp; + lwpd->br_affinitymask = affmask; + + return (lwpd); +} + +/* + * Free lwp brand data if an error occurred during lwp_create. + * Otherwise, lx_freelwp will be used to free the resources after they're + * associated with the lwp via lx_initlwp. + */ +void +lx_lwpdata_free(void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + VERIFY(lwpd != NULL); + VERIFY(lwpd->br_lpid != NULL); + VERIFY(lwpd->br_affinitymask != NULL); + + cpuset_free(lwpd->br_affinitymask); + if (lwpd->br_lpid->lxp_pidp != NULL) { + (void) pid_rele(lwpd->br_lpid->lxp_pidp); + } + kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid)); + kmem_free(lwpd, sizeof (*lwpd)); +} + +void +lx_initlwp(klwp_t *lwp, void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + kthread_t *tp = lwptot(lwp); + proc_t *p = lwptoproc(lwp); + lx_zone_data_t *lxzdata; + vfs_t *cgrp; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwp->lwp_brand == NULL); + + lwpd->br_exitwhy = CLD_EXITED; + lwpd->br_lwp = lwp; + lwpd->br_clear_ctidp = NULL; + lwpd->br_set_ctidp = NULL; + lwpd->br_signal = 0; + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + cpuset_all(lwpd->br_affinitymask); + + /* + * The first thread in a process has ppid set to the parent + * process's pid, and ptid set to -1. Subsequent threads in the + * process have their ppid set to the pid of the thread that + * created them, and their ptid to that thread's tid. + */ + if (tp->t_next == tp) { + lwpd->br_ppid = tp->t_procp->p_ppid; + lwpd->br_ptid = -1; + } else if (plwpd != NULL) { + bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls)); + lwpd->br_ppid = plwpd->br_pid; + lwpd->br_ptid = curthread->t_tid; + /* The child inherits the fs/gsbase values from the parent */ + lwpd->br_lx_fsbase = plwpd->br_lx_fsbase; + lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase; + lwpd->br_lx_gsbase = plwpd->br_lx_gsbase; + lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase; + } else { + /* + * Oddball case: the parent thread isn't a Linux process. + */ + lwpd->br_ppid = 0; + lwpd->br_ptid = -1; + } + lwp->lwp_brand = lwpd; + + /* + * When during lx_lwpdata_alloc, we must decide whether or not to + * allocate a new pid to associate with the lwp. Since p_lock is not + * held at that point, the only time we can guarantee a new pid isn't + * needed is when p_lwpcnt == 0. This is because other lwps won't be + * present to race with us with regards to pid allocation. + * + * This means that in all other cases (where p_lwpcnt > 0), we expect + * that lx_lwpdata_alloc will allocate a pid for us to use here, even + * if it is uneeded. If this process is undergoing an exec, for + * example, the single existing lwp will not need a new pid when it is + * rebranded. In that case, lx_pid_assign will free the uneeded pid. + */ + VERIFY(lwpd->br_lpid->lxp_pidp != NULL || p->p_lwpcnt == 0); + + lx_pid_assign(tp, lwpd->br_lpid); + lwpd->br_tgid = lwpd->br_pid; + /* + * Having performed the lx pid assignement, the lpid reference is no + * longer needed. The underlying data will be freed during lx_freelwp. + */ + lwpd->br_lpid = NULL; + + ctxop_install(lwptot(lwp), &lx_ctxop_template, lwp); + + /* + * Install branded system call hooks for this LWP: + */ + lwp->lwp_brand_syscall = lx_syscall_enter; + + /* + * The new LWP inherits the parent LWP cgroup ID. + */ + if (plwpd != NULL) { + lwpd->br_cgroupid = plwpd->br_cgroupid; + } + /* + * The new LWP inherits the parent LWP emulated scheduling info. + */ + if (plwpd != NULL) { + lwpd->br_schd_class = plwpd->br_schd_class; + lwpd->br_schd_pri = plwpd->br_schd_pri; + lwpd->br_schd_flags = plwpd->br_schd_flags; + lwpd->br_schd_runtime = plwpd->br_schd_runtime; + lwpd->br_schd_deadline = plwpd->br_schd_deadline; + lwpd->br_schd_period = plwpd->br_schd_period; + } + lxzdata = ztolxzd(p->p_zone); + mutex_enter(&lxzdata->lxzd_lock); + cgrp = lxzdata->lxzd_cgroup; + if (cgrp != NULL) { + VFS_HOLD(cgrp); + mutex_exit(&lxzdata->lxzd_lock); + ASSERT(lx_cgrp_initlwp != NULL); + (*lx_cgrp_initlwp)(cgrp, lwpd->br_cgroupid, lwptot(lwp)->t_tid, + lwpd->br_pid); + VFS_RELE(cgrp); + } else { + mutex_exit(&lxzdata->lxzd_lock); + } +} + +void +lx_initlwp_post(klwp_t *lwp) +{ + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + /* + * If the parent LWP has a ptrace(2) tracer, the new LWP may + * need to inherit that same tracer. + */ + if (plwpd != NULL) { + lx_ptrace_inherit_tracer(plwpd, lwptolxlwp(lwp)); + } +} + +/* + * There is no need to have any locking for either the source or + * destination struct lx_lwp_data structs. This is always run in the + * thread context of the source thread, and the destination thread is + * always newly created and not referred to from anywhere else. + */ +void +lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + struct lx_lwp_data *src = srclwp->lwp_brand; + struct lx_lwp_data *dst = dstlwp->lwp_brand; + + dst->br_ppid = src->br_pid; + dst->br_ptid = lwptot(srclwp)->t_tid; + bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + + switch (src->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * The parent LWP has an alternate stack installed. + * The child LWP should have the same stack base and extent. + */ + dst->br_stack_mode = src->br_stack_mode; + dst->br_ntv_stack = src->br_ntv_stack; + dst->br_ntv_stack_current = src->br_ntv_stack_current; + break; + + default: + /* + * Otherwise, clear the stack data for this LWP. + */ + dst->br_stack_mode = LX_STACK_MODE_PREINIT; + dst->br_ntv_stack = 0; + dst->br_ntv_stack_current = 0; + } + + /* + * copy only these flags + */ + dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; + dst->br_scall_args = NULL; + lx_affinity_forklwp(srclwp, dstlwp); + + /* + * Flag so child doesn't ptrace-stop on syscall exit. + */ + dst->br_ptrace_flags |= LX_PTF_NOSTOP; + + if (src->br_clone_grp_flags != 0) { + lx_clone_grp_enter(src->br_clone_grp_flags, lwptoproc(srclwp), + lwptoproc(dstlwp)); + /* clone group no longer pending on this thread */ + src->br_clone_grp_flags = 0; + } +} + +/* + * When switching a Linux process off the CPU, clear its GDT entries. + */ +/* ARGSUSED */ +static void +lx_save(void *arg) +{ + int i; + +#if defined(__amd64) + reset_sregs(); +#endif + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &null_udesc); +} + +/* + * When switching a Linux process on the CPU, set its GDT entries. + * + * For 64-bit code we don't have to worry about explicitly setting the + * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen + * automatically in update_sregs if we are executing in user-land. If this + * is the case then pcb_rupdate should be set. + */ +static void +lx_restore(void *arg) +{ + klwp_t *t = (klwp_t *)arg; + struct lx_lwp_data *lwpd = lwptolxlwp(t); + user_desc_t *tls; + int i; + + ASSERT(lwpd); + + tls = lwpd->br_tls; + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &tls[i]); +} + +void +lx_set_gdt(int entry, user_desc_t *descrp) +{ + + gdt_update_usegd(entry, descrp); +} + +void +lx_clear_gdt(int entry) +{ + gdt_update_usegd(entry, &null_udesc); +} + +longlong_t +lx_nosys() +{ + return (set_errno(ENOSYS)); +} + +/* + * Brand-specific routine to check if given non-Solaris standard segment + * register values should be modified to other values. + */ +/*ARGSUSED*/ +greg_t +lx_fixsegreg(greg_t sr, model_t datamodel) +{ + uint16_t idx = SELTOIDX(sr); + + ASSERT(sr == (sr & 0xffff)); + + /* + * If the segment selector is a valid TLS selector, just return it. + */ + if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX) + return (sr | SEL_UPL); + + /* + * Force the SR into the LDT in ring 3 for 32-bit processes. + * + * 64-bit processes get the null GDT selector since they are not + * allowed to have a private LDT. + */ +#if defined(__amd64) + return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0); +#elif defined(__i386) + datamodel = datamodel; /* datamodel currently unused for 32-bit */ + return (sr | SEL_TI_LDT | SEL_UPL); +#endif /* __amd64 */ +} + +/* + * Brand-specific function to convert the fsbase as pulled from the register + * into a native fsbase suitable for locating the ulwp_t from the kernel. + */ +uintptr_t +lx_fsbase(klwp_t *lwp, uintptr_t fsbase) +{ + lx_lwp_data_t *lwpd = lwp->lwp_brand; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND || + lwpd->br_ntv_fsbase == (uintptr_t)NULL) { + return (fsbase); + } + + return (lwpd->br_ntv_fsbase); +} + +/* + * These two functions simulate winfo and post_sigcld for the lx brand. The + * difference is delivering a designated signal as opposed to always SIGCLD. + */ +static void +lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat) +{ + ASSERT(MUTEX_HELD(&pidlock)); + bzero(ip, sizeof (k_siginfo_t)); + ip->si_signo = ltos_signo[dat->l_signal]; + ip->si_code = pp->p_wcode; + ip->si_pid = pp->p_pid; + ip->si_ctid = PRCTID(pp); + ip->si_zoneid = pp->p_zone->zone_id; + ip->si_status = pp->p_wdata; + /* + * These siginfo values are converted to USER_HZ in the user-land + * brand signal code. + */ + ip->si_stime = pp->p_stime; + ip->si_utime = pp->p_utime; +} + +static void +lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat) +{ + proc_t *pp = cp->p_parent; + + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + /* + * Since Linux doesn't queue SIGCHLD, or any other non RT + * signals, we just blindly deliver whatever signal we can. + */ + ASSERT(sqp != NULL); + lx_winfo(cp, &sqp->sq_info, dat); + sigaddqa(pp, NULL, sqp); + sqp = NULL; + mutex_exit(&pp->p_lock); +} + + +/* + * Brand specific code for exiting and sending a signal to the parent, as + * opposed to sigcld(). + */ +void +lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp) +{ + proc_t *pp = cp->p_parent; + lx_proc_data_t *lx_brand_data = ptolxproc(cp); + ASSERT(MUTEX_HELD(&pidlock)); + + switch (cp->p_wcode) { + case CLD_EXITED: + case CLD_DUMPED: + case CLD_KILLED: + ASSERT(cp->p_stat == SZOMB); + /* + * The broadcast on p_srwchan_cv is a kludge to + * wakeup a possible thread in uadmin(A_SHUTDOWN). + */ + cv_broadcast(&cp->p_srwchan_cv); + + /* + * Add to newstate list of the parent + */ + add_ns(pp, cp); + + cv_broadcast(&pp->p_cv); + if ((pp->p_flag & SNOWAIT) || + PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) { + if (!(cp->p_pidflag & CLDWAITPID)) + freeproc(cp); + } else if (!(cp->p_pidflag & CLDNOSIGCHLD) && + lx_brand_data->l_signal != 0) { + lx_post_exit_sig(cp, sqp, lx_brand_data); + sqp = NULL; + } + break; + + case CLD_STOPPED: + case CLD_CONTINUED: + case CLD_TRAPPED: + panic("Should not be called in this case"); + } + + if (sqp) + siginfofree(sqp); +} + +/* + * Filters based on arguments that have been passed in by a separate syscall + * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is + * applied, otherwise we look at the difference between a clone and non-clone + * process. + * The definition of a clone process in Linux is a thread that does not deliver + * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone + * processes. Without that option, a process should only wait on normal + * children. The following table shows the cases. + * + * default __WCLONE + * no SIGCHLD - X + * SIGCHLD X - + * + * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on + * process exit. + * + * More information on wait in lx brands can be found at + * usr/src/lib/brand/lx/lx_brand/common/wait.c. + */ +/* ARGSUSED */ +boolean_t +lx_wait_filter(proc_t *pp, proc_t *cp) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int flags = lwpd->br_waitid_flags; + boolean_t ret; + + if (!lwpd->br_waitid_emulate) { + return (B_TRUE); + } + + mutex_enter(&cp->p_lock); + if (flags & LX_WALL) { + ret = B_TRUE; + } else { + lx_proc_data_t *pd = ptolxproc(cp); + boolean_t is_sigchld = B_TRUE; + boolean_t match_wclone = B_FALSE; + + /* + * When calling clone, an alternate signal can be chosen to + * deliver to the parent when the child exits. + */ + if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) { + is_sigchld = B_FALSE; + } + if ((flags & LX_WCLONE) != 0) { + match_wclone = B_TRUE; + } + + ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE; + } + mutex_exit(&cp->p_lock); + + return (ret); +} + +void +lx_ifname_convert(char *ifname, lx_if_action_t act) +{ + if (act == LX_IF_TONATIVE) { + if (strncmp(ifname, "lo", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo0", IFNAMSIZ); + } else { + if (strncmp(ifname, "lo0", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo", IFNAMSIZ); + } +} + +void +lx_ifflags_convert(uint64_t *flags, lx_if_action_t act) +{ + uint64_t buf; + + buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG | + IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS | + IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI); + + /* Linux has different shift for multicast flag */ + if (act == LX_IF_TONATIVE) { + if (*flags & 0x1000) + buf |= IFF_MULTICAST; + } else { + if (*flags & IFF_MULTICAST) + buf |= 0x1000; + } + *flags = buf; +} + +/* + * Convert an IPv6 address into the numbers used by /proc/net/if_inet6 + */ +unsigned int +lx_ipv6_scope_convert(const in6_addr_t *addr) +{ + if (IN6_IS_ADDR_V4COMPAT(addr)) { + return (LX_IPV6_ADDR_COMPATv4); + } else if (IN6_ARE_ADDR_EQUAL(addr, &ipv6_loopback)) { + return (LX_IPV6_ADDR_LOOPBACK); + } else if (IN6_IS_ADDR_LINKLOCAL(addr)) { + return (LX_IPV6_ADDR_LINKLOCAL); + } else if (IN6_IS_ADDR_SITELOCAL(addr)) { + return (LX_IPV6_ADDR_SITELOCAL); + } else { + return (0x0000U); + } +} + + +void +lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size) +{ + int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data)); + + switch (src->sdl_type) { + case DL_ETHER: + dst->sa_family = LX_ARPHRD_ETHER; + break; + case DL_LOOP: + dst->sa_family = LX_ARPHRD_LOOPBACK; + break; + default: + dst->sa_family = LX_ARPHRD_VOID; + } + + bcopy(LLADDR(src), dst->sa_data, copy_size); + *size = copy_size; +} + +/* + * Brand hook to convert native kernel siginfo signal number, errno, code, pid + * and si_status to Linux values. Similar to the stol_ksiginfo function but + * this one converts in-place, converts the pid, and does not copyout. + */ +void +lx_sigfd_translate(k_siginfo_t *infop) +{ + zone_t *zone = curproc->p_zone; + + infop->si_signo = lx_stol_signo(infop->si_signo, LX_SIGKILL); + infop->si_status = lx_stol_status(infop->si_status, LX_SIGKILL); + infop->si_code = lx_stol_sigcode(infop->si_code); + infop->si_errno = lx_errno(infop->si_errno, EINVAL); + + /* Map zsched and zone init to pid 1 */ + if (infop->si_pid == zone->zone_proc_initpid || + infop->si_pid == zone->zone_zsched->p_pid) { + infop->si_pid = 1; + } +} + +int +stol_ksiginfo_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime); + lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime); + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +int +stol_ksiginfo32_copyout(k_siginfo_t *sip, void *ulxsip) +{ + lx_siginfo32_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = HZ_TO_LX_USERHZ(sip->si_utime); + lsi.lsi_stime = HZ_TO_LX_USERHZ(sip->si_stime); + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, ulxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif + +/* Given an LX LWP, determine where user register state is stored. */ +lx_regs_location_t +lx_regs_location(lx_lwp_data_t *lwpd, void **ucp, boolean_t for_write) +{ + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + /* + * The LWP was stopped with the brand stack and register state + * loaded, e.g. during a syscall emulated within the kernel. + */ + return (LX_REG_LOC_LWP); + + case LX_STACK_MODE_PREINIT: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED || + lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) { + /* The LWP was stopped by tracing on exec. */ + return (LX_REG_LOC_LWP); + } + break; + + case LX_STACK_MODE_NATIVE: + if (for_write) { + /* setting registers not allowed in this state */ + break; + } + if (lwpd->br_ptrace_whystop == PR_BRAND) { + /* Called while ptrace-event-stopped by lx_exec. */ + if (lwpd->br_ptrace_whatstop == LX_PR_EVENT) { + return (LX_REG_LOC_LWP); + } + + /* Called while ptrace-event-stopped after clone. */ + if (lwpd->br_ptrace_whatstop == LX_PR_SIGNALLED && + lwpd->br_ptrace_stopsig == LX_SIGSTOP && + (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) { + return (LX_REG_LOC_LWP); + } + + /* + * Called to obtain syscall exit for other cases + * (e.g. pseudo return from rt_sigreturn). + */ + if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT && + (lwpd->br_ptrace_flags & LX_PTF_STOPPED)) { + return (LX_REG_LOC_LWP); + } + } + break; + default: + break; + } + + if (lwpd->br_ptrace_stopucp != (uintptr_t)NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Return the register state + * from that ucontext_t. + */ + VERIFY(ucp != NULL); + *ucp = (void *)lwpd->br_ptrace_stopucp; + return (LX_REG_LOC_UCP); + } + + return (LX_REG_LOC_UNAVAIL); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c new file mode 100644 index 0000000000..fd82a84a22 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c @@ -0,0 +1,498 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> + +#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */ +static int hash_len = 4; /* desired average hash chain length */ +static int hash_size; /* no of buckets in the hash table */ + +static struct lx_pid **stol_pid_hash; +static struct lx_pid **ltos_pid_hash; + +#define LTOS_HASH(pid) ((pid) & (hash_size - 1)) +#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1)) + +static kmutex_t hash_lock; + +static void +lx_pid_insert_hash(struct lx_pid *lpidp) +{ + int shash = STOL_HASH(lpidp->lxp_spid, lpidp->lxp_stid); + int lhash = LTOS_HASH(lpidp->lxp_lpid); + + ASSERT(MUTEX_HELD(&hash_lock)); + + lpidp->lxp_stol_next = stol_pid_hash[shash]; + stol_pid_hash[shash] = lpidp; + + lpidp->lxp_ltos_next = ltos_pid_hash[lhash]; + ltos_pid_hash[lhash] = lpidp; +} + +static struct lx_pid * +lx_pid_remove_hash(pid_t pid, id_t tid) +{ + struct lx_pid **hpp; + struct lx_pid *lpidp = NULL; + + ASSERT(MUTEX_HELD(&hash_lock)); + + hpp = &stol_pid_hash[STOL_HASH(pid, tid)]; + while (*hpp) { + if ((*hpp)->lxp_spid == pid && (*hpp)->lxp_stid == tid) { + lpidp = *hpp; + *hpp = (*hpp)->lxp_stol_next; + break; + } + hpp = &(*hpp)->lxp_stol_next; + } + + /* + * when called during error recovery the pid may already + * be released + */ + if (lpidp == NULL) + return (NULL); + + hpp = <os_pid_hash[LTOS_HASH(lpidp->lxp_lpid)]; + while (*hpp) { + if (*hpp == lpidp) { + *hpp = lpidp->lxp_ltos_next; + break; + } + hpp = &(*hpp)->lxp_ltos_next; + } + + return (lpidp); +} + +/* + * given a solaris pid/tid pair, create a linux pid + */ +void +lx_pid_assign(kthread_t *t, struct lx_pid *lpidp) +{ + proc_t *p = ttoproc(t); + lx_lwp_data_t *lwpd = ttolxlwp(t); + pid_t spid = p->p_pid; + id_t stid = t->t_tid; + + /* + * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be + * equal to 1. Since lx_initlwp is being called against an lwp that + * already exists, an additional pid allocation is not necessary. + * + * We check for this by testing br_ppid == 0. + */ + if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) { + /* + * Assign allocated pid to any thread other than the first. + * The lpid and pidp fields should be populated. + */ + VERIFY(lpidp->lxp_pidp != NULL); + VERIFY(lpidp->lxp_lpid != 0); + } else { + /* + * There are cases where a pid is speculatively allocated but + * is not needed. We are obligated to free it here. + */ + if (lpidp->lxp_pidp != NULL) { + (void) pid_rele(lpidp->lxp_pidp); + } + lpidp->lxp_pidp = NULL; + lpidp->lxp_lpid = spid; + } + + lpidp->lxp_spid = spid; + lpidp->lxp_stid = stid; + lpidp->lxp_start = t->t_start; + lpidp->lxp_procp = p; + + /* + * Now place the pid into the Linux-SunOS and SunOS-Linux conversion + * hash tables. + */ + mutex_enter(&hash_lock); + lx_pid_insert_hash(lpidp); + mutex_exit(&hash_lock); + + lwpd->br_pid = lpidp->lxp_lpid; +} + +/* + * If we are exec()ing the process, this thread's tid is about to be reset + * to 1. Make sure the Linux PID bookkeeping reflects that change. + */ +void +lx_pid_reassign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + struct pid *old_pidp; + struct lx_pid *lpidp; + + ASSERT(p->p_lwpcnt == 1); + + mutex_enter(&hash_lock); + + /* + * Clean up all the traces of this thread's 'fake' Linux PID. + */ + lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid); + ASSERT(lpidp != NULL); + old_pidp = lpidp->lxp_pidp; + lpidp->lxp_pidp = NULL; + + /* + * Now register this thread as (pid, 1). + */ + lpidp->lxp_lpid = p->p_pid; + lpidp->lxp_spid = p->p_pid; + lpidp->lxp_stid = 1; + lx_pid_insert_hash(lpidp); + + mutex_exit(&hash_lock); + + if (old_pidp) + (void) pid_rele(old_pidp); +} + +/* + * release a solaris pid/tid pair + */ +void +lx_pid_rele(pid_t pid, id_t tid) +{ + struct lx_pid *lpidp; + + mutex_enter(&hash_lock); + lpidp = lx_pid_remove_hash(pid, tid); + mutex_exit(&hash_lock); + + if (lpidp) { + if (lpidp->lxp_pidp) + (void) pid_rele(lpidp->lxp_pidp); + + kmem_free(lpidp, sizeof (*lpidp)); + } +} + +/* + * given a linux pid, return the solaris pid/tid pair + */ +int +lx_lpid_to_spair(pid_t lpid, pid_t *spid, id_t *stid) +{ + struct lx_pid *hp; + + if (lpid == 1) { + pid_t initpid; + + /* + * We are trying to look up the Linux init process for the + * current zone, which we pretend has pid 1. + */ + if ((initpid = curzone->zone_proc_initpid) == -1) { + /* + * We could not find the init process for this zone. + */ + return (-1); + } + + if (spid != NULL) + *spid = initpid; + if (stid != NULL) + *stid = 1; + + return (0); + } + + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL; + hp = hp->lxp_ltos_next) { + if (hp->lxp_lpid == lpid) { + if (spid) + *spid = hp->lxp_spid; + if (stid) + *stid = hp->lxp_stid; + break; + } + } + mutex_exit(&hash_lock); + if (hp != NULL) + return (0); + + /* + * We didn't find this pid in our translation table. + * But this still could be the pid of a native process + * running in the current zone so check for that here. + * + * Note that prfind() only searches for processes in the current zone. + */ + mutex_enter(&pidlock); + if (prfind(lpid) != NULL) { + mutex_exit(&pidlock); + if (spid) + *spid = lpid; + if (stid) + *stid = 0; + return (0); + } + mutex_exit(&pidlock); + + return (-1); +} + +/* + * Given a Linux pid, locate the proc_t and optionally acquire P_PR_LOCK. + * Returns 0 on success with p_lock held for the proc_t in question. + */ +int +lx_lpid_lock(pid_t lpid, zone_t *zone, lx_pid_flag_t flag, proc_t **pp, + kthread_t **tp) +{ + proc_t *p; + kthread_t *t; + id_t tid = 0; + + ASSERT(MUTEX_NOT_HELD(&pidlock)); + ASSERT(pp != NULL); + ASSERT(zone != NULL && zone->zone_brand == &lx_brand); + +retry: + p = NULL; + if (lpid == 1) { + pid_t initpid; + + /* + * Look up the init process for the zone. + */ + if ((initpid = zone->zone_proc_initpid) <= 0) { + return (-1); + } + mutex_enter(&pidlock); + p = prfind_zone(initpid, zone->zone_id); + tid = 0; + } else { + struct lx_pid *hp; + + mutex_enter(&pidlock); + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lpid)]; hp != NULL; + hp = hp->lxp_ltos_next) { + if (hp->lxp_lpid == lpid) { + tid = hp->lxp_stid; + p = hp->lxp_procp; + break; + } + } + mutex_exit(&hash_lock); + /* + * If the pid wasn't listed in the ltos hash, it may correspond + * to an native process in the zone. + */ + if (p == NULL) { + p = prfind_zone(lpid, zone->zone_id); + tid = 0; + } + } + + if (p == NULL) { + mutex_exit(&pidlock); + return (-1); + } + + /* + * Bail on processes belonging to the system, those which are not yet + * complete and zombies (unless explicitly allowed via the flags). + */ + if (p->p_stat == SIDL || (p->p_flag & SSYS) != 0 || + (p->p_stat == SZOMB && (flag & LXP_ZOMBOK) == 0)) { + mutex_exit(&pidlock); + return (-1); + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (flag & LXP_PRLOCK) { + /* + * It would be convenient to call sprtrylock_proc() for this + * task. Unfortunately, its behavior of filtering zombies is + * excessive for some lx_proc use cases. Instead, when the + * provided flags do not indicate that zombies are allowed, + * exiting processes are filtered out (as would be performed by + * sprtrylock_proc). + */ + if ((p->p_flag & (SEXITING|SEXITLWPS)) != 0 && + (flag & LXP_ZOMBOK) == 0) { + mutex_exit(&p->p_lock); + return (-1); + } + if (p->p_proc_flag & P_PR_LOCK) { + sprwaitlock_proc(p); + goto retry; + } else { + p->p_proc_flag |= P_PR_LOCK; + } + } + + if (tid == 0) { + t = p->p_tlist; + } else { + lwpdir_t *ld; + + ld = lwp_hash_lookup(p, tid); + if (ld == NULL) { + if (flag & LXP_PRLOCK) { + sprunprlock(p); + } + mutex_exit(&p->p_lock); + return (-1); + } + t = ld->ld_entry->le_thread; + } + *pp = p; + if (tp != NULL) { + *tp = t; + } + return (0); +} + + +/* + * Given an lwp, return the Linux pid of its parent. If the caller + * wants them, we return the SunOS (pid, tid) as well. + */ +pid_t +lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + const pid_t zoneinit = p->p_zone->zone_proc_initpid; + const pid_t ppid = p->p_ppid; + + /* + * Report a ppid of 1 for processes which are children to either init + * or a process outside the zone. + */ + if (ppid == zoneinit || (p->p_flag & SZONETOP) != 0) { + goto ppid_is_zinit; + } + + /* + * Our native concept of a 'parent pid' matches Linux in two cases: + * + * - TGID and PID are equal: This is either the first thread in the + * process or one created with CLONE_THREAD. + * + * - The brand lwp value for PPID is 0: This is either the child of a + * differently-branded process or was created with the CLONE_PARENT. + */ + if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) { + if (ppidp != NULL) + *ppidp = ppid; + if (ptidp != NULL) + *ptidp = -1; + return (ppid); + } + + /* + * In all other cases, we are looking for the parent of this specific + * thread, which in Linux refers to the thread that clone(2)d it. We + * stashed that thread's PID away when this thread was created. + */ + mutex_enter(&hash_lock); + for (struct lx_pid *hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; + hp != NULL; hp = hp->lxp_ltos_next) { + if (lwpd->br_ppid == hp->lxp_lpid) { + /* + * The PID matches, but there are a couple cases when + * the translation is not suitable: + * + * - The cached start time is too young, indicating + * that the thread exited and the PID was reused by + * another process. + * - The parent is zoneinit + * + * In both cases, a result of ppid=1 is yielded. + */ + if (hp->lxp_start > lwptot(lwp)->t_start || + lwpd->br_ppid == zoneinit) { + break; + } + + /* Good match, yield the result */ + if (ppidp != NULL) + *ppidp = hp->lxp_spid; + if (ptidp != NULL) + *ptidp = hp->lxp_stid; + mutex_exit(&hash_lock); + return (lwpd->br_ppid); + } + } + mutex_exit(&hash_lock); + /* + * If no match is found in the Linux->SunOS translation hash, fall back + * to assuming the zone init process as the parent. + */ + +ppid_is_zinit: + if (ppidp != NULL) + *ppidp = 1; + if (ptidp != NULL) + *ptidp = -1; + return (1); +} + +void +lx_pid_init(void) +{ + hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR)); + + stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + + mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_pid_fini(void) +{ + kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size); + kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c new file mode 100644 index 0000000000..07757ab0aa --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c @@ -0,0 +1,2710 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Emulation of the Linux ptrace(2) interface. + * + * OVERVIEW + * + * The Linux process model is somewhat different from the illumos native + * model. One critical difference is that each Linux thread has a unique + * identifier in the pid namespace. The lx brand assigns a pid to each LWP + * within the emulated process, giving the pid of the process itself to the + * first LWP. + * + * The Linux ptrace(2) interface allows for any LWP in a branded process to + * exert control over any other LWP within the same zone. Control is exerted + * by the use of the ptrace(2) system call itself, which accepts a number of + * request codes. Feedback on traced events is primarily received by the + * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system + * calls. Many of the possible ptrace(2) requests will only succeed if the + * target LWP is in a "ptrace-stop" condition. + * + * HISTORY + * + * The brand support for ptrace(2) was originally built on top of the rich + * support for debugging and tracing provided through the illumos /proc + * interfaces, mounted at /native/proc within the zone. The native legacy + * ptrace(3C) functionality was used as a starting point, but was generally + * insufficient for complete and precise emulation. The extant legacy + * interface, and indeed our native SIGCLD and waitid(2) facilities, are + * focused on _process_ level concerns -- the Linux interface has been + * extended to be aware of LWPs as well. + * + * In order to allow us to focus on providing more complete and accurate + * emulation without extensive and undesirable changes to the native + * facilities, this second generation ptrace(2) emulation is mostly separate + * from any other tracing or debugging framework in the system. + * + * ATTACHING TRACERS TO TRACEES + * + * There are several ways that a child LWP may becomed traced by a tracer. + * To determine which attach method caused a tracee to become attached, one + * may inspect the "br_ptrace_attach" member of the LWP-specific brand data + * with the debugger. + * + * The first attach methods to consider are the attaching ptrace(2) requests: + * + * PTRACE_TRACEME + * + * If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee + * to its parent LWP (br_ppid). Using PTRACE_TRACEME does _not_ cause the + * tracee to be held in a stop condition. It is common practice for + * consumers to raise(SIGSTOP) immediately afterward. + * + * PTRACE_ATTACH + * + * An LWP may attempt to trace any other LWP in this, or another, process. + * We currently allow any attach where the process containing the tracer + * LWP has permission to write to /proc for the process containing the + * intended tracer. This action also sends a SIGSTOP to the newly attached + * tracee. + * + * The second class of attach methods are the clone(2)/fork(2) inheritance + * options that may be set on a tracee with PTRACE_SETOPTIONS: + * + * PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE + * + * If these options have been set on a tracee, then a fork(2), vfork(2) or + * clone(2) respectively will cause the newly created LWP to be traced by + * the same tracer. The same set of ptrace(2) options will also be set on + * the new child. + * + * The third class of attach method is the PTRACE_CLONE flag to clone(2). + * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is + * passed by the tracee as an argument to clone(2). + * + * DETACHING TRACEES + * + * Tracees can be detached by the tracer with the PTRACE_DETACH request. + * This request is only valid when the tracee is in a ptrace(2) stop + * condition, and is itself a restarting action. + * + * If the tracer exits without detaching all of its tracees, then all of the + * tracees are automatically detached and restarted. If a tracee was in + * "signal-delivery-stop" at the time the tracer exited, the signal will be + * released to the child unless it is a SIGSTOP. We drop this instance of + * SIGSTOP in order to prevent the child from becoming stopped by job + * control. + * + * ACCORD ALLOCATION AND MANAGEMENT + * + * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP + * and zero or more tracee LWPs. It is explicitly illegal for a tracee to + * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME. + * + * An LWP starts out without an accord. If a child of that LWP calls + * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses + * PTRACE_ATTACH, an accord will be allocated and stored on that LWP. The + * accord structure is not released from that LWP until it arrives in + * lx_exitlwp(), as called by lwp_exit(). A new accord will not be + * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp() + * and sets the LX_PTF_EXITING flag. An LWP will have at most one accord + * structure throughout its entire lifecycle; once it has one, it has the + * same one until death. + * + * The accord is reference counted (lxpa_refcnt), starting at a count of one + * at creation to represent the link from the tracer LWP to its accord. The + * accord is not freed until the reference count falls to zero. + * + * To make mutual exclusion between a detaching tracer and various notifying + * tracees simpler, the tracer will hold "pidlock" while it clears the + * accord members that point back to the tracer LWP and CV. + * + * SIGNALS AND JOB CONTROL + * + * Various actions, either directly ptrace(2) related or commonly associated + * with tracing, cause process- or thread-directed SIGSTOP signals to be sent + * to tracees (a "signal-delivery-stop"). These signals, and indeed any signal + * other than SIGKILL, can be suppressed by the tracer when using a restarting + * request (including PTRACE_DETACH) on a child. The signal may also be + * substituted for a different signal. + * + * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer, + * it will induce the regular illumos native job control stop of the entire + * traced process. This is at least passingly similar to the Linux "group + * stop" ptrace(2) condition. + * + * SYSTEM CALL TRACING + * + * The ptrace(2) interface enables the tracer to hold the tracee on entry and + * exit from system calls. When a stopped tracee is restarted through the + * PTRACE_SYSCALL request, the LX_PTF_SYSCALL flag is set until the next + * system call boundary. Whether this is a "syscall-entry-stop" or + * "syscall-exit-stop", the tracee is held and the tracer is notified via + * SIGCLD/waitpid(2) in the usual way. The flag LX_PTF_SYSCALL flag is + * cleared after each stop; for ongoing system call tracing the tracee must + * be continuously restarted with PTRACE_SYSCALL. + * + * SPECIAL CASES FOR STOP EVENTS + * + * The strace command is one of the primary consumers of ptrace. In order for + * strace to properly understand what is actually happening when it receives a + * signal associated with a stop event, these signals must match Linux behavior + * exactly or the strace consumer will get out of sync and report incorrect + * state. There are a couple of special cases we have to handle to provide + * proper interaction of the syscall-entry-stop, syscall-exit-stop, and + * signal-delivery-stop events: + * 1) The child process of a clone/fork does not emit a syscall-exit-stop event. + * 2) A signal that arrives between syscall-enter-stop & syscall-exit-stop must + * not immediately emit signal-delivery-stop. This event must be emitted + * after the syscall is interrupted and syscall-exit-stop has been emitted. + * + * EVENT STOPS + * + * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are + * enabled by the tracer through PTRACE_SETOPTIONS. Once enabled, the tracee + * will be stopped at the nominated points of interest and the tracer + * notified. The tracer may request additional information about the event, + * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG. + * + * LOCK ORDERING RULES + * + * It is not safe, in general, to hold p_lock for two different processes at + * the same time. This constraint is the primary reason for the existence + * (and complexity) of the ptrace(2) accord mechanism. + * + * In order to facilitate looking up accords by the "pid" of a tracer LWP, + * p_lock for the tracer process may be held while entering the accord mutex + * (lxpa_lock). This mutex protects the accord flags and reference count. + * The reference count is manipulated through lx_ptrace_accord_hold() and + * lx_ptrace_accord_rele(). + * + * DO NOT interact with the accord mutex (lxpa_lock) directly. The + * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various + * book-keeping and lock ordering enforcement and MUST be used. + * + * It is NOT legal to take ANY p_lock while holding the accord mutex + * (lxpa_lock). If the lxpa_tracees_lock is to be held concurrently with + * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock + * of any processes from the tracee list. + * + * It is NOT legal to take a tracee p_lock and then attempt to enter the + * accord mutex (or tracee list mutex) of its tracer. When running as the + * tracee LWP, the tracee's hold will prevent the accord from being freed. + * Use of the LX_PTF_STOPPING or LX_PTF_CLONING flag in the LWP-specific brand + * data prevents an exiting tracer from altering the tracee until the tracee + * has come to an orderly stop, without requiring the tracee to hold its own + * p_lock the entire time it is stopping. + * + * It is not safe, in general, to enter "pidlock" while holding the p_lock of + * any process. It is similarly illegal to hold any accord locks (lxpa_lock + * or lxpa_sublock) while attempting to enter "pidlock". As "pidlock" is a + * global mutex, it should be held for the shortest possible time. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/procfs.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/wait.h> +#include <sys/prsystm.h> +#include <sys/note.h> + +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> +#include <lx_signum.h> + + +typedef enum lx_ptrace_cont_flags_t { + LX_PTC_NONE = 0x00, + LX_PTC_SYSCALL = 0x01, + LX_PTC_SINGLESTEP = 0x02 +} lx_ptrace_cont_flags_t; + + +extern int lx_user_regs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_regs_copyout(lx_lwp_data_t *, void *); +extern int lx_ptrace_peekuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_ptrace_pokeuser(lx_lwp_data_t *, uintptr_t, void *); +extern int lx_user_fpregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpregs_copyout(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyin(lx_lwp_data_t *, void *); +extern int lx_user_fpxregs_copyout(lx_lwp_data_t *, void *); + +/* + * Macros for checking the state of an LWP via "br_ptrace_flags": + */ +#define LX_PTRACE_BUSY \ + (LX_PTF_EXITING | LX_PTF_STOPPING | LX_PTF_CLONING) + +#define VISIBLE(a) (((a)->br_ptrace_flags & LX_PTF_EXITING) == 0) +#define TRACEE_BUSY(a) (((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0) + +#define ACCORD_HELD(a) MUTEX_HELD(&(a)->lxpa_lock) + +#define LX_PID_TO_INIT(x) ((x) == curproc->p_zone->zone_proc_initpid ? \ + 1 : (x)) +#define LX_INIT_TO_PID(x) ((x) == 1 ? \ + curproc->p_zone->zone_proc_initpid : (x)) + +static kcondvar_t lx_ptrace_busy_cv; +static kmem_cache_t *lx_ptrace_accord_cache; + +/* + * Enter the accord mutex. + */ +static void +lx_ptrace_accord_enter(lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + + mutex_enter(&accord->lxpa_lock); +} + +/* + * Exit the accord mutex. If the reference count has dropped to zero, + * free the accord. + */ +static void +lx_ptrace_accord_exit(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + if (accord->lxpa_refcnt > 0) { + mutex_exit(&accord->lxpa_lock); + return; + } + + /* + * When the reference count drops to zero we must free the accord. + */ + VERIFY(accord->lxpa_tracer == NULL); + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + VERIFY(list_is_empty(&accord->lxpa_tracees)); + VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE); + + mutex_destroy(&accord->lxpa_lock); + mutex_destroy(&accord->lxpa_tracees_lock); + + kmem_cache_free(lx_ptrace_accord_cache, accord); +} + +/* + * Drop our reference to this accord. If this drops the reference count + * to zero, the next lx_ptrace_accord_exit() will free the accord. + */ +static void +lx_ptrace_accord_rele(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + VERIFY(accord->lxpa_refcnt > 0); + accord->lxpa_refcnt--; +} + +/* + * Place an additional hold on an accord. + */ +static void +lx_ptrace_accord_hold(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + accord->lxpa_refcnt++; +} + +/* + * Fetch the accord for this LWP. If one has not yet been created, and the + * process is not exiting, allocate it now. Must be called with p_lock held + * for the process containing the target LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp, + boolean_t allocate_one) +{ + lx_ptrace_accord_t *lxpa; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * If this LWP does not have an accord, we wish to allocate + * and install one. + */ + if ((lxpa = lwpd->br_ptrace_accord) == NULL) { + if (!allocate_one || !VISIBLE(lwpd)) { + /* + * Either we do not wish to allocate an accord, or this + * LWP has already begun exiting from a ptrace + * perspective. + */ + *accordp = NULL; + return (ESRCH); + } + + lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP); + bzero(lxpa, sizeof (*lxpa)); + + /* + * The initial reference count is 1 because we are referencing + * it in from the soon-to-be tracer LWP. + */ + lxpa->lxpa_refcnt = 1; + mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t), + offsetof(lx_lwp_data_t, br_ptrace_linkage)); + lxpa->lxpa_cvp = &p->p_cv; + + lxpa->lxpa_tracer = lwpd; + lwpd->br_ptrace_accord = lxpa; + } + + /* + * Lock the accord before returning it to the caller. + */ + lx_ptrace_accord_enter(lxpa); + + /* + * There should be at least one active reference to this accord, + * otherwise it should have been freed. + */ + VERIFY(lxpa->lxpa_refcnt > 0); + + *accordp = lxpa; + return (0); +} + +/* + * Accords belong to the tracer LWP. Get the accord for this tracer or return + * an error if it was not possible. To prevent deadlocks, the caller MUST NOT + * hold p_lock on its own or any other process. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp) +{ + int ret = ESRCH; + proc_t *aproc; + kthread_t *athr; + klwp_t *alwp; + lx_lwp_data_t *alwpd; + + VERIFY(MUTEX_NOT_HELD(&curproc->p_lock)); + + /* + * Locate the process containing the tracer LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_lock(lxpid, curzone, LXP_PRLOCK, &aproc, &athr) != 0) { + return (ESRCH); + } + + /* + * Locate the tracer LWP itself and ensure that it is visible to + * ptrace(2). + */ + if ((alwp = ttolwp(athr)) == NULL || + (alwpd = lwptolxlwp(alwp)) == NULL || + !VISIBLE(alwpd)) { + sprunlock(aproc); + return (ESRCH); + } + + /* + * We should not fetch our own accord this way. + */ + if (athr == curthread) { + sprunlock(aproc); + return (EPERM); + } + + /* + * Fetch (or allocate) the accord owned by this tracer LWP: + */ + ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE); + + /* + * Unlock the process and return. + */ + sprunlock(aproc); + return (ret); +} + +/* + * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a + * tracer. The caller MUST NOT currently hold p_lock on the process containing + * this LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + int ret; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Lock the tracer (this LWP). + */ + mutex_enter(&p->p_lock); + + /* + * Fetch (or allocate) the accord for this LWP: + */ + ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one); + + mutex_exit(&p->p_lock); + + return (ret); +} + +/* + * Restart an LWP if it is in "ptrace-stop". This function may induce sleep, + * so the caller MUST NOT hold any mutexes other than p_lock for the process + * containing the LWP. + */ +static void +lx_ptrace_restart_lwp(klwp_t *lwp) +{ + kthread_t *rt = lwptot(lwp); + proc_t *rproc = lwptoproc(lwp); + lx_lwp_data_t *rlwpd = lwptolxlwp(lwp); + + VERIFY(rt != curthread); + VERIFY(MUTEX_HELD(&rproc->p_lock)); + + /* + * Exclude potential meddling from procfs. + */ + prbarrier(rproc); + + /* + * Check that the LWP is still in "ptrace-stop" and, if so, restart it. + */ + thread_lock(rt); + if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) { + rt->t_schedflag |= TS_BSTART; + setrun_locked(rt); + + /* + * Clear stop reason. + */ + rlwpd->br_ptrace_whystop = 0; + rlwpd->br_ptrace_whatstop = 0; + rlwpd->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); + } + thread_unlock(rt); +} + +static void +lx_ptrace_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag, + pid_t *event_ppid, pid_t *event_pid) +{ + int signo; + + /* + * Populate our k_siginfo_t with data about this "ptrace-stop" + * condition: + */ + bzero(ip, sizeof (*ip)); + ip->si_signo = SIGCLD; + ip->si_pid = LX_PID_TO_INIT(remote->br_pid); + ip->si_code = CLD_TRAPPED; + + switch (remote->br_ptrace_whatstop) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + ip->si_status = SIGTRAP; + if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) { + ip->si_status |= 0x80; + } + break; + + case LX_PR_SIGNALLED: + signo = remote->br_ptrace_stopsig; + if (signo < 1 || signo >= LX_NSIG) { + /* + * If this signal number is not valid, pretend it + * was a SIGTRAP. + */ + ip->si_status = SIGTRAP; + } else { + ip->si_status = ltos_signo[signo]; + } + break; + + case LX_PR_EVENT: + ip->si_status = SIGTRAP | remote->br_ptrace_event; + /* + * Record the Linux pid of both this LWP and the create + * event we are dispatching. We will use this information + * to unblock any subsequent ptrace(2) events that depend + * on this one. + */ + if (event_ppid != NULL) + *event_ppid = remote->br_pid; + if (event_pid != NULL) + *event_pid = (pid_t)remote->br_ptrace_eventmsg; + break; + + default: + cmn_err(CE_PANIC, "unxpected stop subreason: %d", + remote->br_ptrace_whatstop); + } + + /* + * If WNOWAIT was specified, do not mark the event as posted + * so that it may be re-fetched on another call to waitid(). + */ + if (waitflag) + remote->br_ptrace_flags &= ~(LX_PTF_CLDPEND | LX_PTF_WAITPEND); +} + +/* + * Receive notification from stop() of a PR_BRAND stop. + */ +void +lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + klwp_t *plwp = NULL; + proc_t *pp = NULL; + lx_lwp_data_t *parent; + boolean_t cldpend = B_TRUE; + boolean_t cldpost = B_FALSE; + sigqueue_t *sqp = NULL; + + /* + * We currently only care about LX-specific stop reasons. + */ + if (why != PR_BRAND) + return; + + switch (what) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + case LX_PR_SIGNALLED: + case LX_PR_EVENT: + break; + default: + cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND" + " stop: %d", (int)what); + } + + /* + * We should be holding the lock on our containing process. The + * STOPPING flag should have been set by lx_ptrace_stop() for all + * PR_BRAND stops. + */ + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + VERIFY((accord = lwpd->br_ptrace_tracer) != NULL); + + /* + * We must drop our process lock to take "pidlock". The + * LX_PTF_STOPPING flag protects us from an exiting or detaching tracer. + */ + mutex_exit(&p->p_lock); + + /* + * Allocate before we enter any mutexes. + */ + sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP); + + /* + * We take pidlock now, which excludes all callers of waitid() and + * prevents an exiting tracer from clearing critical accord members. + */ + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + + /* + * Get the ptrace(2) "parent" process, to which we may send + * a SIGCLD signal later. + */ + if ((parent = accord->lxpa_tracer) != NULL && + (plwp = parent->br_lwp) != NULL) { + pp = lwptoproc(plwp); + } + + /* + * Our tracer should not have been modified in our absence; the + * LX_PTF_STOPPING flag prevents it. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + + /* + * Stash data for this stop condition in the LWP data while we hold + * both pidlock and our p_lock. + */ + lwpd->br_ptrace_whystop = why; + lwpd->br_ptrace_whatstop = what; + lwpd->br_ptrace_flags |= LX_PTF_WAITPEND; + + /* + * If this event does not depend on an event from the parent LWP, + * populate the siginfo_t for the event pending on this tracee LWP. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_PARENT_WAIT) && pp != NULL) { + cldpost = B_TRUE; + lx_ptrace_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL); + } + + /* + * Drop our p_lock so that we may lock the tracer. + */ + mutex_exit(&p->p_lock); + if (cldpost && pp != NULL) { + /* + * Post the SIGCLD to the tracer. + */ + mutex_enter(&pp->p_lock); + if (!sigismember(&pp->p_sig, SIGCLD)) { + sigaddqa(pp, plwp->lwp_thread, sqp); + cldpend = B_FALSE; + sqp = NULL; + } + mutex_exit(&pp->p_lock); + } + + /* + * We re-take our process lock now. The lock will be held until + * the thread is actually marked stopped, so we will not race with + * lx_ptrace_lock_if_stopped() or lx_waitid_helper(). + */ + mutex_enter(&p->p_lock); + + /* + * We clear the STOPPING flag; stop() continues to hold our p_lock + * until our thread stop state is visible. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_STOPPING; + lwpd->br_ptrace_flags |= LX_PTF_STOPPED; + if (cldpend) { + /* + * We sent the SIGCLD for this new wait condition already. + */ + lwpd->br_ptrace_flags |= LX_PTF_CLDPEND; + } + + /* + * If lx_ptrace_exit_tracer(), or a detach operation, is trying to + * detach our tracer, it will be sleeping on this CV until + * LX_PTF_STOPPING is clear. Wake it now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * While still holding pidlock, we attempt to wake our tracer from a + * potential waitid() slumber. + */ + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + + /* + * We release pidlock and return as we were called: with our p_lock + * held. + */ + mutex_exit(&pidlock); + + if (sqp != NULL) { + kmem_free(sqp, sizeof (*sqp)); + } +} + +/* + * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or + * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop". This + * check must ONLY be run on tracees of the current LWP. If the check is + * successful, we return with the tracee p_lock held. + * + * In the case of PTRACE_DETACH, we can return with the tracee locked even if + * it is not in "ptrace-stop". This can happen for various reasons, such as if + * the remote process is already job-stopped in the kernel. We must still be + * able to detach from this process. We return ENOENT in this case. + */ +static int +lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, + boolean_t detaching) +{ + klwp_t *rlwp = remote->br_lwp; + proc_t *rproc = lwptoproc(rlwp); + kthread_t *rt = lwptot(rlwp); + + /* + * We must never check that we, ourselves, are stopped. We must also + * have the accord tracee list locked while we lock our tracees. + */ + VERIFY(curthread != rt); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + VERIFY(accord->lxpa_tracer == ttolxlwp(curthread)); + + /* + * Lock the process containing the tracee LWP. + */ + mutex_enter(&rproc->p_lock); + if (!VISIBLE(remote)) { + /* + * The tracee LWP is currently detaching itself as it exits. + * It is no longer visible to ptrace(2). + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * We must only check whether tracees of the current LWP are stopped. + * We check this condition after confirming visibility as an exiting + * tracee may no longer be completely consistent. + */ + VERIFY(remote->br_ptrace_tracer == accord); + + if (!(remote->br_ptrace_flags & LX_PTF_STOPPED)) { + if (detaching) { + /* + * The tracee is not in "ptrace-stop", but we still + * return with the locked process. This is indicated + * by ENOENT. + */ + return (ENOENT); + } + + /* + * The tracee is not in "ptrace-stop", so we release the + * process. + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * The tracee is stopped. We return holding its process lock so that + * the caller may manipulate it. + */ + return (0); +} + +static int +lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options) +{ + /* + * Check for valid options. + */ + if ((options & ~LX_PTRACE_O_ALL) != 0) { + return (EINVAL); + } + + /* + * Set ptrace options on the target LWP. + */ + remote->br_ptrace_options = (lx_ptrace_options_t)options; + + return (0); +} + +static int +lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp) +{ + int error; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + uint32_t tmp = remote->br_ptrace_eventmsg; + + error = copyout(&tmp, umsgp, sizeof (uint32_t)); + } else +#endif + { + error = copyout(&remote->br_ptrace_eventmsg, umsgp, + sizeof (ulong_t)); + } + + return (error); +} + +static int +lx_ptrace_getsiginfo(lx_lwp_data_t *remote, void *usiginfo) +{ + klwp_t *lwp = remote->br_lwp; + int lx_sig; + + lx_sig = lx_stol_signo(lwp->lwp_cursig, 0); + if (lx_sig < 1 || lwp->lwp_curinfo == NULL) { + return (EINVAL); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + if (stol_ksiginfo32_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } else +#endif + { + if (stol_ksiginfo_copyout(&lwp->lwp_curinfo->sq_info, + usiginfo) != 0) { + return (EFAULT); + } + } + + return (0); +} + + +/* + * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface. + */ +static int +lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo) +{ + klwp_t *lwp = remote->br_lwp; + + if (flags & LX_PTC_SINGLESTEP) { + /* + * We do not currently support single-stepping. + */ + lx_unsupported("PTRACE_SINGLESTEP not currently implemented"); + return (EINVAL); + } + + /* + * The tracer may choose to suppress the delivery of a signal, or + * select an alternative signal for delivery. If this is an + * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig + * will be used as the new signal number. + * + * As with so many other aspects of the Linux ptrace(2) interface, this + * may fail silently if the state machine is not aligned correctly. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + /* + * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart: + */ + if (flags & LX_PTC_SYSCALL) { + remote->br_ptrace_flags |= LX_PTF_SYSCALL; + } else { + remote->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + lx_ptrace_restart_lwp(lwp); + + return (0); +} + +/* + * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface. + * + * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the + * current LWP, be detached and (optionally) set runnable. + */ +static void +lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo, + boolean_t restart) +{ + klwp_t *rlwp = remote->br_lwp; + + /* + * The tracee LWP may have been in "ptrace-stop" (restart is true if + * that was the case). We now hold the tracee's p_lock. + * Detach the LWP from the accord and set it running. + */ + VERIFY(!TRACEE_BUSY(remote)); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + remote->br_ptrace_flags &= ~(LX_PTF_SYSCALL | LX_PTF_INHERIT); + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Decrement traced-lwp count for the process. + */ + ASSERT(MUTEX_HELD(&rlwp->lwp_procp->p_lock)); + VERIFY(ptolxproc(rlwp->lwp_procp)->l_ptrace-- >= 1); + + /* + * The tracer may, as described in lx_ptrace_cont(), choose to suppress + * or modify the delivered signal. + */ + remote->br_ptrace_stopsig = signo; + remote->br_ptrace_donesig = 0; + + if (restart) { + lx_ptrace_restart_lwp(rlwp); + } +} + +/* + * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2) + * interface. + * + * This LWP is requesting to be attached as a tracer to another LWP -- the + * tracee. If a ptrace accord to track the list of tracees has not yet been + * allocated, one will be allocated and attached to this LWP now. + * + * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the + * tracee LWP is then added to the "lxpa_tracees" list in the accord. We drop + * locks between these two phases; the only consumer of trace events from this + * accord is this LWP, which obviously cannot be running waitpid(2) at the same + * time as this call to ptrace(2). + */ +static int +lx_ptrace_attach(pid_t lx_pid) +{ + int error = ESRCH; + /* + * Our (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + /* + * Remote (Tracee) LWP: + */ + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + + if (lwpd->br_pid == lx_pid) { + /* + * We cannot trace ourselves. + */ + return (EPERM); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This + * routine should not fail because the LWP cannot make ptrace(2) system + * calls after it has begun exiting. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * Place speculative hold in case the attach is successful. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Locate the process containing the tracee LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_lock(lx_pid, curzone, LXP_PRLOCK, &rproc, &rthr) != 0) { + /* + * We could not find the target process. + */ + goto errout; + } + + /* + * Locate the tracee LWP. + */ + if ((rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL || + !VISIBLE(rlwpd)) { + /* + * The LWP could not be found, was not branded, or is not + * visible to ptrace(2) at this time. + */ + goto unlock_errout; + } + + /* + * We now hold the lock on the tracee. Attempt to install ourselves + * as the tracer. + */ + if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc, + NULL, VWRITE) != 0) { + /* + * This process does not have permission to trace the remote + * process. + */ + error = EPERM; + } else if (rlwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(list_link_active(&rlwpd->br_ptrace_linkage)); + VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + lx_proc_data_t *rprocd = ptolxproc(rproc); + + /* + * Bond the tracee to the accord. + */ + VERIFY0(rlwpd->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE); + rlwpd->br_ptrace_attach = LX_PTA_ATTACH; + rlwpd->br_ptrace_tracer = accord; + + /* Don't emit ptrace syscall-stop-exit event on kernel exit. */ + rlwpd->br_ptrace_flags |= LX_PTF_NOSTOP; + + /* + * We had no tracer, and are thus not in the tracees list. + * It is safe to take the tracee list lock while we insert + * ourselves. + */ + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, rlwpd); + /* + * Bump traced-lwp count for the remote process. + */ + rprocd->l_ptrace++; + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Send a thread-directed SIGSTOP. + */ + sigtoproc(rproc, rthr, SIGSTOP); + + + error = 0; + } + +unlock_errout: + /* + * Unlock the process containing the tracee LWP and the accord. + */ + sprunlock(rproc); + +errout: + if (error != 0) { + /* + * The attach was not successful. Remove our speculative + * hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); +} + +int +lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + switch (option) { + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + case LX_PTRACE_O_TRACECLONE: + break; + + default: + return (EINVAL); + } + + mutex_enter(&p->p_lock); + + lwpd->br_ptrace_clone_option = option; + + if (inherit_flag) { + lwpd->br_ptrace_flags |= LX_PTF_INHERIT; + } else { + lwpd->br_ptrace_flags &= ~LX_PTF_INHERIT; + } + + mutex_exit(&p->p_lock); + return (0); +} + +/* + * If the parent LWP is being traced, we want to attach ourselves to the + * same accord. + */ +void +lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst) +{ + proc_t *srcp = lwptoproc(src->br_lwp); + proc_t *dstp = lwptoproc(dst->br_lwp); + lx_ptrace_accord_t *accord; + boolean_t is_fork = B_FALSE; + + VERIFY(MUTEX_HELD(&dstp->p_lock)); + if (srcp != dstp) { + /* + * In the case of being called via forklwp, some lock shuffling + * is required. The destination p_lock must be dropped to + * avoid deadlocks when locking the source and manipulating + * ptrace accord resources. + */ + is_fork = B_TRUE; + sprlock_proc(dstp); + mutex_exit(&dstp->p_lock); + mutex_enter(&srcp->p_lock); + } + + if ((accord = src->br_ptrace_tracer) == NULL) { + /* + * The source LWP does not have a tracer to inherit. + */ + goto out; + } + + /* + * There are two conditions to check when determining if the new + * child should inherit the same tracer (and tracing options) as its + * parent. Either condition is sufficient to trigger inheritance. + */ + dst->br_ptrace_attach = LX_PTA_NONE; + if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) { + /* + * Condition 1: + * The clone(2), fork(2) and vfork(2) emulated system calls + * populate "br_ptrace_clone_option" with the specific + * ptrace(2) SETOPTIONS option that applies to this + * operation. If the relevant option has been enabled by the + * tracer then we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS; + + } else if ((src->br_ptrace_flags & LX_PTF_INHERIT) != 0) { + /* + * Condition 2: + * If the caller opted in to inheritance with the + * PTRACE_CLONE flag to clone(2), the LX_PTF_INHERIT flag + * will be set and we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE; + } + + /* + * These values only apply for the duration of a single clone(2), et + * al, system call. + */ + src->br_ptrace_flags &= ~LX_PTF_INHERIT; + src->br_ptrace_clone_option = 0; + + if (dst->br_ptrace_attach == LX_PTA_NONE) { + /* + * No condition triggered inheritance. + */ + goto out; + } + + /* + * Set the LX_PTF_CLONING flag to prevent us from being detached + * while our p_lock is dropped. + */ + src->br_ptrace_flags |= LX_PTF_CLONING; + mutex_exit(&srcp->p_lock); + + /* + * Hold the accord for the new LWP. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Install the tracer and copy the current PTRACE_SETOPTIONS options. + */ + dst->br_ptrace_tracer = accord; + dst->br_ptrace_options = src->br_ptrace_options; + + /* + * This flag prevents waitid() from seeing events for the new child + * until the parent is able to post the relevant ptrace event to + * the tracer. + */ + dst->br_ptrace_flags |= LX_PTF_PARENT_WAIT; + + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(list_link_active(&src->br_ptrace_linkage)); + VERIFY(!list_link_active(&dst->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, dst); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Relock our process and clear our busy flag. + */ + mutex_enter(&srcp->p_lock); + src->br_ptrace_flags &= ~LX_PTF_CLONING; + + /* + * Bump traced-lwp count for the process. + */ + ptolxproc(dstp)->l_ptrace++; + + /* + * If lx_ptrace_exit_tracer(), or a detach operation, is trying to + * detach our tracer, it will be sleeping on this CV until + * LX_PTF_CLONING is clear. Wake it now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + +out: + if (is_fork) { + mutex_exit(&srcp->p_lock); + mutex_enter(&dstp->p_lock); + sprunprlock(dstp); + } +} + +static int +lx_ptrace_traceme(void) +{ + int error; + boolean_t did_attach = B_FALSE; + /* + * Our (Tracee) LWP: + */ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + /* + * Remote (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + + /* + * We are intending to be the tracee. Fetch (or allocate) the accord + * for our parent LWP. + */ + if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL, + NULL), &accord)) != 0) { + /* + * Could not determine the Linux pid of the parent LWP, or + * could not get the accord for that LWP. + */ + return (error); + } + + /* + * We now hold the accord lock. + */ + if (accord->lxpa_flags & LX_ACC_TOMBSTONE) { + /* + * The accord is marked for death; give up now. + */ + lx_ptrace_accord_exit(accord); + return (ESRCH); + } + + /* + * Bump the reference count so that the accord is not freed. We need + * to drop the accord lock before we take our own p_lock. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * We now lock _our_ process and determine if we can install our parent + * as our tracer. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + /* + * Bond ourselves to the accord. We already bumped the accord + * reference count. + */ + VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE); + lwpd->br_ptrace_attach = LX_PTA_TRACEME; + lwpd->br_ptrace_tracer = accord; + did_attach = B_TRUE; + error = 0; + + /* + * Speculatively bump l_ptrace now before dropping p_lock. + * It will be reverted if the tracee attachment fails. + */ + ptolxproc(p)->l_ptrace++; + } + mutex_exit(&p->p_lock); + + /* + * Lock the accord tracee list and add this LWP. Once we are in the + * tracee list, it is the responsibility of the tracer to detach us. + */ + if (error == 0) { + lx_ptrace_accord_enter(accord); + mutex_enter(&accord->lxpa_tracees_lock); + + if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) { + /* + * Put ourselves in the tracee list for this accord. + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, lwpd); + mutex_exit(&accord->lxpa_tracees_lock); + lx_ptrace_accord_exit(accord); + + return (0); + } + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * The accord has been marked for death. We must + * untrace ourselves. + */ + error = ESRCH; + lx_ptrace_accord_exit(accord); + + /* + * Undo speculative increment of ptracer count. + */ + mutex_enter(&p->p_lock); + ptolxproc(p)->l_ptrace--; + mutex_exit(&p->p_lock); + } + + /* + * Our optimism was unjustified: We were unable to attach. We need to + * lock the process containing this LWP again in order to remove the + * tracer. + */ + VERIFY(error != 0); + mutex_enter(&p->p_lock); + if (did_attach) { + /* + * Verify that things were as we left them: + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + VERIFY(lwpd->br_ptrace_tracer == accord); + + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + } + mutex_exit(&p->p_lock); + + /* + * Remove our speculative hold on the accord, possibly causing it to be + * freed in the process. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (error); +} + +static boolean_t +lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) +{ + boolean_t reset_nostop = B_FALSE; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark this LWP as stopping and call stop() to enter "ptrace-stop". + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_STOPPING); + lwpd->br_ptrace_flags |= LX_PTF_STOPPING; + + if (lwpd->br_lwp->lwp_nostop == 1 && + lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) { + /* We need to clear this to get the signal delivered. */ + lwpd->br_lwp->lwp_nostop = 0; + reset_nostop = B_TRUE; + } + + stop(PR_BRAND, what); + + if (reset_nostop) { + VERIFY(lwpd->br_lwp->lwp_nostop == 0); + lwpd->br_lwp->lwp_nostop = 1; + } + + /* + * We are back from "ptrace-stop" with our process lock held. + */ + lwpd->br_ptrace_flags &= ~(LX_PTF_STOPPING | LX_PTF_STOPPED | + LX_PTF_CLDPEND); + lwpd->br_ptrace_stopucp = (uintptr_t)NULL; + cv_broadcast(&lx_ptrace_busy_cv); + mutex_exit(&p->p_lock); + + return (B_TRUE); +} + +int +lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg, + uintptr_t ucp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (!child) { + /* + * Only the first event posted by a new process is to be held + * until the matching parent event is dispatched, and only if + * it is a "child" event. This is not a child event, so we + * clear the wait flag. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + } else if (option == LX_PTRACE_O_TRACEVFORK) { + /* + * For a child, we have to handle vfork as a special case. In + * lx_ptrace_inherit_tracer() we set LX_PTF_PARENT_WAIT to + * force events to be delayed until the parent posts its event. + * This flag is cleared in lx_waitid_helper() to enforce a + * "happens after" relationship. However, this obviously cannot + * work for the vfork case. Thus, we clear our flag now so that + * we can deliver the signal in lx_stop_notify(), if necessary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + } + + if (!(lwpd->br_ptrace_options & option)) { + if (option == LX_PTRACE_O_TRACEEXEC) { + /* + * Without PTRACE_O_TRACEEXEC, the Linux kernel will + * send SIGTRAP to the process. + */ + sigtoproc(p, t, SIGTRAP); + mutex_exit(&p->p_lock); + return (0); + } + + /* + * The flag for this trace event is not enabled, so we will not + * stop. + */ + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (child) { + switch (option) { + case LX_PTRACE_O_TRACECLONE: + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + /* + * Send the child LWP a directed SIGSTOP. + */ + sigtoproc(p, t, SIGSTOP); + mutex_exit(&p->p_lock); + return (0); + default: + goto nostop; + } + } + + lwpd->br_ptrace_eventmsg = msg; + + switch (option) { + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE; + break; + case LX_PTRACE_O_TRACEEXEC: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC; + lwpd->br_ptrace_eventmsg = 0; + break; + case LX_PTRACE_O_TRACEEXIT: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT; + break; + case LX_PTRACE_O_TRACEFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK; + break; + case LX_PTRACE_O_TRACEVFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK; + break; + case LX_PTRACE_O_TRACEVFORKDONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE; + lwpd->br_ptrace_eventmsg = 0; + break; + default: + goto nostop; + } + + /* + * Userland may have passed in a ucontext_t pointer for + * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped. + */ + lwpd->br_ptrace_stopucp = ucp; + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH); + +nostop: + lwpd->br_ptrace_event = 0; + lwpd->br_ptrace_eventmsg = 0; + mutex_exit(&p->p_lock); + return (ESRCH); +} + +boolean_t +lx_ptrace_stop(ushort_t what) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT || + what == LX_PR_SIGNALLED); + + /* + * If we do not have an accord, bail out early. + */ + if (lwpd->br_ptrace_tracer == NULL) + return (B_FALSE); + + /* + * Lock this process and re-check the condition. + */ + mutex_enter(&p->p_lock); + + /* + * The child after a fork/clone doesn't emit syscall-exit-stop event. + */ + if (what == LX_PR_SYSEXIT && (lwpd->br_ptrace_flags & LX_PTF_NOSTOP)) { + lwpd->br_ptrace_flags &= ~LX_PTF_NOSTOP; + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (lwpd->br_ptrace_tracer == NULL) { + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_SYSCALL); + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) { + if (what == LX_PR_SYSENTRY) { + lwpd->br_ptrace_flags |= LX_PTF_INSYSCALL; + } else { + lwpd->br_ptrace_flags &= ~LX_PTF_INSYSCALL; + } + + /* + * This is a syscall-entry-stop or syscall-exit-stop point. + */ + if (!(lwpd->br_ptrace_flags & LX_PTF_SYSCALL)) { + /* + * A system call stop has not been requested. + */ + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + /* + * The PTRACE_SYSCALL restart command applies only to the next + * system call entry or exit. The tracer must restart us with + * PTRACE_SYSCALL while we are in ptrace-stop for us to fire + * again at the next system call boundary. + */ + lwpd->br_ptrace_flags &= ~LX_PTF_SYSCALL; + } + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, what)); +} + +/* + * In addition to performing the ptrace sig_stop handling, this function is + * also used to block signal from being delivered. + * + * Return 0 if issig_forreal() should continue on, -1 if issig_forreal should + * recheck after we've made changes, or 1 if issig_forreal should stop checking + * signals. + */ +int +lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int lx_sig; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + if (ptolxproc(p)->l_block_all_signals != 0) + return (1); + + /* + * In very rare circumstances, a process which is almost completely + * through proc_exit() may incur issig checks in the current thread via + * clean-up actions. The process will still be branded, but the thread + * will have already been stripped of any LX-specific data on its way + * to the grave. Bail early if the brand data is missing. + */ + if (lwpd == NULL) { + return (0); + } + + /* + * If we do not have an accord, bail out now. Additionally, if there + * is no valid signal then we have no reason to stop. + */ + if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL || + (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) || + (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) { + if (lwp->lwp_cursig == 0) { + /* + * If this lwp has no current signal, it means that any + * signal ignorance enabled by br_ptrace_donesig has + * already taken place (the signal was consumed). + * By clearing donesig, we declare desire to ignore no + * signals for accurate ptracing. + */ + lwpd->br_ptrace_donesig = 0; + } + return (0); + } + + /* + * We can't deliver the signal-delivery-stop condition while we're + * between the syscall-enter-stop and syscall-exit-stop conditions. + * We must first let the signal interrupt the in-progress syscall, let + * it emit syscall-exit-stop with the interrupted result, then we'll + * come back here to emit signal-delivery-stop. + */ + if (lwpd->br_ptrace_flags & LX_PTF_INSYSCALL) { + return (0); + } + + /* + * We stash the signal on the LWP where our waitid_helper will find it + * and enter the ptrace "signal-delivery-stop" condition. + */ + lwpd->br_ptrace_stopsig = lx_sig; + lwpd->br_ptrace_donesig = 0; + (void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED); + mutex_enter(&p->p_lock); + + /* + * When we return, the signal may have been altered or suppressed. + */ + if (lwpd->br_ptrace_stopsig != lx_sig) { + int native_sig; + lx_sig = lwpd->br_ptrace_stopsig; + + if (lx_sig >= LX_NSIG) { + lx_sig = 0; + } + + /* + * Translate signal from Linux signal number back to + * an illumos native signal. + */ + if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig = + ltos_signo[lx_sig]) < 1) { + /* + * The signal is not deliverable. + */ + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + } else { + /* + * Alter the currently dispatching signal. + */ + if (native_sig == SIGKILL) { + /* + * We mark ourselves the victim and request + * a restart of signal processing. + */ + p->p_flag |= SKILLED; + p->p_flag &= ~SEXTKILLED; + return (-1); + } + lwp->lwp_cursig = native_sig; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo != NULL) { + lwp->lwp_curinfo->sq_info.si_signo = native_sig; + } + } + } + + lwpd->br_ptrace_donesig = lwp->lwp_cursig; + lwpd->br_ptrace_stopsig = 0; + return (0); +} + +boolean_t +lx_ptrace_sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + lx_proc_data_t *lxpd = ptolxproc(p); + + /* + * Ignored signals and ptrace: + * + * When a process is being ptraced by another, special care is needed + * while handling signals. Since the tracer is interested in all + * signals sent to the tracee, an effort must be made to initially + * bypass signal ignorance logic. This allows the signal to be placed + * in the tracee's sigqueue to be inspected and potentially altered by + * the tracer. + * + * A critical detail in this procedure is how a signal is handled after + * tracer has completed processing for the event. If the signal would + * have been ignored, were it not for the initial ptrace override, then + * lx_ptrace_sig_ignorable must report B_TRUE when the tracee is + * restarted and resumes signal processing. This is done by recording + * the most recent tracee signal consumed by ptrace. + */ + + if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) { + /* + * This process is being ptraced. Bypass signal ignorance for + * anything that maps to a valid Linux signal... + */ + if (lwp != NULL && lwptolxlwp(lwp)->br_ptrace_donesig == sig) { + /* + * ...Unless it is a signal which has already been + * processed by the tracer. + */ + return (B_TRUE); + } + return (B_FALSE); + } + return (B_TRUE); +} + +static void +lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + lx_ptrace_accord_enter(accord); + /* + * Mark this accord for death. This means no new tracees can be + * attached to this accord. + */ + VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE); + accord->lxpa_flags |= LX_ACC_TOMBSTONE; + lx_ptrace_accord_exit(accord); + + /* + * Walk the list of tracees, detaching them and setting them runnable + * if they are stopped. + */ + for (;;) { + klwp_t *rlwp; + proc_t *rproc; + lx_lwp_data_t *remote; + kmutex_t *rmp; + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + mutex_exit(&accord->lxpa_tracees_lock); + break; + } + + /* + * Fetch the first tracee LWP in the list and lock the process + * which contains it. + */ + remote = list_head(&accord->lxpa_tracees); + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + /* + * The p_lock mutex persists beyond the life of the process + * itself. We save the address, here, to prevent the need to + * dereference the proc_t after awaking from sleep. + */ + rmp = &rproc->p_lock; + mutex_enter(rmp); + + if (TRACEE_BUSY(remote)) { + /* + * This LWP is currently detaching itself on exit, or + * mid-way through stop(). We must wait for this + * action to be completed. While we wait on the CV, we + * must drop the accord tracee list lock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + cv_wait(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, some state may have changed. + * Restart the walk to be sure we don't miss anything. + */ + mutex_exit(rmp); + continue; + } + + /* + * We now hold p_lock on the process. Remove the tracee from + * the list. + */ + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + /* + * Unlink the accord and clear our trace flags. + */ + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Let go of the list lock before we restart the LWP. We must + * not hold any locks other than the process p_lock when + * we call lx_ptrace_restart_lwp() as it will thread_lock + * the tracee. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Decrement traced-lwp count for the remote process. + */ + VERIFY(ptolxproc(rproc)->l_ptrace-- >= 1); + + /* + * Ensure that the LWP is not stopped on our account. + */ + lx_ptrace_restart_lwp(rlwp); + + /* + * Unlock the former tracee. + */ + mutex_exit(rmp); + + /* + * Drop the hold this tracee had on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + mutex_enter(&p->p_lock); + lwpd->br_ptrace_accord = NULL; + mutex_exit(&p->p_lock); + + /* + * Clean up and release our hold on the accord If we completely + * detached all tracee LWPs, this will free the accord. Otherwise, it + * will be freed when they complete their cleanup. + * + * We hold "pidlock" while clearing these members for easy exclusion of + * waitid(), etc. + */ + mutex_enter(&pidlock); + lx_ptrace_accord_enter(accord); + accord->lxpa_cvp = NULL; + accord->lxpa_tracer = NULL; + mutex_exit(&pidlock); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +static void +lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd) +{ + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Be careful in the face of detaching and attaching tracers. + * lwpd->br_ptrace_tracer is modified only when p->p_lock is held. Lock + * ordering says that accord->lxpa_tracees_lock must be taken prior to + * p->p_lock, so we have to get a reference to the accord and hold it + * across dropping p->p_lock. + * + * In the face of a tracer going away and a new one coming in, we may + * take a lap. + */ +again: + if ((accord = lwpd->br_ptrace_tracer) == NULL) { + return; + } + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + mutex_exit(&p->p_lock); + + /* + * We are the tracee LWP. Lock the accord tracee list and then our + * containing process. + */ + mutex_enter(&accord->lxpa_tracees_lock); + mutex_enter(&p->p_lock); + + /* + * Be sure that the accord currently associated with the lwp is the one + * for which we are holding lxpa_tracees_lock. + */ + if (lwpd->br_ptrace_tracer != accord) { + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + mutex_enter(&p->p_lock); + + goto again; + } + + /* + * Remove our reference to the accord. We will release our hold + * later. + */ + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + + /* + * Remove this LWP from the accord tracee list: + */ + VERIFY(list_link_active(&lwpd->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, lwpd); + + /* + * Wake up any tracers waiting for us to detach from the accord. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * Decrement traced-lwp count for the process. + */ + VERIFY(ptolxproc(p)->l_ptrace-- >= 1); + + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Grab "pidlock" and wake the tracer if it is blocked in waitid(). + */ + mutex_enter(&pidlock); + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + mutex_exit(&pidlock); + + /* + * Release the holds on the accord. One is the hold taken earlier in + * this function and the other is lwpd's hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + mutex_enter(&p->p_lock); +} + +/* + * This routine is called from lx_exitlwp() when an LWP is ready to exit. If + * this LWP is being traced, it will be detached from the tracer's accord. The + * routine will also detach any LWPs being traced by this LWP. + */ +void +lx_ptrace_exit(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark our LWP as exiting from a ptrace perspective. This will + * prevent a new accord from being allocated if one does not exist + * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTF_EXITING); + lwpd->br_ptrace_flags |= LX_PTF_EXITING; + + if (lwpd->br_ptrace_tracer != NULL) { + /* + * We are traced by another LWP and must detach ourselves. + */ + lx_ptrace_exit_tracee(p, lwpd); + VERIFY(MUTEX_HELD(&p->p_lock)); + } + + if ((accord = lwpd->br_ptrace_accord) != NULL) { + /* + * We have been tracing other LWPs, and must detach from + * them and clean up our accord. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracer(p, lwpd, accord); + mutex_enter(&p->p_lock); + } +} + +/* + * Called when a SIGCLD signal is dispatched so that we may enqueue another. + * Return 0 if we enqueued a signal, or -1 if not. + */ +int +lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + boolean_t found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&pp->p_lock)); + VERIFY(lwptoproc(lwp) == pp); + + mutex_enter(&pp->p_lock); + if ((accord = lwpd->br_ptrace_accord) == NULL) { + /* + * This LWP is not a tracer LWP, so there will be no + * SIGCLD. + */ + mutex_exit(&pp->p_lock); + return (-1); + } + mutex_exit(&pp->p_lock); + + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_CLDPEND)) { + /* + * No SIGCLD is required for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + return (-1); + } + + /* + * Generate siginfo for this tracee LWP. + */ + lx_ptrace_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL); + remote->br_ptrace_flags &= ~LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + mutex_enter(&pp->p_lock); + if (sigismember(&pp->p_sig, SIGCLD)) { + mutex_exit(&pp->p_lock); + + mutex_enter(&rproc->p_lock); + remote->br_ptrace_flags |= LX_PTF_CLDPEND; + mutex_exit(&rproc->p_lock); + + return (-1); + } + sigaddqa(pp, curthread, sqp); + mutex_exit(&pp->p_lock); + + return (0); +} + +/* + * Consume the next available ptrace(2) event queued against the accord for + * this LWP. The event will be emitted as if through waitid(), and converted + * by lx_waitpid() and friends before the return to usermode. + */ +int +lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options, + boolean_t *brand_wants_wait, int *rval) +{ + lx_ptrace_accord_t *accord; + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *local = lwptolxlwp(lwp); + lx_lwp_data_t *remote; + boolean_t found = B_FALSE; + klwp_t *rlwp = NULL; + proc_t *rproc = NULL; + pid_t event_pid = 0, event_ppid = 0; + boolean_t waitflag = !(options & WNOWAIT); + boolean_t target_found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * By default, we do not expect waitid() to block on our account. + */ + *brand_wants_wait = B_FALSE; + + if (!local->br_waitid_emulate) { + /* + * This waitid() call is not expecting emulated results. + */ + return (-1); + } + + switch (idtype) { + case P_ALL: + case P_PID: + case P_PGID: + break; + default: + /* + * This idtype has no power here. + */ + return (-1); + } + + if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) { + /* + * This LWP does not have an accord; it cannot be tracing. + */ + return (-1); + } + + /* + * We do not need an additional hold on the accord as it belongs to + * the running, tracer, LWP. + */ + lx_ptrace_accord_exit(accord); + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + /* + * Though it has an accord, there are currently no tracees in + * the list for this LWP. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (-1); + } + + /* + * Walk the list of tracees and determine if any of them have events to + * report. + */ + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * We check to see if this LWP matches an id we are waiting for. + */ + switch (idtype) { + case P_ALL: + break; + case P_PID: + if (remote->br_pid != id) + continue; + break; + case P_PGID: + if (rproc->p_pgrp != id) + continue; + break; + default: + cmn_err(CE_PANIC, "unexpected idtype: %d", idtype); + } + + /* This tracee matches provided idtype and id */ + target_found = B_TRUE; + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote, B_FALSE) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTF_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTF_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + /* + * There were no events of interest, but we have tracees. + * If any of the tracees matched the spcified criteria, signal + * to waitid() that it should block if the provided flags allow + * for it. + */ + if (target_found) { + *brand_wants_wait = B_TRUE; + } + + return (-1); + } + + /* + * Populate the signal information. + */ + lx_ptrace_winfo(remote, ip, waitflag, &event_ppid, &event_pid); + + /* + * Unlock the tracee. + */ + mutex_exit(&rproc->p_lock); + + if (event_pid != 0 && event_ppid != 0) { + /* + * We need to do another pass around the tracee list and + * unblock any events that have a "happens after" relationship + * with this event. + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + mutex_enter(&rproc->p_lock); + + if (remote->br_pid != event_pid || + remote->br_ppid != event_ppid) { + mutex_exit(&rproc->p_lock); + continue; + } + + remote->br_ptrace_flags &= ~LX_PTF_PARENT_WAIT; + + mutex_exit(&rproc->p_lock); + } + mutex_exit(&accord->lxpa_tracees_lock); + } + + /* + * If we are consuming this wait state, we remove the SIGCLD from + * the queue and post another. + */ + if (waitflag) { + mutex_exit(&pidlock); + sigcld_delete(ip); + sigcld_repost(); + mutex_enter(&pidlock); + } + + *rval = 0; + return (0); +} + +static int +lx_ptrace_peek(lx_lwp_data_t *lwpd, uintptr_t addr, void *data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + long buf; + int error = 0, size = sizeof (buf); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uread(p, &buf, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + if (copyout(&buf, data, size) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +lx_ptrace_poke(lx_lwp_data_t *lwpd, uintptr_t addr, uintptr_t data) +{ + proc_t *p = lwptoproc(lwpd->br_lwp); + int error = 0, size = sizeof (data); + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + size = sizeof (uint32_t); + } +#endif + if ((addr & (size - 1)) != 0) { + /* unaligned access */ + return (EINVAL); + } + + mutex_exit(&p->p_lock); + error = uwrite(p, &data, size, addr); + mutex_enter(&p->p_lock); + + if (error != 0) { + return (EIO); + } + return (0); +} + +static int +lx_ptrace_kill(lx_lwp_data_t *lwpd) +{ + sigtoproc(lwptoproc(lwpd->br_lwp), NULL, SIGKILL); + + return (0); +} + +static int +lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + lx_lwp_data_t *local = ttolxlwp(curthread); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + int error; + boolean_t found = B_FALSE, restart = B_TRUE; + + /* + * PTRACE_TRACEME and PTRACE_ATTACH operations induce the tracing of + * one LWP by another. The target LWP must not be traced already. + */ + switch (ptrace_op) { + case LX_PTRACE_TRACEME: + return (lx_ptrace_traceme()); + + case LX_PTRACE_ATTACH: + return (lx_ptrace_attach(lxpid)); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This routine + * should not fail because the LWP cannot make ptrace(2) system calls + * after it has begun exiting. + */ + VERIFY0(local->br_ptrace_flags & LX_PTF_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * The accord belongs to this (the tracer) LWP, and we have a hold on + * it. We drop the lock so that we can take other locks. + */ + lx_ptrace_accord_exit(accord); + + /* + * Does the tracee list contain the pid in question? + */ +retry: + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + if (remote->br_pid == lxpid) { + found = B_TRUE; + break; + } + } + if (!found) { + /* + * The requested pid does not appear in the tracee list. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (ESRCH); + } + + if (ptrace_op == LX_PTRACE_DETACH) { + /* + * We're detaching, make sure in-syscall flag is off so that + * signal will stop the process directly. + */ + remote->br_ptrace_flags &= ~LX_PTF_INSYSCALL; + } + + /* + * Attempt to lock the target LWP. + */ + if ((error = lx_ptrace_lock_if_stopped(accord, remote, + (ptrace_op == LX_PTRACE_DETACH))) != 0) { + /* + * The LWP was not in "ptrace-stop". For detach, ENOENT + * indicates that the LWP was not in "ptrace-stop", but is + * still locked. + */ + if (ptrace_op == LX_PTRACE_DETACH && error == ENOENT) { + /* + * We're detaching, but the process was not in + * ptrace_stop, so we don't want to try to restart it. + */ + restart = B_FALSE; + } else { + mutex_exit(&accord->lxpa_tracees_lock); + return (error); + } + } + + /* + * The target LWP is in "ptrace-stop". We have the containing process + * locked. + */ + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + if (ptrace_op == LX_PTRACE_DETACH) { + if (TRACEE_BUSY(remote)) { + kmutex_t *rmp; + + /* + * There is a tricky race condition we have to watch + * out for here (for example, if a tracee is in the + * kernel in the middle of a syscall). When the tracee + * is leaving the kernel, it will set LX_PTF_STOPPING. + * In lx_stop_notify() the tracee has to drop its + * p_lock, take pidlock, then reacquire p_lock, before + * it will clear LX_PTF_STOPPING and set LX_PTF_STOPPED. + * During that window, if this tracer is trying to + * detach, we have to make sure the tracee is restarted. + * We handle this case in the same way we handle + * the tracer exiting in lx_ptrace_exit_tracer(). + */ + rmp = &rproc->p_lock; + mutex_exit(&accord->lxpa_tracees_lock); + (void) cv_wait_sig(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, state will have changed, so + * retry. + */ + mutex_exit(rmp); + goto retry; + } + + lx_ptrace_detach(accord, remote, (int)data, restart); + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Release a hold from the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (0); + } + + /* + * The tracees lock is not needed for any of the other operations. + * Drop it so further actions can avoid deadlock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Process the ptrace(2) request: + */ + switch (ptrace_op) { + case LX_PTRACE_CONT: + error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data); + break; + + case LX_PTRACE_SYSCALL: + error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data); + break; + + case LX_PTRACE_SINGLESTEP: + error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data); + break; + + case LX_PTRACE_SETOPTIONS: + error = lx_ptrace_setoptions(remote, data); + break; + + case LX_PTRACE_GETEVENTMSG: + error = lx_ptrace_geteventmsg(remote, (void *)data); + break; + + case LX_PTRACE_GETREGS: + error = lx_user_regs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETREGS: + error = lx_user_regs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETSIGINFO: + error = lx_ptrace_getsiginfo(remote, (void *)data); + break; + + case LX_PTRACE_PEEKTEXT: + case LX_PTRACE_PEEKDATA: + error = lx_ptrace_peek(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKETEXT: + case LX_PTRACE_POKEDATA: + error = lx_ptrace_poke(remote, addr, data); + break; + + case LX_PTRACE_PEEKUSER: + error = lx_ptrace_peekuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_POKEUSER: + error = lx_ptrace_pokeuser(remote, addr, (void *)data); + break; + + case LX_PTRACE_GETFPREGS: + error = lx_user_fpregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPREGS: + error = lx_user_fpregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_GETFPXREGS: + error = lx_user_fpxregs_copyout(remote, (void *)data); + break; + + case LX_PTRACE_SETFPXREGS: + error = lx_user_fpxregs_copyin(remote, (void *)data); + break; + + case LX_PTRACE_KILL: + error = lx_ptrace_kill(remote); + break; + + default: + error = EINVAL; + } + + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + + return (error); +} + +int +lx_ptrace(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + int error; + + error = lx_ptrace_kernel(ptrace_op, LX_INIT_TO_PID(lxpid), addr, data); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +void +lx_ptrace_init(void) +{ + cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL); + + lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord", + sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lx_ptrace_fini(void) +{ + cv_destroy(&lx_ptrace_busy_cv); + + kmem_cache_destroy(lx_ptrace_accord_cache); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_signal.c b/usr/src/uts/common/brand/lx/os/lx_signal.c new file mode 100644 index 0000000000..53e0cecc14 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_signal.c @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/signal.h> +#include <sys/sunddi.h> +#include <lx_signum.h> + +void +lx_ltos_sigset(lx_sigset_t *lsigp, k_sigset_t *ssigp) +{ + int lx_sig, sig; + + sigemptyset(ssigp); + for (lx_sig = 1; lx_sig <= LX_NSIG; lx_sig++) { + if (lx_sigismember(lsigp, lx_sig) && + ((sig = ltos_signo[lx_sig]) > 0)) + sigaddset(ssigp, sig); + } + + /* Emulate sigutok() restrictions */ + ssigp->__sigbits[0] &= (FILLSET0 & ~CANTMASK0); + ssigp->__sigbits[1] &= (FILLSET1 & ~CANTMASK1); + ssigp->__sigbits[2] &= (FILLSET2 & ~CANTMASK2); +} + +void +lx_stol_sigset(k_sigset_t *ssigp, lx_sigset_t *lsigp) +{ + int sig, lx_sig; + + bzero(lsigp, sizeof (lx_sigset_t)); + for (sig = 1; sig < NSIG; sig++) { + if (sigismember(ssigp, sig) && + ((lx_sig = stol_signo[sig]) > 0)) + lx_sigaddset(lsigp, lx_sig); + } +} diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c new file mode 100644 index 0000000000..f349cfec1f --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -0,0 +1,1229 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/privregs.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/sdt.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + + +/* + * Flags for sysent entries: + */ +#define LX_SYS_NOSYS_REASON 0x07 +#define LX_SYS_EBPARG6 0x08 + +/* + * Flags that denote the specific reason we do not have a particular system + * call. These reasons are only valid if the function is NULL. + */ +#define NOSYS_USERMODE 0 +#define NOSYS_NULL 1 +#define NOSYS_NONE 2 +#define NOSYS_NO_EQUIV 3 +#define NOSYS_KERNEL 4 +#define NOSYS_UNDOC 5 +#define NOSYS_OBSOLETE 6 +#define NOSYS_MAX NOSYS_OBSOLETE + +#if NOSYS_MAX > LX_SYS_NOSYS_REASON +#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON +#endif + +/* + * Strings describing the reason we do not emulate a particular system call + * in the kernel. + */ +static char *nosys_reasons[] = { + NULL, /* NOSYS_USERMODE means this call is emulated in usermode */ + "Not done yet", + "No such Linux system call", + "No equivalent illumos functionality", + "Reads/modifies Linux kernel state", + "Undocumented and/or rarely used system call", + "Unsupported, obsolete system call" +}; + + +#if defined(_LP64) +/* + * System call handler table and entry count for Linux x86_64 (amd64): + */ +lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +int lx_nsysent64; +#endif +/* + * System call handler table and entry count for Linux x86 (i386): + */ +lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +int lx_nsysent32; + +#if defined(_LP64) +struct lx_vsyscall +{ + uintptr_t lv_addr; + uintptr_t lv_scnum; +} lx_vsyscalls[] = { + { LX_VSYS_gettimeofday, LX_SYS_gettimeofday }, + { LX_VSYS_time, LX_SYS_time }, + { LX_VSYS_getcpu, LX_SYS_getcpu }, + { (uintptr_t)NULL, (uintptr_t)NULL } +}; +#endif + +#if defined(__amd64) +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Note: Syscall argument passing is different from function + * call argument passing on amd64. For function calls, the + * fourth arg is passed via %rcx, but for system calls the 4th + * arg is passed via %r10. This is because in amd64, the + * syscall instruction puts the lower 32 bits of %rflags in + * %r11 and puts the %rip value to %rcx. + * + * Appendix A of the amd64 ABI (Linux conventions) states that + * syscalls are limited to 6 args and no arg is passed on the + * stack. + */ + args[0] = rp->r_rdi; + args[1] = rp->r_rsi; + args[2] = rp->r_rdx; + args[3] = rp->r_r10; + args[4] = rp->r_r8; + args[5] = rp->r_r9; + } else { + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + uint32_t args32[6]; + + if (copyin((void *)rp->r_rbx, &args32, + sizeof (args32)) != 0) { + /* + * Clear the argument vector so that the + * trace probe does not expose kernel + * memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + + args[0] = args32[0]; + args[1] = args32[1]; + args[2] = args32[2]; + args[3] = args32[3]; + args[4] = args32[4]; + args[5] = args32[5]; + } else { + args[0] = rp->r_rbx; + args[1] = rp->r_rcx; + args[2] = rp->r_rdx; + args[3] = rp->r_rsi; + args[4] = rp->r_rdi; + args[5] = rp->r_rbp; + } + } + + return (0); +} + +#else /* !__amd64 */ + +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) != + 0) { + /* + * Clear the argument vector so that the trace probe + * does not expose kernel memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + } else { + args[0] = rp->r_ebx; + args[1] = rp->r_ecx; + args[2] = rp->r_edx; + args[3] = rp->r_esi; + args[4] = rp->r_edi; + args[5] = rp->r_ebp; + } + + return (0); +} +#endif + +void +lx_syscall_return(klwp_t *lwp, int syscall_num, long ret) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int error = lwp->lwp_errno; + + if (error != EINTR) { + /* + * If this system call was not interrupted, clear the system + * call restart flag before lx_setcontext() can pass it to + * usermode. + */ + lwpd->br_syscall_restart = B_FALSE; + } + + if (error != 0) { + /* + * Convert from illumos to Linux errno: + */ + ret = -lx_errno(error, EINVAL); + } + + /* + * 32-bit Linux system calls return via %eax; 64-bit calls return via + * %rax. + */ + rp->r_r0 = ret; + + /* + * Hold for the ptrace(2) "syscall-exit-stop" condition if required by + * PTRACE_SYSCALL. Note that the register state may be modified by + * tracer. + */ + (void) lx_ptrace_stop(LX_PR_SYSEXIT); + + /* + * Emit audit record, if necessary. + */ + lx_audit_syscall_exit(syscall_num, ret); + + /* + * Fire the DTrace "lx-syscall:::return" probe: + */ + lx_trace_sysreturn(syscall_num, ret); + + /* + * Clear errno for next time. We do not clear "br_syscall_restart" or + * "br_syscall_num" as they are potentially used by "lx_savecontext()" + * in the signal delivery path. + */ + lwp->lwp_errno = 0; + + lx_check_strict_failure(lwpd); + + /* + * We want complete control of the registers on return from this + * emulated Linux system call: + */ + lwp->lwp_eosys = JUSTRETURN; +} + +static void +lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason) +{ + char buf[100]; + + if (s == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds", + syscall_num); + } else { + VERIFY(unsup_reason < (sizeof (nosys_reasons) / + sizeof (*nosys_reasons))); + + if (s->sy_name == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s", + syscall_num, nosys_reasons[unsup_reason]); + } else { + (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s", + s->sy_name, nosys_reasons[unsup_reason]); + } + } + + lx_unsupported(buf); +} + +/* + * This function is used to override the processing of arguments and + * invocation of a handler for emulated system calls, installed on each + * branded LWP as "lwp_brand_syscall". If this system call should use the + * native path, we return 1. If we handled this system call (and have made + * arrangements with respect to post-return usermode register state) we + * return 0. + */ +int +lx_syscall_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num; + int error; + long ret = 0; + lx_sysent_t *s; + uintptr_t args[6]; + unsigned int unsup_reason; + + /* + * If we got here, we should have an LWP-specific brand data + * structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return + * to the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); + } + + /* + * Clear the restartable system call flag. This flag will be set + * on in the system call handler if the call is a candidate for + * a restart. It will be saved by lx_setcontext() in the event + * that we take a signal, and used in the signal handling path + * to restart the system call iff SA_RESTART was set for this + * signal. Save the system call number so that we can store it + * in the saved context if required. + */ + lwpd->br_syscall_restart = B_FALSE; + lwpd->br_syscall_num = (int)rp->r_r0; + + /* + * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by + * PTRACE_SYSCALL. The system call number and arguments may be + * modified by the tracer. + */ + (void) lx_ptrace_stop(LX_PR_SYSENTRY); + + /* + * Check that the system call number is within the bounds we expect. + */ + syscall_num = lwpd->br_syscall_num; + if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) { + lx_syscall_unsup_msg(NULL, syscall_num, 0); + + (void) set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + s = &lx_sysent64[syscall_num]; + } else +#endif + { + s = &lx_sysent32[syscall_num]; + } + + /* + * Process the arguments for this system call and fire the DTrace + * "lx-syscall:::entry" probe: + */ + error = lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + lwpd->br_syscall_args[0] = args[0]; + lwpd->br_syscall_args[1] = args[1]; + lwpd->br_syscall_args[2] = args[2]; + lwpd->br_syscall_args[3] = args[3]; + if (error != 0) { + /* + * Could not read and process the arguments. Return the error + * to the process. + */ + (void) set_errno(error); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + + if (s->sy_callc != NULL) { + /* + * Call the in-kernel handler for this Linux system call: + */ + lwpd->br_eosys = NORMALRETURN; + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + if (lwpd->br_eosys == NORMALRETURN) { + lx_syscall_return(lwp, syscall_num, ret); + } + return (0); + } + + /* + * There is no in-kernel handler. + */ + switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) { + case NOSYS_USERMODE: + /* + * Pass to the usermode emulation routine. + */ +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(lwp, syscall_num, args); + } else +#endif + { + lx_emulate_user(lwp, syscall_num, args); + } + return (0); + + default: + /* + * We are not emulating this system call at all. + */ + lx_syscall_unsup_msg(s, syscall_num, unsup_reason); + + (void) set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } +} + +#if defined(_LP64) +/* + * Emulate vsyscall support. + * + * Linux magically maps a single page into the address space of each process, + * allowing them to make 'vsyscalls'. Originally designed to counteract the + * perceived overhead of regular system calls, vsyscalls were implemented as + * code residing in userspace which could be called directly. The userspace + * implementations of these vsyscalls which have now been replaced by + * instructions which vector into the normal syscall path. + * + * Implementing vsyscalls on Illumos is complicated by the fact that the + * required static address region resides inside the kernel address space. + * Rather than mapping a user-accessible page into the KAS, a different + * approach is taken. The vsyscall gate is emulated by interposing on + * pagefaults in trap(). An attempt to execute a known vsyscall address will + * result in emulating the appropriate system call rather than inducing a + * SIGSEGV. + */ +void +lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum) +{ + struct regs *rp = lwptoregs(lwp); + uintptr_t raddr; + + /* + * Fetch the return address from the process stack. + */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) { +#if DEBUG + printf("lx_vsyscall_call: bad brand stack at vsyscall " + "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm, + p->p_pid, (void *)rp->r_rsp); +#endif + + /* + * The process jumped to the vsyscall address without a + * correctly configured stack. Terminate the process. + */ + exit(CLD_KILLED, SIGSEGV); + return; + } + + DTRACE_PROBE1(brand__lx__vsyscall, int, scnum); + + /* Simulate vectoring into the syscall */ + rp->r_rax = scnum; + rp->r_rip = raddr; + rp->r_rsp += sizeof (uintptr_t); + + (void) lx_syscall_enter(); +} + +boolean_t +lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int i; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * We only handle vsyscalls when running Linux code. + */ + return (B_FALSE); + } + + if (addr < LX_VSYSCALL_ADDR || + addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) { + /* + * Ignore faults outside the vsyscall page. + */ + return (B_FALSE); + } + + for (i = 0; lx_vsyscalls[i].lv_addr != (uintptr_t)NULL; i++) { + if (addr == lx_vsyscalls[i].lv_addr) { + /* + * This is a valid vsyscall address. + */ + *scnum = lx_vsyscalls[i].lv_scnum; + return (B_TRUE); + } + } + + lx_unsupported("bad vsyscall access"); + return (B_FALSE); +} +#endif + +/* + * Linux defines system call numbers for 32-bit x86 in the file: + * arch/x86/syscalls/syscall_32.tbl + */ +lx_sysent_t lx_sysent32[] = { + {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ + {"exit", NULL, 0, 1}, /* 1 */ + {"fork", NULL, 0, 0}, /* 2 */ + {"read", lx_read, 0, 3}, /* 3 */ + {"write", lx_write, 0, 3}, /* 4 */ + {"open", lx_open, 0, 3}, /* 5 */ + {"close", lx_close, 0, 1}, /* 6 */ + {"waitpid", lx_waitpid, 0, 3}, /* 7 */ + {"creat", lx_creat, 0, 2}, /* 8 */ + {"link", lx_link, 0, 2}, /* 9 */ + {"unlink", lx_unlink, 0, 1}, /* 10 */ + {"execve", NULL, 0, 3}, /* 11 */ + {"chdir", lx_chdir, 0, 1}, /* 12 */ + {"time", lx_time, 0, 1}, /* 13 */ + {"mknod", NULL, 0, 3}, /* 14 */ + {"chmod", lx_chmod, 0, 2}, /* 15 */ + {"lchown16", lx_lchown16, 0, 3}, /* 16 */ + {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ + {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ + {"lseek", lx_lseek32, 0, 3}, /* 19 */ + {"getpid", lx_getpid, 0, 0}, /* 20 */ + {"mount", lx_mount, 0, 5}, /* 21 */ + {"umount", lx_umount, 0, 1}, /* 22 */ + {"setuid16", lx_setuid16, 0, 1}, /* 23 */ + {"getuid16", lx_getuid16, 0, 0}, /* 24 */ + {"stime", lx_stime, 0, 1}, /* 25 */ + {"ptrace", lx_ptrace, 0, 4}, /* 26 */ + {"alarm", lx_alarm, 0, 1}, /* 27 */ + {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ + {"pause", lx_pause, 0, 0}, /* 29 */ + {"utime", NULL, 0, 2}, /* 30 */ + {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ + {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ + {"access", lx_access, 0, 2}, /* 33 */ + {"nice", lx_nice, 0, 1}, /* 34 */ + {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ + {"sync", lx_sync, 0, 0}, /* 36 */ + {"kill", lx_kill, 0, 2}, /* 37 */ + {"rename", lx_rename, 0, 2}, /* 38 */ + {"mkdir", lx_mkdir, 0, 2}, /* 39 */ + {"rmdir", NULL, 0, 1}, /* 40 */ + {"dup", lx_dup, 0, 1}, /* 41 */ + {"pipe", lx_pipe, 0, 1}, /* 42 */ + {"times", lx_times, 0, 1}, /* 43 */ + {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ + {"brk", lx_brk, 0, 1}, /* 45 */ + {"setgid16", lx_setgid16, 0, 1}, /* 46 */ + {"getgid16", lx_getgid16, 0, 0}, /* 47 */ + {"signal", NULL, 0, 2}, /* 48 */ + {"geteuid16", lx_geteuid16, 0, 0}, /* 49 */ + {"getegid16", lx_getegid16, 0, 0}, /* 50 */ + {"acct", lx_acct, 0, 1}, /* 51 */ + {"umount2", lx_umount2, 0, 2}, /* 52 */ + {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ + {"ioctl", lx_ioctl, 0, 3}, /* 54 */ + {"fcntl", lx_fcntl, 0, 3}, /* 55 */ + {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ + {"setpgid", lx_setpgid, 0, 2}, /* 57 */ + {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ + {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ + {"umask", lx_umask, 0, 1}, /* 60 */ + {"chroot", lx_chroot, 0, 1}, /* 61 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ + {"dup2", lx_dup2, 0, 2}, /* 63 */ + {"getppid", lx_getppid, 0, 0}, /* 64 */ + {"getpgrp", lx_getpgrp, 0, 0}, /* 65 */ + {"setsid", lx_setsid, 0, 0}, /* 66 */ + {"sigaction", NULL, 0, 3}, /* 67 */ + {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ + {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ + {"setreuid16", lx_setreuid16, 0, 2}, /* 70 */ + {"setregid16", lx_setregid16, 0, 2}, /* 71 */ + {"sigsuspend", NULL, 0, 1}, /* 72 */ + {"sigpending", NULL, 0, 1}, /* 73 */ + {"sethostname", lx_sethostname, 0, 2}, /* 74 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */ + {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */ + {"getrusage", lx_getrusage, 0, 2}, /* 77 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */ + {"settimeofday", NULL, 0, 2}, /* 79 */ + {"getgroups16", NULL, 0, 2}, /* 80 */ + {"setgroups16", NULL, 0, 2}, /* 81 */ + {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ + {"symlink", lx_symlink, 0, 2}, /* 83 */ + {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ + {"readlink", lx_readlink, 0, 3}, /* 85 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ + {"swapon", lx_swapon, 0, 2}, /* 87 */ + {"reboot", lx_reboot, 0, 4}, /* 88 */ + {"readdir", NULL, 0, 3}, /* 89 */ + {"mmap", lx_mmap, 0, 6}, /* 90 */ + {"munmap", lx_munmap, 0, 2}, /* 91 */ + {"truncate", NULL, 0, 2}, /* 92 */ + {"ftruncate", NULL, 0, 2}, /* 93 */ + {"fchmod", lx_fchmod, 0, 2}, /* 94 */ + {"fchown16", lx_fchown16, 0, 3}, /* 95 */ + {"getpriority", lx_getpriority, 0, 2}, /* 96 */ + {"setpriority", lx_setpriority, 0, 3}, /* 97 */ + {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ + {"statfs", NULL, 0, 2}, /* 99 */ + {"fstatfs", NULL, 0, 2}, /* 100 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ + {"socketcall", lx_socketcall, 0, 2}, /* 102 */ + {"syslog", lx_syslog, 0, 3}, /* 103 */ + {"setitimer", NULL, 0, 3}, /* 104 */ + {"getitimer", lx_getitimer, 0, 2}, /* 105 */ + {"stat", lx_stat32, 0, 2}, /* 106 */ + {"lstat", lx_lstat32, 0, 2}, /* 107 */ + {"fstat", lx_fstat32, 0, 2}, /* 108 */ + {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ + {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ + {"vhangup", lx_vhangup, 0, 0}, /* 111 */ + {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ + {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ + {"wait4", lx_wait4, 0, 4}, /* 114 */ + {"swapoff", lx_swapoff, 0, 1}, /* 115 */ + {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ + {"ipc", NULL, 0, 5}, /* 117 */ + {"fsync", NULL, 0, 1}, /* 118 */ + {"sigreturn", NULL, 0, 1}, /* 119 */ + {"clone", NULL, 0, 5}, /* 120 */ + {"setdomainname", lx_setdomainname, 0, 2}, /* 121 */ + {"uname", lx_uname, 0, 1}, /* 122 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */ + {"adjtimex", NULL, 0, 1}, /* 124 */ + {"mprotect", lx_mprotect, 0, 3}, /* 125 */ + {"sigprocmask", NULL, 0, 3}, /* 126 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ + {"getpgid", lx_getpgid, 0, 1}, /* 132 */ + {"fchdir", lx_fchdir, 0, 1}, /* 133 */ + {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"sysfs", NULL, 0, 3}, /* 135 */ + {"personality", lx_personality, 0, 1}, /* 136 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ + {"setfsuid16", lx_setfsuid16, 0, 1}, /* 138 */ + {"setfsgid16", lx_setfsgid16, 0, 1}, /* 139 */ + {"llseek", lx_llseek, 0, 5}, /* 140 */ + {"getdents", lx_getdents_32, 0, 3}, /* 141 */ + {"select", lx_select, 0, 5}, /* 142 */ + {"flock", lx_flock, 0, 2}, /* 143 */ + {"msync", lx_msync, 0, 3}, /* 144 */ + {"readv", lx_readv, 0, 3}, /* 145 */ + {"writev", lx_writev, 0, 3}, /* 146 */ + {"getsid", lx_getsid, 0, 1}, /* 147 */ + {"fdatasync", NULL, 0, 1}, /* 148 */ + {"sysctl", NULL, 0, 1}, /* 149 */ + {"mlock", lx_mlock, 0, 2}, /* 150 */ + {"munlock", lx_munlock, 0, 2}, /* 151 */ + {"mlockall", lx_mlockall, 0, 1}, /* 152 */ + {"munlockall", lx_munlockall, 0, 0}, /* 153 */ + {"sched_setparam", lx_sched_setparam, 0, 2}, /* 154 */ + {"sched_getparam", lx_sched_getparam, 0, 2}, /* 155 */ + {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 156 */ + {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 157 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */ + {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 159 */ + {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 160 */ + {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 161 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 162 */ + {"mremap", lx_mremap, 0, 5}, /* 163 */ + {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */ + {"getresuid16", lx_getresuid16, 0, 3}, /* 165 */ + {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ + {"query_module", NULL, 0, 5}, /* 167 */ + {"poll", lx_poll, 0, 3}, /* 168 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ + {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */ + {"getresgid16", lx_getresgid16, 0, 3}, /* 171 */ + {"prctl", lx_prctl, 0, 5}, /* 172 */ + {"rt_sigreturn", NULL, 0, 0}, /* 173 */ + {"rt_sigaction", NULL, 0, 4}, /* 174 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 175 */ + {"rt_sigpending", NULL, 0, 2}, /* 176 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 179 */ + {"pread64", lx_pread32, 0, 5}, /* 180 */ + {"pwrite64", lx_pwrite32, 0, 5}, /* 181 */ + {"chown16", lx_chown16, 0, 3}, /* 182 */ + {"getcwd", lx_getcwd, 0, 2}, /* 183 */ + {"capget", NULL, 0, 2}, /* 184 */ + {"capset", NULL, 0, 2}, /* 185 */ + {"sigaltstack", NULL, 0, 2}, /* 186 */ + {"sendfile", NULL, 0, 4}, /* 187 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ + {"vfork", NULL, 0, 0}, /* 190 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */ + {"mmap2", lx_mmap2, LX_SYS_EBPARG6, 6}, /* 192 */ + {"truncate64", NULL, 0, 3}, /* 193 */ + {"ftruncate64", NULL, 0, 3}, /* 194 */ + {"stat64", lx_stat64, 0, 2}, /* 195 */ + {"lstat64", lx_lstat64, 0, 2}, /* 196 */ + {"fstat64", lx_fstat64, 0, 2}, /* 197 */ + {"lchown", lx_lchown, 0, 3}, /* 198 */ + {"getuid", lx_getuid, 0, 0}, /* 199 */ + {"getgid", lx_getgid, 0, 0}, /* 200 */ + {"geteuid", lx_geteuid, 0, 0}, /* 201 */ + {"getegid", lx_getegid, 0, 0}, /* 202 */ + {"setreuid", lx_setreuid, 0, 0}, /* 203 */ + {"setregid", lx_setregid, 0, 0}, /* 204 */ + {"getgroups", NULL, 0, 2}, /* 205 */ + {"setgroups", NULL, 0, 2}, /* 206 */ + {"fchown", lx_fchown, 0, 3}, /* 207 */ + {"setresuid", lx_setresuid, 0, 3}, /* 208 */ + {"getresuid", lx_getresuid, 0, 3}, /* 209 */ + {"setresgid", lx_setresgid, 0, 3}, /* 210 */ + {"getresgid", lx_getresgid, 0, 3}, /* 211 */ + {"chown", lx_chown, 0, 3}, /* 212 */ + {"setuid", lx_setuid, 0, 1}, /* 213 */ + {"setgid", lx_setgid, 0, 1}, /* 214 */ + {"setfsuid", lx_setfsuid, 0, 1}, /* 215 */ + {"setfsgid", lx_setfsgid, 0, 1}, /* 216 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ + {"mincore", lx_mincore, 0, 3}, /* 218 */ + {"madvise", lx_madvise, 0, 3}, /* 219 */ + {"getdents64", lx_getdents64, 0, 3}, /* 220 */ + {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ + {"gettid", lx_gettid, 0, 0}, /* 224 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ + {"setxattr", lx_setxattr, 0, 5}, /* 226 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 227 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 228 */ + {"getxattr", lx_getxattr, 0, 4}, /* 229 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 230 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 231 */ + {"listxattr", lx_listxattr, 0, 3}, /* 232 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 233 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 234 */ + {"removexattr", lx_removexattr, 0, 2}, /* 235 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 236 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 237 */ + {"tkill", lx_tkill, 0, 2}, /* 238 */ + {"sendfile64", NULL, 0, 4}, /* 239 */ + {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */ + {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 241 */ + {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 242 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */ + {"io_setup", lx_io_setup, 0, 2}, /* 245 */ + {"io_destroy", lx_io_destroy, 0, 1}, /* 246 */ + {"io_getevents", lx_io_getevents, 0, 5}, /* 247 */ + {"io_submit", lx_io_submit, 0, 3}, /* 248 */ + {"io_cancel", lx_io_cancel, 0, 3}, /* 249 */ + {"fadvise64", lx_fadvise64_32, 0, 5}, /* 250 */ + {"nosys", NULL, 0, 0}, /* 251 */ + {"group_exit", NULL, 0, 1}, /* 252 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 254 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 255 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 256 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */ + {"timer_create", lx_timer_create, 0, 3}, /* 259 */ + {"timer_settime", NULL, 0, 4}, /* 260 */ + {"timer_gettime", NULL, 0, 2}, /* 261 */ + {"timer_getoverrun", NULL, 0, 1}, /* 262 */ + {"timer_delete", NULL, 0, 1}, /* 263 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */ + {"clock_nanosleep", NULL, 0, 4}, /* 267 */ + {"statfs64", NULL, 0, 2}, /* 268 */ + {"fstatfs64", NULL, 0, 2}, /* 269 */ + {"tgkill", lx_tgkill, 0, 3}, /* 270 */ + +/* + * The following system calls only exist in kernel 2.6 and greater: + */ + {"utimes", NULL, 0, 2}, /* 271 */ + {"fadvise64_64", lx_fadvise64_64, LX_SYS_EBPARG6, 6}, /* 272 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ + {"waitid", lx_waitid, 0, 4}, /* 284 */ + {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 289 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 290 */ + {"inotify_init", NULL, 0, 0}, /* 291 */ + {"inotify_add_watch", NULL, 0, 3}, /* 292 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 293 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ + {"openat", lx_openat, 0, 4}, /* 295 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */ + {"mknodat", NULL, 0, 4}, /* 297 */ + {"fchownat", lx_fchownat, 0, 5}, /* 298 */ + {"futimesat", NULL, 0, 3}, /* 299 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 300 */ + {"unlinkat", lx_unlinkat, 0, 3}, /* 301 */ + {"renameat", lx_renameat, 0, 4}, /* 302 */ + {"linkat", lx_linkat, 0, 5}, /* 303 */ + {"symlinkat", lx_symlinkat, 0, 3}, /* 304 */ + {"readlinkat", lx_readlinkat, 0, 4}, /* 305 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 306 */ + {"faccessat", lx_faccessat, 0, 4}, /* 307 */ + {"pselect6", lx_pselect, LX_SYS_EBPARG6, 6}, /* 308 */ + {"ppoll", lx_ppoll, 0, 5}, /* 309 */ + {"unshare", lx_unshare, 0, 1}, /* 310 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 311 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 312 */ + {"splice", lx_splice, LX_SYS_EBPARG6, 6}, /* 313 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 314 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getcpu", lx_getcpu, 0, 3}, /* 318 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 319 */ + {"utimensat", NULL, 0, 4}, /* 320 */ + {"signalfd", NULL, 0, 3}, /* 321 */ + {"timerfd_create", NULL, 0, 2}, /* 322 */ + {"eventfd", lx_eventfd, 0, 1}, /* 323 */ + {"fallocate", lx_fallocate32, LX_SYS_EBPARG6, 6}, /* 324 */ + {"timerfd_settime", NULL, 0, 4}, /* 325 */ + {"timerfd_gettime", NULL, 0, 2}, /* 326 */ + {"signalfd4", NULL, 0, 4}, /* 327 */ + {"eventfd2", lx_eventfd2, 0, 2}, /* 328 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 329 */ + {"dup3", lx_dup3, 0, 3}, /* 330 */ + {"pipe2", lx_pipe2, 0, 2}, /* 331 */ + {"inotify_init1", NULL, 0, 1}, /* 332 */ + {"preadv", lx_preadv32, 0, 5}, /* 333 */ + {"pwritev", lx_pwritev32, 0, 5}, /* 334 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ + {"recvmmsg", lx_recvmmsg, 0, 5}, /* 337 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ + {"syncfs", lx_syncfs, 0, 1}, /* 344 */ + {"sendmmsg", lx_sendmmsg, 0, 4}, /* 345 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ + {"sched_setattr", lx_sched_setattr, 0, 3}, /* 351 */ + {"sched_getattr", lx_sched_getattr, 0, 4}, /* 352 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */ + {"getrandom", lx_getrandom, 0, 3}, /* 355 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */ +}; + +#if defined(_LP64) +/* + * Linux defines system call numbers for 64-bit x86 in the file: + * arch/x86/syscalls/syscall_64.tbl + */ +lx_sysent_t lx_sysent64[] = { + {"read", lx_read, 0, 3}, /* 0 */ + {"write", lx_write, 0, 3}, /* 1 */ + {"open", lx_open, 0, 3}, /* 2 */ + {"close", lx_close, 0, 1}, /* 3 */ + {"stat", lx_stat64, 0, 2}, /* 4 */ + {"fstat", lx_fstat64, 0, 2}, /* 5 */ + {"lstat", lx_lstat64, 0, 2}, /* 6 */ + {"poll", lx_poll, 0, 3}, /* 7 */ + {"lseek", lx_lseek64, 0, 3}, /* 8 */ + {"mmap", lx_mmap, 0, 6}, /* 9 */ + {"mprotect", lx_mprotect, 0, 3}, /* 10 */ + {"munmap", lx_munmap, 0, 2}, /* 11 */ + {"brk", lx_brk, 0, 1}, /* 12 */ + {"rt_sigaction", NULL, 0, 4}, /* 13 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 14 */ + {"rt_sigreturn", NULL, 0, 0}, /* 15 */ + {"ioctl", lx_ioctl, 0, 3}, /* 16 */ + {"pread64", lx_pread, 0, 4}, /* 17 */ + {"pwrite64", lx_pwrite, 0, 4}, /* 18 */ + {"readv", lx_readv, 0, 3}, /* 19 */ + {"writev", lx_writev, 0, 3}, /* 20 */ + {"access", lx_access, 0, 2}, /* 21 */ + {"pipe", lx_pipe, 0, 1}, /* 22 */ + {"select", lx_select, 0, 5}, /* 23 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */ + {"mremap", lx_mremap, 0, 5}, /* 25 */ + {"msync", lx_msync, 0, 3}, /* 26 */ + {"mincore", lx_mincore, 0, 3}, /* 27 */ + {"madvise", lx_madvise, 0, 3}, /* 28 */ + {"shmget", NULL, 0, 3}, /* 29 */ + {"shmat", NULL, 0, 4}, /* 30 */ + {"shmctl", NULL, 0, 3}, /* 31 */ + {"dup", lx_dup, 0, 1}, /* 32 */ + {"dup2", lx_dup2, 0, 2}, /* 33 */ + {"pause", lx_pause, 0, 0}, /* 34 */ + {"nanosleep", lx_nanosleep, 0, 2}, /* 35 */ + {"getitimer", lx_getitimer, 0, 2}, /* 36 */ + {"alarm", lx_alarm, 0, 1}, /* 37 */ + {"setitimer", NULL, 0, 3}, /* 38 */ + {"getpid", lx_getpid, 0, 0}, /* 39 */ + {"sendfile", NULL, 0, 4}, /* 40 */ + {"socket", lx_socket, 0, 3}, /* 41 */ + {"connect", lx_connect, 0, 3}, /* 42 */ + {"accept", lx_accept, 0, 3}, /* 43 */ + {"sendto", lx_sendto, 0, 6}, /* 44 */ + {"recvfrom", lx_recvfrom, 0, 6}, /* 45 */ + {"sendmsg", lx_sendmsg, 0, 3}, /* 46 */ + {"recvmsg", lx_recvmsg, 0, 3}, /* 47 */ + {"shutdown", lx_shutdown, 0, 2}, /* 48 */ + {"bind", lx_bind, 0, 3}, /* 49 */ + {"listen", lx_listen, 0, 2}, /* 50 */ + {"getsockname", lx_getsockname, 0, 3}, /* 51 */ + {"getpeername", lx_getpeername, 0, 3}, /* 52 */ + {"socketpair", lx_socketpair, 0, 4}, /* 53 */ + {"setsockopt", lx_setsockopt, 0, 5}, /* 54 */ + {"getsockopt", lx_getsockopt, 0, 5}, /* 55 */ + {"clone", NULL, 0, 5}, /* 56 */ + {"fork", NULL, 0, 0}, /* 57 */ + {"vfork", NULL, 0, 0}, /* 58 */ + {"execve", NULL, 0, 3}, /* 59 */ + {"exit", NULL, 0, 1}, /* 60 */ + {"wait4", lx_wait4, 0, 4}, /* 61 */ + {"kill", lx_kill, 0, 2}, /* 62 */ + {"uname", lx_uname, 0, 1}, /* 63 */ + {"semget", NULL, 0, 3}, /* 64 */ + {"semop", NULL, 0, 3}, /* 65 */ + {"semctl", NULL, 0, 4}, /* 66 */ + {"shmdt", NULL, 0, 1}, /* 67 */ + {"msgget", NULL, 0, 2}, /* 68 */ + {"msgsnd", NULL, 0, 4}, /* 69 */ + {"msgrcv", NULL, 0, 5}, /* 70 */ + {"msgctl", NULL, 0, 3}, /* 71 */ + {"fcntl", lx_fcntl64, 0, 3}, /* 72 */ + {"flock", lx_flock, 0, 2}, /* 73 */ + {"fsync", NULL, 0, 1}, /* 74 */ + {"fdatasync", NULL, 0, 1}, /* 75 */ + {"truncate", NULL, 0, 2}, /* 76 */ + {"ftruncate", NULL, 0, 2}, /* 77 */ + {"getdents", lx_getdents_64, 0, 3}, /* 78 */ + {"getcwd", lx_getcwd, 0, 2}, /* 79 */ + {"chdir", lx_chdir, 0, 1}, /* 80 */ + {"fchdir", lx_fchdir, 0, 1}, /* 81 */ + {"rename", lx_rename, 0, 2}, /* 82 */ + {"mkdir", lx_mkdir, 0, 2}, /* 83 */ + {"rmdir", NULL, 0, 1}, /* 84 */ + {"creat", lx_creat, 0, 2}, /* 85 */ + {"link", lx_link, 0, 2}, /* 86 */ + {"unlink", lx_unlink, 0, 1}, /* 87 */ + {"symlink", lx_symlink, 0, 2}, /* 88 */ + {"readlink", lx_readlink, 0, 3}, /* 89 */ + {"chmod", lx_chmod, 0, 2}, /* 90 */ + {"fchmod", lx_fchmod, 0, 2}, /* 91 */ + {"chown", lx_chown, 0, 3}, /* 92 */ + {"fchown", lx_fchown, 0, 3}, /* 93 */ + {"lchown", lx_lchown, 0, 3}, /* 94 */ + {"umask", lx_umask, 0, 1}, /* 95 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */ + {"getrusage", lx_getrusage, 0, 2}, /* 98 */ + {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */ + {"times", lx_times, 0, 1}, /* 100 */ + {"ptrace", lx_ptrace, 0, 4}, /* 101 */ + {"getuid", lx_getuid, 0, 0}, /* 102 */ + {"syslog", lx_syslog, 0, 3}, /* 103 */ + {"getgid", lx_getgid, 0, 0}, /* 104 */ + {"setuid", lx_setuid, 0, 1}, /* 105 */ + {"setgid", lx_setgid, 0, 1}, /* 106 */ + {"geteuid", lx_geteuid, 0, 0}, /* 107 */ + {"getegid", lx_getegid, 0, 0}, /* 108 */ + {"setpgid", lx_setpgid, 0, 2}, /* 109 */ + {"getppid", lx_getppid, 0, 0}, /* 110 */ + {"getpgrp", lx_getpgrp, 0, 0}, /* 111 */ + {"setsid", lx_setsid, 0, 0}, /* 112 */ + {"setreuid", lx_setreuid, 0, 0}, /* 113 */ + {"setregid", lx_setregid, 0, 0}, /* 114 */ + {"getgroups", NULL, 0, 2}, /* 115 */ + {"setgroups", NULL, 0, 2}, /* 116 */ + {"setresuid", lx_setresuid, 0, 3}, /* 117 */ + {"getresuid", lx_getresuid, 0, 3}, /* 118 */ + {"setresgid", lx_setresgid, 0, 3}, /* 119 */ + {"getresgid", lx_getresgid, 0, 3}, /* 120 */ + {"getpgid", lx_getpgid, 0, 1}, /* 121 */ + {"setfsuid", lx_setfsuid, 0, 1}, /* 122 */ + {"setfsgid", lx_setfsgid, 0, 1}, /* 123 */ + {"getsid", lx_getsid, 0, 1}, /* 124 */ + {"capget", NULL, 0, 2}, /* 125 */ + {"capset", NULL, 0, 2}, /* 126 */ + {"rt_sigpending", NULL, 0, 2}, /* 127 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 130 */ + {"sigaltstack", NULL, 0, 2}, /* 131 */ + {"utime", NULL, 0, 2}, /* 132 */ + {"mknod", NULL, 0, 3}, /* 133 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"personality", lx_personality, 0, 1}, /* 135 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ + {"statfs", NULL, 0, 2}, /* 137 */ + {"fstatfs", NULL, 0, 2}, /* 138 */ + {"sysfs", NULL, 0, 3}, /* 139 */ + {"getpriority", lx_getpriority, 0, 2}, /* 140 */ + {"setpriority", lx_setpriority, 0, 3}, /* 141 */ + {"sched_setparam", lx_sched_setparam, 0, 2}, /* 142 */ + {"sched_getparam", lx_sched_getparam, 0, 2}, /* 143 */ + {"sched_setscheduler", lx_sched_setscheduler, 0, 3}, /* 144 */ + {"sched_getscheduler", lx_sched_getscheduler, 0, 1}, /* 145 */ + {"sched_get_priority_max", lx_sched_get_priority_max, 0, 1}, /* 146 */ + {"sched_get_priority_min", lx_sched_get_priority_min, 0, 1}, /* 147 */ + {"sched_rr_get_interval", lx_sched_rr_get_interval, 0, 2}, /* 148 */ + {"mlock", lx_mlock, 0, 2}, /* 149 */ + {"munlock", lx_munlock, 0, 2}, /* 150 */ + {"mlockall", lx_mlockall, 0, 1}, /* 151 */ + {"munlockall", lx_munlockall, 0, 0}, /* 152 */ + {"vhangup", lx_vhangup, 0, 0}, /* 153 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ + {"sysctl", NULL, 0, 1}, /* 156 */ + {"prctl", lx_prctl, 0, 5}, /* 157 */ + {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ + {"adjtimex", NULL, 0, 1}, /* 159 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */ + {"chroot", lx_chroot, 0, 1}, /* 161 */ + {"sync", lx_sync, 0, 0}, /* 162 */ + {"acct", lx_acct, 0, 1}, /* 163 */ + {"settimeofday", NULL, 0, 2}, /* 164 */ + {"mount", lx_mount, 0, 5}, /* 165 */ + {"umount2", lx_umount2, 0, 2}, /* 166 */ + {"swapon", lx_swapon, 0, 2}, /* 167 */ + {"swapoff", lx_swapoff, 0, 1}, /* 168 */ + {"reboot", lx_reboot, 0, 4}, /* 169 */ + {"sethostname", lx_sethostname, 0, 2}, /* 170 */ + {"setdomainname", lx_setdomainname, 0, 2}, /* 171 */ + {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ + {"query_module", NULL, 0, 5}, /* 178 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ + {"gettid", lx_gettid, 0, 0}, /* 186 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ + {"setxattr", lx_setxattr, 0, 5}, /* 188 */ + {"lsetxattr", lx_lsetxattr, 0, 5}, /* 189 */ + {"fsetxattr", lx_fsetxattr, 0, 5}, /* 190 */ + {"getxattr", lx_getxattr, 0, 4}, /* 191 */ + {"lgetxattr", lx_lgetxattr, 0, 4}, /* 192 */ + {"fgetxattr", lx_fgetxattr, 0, 4}, /* 193 */ + {"listxattr", lx_listxattr, 0, 3}, /* 194 */ + {"llistxattr", lx_llistxattr, 0, 3}, /* 195 */ + {"flistxattr", lx_flistxattr, 0, 3}, /* 196 */ + {"removexattr", lx_removexattr, 0, 2}, /* 197 */ + {"lremovexattr", lx_lremovexattr, 0, 2}, /* 198 */ + {"fremovexattr", lx_fremovexattr, 0, 2}, /* 199 */ + {"tkill", lx_tkill, 0, 2}, /* 200 */ + {"time", lx_time, 0, 1}, /* 201 */ + {"futex", lx_futex, 0, 6}, /* 202 */ + {"sched_setaffinity", lx_sched_setaffinity, 0, 3}, /* 203 */ + {"sched_getaffinity", lx_sched_getaffinity, 0, 3}, /* 204 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */ + {"io_setup", lx_io_setup, 0, 2}, /* 206 */ + {"io_destroy", lx_io_destroy, 0, 1}, /* 207 */ + {"io_getevents", lx_io_getevents, 0, 5}, /* 208 */ + {"io_submit", lx_io_submit, 0, 3}, /* 209 */ + {"io_cancel", lx_io_cancel, 0, 3}, /* 210 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ + {"epoll_create", lx_epoll_create, 0, 1}, /* 213 */ + {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ + {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ + {"getdents64", lx_getdents64, 0, 3}, /* 217 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */ + {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ + {"semtimedop", NULL, 0, 4}, /* 220 */ + {"fadvise64", lx_fadvise64, 0, 4}, /* 221 */ + {"timer_create", lx_timer_create, 0, 3}, /* 222 */ + {"timer_settime", NULL, 0, 4}, /* 223 */ + {"timer_gettime", NULL, 0, 2}, /* 224 */ + {"timer_getoverrun", NULL, 0, 1}, /* 225 */ + {"timer_delete", NULL, 0, 1}, /* 226 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */ + {"clock_nanosleep", NULL, 0, 4}, /* 230 */ + {"exit_group", NULL, 0, 1}, /* 231 */ + {"epoll_wait", lx_epoll_wait, 0, 4}, /* 232 */ + {"epoll_ctl", lx_epoll_ctl, 0, 4}, /* 233 */ + {"tgkill", lx_tgkill, 0, 3}, /* 234 */ + {"utimes", NULL, 0, 2}, /* 235 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ + {"waitid", lx_waitid, 0, 4}, /* 247 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ + {"ioprio_set", lx_ioprio_set, 0, 3}, /* 251 */ + {"ioprio_get", lx_ioprio_get, 0, 2}, /* 252 */ + {"inotify_init", NULL, 0, 0}, /* 253 */ + {"inotify_add_watch", NULL, 0, 3}, /* 254 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 255 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ + {"openat", lx_openat, 0, 4}, /* 257 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */ + {"mknodat", NULL, 0, 4}, /* 259 */ + {"fchownat", lx_fchownat, 0, 5}, /* 260 */ + {"futimesat", NULL, 0, 3}, /* 261 */ + {"fstatat64", lx_fstatat64, 0, 4}, /* 262 */ + {"unlinkat", lx_unlinkat, 0, 3}, /* 263 */ + {"renameat", lx_renameat, 0, 4}, /* 264 */ + {"linkat", lx_linkat, 0, 5}, /* 265 */ + {"symlinkat", lx_symlinkat, 0, 3}, /* 266 */ + {"readlinkat", lx_readlinkat, 0, 4}, /* 267 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 268 */ + {"faccessat", lx_faccessat, 0, 4}, /* 269 */ + {"pselect6", lx_pselect, 0, 6}, /* 270 */ + {"ppoll", lx_ppoll, 0, 5}, /* 271 */ + {"unshare", lx_unshare, 0, 1}, /* 272 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 273 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 274 */ + {"splice", lx_splice, 0, 6}, /* 275 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ + {"sync_file_range", lx_sync_file_range, 0, 4}, /* 277 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ + {"utimensat", NULL, 0, 4}, /* 280 */ + {"epoll_pwait", lx_epoll_pwait, 0, 5}, /* 281 */ + {"signalfd", NULL, 0, 3}, /* 282 */ + {"timerfd_create", NULL, 0, 2}, /* 283 */ + {"eventfd", lx_eventfd, 0, 1}, /* 284 */ + {"fallocate", lx_fallocate, 0, 4}, /* 285 */ + {"timerfd_settime", NULL, 0, 4}, /* 286 */ + {"timerfd_gettime", NULL, 0, 2}, /* 287 */ + {"accept4", lx_accept4, 0, 4}, /* 288 */ + {"signalfd4", NULL, 0, 4}, /* 289 */ + {"eventfd2", lx_eventfd2, 0, 2}, /* 290 */ + {"epoll_create1", lx_epoll_create1, 0, 1}, /* 291 */ + {"dup3", lx_dup3, 0, 3}, /* 292 */ + {"pipe2", lx_pipe2, 0, 2}, /* 293 */ + {"inotify_init1", NULL, 0, 1}, /* 294 */ + {"preadv", lx_preadv, 0, 4}, /* 295 */ + {"pwritev", lx_pwritev, 0, 4}, /* 296 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ + {"recvmmsg", lx_recvmmsg, 0, 5}, /* 299 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ + {"syncfs", lx_syncfs, 0, 1}, /* 306 */ + {"sendmmsg", lx_sendmmsg, 0, 4}, /* 307 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ + {"getcpu", lx_getcpu, 0, 3}, /* 309 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sched_setattr", lx_sched_setattr, 0, 3}, /* 314 */ + {"sched_getattr", lx_sched_getattr, 0, 4}, /* 315 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getrandom", lx_getrandom, 0, 3}, /* 318 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */ + {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */ + + /* XXX TBD gap then x32 syscalls from 512 - 544 */ +}; +#endif diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h new file mode 100644 index 0000000000..f4d18fffc1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h @@ -0,0 +1,394 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _LX_PROC_H +#define _LX_PROC_H + +#ifdef _LXPROC_NATIVE_H +#error Attempted to include branded lx_proc.h after native lxproc.h +#endif + +#define _LXPROC_BRANDED_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/nvpair.h> +#include <vm/as.h> +#include <vm/anon.h> + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_INVALID, /* nodes start at 1 */ + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_AUXV, /* /proc/<pid>/auxv */ + LXPR_PID_CGROUP, /* /proc/<pid>/cgroup */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_COMM, /* /proc/<pid>/comm */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_GIDMAP, /* /proc/<pid>/gid_map */ + LXPR_PID_LIMITS, /* /proc/<pid>/limits */ + LXPR_PID_LOGINUID, /* /proc/<pid>/loginuid */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_MOUNTINFO, /* /proc/<pid>/mountinfo */ + LXPR_PID_MOUNTS, /* /proc/<pid>/mounts */ + LXPR_PID_OOM_SCR_ADJ, /* /proc/<pid>/oom_score_adj */ + LXPR_PID_PERSONALITY, /* /proc/<pid>/personality */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_TASKDIR, /* /proc/<pid>/task */ + LXPR_PID_TASK_IDDIR, /* /proc/<pid>/task/<tid> */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_PID_FDINFODIR, /* /proc/<pid>/fdinfo */ + LXPR_PID_FDINFO_FD, /* /proc/<pid>/fdinfo/nn */ + LXPR_PID_UIDMAP, /* /proc/<pid>/uid_map */ + LXPR_PID_TID_AUXV, /* /proc/<pid>/task/<tid>/auxv */ + LXPR_PID_TID_CGROUP, /* /proc/<pid>/task/<tid>/cgroup */ + LXPR_PID_TID_CMDLINE, /* /proc/<pid>/task/<tid>/cmdline */ + LXPR_PID_TID_COMM, /* /proc/<pid>/task/<tid>/comm */ + LXPR_PID_TID_CPU, /* /proc/<pid>/task/<tid>/cpu */ + LXPR_PID_TID_CURDIR, /* /proc/<pid>/task/<tid>/cwd */ + LXPR_PID_TID_ENV, /* /proc/<pid>/task/<tid>/environ */ + LXPR_PID_TID_EXE, /* /proc/<pid>/task/<tid>/exe */ + LXPR_PID_TID_GIDMAP, /* /proc/<pid>/task/<tid>/gid_map */ + LXPR_PID_TID_LIMITS, /* /proc/<pid>/task/<tid>/limits */ + LXPR_PID_TID_LOGINUID, /* /proc/<pid>/task/<tid>/loginuid */ + LXPR_PID_TID_MAPS, /* /proc/<pid>/task/<tid>/maps */ + LXPR_PID_TID_MEM, /* /proc/<pid>/task/<tid>/mem */ + LXPR_PID_TID_MOUNTINFO, /* /proc/<pid>/task/<tid>/mountinfo */ + LXPR_PID_TID_OOM_SCR_ADJ, /* /proc/<pid>/task/<tid>/oom_score_adj */ + LXPR_PID_TID_PERSONALITY, /* /proc/<pid>/task/<tid>/personality */ + LXPR_PID_TID_ROOTDIR, /* /proc/<pid>/task/<tid>/root */ + LXPR_PID_TID_STAT, /* /proc/<pid>/task/<tid>/stat */ + LXPR_PID_TID_STATM, /* /proc/<pid>/task/<tid>/statm */ + LXPR_PID_TID_STATUS, /* /proc/<pid>/task/<tid>/status */ + LXPR_PID_TID_FDDIR, /* /proc/<pid>/task/<tid>/fd */ + LXPR_PID_TID_FD_FD, /* /proc/<pid>/task/<tid>/fd/nn */ + LXPR_PID_TID_FDINFODIR, /* /proc/<pid>/task/<tid>/fdinfo */ + LXPR_PID_TID_FDINFO_FD, /* /proc/<pid>/task/<tid>/fdinfo/nn */ + LXPR_PID_TID_UIDMAP, /* /proc/<pid>/task/<tid>/uid_map */ + LXPR_CGROUPS, /* /proc/cgroups */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DISKSTATS, /* /proc/diskstats */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MODULES, /* /proc/modules */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IF_INET6, /* /proc/net/if_inet6 */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_IPV6_ROUTE, /* /proc/net/ipv6_route */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_TCP6, /* /proc/net/tcp6 */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UDP6, /* /proc/net/udp6 */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_SWAPS, /* /proc/swaps */ + LXPR_SYSDIR, /* /proc/sys/ */ + LXPR_SYS_FSDIR, /* /proc/sys/fs/ */ + LXPR_SYS_FS_AIO_MAX_NR, /* /proc/sys/fs/aio-max-nr */ + LXPR_SYS_FS_AIO_NR, /* /proc/sys/fs/aio-nr */ + LXPR_SYS_FS_FILEMAX, /* /proc/sys/fs/file-max */ + LXPR_SYS_FS_FILENR, /* /proc/sys/fs/file-nr */ + LXPR_SYS_FS_INOTIFYDIR, /* /proc/sys/fs/inotify */ + LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, /* inotify/max_queued_events */ + LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, /* inotify/max_user_instances */ + LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, /* inotify/max_user_watches */ + LXPR_SYS_FS_PIPE_MAX, /* /proc/sys/fs/pipe-max-size */ + LXPR_SYS_KERNELDIR, /* /proc/sys/kernel/ */ + LXPR_SYS_KERNEL_CAPLCAP, /* /proc/sys/kernel/cap_last_cap */ + LXPR_SYS_KERNEL_COREPATT, /* /proc/sys/kernel/core_pattern */ + LXPR_SYS_KERNEL_HOSTNAME, /* /proc/sys/kernel/hostname */ + LXPR_SYS_KERNEL_MSGMAX, /* /proc/sys/kernel/msgmax */ + LXPR_SYS_KERNEL_MSGMNB, /* /proc/sys/kernel/msgmnb */ + LXPR_SYS_KERNEL_MSGMNI, /* /proc/sys/kernel/msgmni */ + LXPR_SYS_KERNEL_NGROUPS_MAX, /* /proc/sys/kernel/ngroups_max */ + LXPR_SYS_KERNEL_OSREL, /* /proc/sys/kernel/osrelease */ + LXPR_SYS_KERNEL_PID_MAX, /* /proc/sys/kernel/pid_max */ + LXPR_SYS_KERNEL_RANDDIR, /* /proc/sys/kernel/random */ + LXPR_SYS_KERNEL_RAND_BOOTID, /* /proc/sys/kernel/random/boot_id */ + LXPR_SYS_KERNEL_RAND_ENTAVL, /* /proc/sys/kernel/random/entropy_avail */ + LXPR_SYS_KERNEL_RAND_UUID, /* /proc/sys/kernel/random/uuid */ + LXPR_SYS_KERNEL_SEM, /* /proc/sys/kernel/sem */ + LXPR_SYS_KERNEL_SHMALL, /* /proc/sys/kernel/shmall */ + LXPR_SYS_KERNEL_SHMMAX, /* /proc/sys/kernel/shmmax */ + LXPR_SYS_KERNEL_SHMMNI, /* /proc/sys/kernel/shmmni */ + LXPR_SYS_KERNEL_THREADS_MAX, /* /proc/sys/kernel/threads-max */ + LXPR_SYS_NETDIR, /* /proc/sys/net */ + LXPR_SYS_NET_COREDIR, /* /proc/sys/net/core */ + LXPR_SYS_NET_CORE_SOMAXCON, /* /proc/sys/net/core/somaxconn */ + LXPR_SYS_NET_IPV4DIR, /* /proc/sys/net/ipv4 */ + LXPR_SYS_NET_IPV4_ICMP_EIB, /* .../icmp_echo_ignore_broadcasts */ + LXPR_SYS_NET_IPV4_IP_FORWARD, /* .../net/ipv4/ip_forward */ + LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, /* .../net/ipv4/ip_local_port_range */ + /* .../tcp_allowed_congestion_control */ + LXPR_SYS_NET_IPV4_TCP_CC_ALLOW, + /* .../tcp_available_congestion_control */ + LXPR_SYS_NET_IPV4_TCP_CC_AVAIL, + /* .../tcp_congestion_control */ + LXPR_SYS_NET_IPV4_TCP_CC_CURR, + LXPR_SYS_NET_IPV4_TCP_FIN_TO, /* /proc/sys/net/ipv4/tcp_fin_timeout */ + LXPR_SYS_NET_IPV4_TCP_KA_INT, /* .../net/ipv4/tcp_keepalive_intvl */ + LXPR_SYS_NET_IPV4_TCP_KA_TIM, /* .../net/ipv4/tcp_keepalive_time */ + LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, /* .../net/ipv4/tcp_max_syn_backlog */ + LXPR_SYS_NET_IPV4_TCP_RETRY2, /* /proc/sys/net/ipv4/tcp_retries2 */ + LXPR_SYS_NET_IPV4_TCP_RMEM, /* /proc/sys/net/ipv4/tcp_rmem */ + LXPR_SYS_NET_IPV4_TCP_SACK, /* /proc/sys/net/ipv4/tcp_sack */ + LXPR_SYS_NET_IPV4_TCP_WINSCALE, /* .../net/ipv4/tcp_window_scaling */ + LXPR_SYS_NET_IPV4_TCP_WMEM, /* /proc/sys/net/ipv4/tcp_wmem */ + LXPR_SYS_VMDIR, /* /proc/sys/vm */ + LXPR_SYS_VM_DIRTY_BG_BYTES, /* .../vm/dirty_background_bytes */ + LXPR_SYS_VM_DIRTY_BG_RATIO, /* .../vm/dirty_background_ratio */ + LXPR_SYS_VM_DIRTY_BYTES, /* /proc/sys/vm/dirty_bytes */ + LXPR_SYS_VM_DIRTY_EXP_CS, /* .../vm/dirty_expire_centisecs */ + LXPR_SYS_VM_DIRTY_RATIO, /* /proc/sys/vm/dirty_ratio */ + LXPR_SYS_VM_DIRTYTIME_EXP_SEC, /* .../vm/dirtytime_expire_seconds */ + LXPR_SYS_VM_DIRTY_WB_CS, /* .../vm/dirty_writeback_centisecs */ + LXPR_SYS_VM_MAX_MAP_CNT, /* /proc/sys/vm/max_map_count */ + LXPR_SYS_VM_MINFR_KB, /* /proc/sys/vm/min_free_kbytes */ + LXPR_SYS_VM_NHUGEP, /* /proc/sys/vm/nr_hugepages */ + LXPR_SYS_VM_OVERCOMMIT_MEM, /* /proc/sys/vm/overcommit_memory */ + LXPR_SYS_VM_SWAPPINESS, /* /proc/sys/vm/swappiness */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_VMSTAT, /* /proc/vmstat */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * Linux sector size for /proc/diskstats + */ +#define LXPR_SECTOR_SIZE 512 + +/* + * external dirent characteristics + */ +typedef struct { + lxpr_nodetype_t d_type; + char *d_name; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + uint_t lxpr_desc; /* addl. descriptor (fd or tid) */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern boolean_t lxpr_is_writable(lxpr_nodetype_t); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); +extern vnode_t *lxpr_lookup_fdnode(vnode_t *, const char *); +extern int lxpr_readlink_fdnode(lxpr_node_t *, char *, size_t); +extern vnode_t *lxpr_lookup_fdinfonode(vnode_t *, const char *); +extern int lxpr_open_flags_convert(offset_t, uint32_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +} lxpr_uiobuf_t; + +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +extern int lxpr_core_path_l2s(const char *, char *, size_t); +extern int lxpr_core_path_s2l(const char *, char *, size_t); + +typedef enum lxpr_zombok { + NO_ZOMB = 0, + ZOMB_OK +} zombok_t; + +extern proc_t *lxpr_lock(lxpr_node_t *, zombok_t); +extern proc_t *lxpr_lock_pid(lxpr_node_t *, pid_t, zombok_t, kthread_t **); +extern void lxpr_unlock(proc_t *); +extern netstack_t *lxpr_netstack(lxpr_node_t *); +extern void lxpr_fixpid(zone_t *, proc_t *, pid_t *, pid_t *); +extern file_t *lxpr_getf(proc_t *, uint_t, short *); +extern void lxpr_releasef(proc_t *, uint_t); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LX_PROC_H */ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c new file mode 100644 index 0000000000..967d594913 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -0,0 +1,1065 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * lxprsubr.c: Various functions for the /lxproc vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> +#include <sys/brand.h> +#include <sys/fcntl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +#include "lx_proc.h" + +#define LXPRCACHE_NAME "lxbpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +int lx_pr_bufsize = 4000; + +struct lxpr_zfs_ds { + list_node_t ds_link; + char ds_name[MAXPATHLEN]; + uint64_t ds_cookie; +}; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lx_pr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +boolean_t +lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf) +{ + if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0) + return (B_TRUE); + return (B_FALSE); +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * Lookup process, potentially constrained by pid associated with lxpr_node and + * return with p_lock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock_pid(lxpr_node_t *lxpnp, pid_t pid, zombok_t zombie_ok, + kthread_t **tp) +{ + zone_t *zone = LXPTOZ(lxpnp); + proc_t *p; + kthread_t *t; + lx_pid_flag_t flags = LXP_PRLOCK; + + ASSERT(!MUTEX_HELD(&pidlock)); + + /* Consider zsched to be invisible to LX */ + if (pid == zone->zone_zsched->p_pid) { + return (NULL); + } + if (zombie_ok == ZOMB_OK) { + flags |= LXP_ZOMBOK; + } + +retry: + if (lx_lpid_lock(pid, zone, flags, &p, &t) != 0) { + return (NULL); + } + + /* + * Make sure that thread lookups (where non-main LX threads are + * assigned a pid not equal to the encompassing parent) match the pid + * of the encompasing directory. This must be performed carefully for + * the Linux pid 1 as it will not equal the native pid despite the + * process matching. + * + * This is necessary to constrain paths such as /proc/<pid>/task/<tid>. + */ + if (lxpnp->lxpr_pid != 0 && lxpnp->lxpr_pid != pid && + !(pid == 1 && lxpnp->lxpr_pid == zone->zone_proc_initpid)) { + klwp_t *lwp; + lx_lwp_data_t *lwpd; + + /* + * Only LWPs of branded processes will be accessible this way. + * The threads of native processes lack pid assignments which + * LX uses to emulate Linux's weird thread/process model. + */ + if ((lwp = ttolwp(t)) == NULL || + (lwpd = lwptolxlwp(lwp)) == NULL || + lwpd->br_pid != pid) { + sprunlock(p); + return (NULL); + } + } + + if (zombie_ok == NO_ZOMB && + ((p->p_flag & SEXITING) || p->p_stat == SZOMB)) { + sprunlock(p); + return (NULL); + } + + /* + * Accessing a process which is undergoing exec(2) is somewhat risky. + * In particular, the p_exec field is updated outside p_lock. To avoid + * this mess, access is denied when P_PR_EXEC set unless the caller + * happens to be the process itself. This allows actions such as + * re-exec()-ing /proc/<pid>/exe to make forward progress. + * + * All other callers must block until the flag is cleared. + */ + if ((p->p_proc_flag & P_PR_EXEC) != 0) { + if (p != curproc) { + kmutex_t *mp; + + /* + * Drop PR_LOCK and wait for the exec() to ping the CV + * once it has completed. Afterward, the pid is looked + * up again in case the process exited for some reason. + */ + mp = &p->p_lock; + sprunprlock(p); + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + goto retry; + } + } + + if (tp != NULL) { + *tp = t; + } + return (p); +} + +netstack_t * +lxpr_netstack(lxpr_node_t *lxpnp) +{ + return (netstack_hold_if_active(LXPTOZ(lxpnp)->zone_netstack)); +} + +/* + * Lookup process from pid associated with lxpr_node and return with p_lock and + * P_PR_LOCK held. + */ +proc_t * +lxpr_lock(lxpr_node_t *lxpnp, zombok_t zombie_ok) +{ + return (lxpr_lock_pid(lxpnp, lxpnp->lxpr_pid, zombie_ok, NULL)); +} + +void +lxpr_fixpid(zone_t *zone, proc_t *p, pid_t *pidp, pid_t *ppidp) +{ + pid_t pid = p->p_pid; + pid_t ppid = p->p_ppid; + + ASSERT(p != NULL); + ASSERT(pidp != NULL); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(pid != zone->zone_zsched->p_pid); + + if (pid == zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + if (ppid == zone->zone_proc_initpid) { + /* + * Convert ppid to the Linux default of 1 if our parent + * is the zone's init process + */ + ppid = 1; + } else if (ppid == zone->zone_zsched->p_pid || + (p->p_flag & SZONETOP) != 0) { + /* + * Additionally, if the process has no valid parent + * inside the zone (or its parent is zsched), lie and + * claim init as the parent. + */ + ppid = 1; + } + } + + *pidp = pid; + if (ppidp != NULL) { + *ppidp = ppid; + } +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); +} + +file_t * +lxpr_getf(proc_t *p, uint_t fd, short *flag) +{ + uf_entry_t *ufp; + uf_info_t *fip; + file_t *fp; + + ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK)); + + fip = P_FINFO(p); + + if (fd >= fip->fi_nfiles) + return (NULL); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + mutex_enter(&fip->fi_lock); + UF_ENTER(ufp, fip, fd); + if ((fp = ufp->uf_file) != NULL && fp->f_count > 0) { + if (flag != NULL) + *flag = ufp->uf_flag; + ufp->uf_refcnt++; + } else { + fp = NULL; + } + UF_EXIT(ufp); + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + + return (fp); +} + +void +lxpr_releasef(proc_t *p, uint_t fd) +{ + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(MUTEX_HELD(&p->p_lock) && (p->p_proc_flag & P_PR_LOCK)); + + fip = P_FINFO(p); + + mutex_exit(&p->p_lock); + mutex_enter(&fip->fi_lock); + UF_ENTER(ufp, fip, fd); + ASSERT3U(ufp->uf_refcnt, >, 0); + ufp->uf_refcnt--; + UF_EXIT(ufp); + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); +} + + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc) +{ + switch (type) { + case LXPR_PIDDIR: + return (maxpid + pid + 1); + case LXPR_PID_TASK_IDDIR: + return (maxpid + (desc * 10)); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + desc); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_inactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + lxpnp->lxpr_desc = desc; + VN_HOLD(dp); + if (p != NULL) { + lxpr_node_t *dlxpnp = VTOLXP(dp); + + lxpnp->lxpr_pid = p->p_pid; + /* Propagate the tid whenever possible. */ + if (desc == 0 && dlxpnp->lxpr_desc != 0) { + lxpnp->lxpr_desc = dlxpnp->lxpr_desc; + } + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + if (lxpr_is_writable(type)) { + /* These two have different modes; handled later. */ + if (type != LXPR_PID_FD_FD && type != LXPR_PID_TID_FD_FD) { + vp->v_type = VREG; + lxpnp->lxpr_mode = 0644; + return (lxpnp); + } + } + + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_TASKDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_TASK_IDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + vp->v_type = VLNK; + break; + + case LXPR_PID_FDINFO_FD: + case LXPR_PID_TID_FDINFO_FD: + ASSERT(p != NULL); + lxpnp->lxpr_mode = 0400; /* read by owner only */ + vp->v_type = VREG; + break; + + case LXPR_PID_FDDIR: + case LXPR_PID_TID_FDDIR: + case LXPR_PID_FDINFODIR: + case LXPR_PID_TID_FDINFODIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + case LXPR_SYSDIR: + case LXPR_SYS_FSDIR: + case LXPR_SYS_FS_INOTIFYDIR: + case LXPR_SYS_KERNELDIR: + case LXPR_SYS_KERNEL_RANDDIR: + case LXPR_SYS_NETDIR: + case LXPR_SYS_NET_COREDIR: + case LXPR_SYS_NET_IPV4DIR: + case LXPR_SYS_VMDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_AUXV: + case LXPR_PID_PERSONALITY: + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} + +static int +lxpr_parse_fdnode_num(const char *name) +{ + char *endptr = NULL; + long num; + int fd; + + if (ddi_strtol(name, &endptr, 10, &num) != 0) { + return (-1); + } else if (name[0] < '0' || name[0] > '9' || *endptr != '\0') { + /* + * ddi_strtol allows leading spaces and trailing garbage + * We do not tolerate such foolishness. + */ + return (-1); + } else if ((fd = (int)num) < 0) { + return (-1); + } + return (fd); +} + +/* + * Attempt to locate vnode for /proc/<pid>/fd/<#>. + */ +vnode_t * +lxpr_lookup_fdnode(vnode_t *dvp, const char *name) +{ + lxpr_node_t *lxdp = VTOLXP(dvp); + lxpr_node_t *lxfp; + int fd; + proc_t *p; + vnode_t *vp = NULL; + file_t *fp; + + ASSERT(lxdp->lxpr_type == LXPR_PID_FDDIR || + lxdp->lxpr_type == LXPR_PID_TID_FDDIR); + + if ((fd = lxpr_parse_fdnode_num(name)) == -1) + return (NULL); + + /* Lock the owner process */ + if ((p = lxpr_lock(lxdp, NO_ZOMB)) == NULL) + return (NULL); + + /* Not applicable to processes which are system-owned. */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + lxfp = lxpr_getnode(dvp, LXPR_PID_FD_FD, p, fd); + + if ((fp = lxpr_getf(p, fd, NULL)) != NULL) { + vp = fp->f_vnode; + VN_HOLD(vp); + lxpr_releasef(p, fd); + } + + if (vp == NULL) { + lxpr_unlock(p); + lxpr_freenode(lxfp); + return (NULL); + } + + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxfp->lxpr_realvp = vp; + vp = LXPTOV(lxfp); + + /* + * For certain entries (sockets, pipes, etc), Linux expects a + * bogus-named symlink. If that's the case, report the type as + * VNON to bypass link-following elsewhere in the vfs system. + * + * See lxpr_readlink for more details. + */ + if (lxpr_readlink_fdnode(lxfp, NULL, 0) == 0) + vp->v_type = VNON; + + lxpr_unlock(p); + ASSERT(vp != NULL); + return (vp); +} + +/* + * Attempt to create Linux-proc-style fake symlinks contents for supported + * /proc/<pid>/fd/<#> entries. + */ +int +lxpr_readlink_fdnode(lxpr_node_t *lxpnp, char *bp, size_t len) +{ + const char *format; + vnode_t *rvp = lxpnp->lxpr_realvp; + vattr_t attr; + + switch (rvp->v_type) { + case VSOCK: + format = "socket:[%lu]"; + break; + case VFIFO: + format = "pipe:[%lu]"; + break; + default: + return (-1); + } + + /* Fetch the inode of the underlying vnode */ + if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0) + return (-1); + + if (bp != NULL) + (void) snprintf(bp, len, format, (ino_t)attr.va_nodeid); + return (0); +} + +/* + * Attempt to locate vnode for /proc/<pid>/fdinfo/<#>. + */ +vnode_t * +lxpr_lookup_fdinfonode(vnode_t *dvp, const char *name) +{ + lxpr_node_t *lxdp = VTOLXP(dvp); + lxpr_node_t *lxfp; + proc_t *p; + int fd; + + ASSERT(lxdp->lxpr_type == LXPR_PID_FDINFODIR); + + if ((fd = lxpr_parse_fdnode_num(name)) == -1) + return (NULL); + + /* Lock the owner process */ + if ((p = lxpr_lock(lxdp, NO_ZOMB)) == NULL) + return (NULL); + + /* Not applicable to processes which are system-owned. */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + lxfp = lxpr_getnode(dvp, LXPR_PID_FDINFO_FD, p, fd); + + lxpr_unlock(p); + ASSERT(LXPTOV(lxfp) != NULL); + return (LXPTOV(lxfp)); +} + +/* + * Translate native file flags to Linux open flags. + */ +int +lxpr_open_flags_convert(offset_t uf_flag, uint32_t f_flag) +{ + int flags = 0; + + switch (f_flag & (FREAD | FWRITE)) { + case FREAD: + flags = LX_O_RDONLY; + break; + case FWRITE: + flags = LX_O_WRONLY; + break; + case FREAD | FWRITE: + flags = LX_O_RDWR; + break; + } + + if (f_flag & FNDELAY) + flags |= LX_O_NDELAY; + if (f_flag & FAPPEND) + flags |= LX_O_APPEND; + if (f_flag & FSYNC) + flags |= LX_O_SYNC; + if (f_flag & FNONBLOCK) + flags |= LX_O_NONBLOCK; + + if (f_flag & FCREAT) + flags |= LX_O_CREAT; + if (f_flag & FTRUNC) + flags |= LX_O_TRUNC; + if (f_flag & FEXCL) + flags |= LX_O_EXCL; + if (f_flag & FASYNC) + flags |= LX_O_ASYNC; + if (f_flag & FOFFMAX) + flags |= LX_O_LARGEFILE; + if (f_flag & FNOCTTY) + flags |= LX_O_NOCTTY; + if (f_flag & FNOFOLLOW) + flags |= LX_O_NOFOLLOW; + + if (f_flag & FDIRECT) + flags |= LX_O_DIRECT; + if (f_flag & __FLXPATH) + flags |= LX_O_PATH; + + if (uf_flag & FD_CLOEXEC) + flags |= LX_O_CLOEXEC; + + return (flags); +} + +/* + * Translate a Linux core_pattern path to a native illumos one, by replacing + * the appropriate % escape sequences. + * + * Any % escape sequences that are not recognised are double-escaped so that + * they will be inserted literally into the path (to mimic Linux). + */ +int +lxpr_core_path_l2s(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'E': + if (j + 4 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = 'd'; + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'e': + outp[j++] = '%'; + outp[j++] = 'f'; + break; + case 'p': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = x; + break; + case 'h': + outp[j++] = '%'; + outp[j++] = 'n'; + break; + default: + /* No translation, make it literal. */ + if (j + 3 >= outsz - 1) + return (EINVAL); + outp[j++] = '%'; + outp[j++] = '%'; + outp[j++] = x; + break; + } + } + + outp[j] = '\0'; + return (0); +} + +/* + * Translate an illumos core pattern path back to Linux format. + */ +int +lxpr_core_path_s2l(const char *inp, char *outp, size_t outsz) +{ + int i = 0, j = 0; + char x; + + while (j < outsz - 1) { + x = inp[i++]; + if (x == '\0') + break; + if (x != '%') { + outp[j++] = x; + continue; + } + + x = inp[i++]; + if (x == '\0') + break; + + /* Make sure we have enough space in the output buffer. */ + if (j + 2 >= outsz - 1) + return (EINVAL); + + switch (x) { + case 'd': + /* No Linux equivalent unless it's %d%f. */ + if (inp[i] == '%' && inp[i + 1] == 'f') { + i += 2; + outp[j++] = '%'; + outp[j++] = 'E'; + } + break; + case 'f': + outp[j++] = '%'; + outp[j++] = 'e'; + break; + case 'p': + case 'P': + case 'g': + case 'u': + case 't': + case '%': + outp[j++] = '%'; + outp[j++] = (x == 'P' ? 'p' : x); + break; + case 'n': + outp[j++] = '%'; + outp[j++] = 'h'; + break; + default: + /* No translation. */ + break; + } + } + + outp[j] = '\0'; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c new file mode 100644 index 0000000000..b4dc5091c2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * lxprvfsops.c: vfs operations for /lxprocfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#include "lx_proc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_proc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialise cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + /* + * Mounting lx_proc is not allowed outside an LX zone. + */ + if (zone->zone_brand != &lx_brand) { + return (ENOTSUP); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * Hold a zone reference for access to the lxzd structure. + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* + * Allocate the first vnode and arbitrarily set the parent vnode to the + * mounted over directory + */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + + ldi_ident_release(lxpr_mnt->lxprm_li); + + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/proc"); + (void) strcpy(&sp->f_fstr[6], "/proc"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c new file mode 100644 index 0000000000..6d4d2357eb --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -0,0 +1,8553 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * lx_proc -- a Linux-compatible /proc for the LX brand + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/queue.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <lx_auxv.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/fcntl.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/rctl.h> +#include <sys/kstat.h> +#include <sys/lx_misc.h> +#include <sys/lx_types.h> +#include <sys/lx_userhz.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/corectl.h> +#include <sys/rctl_impl.h> +#include <inet/cc.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> +#include <inet/ipclassifier.h> +#include <sys/socketvar.h> +#include <fs/sockfs/socktpi.h> +#include <sys/random.h> +#include <sys/procfs.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); +extern int prreadcmdline(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); +extern int prreadbuf(proc_t *, uintptr_t, uint8_t *, size_t, size_t *); + +#include "lx_proc.h" + +extern pgcnt_t swapfs_minfree; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_create(struct vnode *, char *, struct vattr *, enum vcexcl, + int, struct vnode **, struct cred *, int, caller_context_t *, vsecattr_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_write(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_space(vnode_t *, int, flock64_t *, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_setattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_poll(vnode_t *, short, int, short *, pollhead_t **, + caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static int lxpr_doaccess(lxpr_node_t *, boolean_t, int, int, cred_t *, + caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fdinfodir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kdir_randdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_coredir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_net_ipv4dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_vmdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fdinfodir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kdir_randdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_coredir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_vmdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cgroups(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_devices(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fdinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_filesystems(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_vmstat(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_auxv(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cgroup(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_id_map(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_loginuid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_oom_scr_adj(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_personality(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_tid_comm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_aiomax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_aionr(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_filemax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_filenr(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_pipe_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_caplcap(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_corepatt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmnb(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_osrel(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_rand_uuid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_sem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmall(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_core_somaxc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_cc_allow(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_cc_avail(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_cc_curr(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_dirty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_minfr_kb(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_nhpages(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_vm_swappiness(lxpr_node_t *, lxpr_uiobuf_t *); + +static int lxpr_write_pid_tid_comm(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_pid_loginuid(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_fs_pipe_max(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_core_somaxc(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_cc_curr(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *, uio_t *, + cred_t *, caller_context_t *); +static int lxpr_write_sys_kernel_corepatt(lxpr_node_t *, uio_t *, cred_t *, + caller_context_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) + +extern rctl_hndl_t rc_process_semmsl; +extern rctl_hndl_t rc_process_semopm; +extern rctl_hndl_t rc_zone_semmni; +extern rctl_hndl_t rc_process_msgmnb; + +extern rctl_hndl_t rc_zone_msgmni; +extern rctl_hndl_t rc_zone_shmmax; +extern rctl_hndl_t rc_zone_shmmni; + +/* From uts/common/crypto/io/swrand.c */ +extern swrand_stats_t swrand_stats; + +#define ONEGB 1073741824ULL +#define FOURGB 4294967295ULL + +/* + * The maximum length of the concatenation of env vector strings we + * will return to the user via the branded procfs. + */ +int lxpr_maxenvvlen = 4096; + +/* + * The lx /proc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_WRITE, { .vop_read = lxpr_write }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_CREATE, { .vop_create = lxpr_create }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_SPACE, { .vop_space = lxpr_space }, + VOPNAME_SETATTR, { .vop_setattr = lxpr_setattr }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + VOPNAME_POLL, { .vop_poll = lxpr_poll }, + NULL, NULL +}; + + +/* + * file contents of an lx /proc directory. + */ +static lxpr_dirent_t lx_procdir[] = { + { LXPR_CGROUPS, "cgroups" }, + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DISKSTATS, "diskstats" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MODULES, "modules" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_SWAPS, "swaps" }, + { LXPR_SYSDIR, "sys" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" }, + { LXPR_VMSTAT, "vmstat" } +}; + +#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0])) + +/* + * Contents of an lx /proc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_GIDMAP, "gid_map" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_MOUNTS, "mounts" }, + { LXPR_PID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_TASKDIR, "task" }, + { LXPR_PID_FDDIR, "fd" }, + { LXPR_PID_FDINFODIR, "fdinfo" }, + { LXPR_PID_UIDMAP, "uid_map" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * Contents of an lx /proc/<pid>/task/<tid> directory. + */ +static lxpr_dirent_t tiddir[] = { + { LXPR_PID_TID_AUXV, "auxv" }, + { LXPR_PID_CGROUP, "cgroup" }, + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_TID_COMM, "comm" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_GIDMAP, "gid_map" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_LOGINUID, "loginuid" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_TID_OOM_SCR_ADJ, "oom_score_adj" }, + { LXPR_PID_PERSONALITY, "personality" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_TID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_TID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" }, + { LXPR_PID_FDINFODIR, "fdinfo" }, + { LXPR_PID_UIDMAP, "uid_map" } +}; + +#define TIDDIRFILES (sizeof (tiddir) / sizeof (tiddir[0])) + +#define LX_RLIM_INFINITY 0xFFFFFFFFFFFFFFFF + +#define RCTL_INFINITE(x) \ + ((x.rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x.rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct lxpr_rlimtab { + char *rlim_name; /* limit name */ + char *rlim_unit; /* limit unit */ + char *rlim_rctl; /* rctl source */ +} lxpr_rlimtab_t; + +#define RLIM_MAXFD "Max open files" + +static lxpr_rlimtab_t lxpr_rlimtab[] = { + { "Max cpu time", "seconds", "process.max-cpu-time" }, + { "Max file size", "bytes", "process.max-file-size" }, + { "Max data size", "bytes", "process.max-data-size" }, + { "Max stack size", "bytes", "process.max-stack-size" }, + { "Max core file size", "bytes", "process.max-core-size" }, + { "Max resident set", "bytes", "zone.max-physical-memory" }, + { "Max processes", "processes", "zone.max-lwps" }, + { RLIM_MAXFD, "files", "process.max-file-descriptor" }, + { "Max locked memory", "bytes", "zone.max-locked-memory" }, + { "Max address space", "bytes", "process.max-address-space" }, + { "Max file locks", "locks", NULL }, + { "Max pending signals", "signals", + "process.max-sigqueue-size" }, + { "Max msgqueue size", "bytes", "process.max-msg-messages" } +}; + +#define LX_RLIM_TAB_LEN (sizeof (lxpr_rlimtab) / sizeof (lxpr_rlimtab[0])) + + +/* + * contents of lx /proc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IF_INET6, "if_inet6" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_IPV6_ROUTE, "ipv6_route" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_TCP6, "tcp6" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UDP6, "udp6" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * contents of /proc/sys directory + */ +static lxpr_dirent_t sysdir[] = { + { LXPR_SYS_FSDIR, "fs" }, + { LXPR_SYS_KERNELDIR, "kernel" }, + { LXPR_SYS_NETDIR, "net" }, + { LXPR_SYS_VMDIR, "vm" }, +}; + +#define SYSDIRFILES (sizeof (sysdir) / sizeof (sysdir[0])) + +/* + * contents of /proc/sys/fs directory + */ +static lxpr_dirent_t sys_fsdir[] = { + { LXPR_SYS_FS_AIO_MAX_NR, "aio-max-nr" }, + { LXPR_SYS_FS_AIO_NR, "aio-nr" }, + { LXPR_SYS_FS_FILEMAX, "file-max" }, + { LXPR_SYS_FS_FILENR, "file-nr" }, + { LXPR_SYS_FS_INOTIFYDIR, "inotify" }, + { LXPR_SYS_FS_PIPE_MAX, "pipe-max-size" }, +}; + +#define SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0])) + +/* + * contents of /proc/sys/fs/inotify directory + */ +static lxpr_dirent_t sys_fs_inotifydir[] = { + { LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, +}; + +#define SYS_FS_INOTIFYDIRFILES \ + (sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0])) + +/* + * contents of /proc/sys/kernel directory + */ +static lxpr_dirent_t sys_kerneldir[] = { + { LXPR_SYS_KERNEL_CAPLCAP, "cap_last_cap" }, + { LXPR_SYS_KERNEL_COREPATT, "core_pattern" }, + { LXPR_SYS_KERNEL_HOSTNAME, "hostname" }, + { LXPR_SYS_KERNEL_MSGMAX, "msgmax" }, + { LXPR_SYS_KERNEL_MSGMNB, "msgmnb" }, + { LXPR_SYS_KERNEL_MSGMNI, "msgmni" }, + { LXPR_SYS_KERNEL_NGROUPS_MAX, "ngroups_max" }, + { LXPR_SYS_KERNEL_OSREL, "osrelease" }, + { LXPR_SYS_KERNEL_PID_MAX, "pid_max" }, + { LXPR_SYS_KERNEL_RANDDIR, "random" }, + { LXPR_SYS_KERNEL_SEM, "sem" }, + { LXPR_SYS_KERNEL_SHMALL, "shmall" }, + { LXPR_SYS_KERNEL_SHMMAX, "shmmax" }, + { LXPR_SYS_KERNEL_SHMMNI, "shmmni" }, + { LXPR_SYS_KERNEL_THREADS_MAX, "threads-max" }, +}; + +#define SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0])) + +/* + * contents of /proc/sys/kernel/random directory + */ +static lxpr_dirent_t sys_randdir[] = { + { LXPR_SYS_KERNEL_RAND_BOOTID, "boot_id" }, + { LXPR_SYS_KERNEL_RAND_ENTAVL, "entropy_avail" }, + { LXPR_SYS_KERNEL_RAND_UUID, "uuid" }, +}; + +#define SYS_RANDDIRFILES (sizeof (sys_randdir) / sizeof (sys_randdir[0])) + +/* + * contents of /proc/sys/net directory + */ +static lxpr_dirent_t sys_netdir[] = { + { LXPR_SYS_NET_COREDIR, "core" }, + { LXPR_SYS_NET_IPV4DIR, "ipv4" }, +}; + +#define SYS_NETDIRFILES (sizeof (sys_netdir) / sizeof (sys_netdir[0])) + +/* + * contents of /proc/sys/net/core directory + */ +static lxpr_dirent_t sys_net_coredir[] = { + { LXPR_SYS_NET_CORE_SOMAXCON, "somaxconn" }, +}; + +#define SYS_NET_COREDIRFILES \ + (sizeof (sys_net_coredir) / sizeof (sys_net_coredir[0])) + +/* + * contents of /proc/sys/net/ipv4 directory + * See the Linux ip(7) & tcp(7) man pages for descriptions and the illumos + * ip(7p) & tcp(7p) man pages for the native descriptions. + */ +static lxpr_dirent_t sys_net_ipv4dir[] = { + { LXPR_SYS_NET_IPV4_ICMP_EIB, "icmp_echo_ignore_broadcasts" }, + { LXPR_SYS_NET_IPV4_IP_FORWARD, "ip_forward" }, + { LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, "ip_local_port_range" }, + { LXPR_SYS_NET_IPV4_TCP_CC_ALLOW, "tcp_allowed_congestion_control" }, + { LXPR_SYS_NET_IPV4_TCP_CC_AVAIL, "tcp_available_congestion_control" }, + { LXPR_SYS_NET_IPV4_TCP_CC_CURR, "tcp_congestion_control" }, + { LXPR_SYS_NET_IPV4_TCP_FIN_TO, "tcp_fin_timeout" }, + { LXPR_SYS_NET_IPV4_TCP_KA_INT, "tcp_keepalive_intvl" }, + { LXPR_SYS_NET_IPV4_TCP_KA_TIM, "tcp_keepalive_time" }, + { LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, "tcp_max_syn_backlog" }, + { LXPR_SYS_NET_IPV4_TCP_RETRY2, "tcp_retries2" }, + { LXPR_SYS_NET_IPV4_TCP_RMEM, "tcp_rmem" }, + { LXPR_SYS_NET_IPV4_TCP_SACK, "tcp_sack" }, + { LXPR_SYS_NET_IPV4_TCP_WINSCALE, "tcp_window_scaling" }, + { LXPR_SYS_NET_IPV4_TCP_WMEM, "tcp_wmem" }, +}; + +#define SYS_NET_IPV4DIRFILES \ + (sizeof (sys_net_ipv4dir) / sizeof (sys_net_ipv4dir[0])) + +/* + * contents of /proc/sys/vm directory + */ +static lxpr_dirent_t sys_vmdir[] = { + { LXPR_SYS_VM_DIRTY_BG_BYTES, "dirty_background_bytes" }, + { LXPR_SYS_VM_DIRTY_BG_RATIO, "dirty_background_ratio" }, + { LXPR_SYS_VM_DIRTY_BYTES, "dirty_bytes" }, + { LXPR_SYS_VM_DIRTY_EXP_CS, "dirty_expire_centisecs" }, + { LXPR_SYS_VM_DIRTY_RATIO, "dirty_ratio" }, + { LXPR_SYS_VM_DIRTYTIME_EXP_SEC, "dirtytime_expire_seconds" }, + { LXPR_SYS_VM_DIRTY_WB_CS, "dirty_writeback_centisecs" }, + { LXPR_SYS_VM_MAX_MAP_CNT, "max_map_count" }, + { LXPR_SYS_VM_MINFR_KB, "min_free_kbytes" }, + { LXPR_SYS_VM_NHUGEP, "nr_hugepages" }, + { LXPR_SYS_VM_OVERCOMMIT_MEM, "overcommit_memory" }, + { LXPR_SYS_VM_SWAPPINESS, "swappiness" }, +}; + +#define SYS_VMDIRFILES (sizeof (sys_vmdir) / sizeof (sys_vmdir[0])) + +/* + * Table for standard writable files. Non-standard writable files not in this + * table can be handled explicitly as special cases. + * This table drives lxpr_is_writable, lxpr_write, and lxpr_create. + * Note that the entries LXPR_PID_FD_FD and LXPR_PID_TID_FD_FD exist in the + * table both to verify writability and to satisfy opening with O_CREATE. + */ +typedef struct wftab { + lxpr_nodetype_t wft_type; /* file entry type */ + int (*wft_wrf)(lxpr_node_t *, struct uio *, cred_t *, + caller_context_t *); /* write function */ +} wftab_t; + +static wftab_t wr_tab[] = { + {LXPR_PID_COMM, lxpr_write_pid_tid_comm}, + {LXPR_PID_FD_FD, NULL}, + {LXPR_PID_LOGINUID, lxpr_write_pid_loginuid}, + {LXPR_PID_OOM_SCR_ADJ, NULL}, + {LXPR_PID_TID_COMM, lxpr_write_pid_tid_comm}, + {LXPR_PID_TID_FD_FD, NULL}, + {LXPR_PID_TID_OOM_SCR_ADJ, NULL}, + {LXPR_SYS_FS_FILEMAX, NULL}, + {LXPR_SYS_KERNEL_COREPATT, lxpr_write_sys_kernel_corepatt}, + {LXPR_SYS_KERNEL_SHMALL, NULL}, + {LXPR_SYS_KERNEL_SHMMAX, NULL}, + {LXPR_SYS_FS_PIPE_MAX, lxpr_write_sys_fs_pipe_max}, + {LXPR_SYS_NET_CORE_SOMAXCON, lxpr_write_sys_net_core_somaxc}, + {LXPR_SYS_NET_IPV4_ICMP_EIB, lxpr_write_sys_net_ipv4_icmp_eib}, + {LXPR_SYS_NET_IPV4_IP_FORWARD, NULL}, + {LXPR_SYS_NET_IPV4_IP_LPORT_RANGE, + lxpr_write_sys_net_ipv4_ip_lport_range}, + {LXPR_SYS_NET_IPV4_TCP_CC_ALLOW, NULL}, + {LXPR_SYS_NET_IPV4_TCP_CC_AVAIL, NULL}, + {LXPR_SYS_NET_IPV4_TCP_CC_CURR, lxpr_write_sys_net_ipv4_tcp_cc_curr}, + {LXPR_SYS_NET_IPV4_TCP_FIN_TO, lxpr_write_sys_net_ipv4_tcp_fin_to}, + {LXPR_SYS_NET_IPV4_TCP_KA_INT, lxpr_write_sys_net_ipv4_tcp_ka_int}, + {LXPR_SYS_NET_IPV4_TCP_KA_TIM, lxpr_write_sys_net_ipv4_tcp_ka_tim}, + {LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL, + lxpr_write_sys_net_ipv4_tcp_max_syn_bl}, + {LXPR_SYS_NET_IPV4_TCP_RETRY2, lxpr_write_sys_net_ipv4_tcp_retry2}, + {LXPR_SYS_NET_IPV4_TCP_RMEM, lxpr_write_sys_net_ipv4_tcp_rwmem}, + {LXPR_SYS_NET_IPV4_TCP_SACK, lxpr_write_sys_net_ipv4_tcp_sack}, + {LXPR_SYS_NET_IPV4_TCP_WINSCALE, lxpr_write_sys_net_ipv4_tcp_winscale}, + {LXPR_SYS_NET_IPV4_TCP_WMEM, lxpr_write_sys_net_ipv4_tcp_rwmem}, + {LXPR_SYS_VM_DIRTY_BG_BYTES, NULL}, + {LXPR_SYS_VM_DIRTY_BG_RATIO, NULL}, + {LXPR_SYS_VM_DIRTY_BYTES, NULL}, + {LXPR_SYS_VM_DIRTY_EXP_CS, NULL}, + {LXPR_SYS_VM_DIRTY_RATIO, NULL}, + {LXPR_SYS_VM_DIRTYTIME_EXP_SEC, NULL}, + {LXPR_SYS_VM_DIRTY_WB_CS, NULL}, + {LXPR_SYS_VM_OVERCOMMIT_MEM, NULL}, + {LXPR_SYS_VM_SWAPPINESS, NULL}, + {LXPR_INVALID, NULL} +}; + +/* + * Centralized test for the standard writable proc files. Other non-standard + * writable files might be handled separately. + */ +boolean_t +lxpr_is_writable(lxpr_nodetype_t type) +{ + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* Restrict writes to certain files */ + if ((flag & FWRITE) && !lxpr_is_writable(type)) { + return (EPERM); + } + + /* + * If we are opening an underlying file only allow regular files, + * fifos or sockets; reject the open for anything else. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG && + rvp->v_type != VFIFO && rvp->v_type != VSOCK) { + error = EACCES; + } else { + if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) { + /* + * This flag lets the fifo open know that + * we're using proc/fd to open a fd which we + * already have open. Otherwise, the fifo might + * reject an open if the other end has closed. + */ + flag |= FKLYR; + } + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ +#ifdef DEBUG + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); +#endif /* DEBUG */ + + return (0); +} + +static void (*lxpr_read_function[])() = { + NULL, /* invalid */ + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_auxv, /* /proc/<pid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_pid_tid_comm, /* /proc/<pid>/comm */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_id_map, /* /proc/<pid>/gid_map */ + lxpr_read_pid_limits, /* /proc/<pid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/mountinfo */ + lxpr_read_mounts, /* /proc/<pid>/mounts */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/oom_score_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task */ + lxpr_read_isdir, /* /proc/<pid>/task/nn */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_isdir, /* /proc/<pid>/fdinfo */ + lxpr_read_fdinfo, /* /proc/<pid>/fdinfo/nn */ + lxpr_read_pid_id_map, /* /proc/<pid>/uid_map */ + lxpr_read_pid_auxv, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_read_pid_cgroup, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_read_pid_cmdline, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_read_pid_tid_comm, /* /proc/<pid>/task/<tid>/comm */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/task/<tid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/exe */ + lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_read_pid_limits, /* /proc/<pid>/task/<tid>/limits */ + lxpr_read_pid_loginuid, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_read_pid_maps, /* /proc/<pid>/task/<tid>/maps */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_read_pid_oom_scr_adj, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_read_pid_personality, /* /proc/<pid>/task/<tid>/personality */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/task/<tid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/task/<tid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/task/<tid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_read_fd, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fdinfo */ + lxpr_read_fdinfo, /* /proc/<pid>/task/<tid>/fdinfo/nn */ + lxpr_read_pid_id_map, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_read_cgroups, /* /proc/cgroups */ + lxpr_read_cmdline, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_devices, /* /proc/devices */ + lxpr_read_diskstats, /* /proc/diskstats */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_filesystems, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_empty, /* /proc/modules */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_if_inet6, /* /proc/net/if_inet6 */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_ipv6_route, /* /proc/net/ipv6_route */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_tcp6, /* /proc/net/tcp6 */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_udp6, /* /proc/net/udp6 */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_swaps, /* /proc/swaps */ + lxpr_read_invalid, /* /proc/sys */ + lxpr_read_invalid, /* /proc/sys/fs */ + lxpr_read_sys_fs_aiomax, /* /proc/sys/fs/aio-max-nr */ + lxpr_read_sys_fs_aionr, /* /proc/sys/fs/aio-nr */ + lxpr_read_sys_fs_filemax, /* /proc/sys/fs/file-max */ + lxpr_read_sys_fs_filenr, /* /proc/sys/fs/file-nr */ + lxpr_read_invalid, /* /proc/sys/fs/inotify */ + lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */ + lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */ + lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */ + lxpr_read_sys_fs_pipe_max, /* /proc/sys/fs/pipe-max-size */ + lxpr_read_invalid, /* /proc/sys/kernel */ + lxpr_read_sys_kernel_caplcap, /* /proc/sys/kernel/cap_last_cap */ + lxpr_read_sys_kernel_corepatt, /* /proc/sys/kernel/core_pattern */ + lxpr_read_sys_kernel_hostname, /* /proc/sys/kernel/hostname */ + lxpr_read_sys_kernel_msgmax, /* /proc/sys/kernel/msgmax */ + lxpr_read_sys_kernel_msgmnb, /* /proc/sys/kernel/msgmnb */ + lxpr_read_sys_kernel_msgmni, /* /proc/sys/kernel/msgmni */ + lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */ + lxpr_read_sys_kernel_osrel, /* /proc/sys/kernel/osrelease */ + lxpr_read_sys_kernel_pid_max, /* /proc/sys/kernel/pid_max */ + lxpr_read_invalid, /* /proc/sys/kernel/random */ + lxpr_read_sys_kernel_rand_bootid, /* /proc/sys/kernel/random/boot_id */ + lxpr_read_sys_kernel_rand_entavl, /* .../kernel/random/entropy_avail */ + lxpr_read_sys_kernel_rand_uuid, /* .../kernel/random/uuid */ + lxpr_read_sys_kernel_sem, /* /proc/sys/kernel/sem */ + lxpr_read_sys_kernel_shmall, /* /proc/sys/kernel/shmall */ + lxpr_read_sys_kernel_shmmax, /* /proc/sys/kernel/shmmax */ + lxpr_read_sys_kernel_shmmni, /* /proc/sys/kernel/shmmni */ + lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */ + lxpr_read_invalid, /* /proc/sys/net */ + lxpr_read_invalid, /* /proc/sys/net/core */ + lxpr_read_sys_net_core_somaxc, /* /proc/sys/net/core/somaxconn */ + lxpr_read_invalid, /* /proc/sys/net/ipv4 */ + lxpr_read_sys_net_ipv4_icmp_eib, /* .../icmp_echo_ignore_broadcasts */ + lxpr_read_sys_net_ipv4_ip_forward, /* .../ipv4/ip_forward */ + lxpr_read_sys_net_ipv4_ip_lport_range, /* ../ipv4/ip_local_port_range */ + /* .../tcp_allowed_congestion_control */ + lxpr_read_sys_net_ipv4_tcp_cc_allow, + /* .../tcp_available_congestion_control */ + lxpr_read_sys_net_ipv4_tcp_cc_avail, + /* .../tcp_congestion_control */ + lxpr_read_sys_net_ipv4_tcp_cc_curr, + lxpr_read_sys_net_ipv4_tcp_fin_to, /* .../ipv4/tcp_fin_timeout */ + lxpr_read_sys_net_ipv4_tcp_ka_int, /* .../ipv4/tcp_keepalive_intvl */ + lxpr_read_sys_net_ipv4_tcp_ka_tim, /* .../ipv4/tcp_keepalive_time */ + lxpr_read_sys_net_ipv4_tcp_max_syn_bl, /* ../ipv4/tcp_max_syn_backlog */ + lxpr_read_sys_net_ipv4_tcp_retry2, /* .../ipv4/tcp_retries2 */ + lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_rmem */ + lxpr_read_sys_net_ipv4_tcp_sack, /* .../ipv4/tcp_sack */ + lxpr_read_sys_net_ipv4_tcp_winscale, /* .../ipv4/tcp_window_scaling */ + lxpr_read_sys_net_ipv4_tcp_rwmem, /* .../ipv4/tcp_wmem */ + lxpr_read_invalid, /* /proc/sys/vm */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_bytes */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_background_ratio */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_bytes */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_expire_centisecs */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_ratio */ + lxpr_read_sys_vm_dirty, /* .../vm/dirtytime_expire_seconds */ + lxpr_read_sys_vm_dirty, /* .../vm/dirty_writeback_centisecs */ + lxpr_read_sys_vm_max_map_cnt, /* /proc/sys/vm/max_map_count */ + lxpr_read_sys_vm_minfr_kb, /* /proc/sys/vm/min_free_kbytes */ + lxpr_read_sys_vm_nhpages, /* /proc/sys/vm/nr_hugepages */ + lxpr_read_sys_vm_overcommit_mem, /* /proc/sys/vm/overcommit_memory */ + lxpr_read_sys_vm_swappiness, /* /proc/sys/vm/swappiness */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ + lxpr_read_vmstat, /* /proc/vmstat */ +}; + +CTASSERT(ARRAY_SIZE(lxpr_read_function) == LXPR_NFILES); + +/* + * Array of lookup functions, indexed by lx /proc file type. + */ +static vnode_t *(*lxpr_lookup_function[])() = { + NULL, /* invalid */ + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/gid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mounts */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_taskdir, /* /proc/<pid>/task */ + lxpr_lookup_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_fdinfodir, /* /proc/<pid>/fdinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fdinfo/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/uid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/oom_scr_adj */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/personality */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_lookup_fdinfodir, /* /proc/<pid>/task/<tid>/fdinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fdinfo/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_lookup_not_a_dir, /* /proc/cgroups */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/diskstats */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/modules */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/swaps */ + lxpr_lookup_sysdir, /* /proc/sys */ + lxpr_lookup_sys_fsdir, /* /proc/sys/fs */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-max-nr */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/aio-nr */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-max */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/file-nr */ + lxpr_lookup_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_lookup_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_lookup_not_a_dir, /* /proc/sys/fs/pipe-max-size */ + lxpr_lookup_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmnb */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_lookup_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_lookup_not_a_dir, /* .../kernel/random/entropy_avail */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/random/uuid */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_lookup_sys_netdir, /* /proc/sys/net */ + lxpr_lookup_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_lookup_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_lookup_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_lookup_not_a_dir, /* .../icmp_echo_ignore_broadcasts */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_forward */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + /* .../tcp_allowed_congestion_control */ + lxpr_lookup_not_a_dir, + /* .../tcp_available_congestion_control */ + lxpr_lookup_not_a_dir, + /* .../tcp_congestion_control */ + lxpr_lookup_not_a_dir, + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_retries2 */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_rmem */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_lookup_not_a_dir, /* .../net/ipv4/tcp_wmem */ + lxpr_lookup_sys_vmdir, /* /proc/sys/vm */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_background_bytes */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_background_ratio */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_bytes */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_expire_centisecs */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_ratio */ + lxpr_lookup_not_a_dir, /* .../vm/dirtytime_expire_seconds */ + lxpr_lookup_not_a_dir, /* .../vm/dirty_writeback_centisecs */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_lookup_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ + lxpr_lookup_not_a_dir, /* /proc/vmstat */ +}; + +CTASSERT(ARRAY_SIZE(lxpr_lookup_function) == LXPR_NFILES); + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[])() = { + NULL, /* invalid */ + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/gid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mounts */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/oom_score_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_taskdir, /* /proc/<pid>/task */ + lxpr_readdir_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_fdinfodir, /* /proc/<pid>/fdinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fdinfo/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/uid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/auxv */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cgroup */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/comm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/gid_map */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/loginuid */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/oom_scr_adj */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid/personality */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_readdir_fdinfodir, /* /proc/<pid>/task/<tid>/fdinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fdinfo/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/uid_map */ + lxpr_readdir_not_a_dir, /* /proc/cgroups */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/diskstats */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/modules */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/swaps */ + lxpr_readdir_sysdir, /* /proc/sys */ + lxpr_readdir_sys_fsdir, /* /proc/sys/fs */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-max-nr */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/aio-nr */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-max */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/file-nr */ + lxpr_readdir_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_readdir_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_readdir_not_a_dir, /* /proc/sys/fs/pipe-max-size */ + lxpr_readdir_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/cap_last_cap */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/core_pattern */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmnb */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/osrelease */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_readdir_sys_kdir_randdir, /* /proc/sys/kernel/random */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/random/boot_id */ + lxpr_readdir_not_a_dir, /* .../kernel/random/entropy_avail */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/random/uuid */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/sem */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmall */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_readdir_sys_netdir, /* /proc/sys/net */ + lxpr_readdir_sys_net_coredir, /* /proc/sys/net/core */ + lxpr_readdir_not_a_dir, /* /proc/sys/net/core/somaxconn */ + lxpr_readdir_sys_net_ipv4dir, /* /proc/sys/net/ipv4 */ + lxpr_readdir_not_a_dir, /* .../icmp_echo_ignore_broadcasts */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_forward */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/ip_local_port_range */ + /* .../tcp_allowed_congestion_control */ + lxpr_readdir_not_a_dir, + /* .../tcp_available_congestion_control */ + lxpr_readdir_not_a_dir, + /* .../tcp_congestion_control */ + lxpr_readdir_not_a_dir, + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_fin_timeout */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_intvl */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_keepalive_time */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_max_syn_backlog */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_retries2 */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_rmem */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_sack */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_window_scaling */ + lxpr_readdir_not_a_dir, /* .../net/ipv4/tcp_wmem */ + lxpr_readdir_sys_vmdir, /* /proc/sys/vm */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_background_bytes */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_background_ratio */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_bytes */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_expire_centisecs */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_ratio */ + lxpr_readdir_not_a_dir, /* .../vm/dirtytime_expire_seconds */ + lxpr_readdir_not_a_dir, /* .../vm/dirty_writeback_centisecs */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/max_map_count */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/min_free_kbytes */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/nr_hugepages */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/overcommit_memory */ + lxpr_readdir_not_a_dir, /* /proc/sys/vm/swappiness */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ + lxpr_readdir_not_a_dir, /* /proc/vmstat */ +}; + +CTASSERT(ARRAY_SIZE(lxpr_readdir_function) == LXPR_NFILES); + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in the lx procfs is human + * readable and not binary structures there do not have to be different + * read variants depending on whether the reading process model is 32 or 64 bits + * (at least in general, and certainly the difference is unlikely to be enough + * to justify have different routines for 32 and 64 bit reads + */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_auxv(): read process aux vector + */ +static void +lxpr_read_pid_auxv(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + lx_elf_data_t *edp = NULL; + int i, cnt; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_AUXV || + lxpnp->lxpr_type == LXPR_PID_TID_AUXV); + + p = lxpr_lock(lxpnp, NO_ZOMB); + + if (p == NULL) { + return; + } + if ((pd = ptolxproc(p)) == NULL) { + /* Emit a single AT_NULL record for non-branded processes */ + auxv_t buf; + + bzero(&buf, sizeof (buf)); + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)&buf, sizeof (buf)); + return; + } else { + edp = &pd->l_elf_data; + } + + if (p->p_model == DATAMODEL_NATIVE) { + auxv_t buf[__KERN_NAUXV_IMPL]; + + /* + * Because a_type is only of size int (not long), the buffer + * contents must be zeroed first to ensure cleanliness. + */ + bzero(buf, sizeof (buf)); + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &buf[cnt], edp) == 0) { + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#if defined(_SYSCALL32_IMPL) + else { + auxv32_t buf[__KERN_NAUXV_IMPL]; + + for (i = 0, cnt = 0; i < __KERN_NAUXV_IMPL; i++) { + auxv_t temp; + + if (lx_auxv_stol(&p->p_user.u_auxv[i], + &temp, edp) == 0) { + buf[cnt].a_type = (int)temp.a_type; + buf[cnt].a_un.a_val = (int)temp.a_un.a_val; + cnt++; + } + if (p->p_user.u_auxv[i].a_type == AT_NULL) { + break; + } + } + lxpr_unlock(p); + lxpr_uiobuf_write(uiobuf, (char *)buf, cnt * sizeof (buf[0])); + } +#endif /* defined(_SYSCALL32_IMPL) */ +} + +/* + * lxpr_read_pid_cgroup(): read cgroups for process + */ +static void +lxpr_read_pid_cgroup(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CGROUP || + lxpnp->lxpr_type == LXPR_PID_TID_CGROUP); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* basic stub, 3rd field will need to be populated */ + lxpr_uiobuf_printf(uiobuf, "1:name=systemd:/\n"); +} + +/* + * lxpr_read_pid_cmdline(): read argument vector from process + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = PRMAXARGVLEN, sz; + int r; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE || + lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + r = prreadcmdline(p, buf, asz, &sz); + + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_tid_comm(): read command name from thread + */ +static void +lxpr_read_pid_tid_comm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + pid_t tid; + char buf[LX_PR_SET_NAME_NAMELEN], *pnm; + + VERIFY(lxpnp->lxpr_type == LXPR_PID_COMM || + lxpnp->lxpr_type == LXPR_PID_TID_COMM); + + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + if (t == NULL) { + lxpr_unlock(p); + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* + * If a thread name has not been set, use the process command name. + * This also covers the /proc/{pid}/comm case. + */ + if (t->t_name == NULL) { + pnm = p->p_user.u_comm; + } else { + pnm = t->t_name; + } + + /* Truncate with NUL if the name is longer than the Linux size. */ + (void) strlcpy(buf, pnm, sizeof (buf)); + + lxpr_unlock(p); + lxpr_uiobuf_printf(uiobuf, "%s\n", buf); +} + +static int +lxpr_write_pid_tid_comm(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + int error; + size_t olen; + char *buf; + proc_t *p; + kthread_t *t; + pid_t tid; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_COMM || + lxpnp->lxpr_type == LXPR_PID_TID_COMM); + + /* + * Only a thread in the process can update one of the thread names. Not + * even a process with root privileges. Linux returns EINVAL (not EPERM) + * for this case. + */ + if (lxpnp->lxpr_pid != curproc->p_pid) + return (EINVAL); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > LX_PR_SET_NAME_NAMELEN - 1) + olen = LX_PR_SET_NAME_NAMELEN - 1; + + buf = kmem_zalloc(THREAD_NAME_MAX, KM_SLEEP); + + error = uiomove(buf, olen, UIO_WRITE, uio); + if (error != 0) { + kmem_free(buf, THREAD_NAME_MAX); + return (error); + } + buf[LX_PR_SET_NAME_NAMELEN - 1] = '\0'; + + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, NO_ZOMB, &t); + if (p == NULL) { + kmem_free(buf, THREAD_NAME_MAX); + return (ENXIO); + } + if (t == NULL) { + lxpr_unlock(p); + kmem_free(buf, THREAD_NAME_MAX); + return (ENXIO); + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + + /* + * See comments for thread_setname() and prctl(LX_PR_SET_NAME) handling. + */ + if (t->t_name == NULL) { + t->t_name = buf; + } else { + (void) strlcpy(t->t_name, buf, THREAD_NAME_MAX); + kmem_free(buf, THREAD_NAME_MAX); + } + + if (t->t_tid == 1) { + (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1); + (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ); + } + + lxpr_unlock(p); + return (0); +} + +/* + * lxpr_read_pid_env(): read env vector from process + */ +static void +lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxenvvlen, sz; + int r; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + kmem_free(buf, asz); + return; + } + + r = prreadenvv(p, buf, asz, &sz); + lxpr_unlock(p); + + if (r != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_limits(): ulimit file + */ +static void +lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + rctl_qty_t cur[LX_RLIM_TAB_LEN], max[LX_RLIM_TAB_LEN]; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS || + lxpnp->lxpr_type == LXPR_PID_TID_LIMITS); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + return; + } + + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + char *kname = lxpr_rlimtab[i].rlim_rctl; + rctl_val_t nval, *oval = NULL; + rctl_hndl_t hndl; + + /* default to unlimited for resources without an analog */ + cur[i] = RLIM_INFINITY; + max[i] = RLIM_INFINITY; + if (kname == NULL || (hndl = rctl_hndl_lookup(kname)) == -1) { + continue; + } + while (rctl_local_get(hndl, oval, &nval, p) == 0) { + oval = &nval; + switch (nval.rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + cur[i] = nval.rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + max[i] = nval.rcv_value; + break; + } + } + /* + * If "Max open files" is still set to RLIM_INFINITY, make it + * match the max value so that we do not output "unlimited". + */ + if (strcmp(lxpr_rlimtab[i].rlim_name, RLIM_MAXFD) == 0 && + cur[i] == RLIM_INFINITY) { + cur[i] = max[i]; + } + + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + for (i = 0; i < LX_RLIM_TAB_LEN; i++) { + lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name); + if (cur[i] == RLIM_INFINITY || cur[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", cur[i]); + } + if (max[i] == RLIM_INFINITY || max[i] == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", max[i]); + } + lxpr_uiobuf_printf(uiobuf, " %-10s\n", + lxpr_rlimtab[i].rlim_unit); + } +} +/* + * lxpr_read_pid_id_map(): gid_map and uid_map file + */ +static void +lxpr_read_pid_id_map(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_GIDMAP || + lxpnp->lxpr_type == LXPR_PID_UIDMAP); + + lxpr_uiobuf_printf(uiobuf, "%10u %10u %10u\n", 0, 0, MAXUID); +} + +/* + * lxpr_read_pid_loginuid(): loginuid file + */ +static void +lxpr_read_pid_loginuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *pd; + uid_t lu = 0; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID || + lxpnp->lxpr_type == LXPR_PID_TID_LOGINUID); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + if ((pd = ptolxproc(p)) != NULL) { + lu = pd->l_loginuid; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%d", lu); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + uintptr_t saddr; + uintptr_t eaddr; + int type; + char prot[5]; + uintptr_t offset; + vnode_t *vp; + char *name_override; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS || + lxpnp->lxpr_type == LXPR_PID_TID_MAPS); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + return; + } + + as = p->p_as; + lxpd = ptolxproc(p); + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = (uintptr_t)seg->s_base; + pbuf->eaddr = pbuf->saddr + seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr); + + pbuf->name_override = NULL; + if (lxpd != NULL) { + if (pbuf->saddr == lxpd->l_vdso) { + pbuf->name_override = "[vdso]"; + } else if (pbuf->saddr == p->p_user.u_commpagep) { + pbuf->name_override = "[vvar]"; + } + } + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + ino_t inode = 0; + + *buf = '\0'; + if (pbuf->name_override != NULL) { + (void) strncpy(buf, pbuf->name_override, buflen); + } else if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (p->p_model == DATAMODEL_LP64) { + lxpr_uiobuf_printf(uiobuf, + "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, *buf != '\0' ? " " : "", buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02x:%02x %llu%s%s\n", + (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr, + pbuf->prot, (uint32_t)pbuf->offset, maj, min, + inode, *buf != '\0' ? " " : "", buf); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * Make mount entry look more like Linux. Non-zero return to skip it. + */ +static int +lxpr_clean_mntent(char **mntpt, char **fstype, char **resource) +{ + if (strcmp(*mntpt, "/var/ld") == 0 || + strcmp(*fstype, "objfs") == 0 || + strcmp(*fstype, "mntfs") == 0 || + strcmp(*fstype, "ctfs") == 0 || + strncmp(*mntpt, "/native/", 8) == 0) { + return (1); + } + + if (strcmp(*fstype, "tmpfs") == 0) { + *resource = "tmpfs"; + } else if (strcmp(*fstype, "lx_proc") == 0) { + *resource = *fstype = "proc"; + } else if (strcmp(*fstype, "lx_sysfs") == 0) { + *resource = *fstype = "sysfs"; + } else if (strcmp(*fstype, "lx_devfs") == 0) { + *resource = *fstype = "devtmpfs"; + } else if (strcmp(*fstype, "lx_cgroup") == 0) { + *resource = *fstype = "cgroup"; + } else if (strcmp(*fstype, "lxautofs") == 0) { + *fstype = "autofs"; + } + + return (0); +} + + +typedef struct lxpr_mount_entry { + list_node_t lme_link; + uint_t lme_id; + uint_t lme_parent_id; + refstr_t *lme_mntpt; + refstr_t *lme_resource; + uint_t lme_mntopts_len; + char *lme_mntopts; + uint_t lme_flag; + int lme_fstype; + dev_t lme_dev; + boolean_t lme_force; +} lxpr_mount_entry_t; + +static int lxpr_zfs_fstype = -1; + +#define LXPR_ROOT_MOUNT_ID 15 +#define LXPR_MNT_OPT_CHUNK 128 + +/* List of native, non-Linux mount options we should omit. */ +static const char *lx_invalid_mnt_opts[] = { + "xattr", + NULL +}; + +/* First see if we should omit this option */ +static boolean_t +lxpr_skip_mntopt(const char *s) +{ + uint_t i; + + for (i = 0; lx_invalid_mnt_opts[i] != NULL; i++) { + if (strcmp(s, lx_invalid_mnt_opts[i]) == 0) + return (B_TRUE); + } + return (B_FALSE); +} + +static void +lxpr_append_mntopt(lxpr_mount_entry_t *lme, char *s) +{ + while (strlcat(lme->lme_mntopts, s, lme->lme_mntopts_len) >= + lme->lme_mntopts_len) { + /* expand option string */ + uint_t tlen = lme->lme_mntopts_len + LXPR_MNT_OPT_CHUNK; + char *t = kmem_alloc(tlen, KM_SLEEP); + + (void) strlcpy(t, lme->lme_mntopts, tlen); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + lme->lme_mntopts_len = tlen; + lme->lme_mntopts = t; + } +} + +/* + * Perform the somewhat complicated work of getting the mount options string + * for the mount. + */ +static void +lxpr_get_mntopts(vfs_t *vfsp, lxpr_mount_entry_t *lme) +{ + uint_t i; + mntopt_t *mop; + boolean_t have_nosuid = B_FALSE, have_nodev = B_FALSE; + + lme->lme_mntopts_len = LXPR_MNT_OPT_CHUNK; + lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP); + lme->lme_mntopts[0] = '\0'; + + /* Always show rw/ro option */ + lxpr_append_mntopt(lme, + (lme->lme_flag & VFS_RDONLY) == 0 ? "rw" : "ro"); + + for (i = 0; i < vfsp->vfs_mntopts.mo_count; i++) { + mop = &vfsp->vfs_mntopts.mo_list[i]; + if ((mop->mo_flags & MO_NODISPLAY) || !(mop->mo_flags & MO_SET)) + continue; + + if (strcmp(mop->mo_name, "ro") == 0 || + strcmp(mop->mo_name, "rw") == 0) + continue; + + if (strcmp(mop->mo_name, "nosuid") == 0) + have_nosuid = B_TRUE; + /* sigh, either option string is used */ + if (strcmp(mop->mo_name, "nodev") == 0 || + strcmp(mop->mo_name, "nodevices") == 0) + have_nodev = B_TRUE; + + if (!lxpr_skip_mntopt(mop->mo_name)) { + lxpr_append_mntopt(lme, ","); + lxpr_append_mntopt(lme, mop->mo_name); + if (mop->mo_arg != NULL) { + lxpr_append_mntopt(lme, "="); + lxpr_append_mntopt(lme, mop->mo_arg); + } + } + } + + /* + * Sometimes nosuid is an explicit string, other times it's a flag. + * The same is true for nodevices. + */ + if (!have_nosuid && (lme->lme_flag & VFS_NOSETUID)) { + lxpr_append_mntopt(lme, ",nosuid"); + } + if (!have_nodev && (lme->lme_flag & VFS_NODEVICES)) { + lxpr_append_mntopt(lme, ",nodevices"); + } +} + +static list_t * +lxpr_enumerate_mounts(zone_t *zone) +{ + vfs_t *vfsp, *rvfsp, *vfslist; + lx_zone_data_t *lxzd = ztolxzd(zone); + list_t *result; + lxpr_mount_entry_t *lme; + lx_virt_disk_t *vd; + uint_t root_id, mount_id; + char tmppath[MAXPATHLEN]; + + result = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(result, sizeof (lxpr_mount_entry_t), + offsetof(lxpr_mount_entry_t, lme_link)); + /* use an arbitrary start value for the root mount_id */ + root_id = 15; + mount_id = root_id + 1; + + ASSERT(zone != global_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + vfs_list_read_lock(); + vfsp = vfslist = zone->zone_vfslist; + + /* + * If the zone has a root entry, it will be the first in the list. + * Conjure one up if needed. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + rvfsp = zone->zone_rootvp->v_vfsp; + } else { + rvfsp = vfslist; + vfsp = vfslist->vfs_zone_next; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = root_id; + lme->lme_parent_id = 0; + lme->lme_mntpt = refstr_alloc(zone->zone_rootpath); + lme->lme_flag = rvfsp->vfs_flag; + lme->lme_fstype = rvfsp->vfs_fstype; + lme->lme_force = B_TRUE; + lxpr_get_mntopts(rvfsp, lme); + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == rvfsp->vfs_dev) { + (void) snprintf(tmppath, sizeof (tmppath), + "%sdev/%s", zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(tmppath); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = refstr_alloc(zone->zone_rootpath); + lme->lme_dev = rvfsp->vfs_dev; + } + list_insert_head(result, lme); + + do { + if (vfsp == NULL) { + break; + } + /* Skip mounts we shouldn't show */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_mntpt); + lme->lme_flag = vfsp->vfs_flag; + lme->lme_fstype = vfsp->vfs_fstype; + lme->lme_force = B_FALSE; + lxpr_get_mntopts(vfsp, lme); + + lme->lme_resource = NULL; + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZFS_DS && + vd->lxvd_real_dev == vfsp->vfs_dev) { + char vdev[MAXPATHLEN]; + + (void) snprintf(vdev, sizeof (vdev), + "%sdev/%s", + zone->zone_rootpath, vd->lxvd_name); + lme->lme_resource = refstr_alloc(vdev); + lme->lme_dev = vd->lxvd_emul_dev; + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + if (lme->lme_resource == NULL) { + lme->lme_resource = vfsp->vfs_resource; + refstr_hold(vfsp->vfs_resource); + lme->lme_dev = vfsp->vfs_dev; + } + list_insert_tail(result, lme); + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* Add a single dummy entry for /native/usr */ + lme = kmem_alloc(sizeof (lxpr_mount_entry_t), KM_SLEEP); + lme->lme_id = mount_id++; + lme->lme_parent_id = root_id; + lme->lme_flag = VFS_RDONLY; + lme->lme_dev = makedevice(0, 1); + (void) snprintf(tmppath, sizeof (tmppath), + "%snative/usr", zone->zone_rootpath); + lme->lme_mntpt = refstr_alloc(tmppath); + lme->lme_resource = lme->lme_mntpt; + lme->lme_mntopts_len = 3; + lme->lme_mntopts = kmem_alloc(lme->lme_mntopts_len, KM_SLEEP); + (void) strlcpy(lme->lme_mntopts, "ro", lme->lme_mntopts_len); + refstr_hold(lme->lme_mntpt); + if (lxpr_zfs_fstype == -1) { + vfssw_t *zfssw = vfs_getvfssw("zfs"); + VERIFY(zfssw != NULL); + lxpr_zfs_fstype = ((uintptr_t)zfssw - (uintptr_t)vfssw) / + sizeof (vfssw[0]); + VERIFY(&vfssw[lxpr_zfs_fstype] == zfssw); + } + lme->lme_fstype = lxpr_zfs_fstype; + lme->lme_force = B_TRUE; + list_insert_tail(result, lme); + + return (result); +} + +static uint_t +lxpr_get_mountid(zone_t *zone, vfs_t *match_vfsp) +{ + lx_zone_data_t *lxzd = ztolxzd(zone); + vfs_t *vfsp, *vfslist; + uint_t mount_id; + + if (match_vfsp == NULL) + return (0); + + /* Mount IDs start at 15 for the root, see lxpr_enumerate_mounts() */ + mount_id = 15; + + ASSERT(zone != global_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (zone->zone_rootvp->v_vfsp == match_vfsp) + return (mount_id); + + vfs_list_read_lock(); + + vfsp = vfslist = zone->zone_vfslist; + + do { + if (vfsp == zone->zone_rootvp->v_vfsp) + continue; + + if (vfsp == NULL) + break; + + /* Skip mounts we shouldn't show */ + if ((vfsp->vfs_flag & VFS_NOMNTTAB) != 0) { + vfsp = vfsp->vfs_zone_next; + continue; + } + + mount_id++; + + if (vfsp == match_vfsp) { + vfs_list_unlock(); + return (mount_id); + } + + vfsp = vfsp->vfs_zone_next; + } while (vfsp != vfslist); + + vfs_list_unlock(); + + return (0); +} + +/* + * lxpr_read_pid_mountinfo(): information about process mount points. + */ +static void +lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO || + lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO); + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype, *rwflag; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + rwflag = ((lme->lme_flag & VFS_RDONLY) == 0) ? "rw" : "ro"; + + /* + * XXX parent ID is not tracked correctly here. Currently we + * always assume the parent ID is the root ID. + */ + lxpr_uiobuf_printf(uiobuf, + "%d %d %d:%d / %s %s - %s %s %s\n", + lme->lme_id, lme->lme_parent_id, + getmajor(lme->lme_dev), getminor(lme->lme_dev), + mntpt, rwflag, fstype, resource, lme->lme_mntopts); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = (lxpr_mount_entry_t *)list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_pid_oom_scr_adj(): read oom_score_adj for process + */ +static void +lxpr_read_pid_oom_scr_adj(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_OOM_SCR_ADJ || + lxpnp->lxpr_type == LXPR_PID_TID_OOM_SCR_ADJ); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + lxpr_unlock(p); + + /* always 0 */ + lxpr_uiobuf_printf(uiobuf, "0\n"); +} + +/* + * lxpr_read_pid_personality(): read personality for process + */ +static void +lxpr_read_pid_personality(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + lx_proc_data_t *lxpd; + unsigned int personality; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_PERSONALITY); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + if ((lxpd = ptolxproc(p)) != NULL) { + personality = lxpd->l_personality; + } else { + /* Report native processes as having the SunOS personality */ + personality = LX_PER_SUNOS; + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, "%08x\n", personality); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize, rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM || + lxpnp->lxpr_type == LXPR_PID_TID_STATM); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * Determine number of LWPs visible in the process. In particular we want to + * ignore aio in-kernel threads. + */ +static uint_t +lxpr_count_tasks(proc_t *p) +{ + uint_t cnt = 0; + kthread_t *t; + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + return (0); + } + + if (p->p_brand != &lx_brand || (t = p->p_tlist) == NULL) { + cnt = p->p_lwpcnt; + } else { + do { + lx_lwp_data_t *lwpd = ttolxlwp(t); + /* Don't count aio kernel worker threads */ + if ((t->t_proc_flag & TP_KTHREAD) != 0 && + lwpd != NULL && + (lwpd->br_lwp_flags & BR_AIO_LWP) == 0) { + cnt++; + } + + t = t->t_forw; + } while (t != p->p_tlist); + } + + return (cnt); +} + +/* + * pid/tid common code to read status file + */ +static void +lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + struct as *as; + char *status; + pid_t pid, ppid; + pid_t tid = (lookup_id == 0) ? lxpnp->lxpr_pid : lookup_id; + k_sigset_t current, ignore, handle; + int i, lx_sig, lwpcnt, ngroups; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t fdlim; + size_t vsize = 0, nlocked = 0, rss = 0, stksize = 0; + boolean_t printsz = B_FALSE; + + + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, &ppid); + + if (t != NULL) { + thread_lock(t); + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + (void) strlcpy(buf_comm, up->u_comm, sizeof (buf_comm)); + fdlim = p->p_fno_ctl; + lwpcnt = lxpr_count_tasks(p); + + /* + * Gather memory information + */ + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) && + (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + nlocked = p->p_locked_mem; + stksize = p->p_stksize; + printsz = B_TRUE; + } + + /* + * Gather signal information + */ + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + for (i = 1; i < NSIG; i++) { + lx_sig = stol_signo[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + buf_comm, + status, + pid, /* thread group id - same as pid */ + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + fdlim); + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + if (printsz) { + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + btok(nlocked), + ptok(rss), + 0l, + btok(stksize), + ptok(rss), + 0l); + } + lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u\n", lwpcnt); + lxpr_uiobuf_printf(uiobuf, + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0]); + /* Report only the full bounding set for now */ + lxpr_uiobuf_printf(uiobuf, + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n" + "CapBnd:\t%016llx\n", + 0, 0, 0, 0x1fffffffffLL); +} + +/* + * lxpr_read_pid_tid_status(): status file + */ +static void +lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS || + lxpnp->lxpr_type == LXPR_PID_TID_STATUS); + + lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* + * Same logic as the lx devfs lxd_pts_devt_translator. + */ +static dev_t +lxpr_xlate_pts_dev(dev_t dev) +{ + minor_t min = getminor(dev); + int lx_maj, lx_min; + + lx_maj = LX_PTS_MAJOR_MIN + (min / LX_MAXMIN); + lx_min = min % LX_MAXMIN; + + return (LX_MAKEDEVICE(lx_maj, lx_min)); +} + +/* + * pid/tid common code to read stat file + */ +static void +lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + zone_t *zone; + char stat; + pid_t pid, ppid, pgpid, spid, tid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri, lwpcnt; + caddr_t wchan, stackbase; + processorid_t cpu; + clock_t utime, stime, cutime, cstime, ticks, boottime; + char buf_comm[MAXCOMLEN + 1]; + rlim64_t vmem_ctl; + int exit_signal = -1; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT || + lxpnp->lxpr_type == LXPR_PID_TID_STAT); + + zone = LXPTOZ(lxpnp); + tid = (lxpnp->lxpr_desc == 0) ? lxpnp->lxpr_pid : lxpnp->lxpr_desc; + p = lxpr_lock_pid(lxpnp, tid, ZOMB_OK, &t); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + /* Set Linux defaults if we're the zone's init process */ + pid = p->p_pid; + lxpr_fixpid(zone, p, &pid, &ppid); + if (pid == 1) { + /* init process */ + pgpid = 0; + psgid = (gid_t)-1; + spid = 0; + psdev = 0; + } else { + pgpid = p->p_pgrp; + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = lxpr_xlate_pts_dev(p->p_sessp->s_dev); + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + stackbase = 0; + } else { + /* from prgetstackbase() */ + stackbase = p->p_usrstack - p->p_stksize; + } + + utime = stime = 0; + if (t != NULL) { + klwp_t *lwp = ttolwp(t); + hrtime_t utm = 0, stm = 0; + + /* + * For field 38 (the exit signal), some apps explicitly use + * this field in a check to distinguish processes from threads, + * and assume only processes have a valid signal in this field! + */ + if (t->t_tid == 1) { + lx_proc_data_t *lxpd = ptolxproc(p); + + if (lxpd != NULL) { + exit_signal = lxpd->l_signal; + } else { + exit_signal = SIGCHLD; + } + } + + thread_lock(t); + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; + break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; + break; + case TS_ZOMB: + stat = 'Z'; + break; + case TS_STOPPED: + stat = 'T'; + break; + default: + stat = '!'; + break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + + if (lwp != NULL) { + struct mstate *ms = &lwp->lwp_mstate; + + utm = ms->ms_acct[LMS_USER]; + stm = ms->ms_acct[LMS_SYSTEM]; + + /* convert unscaled high-res time to nanoseconds */ + scalehrtime(&utm); + scalehrtime(&stm); + } + + thread_unlock(t); + + /* Linux /proc expects these values in ticks */ + utime = (clock_t)NSEC_TO_TICK(utm); + stime = (clock_t)NSEC_TO_TICK(stm); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + if (as != &kas) { + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + } else { + vsize = 0; + rss = 0; + } + mutex_enter(&p->p_lock); + + if (tid == p->p_pid) { + /* process */ + utime = p->p_utime; + stime = p->p_stime; + } else { + /* tid: utime & stime for the thread set in block above */ + /* EMPTY */ + } + cutime = p->p_cutime; + cstime = p->p_cstime; + lwpcnt = lxpr_count_tasks(p); + vmem_ctl = p->p_vmem_ctl; + (void) strlcpy(buf_comm, p->p_user.u_comm, sizeof (buf_comm)); + ticks = p->p_user.u_ticks; /* lbolt at process start */ + /* adjust ticks to account for zone boot time */ + boottime = zone->zone_zsched->p_user.u_ticks; + ticks -= boottime; + lxpr_unlock(p); + + /* Adjust hz for relevant fields */ + utime = HZ_TO_LX_USERHZ(utime); + stime = HZ_TO_LX_USERHZ(stime); + cutime = HZ_TO_LX_USERHZ(cutime); + cstime = HZ_TO_LX_USERHZ(cstime); + ticks = HZ_TO_LX_USERHZ(ticks); + + lxpr_uiobuf_printf(uiobuf, + "%d " /* 1 */ + "(%s) %c %d %d %d %d %d " /* 2-8 */ + "%lu %lu %lu %lu %lu " /* 9-13 */ + "%lu %lu %ld %ld " /* 14-17 */ + "%d %d %d " /* 18-20 */ + "%lu " /* 21 */ + "%lu " /* 22 */ + "%lu %ld %llu " /* 23-25 */ + "%lu %lu %llu " /* 26-28 */ + "%lu %lu " /* 29-30 */ + "%lu %lu %lu %lu " /* 31-34 */ + "%lu " /* 35 */ + "%lu %lu " /* 36-37 */ + "%d " /* 38 */ + "%d" /* 39 */ + "\n", + tid, /* 1 */ + buf_comm, stat, ppid, pgpid, spid, psdev, psgid, /* 2-8 */ + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + utime, stime, cutime, cstime, /* 14-17 */ + pri, nice, lwpcnt, /* 18-20 */ + 0l, /* itrealvalue (time before next SIGALRM) 21 */ + ticks, /* 22 */ + vsize, rss, vmem_ctl, /* 23-25 */ + 0l, 0l, stackbase, /* startcode, endcode, startstack 26-28 */ + 0l, 0l, /* kstkesp, kstkeip 29-30 */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch 31-34 */ + wchan, /* 35 */ + 0l, 0l, /* nswap,cnswap 36-37 */ + exit_signal, /* exit_signal 38 */ + cpu /* 39 */); +} + +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +struct lxpr_ifstat { + uint64_t rx_bytes; + uint64_t rx_packets; + uint64_t rx_errors; + uint64_t rx_drop; + uint64_t tx_bytes; + uint64_t tx_packets; + uint64_t tx_errors; + uint64_t tx_drop; + uint64_t collisions; + uint64_t rx_multicast; +}; + +static void * +lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num, + zoneid_t zoneid) +{ + kstat_t *kp; + int i, nrec = 0; + size_t bufsize; + void *buf = NULL; + + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, zoneid); + } else { + kp = kstat_hold_bykid(kn->ks_kid, zoneid); + } + if (kp == NULL) { + return (NULL); + } + if (kp->ks_flags & KSTAT_FLAG_INVALID) { + kstat_rele(kp); + return (NULL); + } + + bufsize = kp->ks_data_size + 1; + kstat_rele(kp); + + /* + * The kstat in question is released so that kmem_alloc(KM_SLEEP) is + * performed without it held. After the alloc, the kstat is reacquired + * and its size is checked again. If the buffer is no longer large + * enough, the alloc and check are repeated up to three times. + */ + for (i = 0; i < 2; i++) { + buf = kmem_alloc(bufsize, KM_SLEEP); + + /* Check if bufsize still appropriate */ + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, zoneid); + } else { + kp = kstat_hold_bykid(kn->ks_kid, zoneid); + } + if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) { + if (kp != NULL) { + kstat_rele(kp); + } + kmem_free(buf, bufsize); + return (NULL); + } + KSTAT_ENTER(kp); + (void) KSTAT_UPDATE(kp, KSTAT_READ); + if (bufsize < kp->ks_data_size) { + kmem_free(buf, bufsize); + buf = NULL; + bufsize = kp->ks_data_size + 1; + KSTAT_EXIT(kp); + kstat_rele(kp); + continue; + } else { + if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) { + kmem_free(buf, bufsize); + buf = NULL; + } + nrec = kp->ks_ndata; + KSTAT_EXIT(kp); + kstat_rele(kp); + break; + } + } + + if (buf != NULL) { + *size = bufsize; + *num = nrec; + } + return (buf); +} + +static int +lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs, zoneid_t zoneid) +{ + kstat_named_t *kp; + int i, num; + size_t size; + + /* + * Search by name instead of by kid since there's a small window to + * race against kstats being added/removed. + */ + bzero(ifs, sizeof (*ifs)); + kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num, zoneid); + if (kp == NULL) + return (-1); + for (i = 0; i < num; i++) { + if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0) + ifs->rx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0) + ifs->rx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0) + ifs->rx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0) + ifs->rx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0) + ifs->rx_multicast = kp[i].value.ui32; + else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0) + ifs->tx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0) + ifs->tx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0) + ifs->tx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0) + ifs->tx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0) + ifs->collisions = kp[i].value.ui32; + } + kmem_free(kp, size); + return (0); +} + +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + int i, nidx; + size_t sidx; + struct lxpr_ifstat ifs; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid); + if (ksr == NULL) + return; + + for (i = 1; i < nidx; i++) { + if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 || + strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) { + if (lxpr_kstat_ifstat(&ksr[i], &ifs, zoneid) != 0) + continue; + + /* Overwriting the name is ok in the local snapshot */ + lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE); + lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu " + "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u " + "%5lu %7u %10u\n", + ksr[i].ks_name, + ifs.rx_bytes, ifs.rx_packets, + ifs.rx_errors, ifs.rx_drop, + 0, 0, 0, ifs.rx_multicast, + ifs.tx_bytes, ifs.tx_packets, + ifs.tx_errors, ifs.tx_drop, + 0, ifs.collisions, 0, 0); + } + } + + kmem_free(ksr, sidx); +} + +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_inet6_out(const in6_addr_t *addr, char buf[33]) +{ + const uint8_t *ip = addr->s6_addr; + char digits[] = "0123456789abcdef"; + int i; + for (i = 0; i < 16; i++) { + buf[2 * i] = digits[ip[i] >> 4]; + buf[2 * i + 1] = digits[ip[i] & 0xf]; + } + buf[32] = '\0'; +} + +static void +lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + ipif_t *ipif; + ill_walk_context_t ctx; + char ifname[LIFNAMSIZ], ip6out[33]; + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = ILL_START_WALK_V6(&ctx, ipst); + + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + uint_t index = ill->ill_phyint->phyint_ifindex; + int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); + unsigned int scope = lx_ipv6_scope_convert( + &ipif->ipif_v6lcl_addr); + /* Always report PERMANENT flag */ + int flag = 0x80; + + (void) snprintf(ifname, LIFNAMSIZ, "%s", ill->ill_name); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out); + + lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x" + " %8s\n", ip6out, index, plen, scope, flag, ifname); + } + } + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); +} + +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + char ipv6addr[33]; + + lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr, + ip_mask_to_plen_v6(&ire->ire_mask_v6)); + + /* punt on this for now */ + lxpr_uiobuf_printf(uiobuf, "%s %02x ", + "00000000000000000000000000000000", 0); + + lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr); + + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + /* Linux's RTF_LOCAL equivalent */ + if (ire->ire_metrics.iulp_local) + flags |= 0x80000000; + + if (ire->ire_ill != NULL) { + ill_get_name(ire->ire_ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n", + 0, /* metric */ + ire->ire_refcnt, + 0, + flags, + name); +} + +static void +lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +#define LXPR_SKIP_ROUTE(type) \ + (((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \ + IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0) + +static void +lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + ill_t *ill; + ire_t *nire; + ipif_t *ipif; + ipaddr_t gateway; + + if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0) + return; + + /* These route flags have direct Linux equivalents */ + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + + /* + * Search for a suitable IRE for naming purposes. + * On Linux, the default route is typically associated with the + * interface used to access gateway. The default IRE on illumos + * typically lacks an ill reference but its parent might have one. + */ + nire = ire; + do { + ill = nire->ire_ill; + nire = nire->ire_dep_parent; + } while (ill == NULL && nire != NULL); + if (ill != NULL) { + ill_get_name(ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '*'; + name[1] = '\0'; + } + + /* + * Linux suppresses the gateway address for directly connected + * interface networks. To emulate this behavior, we walk all addresses + * of a given route interface. If one matches the gateway, it is + * displayed as NULL. + */ + gateway = ire->ire_gateway_addr; + if ((ill = ire->ire_ill) != NULL) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (ipif->ipif_lcl_addr == gateway) { + gateway = 0; + break; + } + } + } + + lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u\n", + name, + ire->ire_addr, + gateway, + flags, 0, 0, + 0, /* priority */ + ire->ire_mask, + 0, 0, /* mss, window */ + ire->ire_metrics.iulp_rtt); +} + +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t" + "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n"); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +typedef struct lxpr_snmp_table { + const char *lst_proto; + const char **lst_fields; +} lxpr_snmp_table_t; + +static const char *lxpr_snmp_ip_fields[] = { + "forwarding", "defaultTTL", "inReceives", "inHdrErrors", + "inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards", + "inDelivers", "outRequests", "outDiscards", "outNoRoutes", + "reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs", + "fragFails", "fragCreates", + NULL +}; + +static const char *lxpr_snmp_icmp_fields[] = { + "inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds", + "inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps", + "inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps", + "outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds", + "outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos", + "outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks", + "outAddrMaskReps", + NULL +}; + +static const char *lxpr_snmp_tcp_fields[] = { + "rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens", + "passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs", + "outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors", + NULL +}; + +static const char *lxpr_snmp_udp_fields[] = { + "inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors", + "sndbufErrors", "inCsumErrors", + NULL +}; + +static lxpr_snmp_table_t lxpr_snmp_ip = { "ip", lxpr_snmp_ip_fields }; +static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp", lxpr_snmp_icmp_fields }; +static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp", lxpr_snmp_tcp_fields }; +static lxpr_snmp_table_t lxpr_snmp_udp = { "udp", lxpr_snmp_udp_fields }; + +static lxpr_snmp_table_t *lxpr_net_snmptab[] = { + &lxpr_snmp_ip, + &lxpr_snmp_icmp, + &lxpr_snmp_tcp, + &lxpr_snmp_udp, + NULL +}; + +static void +lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table, + kstat_t *kn, zoneid_t zoneid) +{ + kstat_named_t *klist; + char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN]; + int i, j, num; + size_t size; + + klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num, + zoneid); + if (klist == NULL) + return; + + /* Print the header line, fields capitalized */ + (void) strncpy(upname, table->lst_proto, KSTAT_STRLEN); + upname[0] = toupper(upname[0]); + lxpr_uiobuf_printf(uiobuf, "%s:", upname); + for (i = 0; table->lst_fields[i] != NULL; i++) { + (void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN); + upfield[0] = toupper(upfield[0]); + lxpr_uiobuf_printf(uiobuf, " %s", upfield); + } + lxpr_uiobuf_printf(uiobuf, "\n%s:", upname); + + /* Then loop back through to print the value line. */ + for (i = 0; table->lst_fields[i] != NULL; i++) { + kstat_named_t *kpoint = NULL; + for (j = 0; j < num; j++) { + if (strncmp(klist[j].name, table->lst_fields[i], + KSTAT_STRLEN) == 0) { + kpoint = &klist[j]; + break; + } + } + if (kpoint == NULL) { + /* Output 0 for unknown fields */ + lxpr_uiobuf_printf(uiobuf, " 0"); + } else { + switch (kpoint->data_type) { + case KSTAT_DATA_INT32: + lxpr_uiobuf_printf(uiobuf, " %d", + kpoint->value.i32); + break; + case KSTAT_DATA_UINT32: + lxpr_uiobuf_printf(uiobuf, " %u", + kpoint->value.ui32); + break; + case KSTAT_DATA_INT64: + lxpr_uiobuf_printf(uiobuf, " %ld", + kpoint->value.l); + break; + case KSTAT_DATA_UINT64: + lxpr_uiobuf_printf(uiobuf, " %lu", + kpoint->value.ul); + break; + } + } + } + lxpr_uiobuf_printf(uiobuf, "\n"); + kmem_free(klist, size); +} + +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + lxpr_snmp_table_t **table = lxpr_net_snmptab; + int i, t, nidx; + size_t sidx; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx, zoneid); + if (ksr == NULL) + return; + + for (t = 0; table[t] != NULL; t++) { + for (i = 0; i < nidx; i++) { + if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0) + continue; + if (strncmp(ksr[i].ks_name, table[t]->lst_proto, + KSTAT_STRLEN) == 0) { + lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i], + zoneid); + break; + } + } + } + kmem_free(ksr, sidx); +} + +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static int +lxpr_convert_tcp_state(int st) +{ + /* + * Derived from the enum located in the Linux kernel sources: + * include/net/tcp_states.h + */ + switch (st) { + case TCPS_ESTABLISHED: + return (1); + case TCPS_SYN_SENT: + return (2); + case TCPS_SYN_RCVD: + return (3); + case TCPS_FIN_WAIT_1: + return (4); + case TCPS_FIN_WAIT_2: + return (5); + case TCPS_TIME_WAIT: + return (6); + case TCPS_CLOSED: + return (7); + case TCPS_CLOSE_WAIT: + return (8); + case TCPS_LAST_ACK: + return (9); + case TCPS_LISTEN: + return (10); + case TCPS_CLOSING: + return (11); + default: + /* No translation for TCPS_IDLE, TCPS_BOUND or anything else */ + return (0); + } +} + +static void +lxpr_format_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + int sonode_shift; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address " + "st tx_queue rx_queue tr tm->when retrnsmt uid timeout " + "inode\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode\n"); + } + /* + * Due to differences between the Linux and illumos TCP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st + * - tx_queue + * - rx_queue + * - uid + * - inode + * + * Omitted/invalid fields + * - tr + * - tm->when + * - retrnsmt + * - timeout + */ + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + sonode_shift = highbit(sizeof (sonode_t)); + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp_t *tcp; + ino_t inode; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + if (connp->conn_ipversion != ipver) + continue; + tcp = connp->conn_tcp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + /* + * We cannot use VOP_GETATTR here to fetch the + * simulated inode for the socket via the + * so->so_vnode. This is because there is a (very + * tight) race for when the v_vfsp is set on the + * sonode's vnode. However, all we really want here is + * the inode number, which we can compute using the + * same algorithm as socket_vop_getattr. + */ + inode = ((ino_t)so >> sonode_shift) & 0xFFFF; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %u %u %u %u %d\n", + lxpr_convert_tcp_state(tcp->tcp_state), + tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* per-connection rexmits aren't tracked today */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode + more */ + inode, 0, NULL, 0, 0, 0, 0, 0); + } + } + netstack_rele(ns); +} + +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(lxpnp, uiobuf, IPV4_VERSION); +} + +static void +lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(lxpnp, uiobuf, IPV6_VERSION); +} + +static void +lxpr_format_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + int sonode_shift; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address" + " st tx_queue rx_queue tr tm->when retrnsmt uid" + " timeout inode ref pointer drops\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode ref pointer drops\n"); + } + /* + * Due to differences between the Linux and illumos UDP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st: limited + * - uid + * + * Omitted/invalid fields + * - tx_queue + * - rx_queue + * - tr + * - tm->when + * - retrnsmt + * - timeout + * - inode + */ + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + sonode_shift = highbit(sizeof (sonode_t)); + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) { + udp_t *udp; + ino_t inode; + int state = 0; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + if (connp->conn_ipversion != ipver) + continue; + udp = connp->conn_udp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + switch (udp->udp_state) { + case TS_UNBND: + case TS_IDLE: + state = 7; + break; + case TS_DATA_XFER: + state = 1; + break; + } + + /* + * We cannot use VOP_GETATTR here to fetch the + * simulated inode for the socket via the + * so->so_vnode. This is because there is a (very + * tight) race for when the v_vfsp is set on the + * sonode's vnode. However, all we really want here is + * the inode number, which we can compute using the + * same algorithm as socket_vop_getattr. + */ + inode = ((ino_t)so >> sonode_shift) & 0xFFFF; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %d\n", + state, + 0, 0, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* retrans */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode, ref, pointer, drops */ + inode, 0, NULL, 0); + } + } + netstack_rele(ns); +} + +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(lxpnp, uiobuf, IPV4_VERSION); +} + +static void +lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(lxpnp, uiobuf, IPV6_VERSION); +} + +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + sonode_t *so; + zoneid_t zoneid = LXPTOZ(lxpnp)->zone_id; + + lxpr_uiobuf_printf(uiobuf, "Num RefCount Protocol Flags Type " + "St Inode Path\n"); + + mutex_enter(&socklist.sl_lock); + for (so = socklist.sl_list; so != NULL; + so = _SOTOTPI(so)->sti_next_so) { + vnode_t *vp = so->so_vnode; + vattr_t attr; + sotpi_info_t *sti; + const char *name = NULL; + int status = 0; + int type = 0; + int flags = 0; + + /* Only process active sonodes in this zone */ + if (so->so_count == 0 || so->so_zoneid != zoneid) + continue; + + /* + * Grab the inode, if possible. + * This must be done before entering so_lock. + */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + mutex_enter(&so->so_lock); + sti = _SOTOTPI(so); + + if (sti->sti_laddr_sa != NULL && + sti->sti_laddr_len > 0) { + name = sti->sti_laddr_sa->sa_data; + } else if (sti->sti_faddr_sa != NULL && + sti->sti_faddr_len > 0) { + name = sti->sti_faddr_sa->sa_data; + } + + /* + * Derived from enum values in Linux kernel source: + * include/uapi/linux/net.h + */ + if ((so->so_state & SS_ISDISCONNECTING) != 0) { + status = 4; + } else if ((so->so_state & SS_ISCONNECTING) != 0) { + status = 2; + } else if ((so->so_state & SS_ISCONNECTED) != 0) { + status = 3; + } else { + status = 1; + /* Add ACC flag for stream-type server sockets */ + if (so->so_type != SOCK_DGRAM && + sti->sti_laddr_sa != NULL) + flags |= 0x10000; + } + + /* Convert to Linux type */ + switch (so->so_type) { + case SOCK_DGRAM: + type = 2; + break; + case SOCK_SEQPACKET: + type = 5; + break; + default: + type = 1; + } + + lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu", + so, + so->so_count, + 0, /* proto, always 0 */ + flags, + type, + status, + (ino_t)attr.va_nodeid); + + /* + * Due to shortcomings in the abstract socket emulation, they + * cannot be properly represented here (as @<path>). + * + * This will be the case until they are better implemented. + */ + if (name != NULL) + lxpr_uiobuf_printf(uiobuf, " %s\n", name); + else + lxpr_uiobuf_printf(uiobuf, "\n"); + mutex_exit(&so->so_lock); + } + mutex_exit(&socklist.sl_lock); +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced, unless we're open non-blocking, in which case we return after + * 1ms. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + timestruc_t to; + timestruc_t *tp = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (lxpr_uiobuf_nonblock(uiobuf)) { + to.tv_sec = 0; + to.tv_nsec = 1000000; /* 1msec */ + tp = &to; + } + + if (ldi_getmsg(lh, &mp, tp) == 0) { + /* + * lx procfs doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(LXPTOZ(lxpnp)); + + ASSERT(LXPTOZ(lxpnp) != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + ulong_t total_mem, free_mem, total_swap; + boolean_t swap_disabled; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + swap_disabled = lxzd->lxzd_swap_disabled; + + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); + + if (swap_disabled) { + total_swap = 0; + } else { + if (zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = ptob(k_anoninfo.ani_max); + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + mutex_exit(&zone->zone_mem_lock); + } + } + + /* + * SwapFree + * On illumos we reserve swap up front, whereas on Linux they just + * wing it and kill a random process if they run out of backing store + * for virtual memory. Our swap reservation doesn't translate to that + * model, so just inform the caller that no swap is being used. + * + * MemAvailable + * MemAvailable entry is available since Linux Kernel +3.14, is an + * estimate of how much memory is available for starting new + * applications, without swapping. In lxbrand we will always return the + * available free memory as an estimate of this value. + */ + lxpr_uiobuf_printf(uiobuf, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached: %8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + btok(free_mem), /* MemAvailable */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + * + * Note: we currently also use this for /proc/{pid}/mounts since we don't + * yet support mount namespaces. + */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + list_t *mounts; + lxpr_mount_entry_t *lme; + + mounts = lxpr_enumerate_mounts(zone); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + lme = list_remove_head(mounts); + while (lme != NULL) { + char *resource, *mntpt, *fstype; + vnode_t *vp; + int error; + + mntpt = (char *)refstr_value(lme->lme_mntpt); + resource = (char *)refstr_value(lme->lme_resource); + + if (mntpt == NULL || mntpt[0] == '\0') { + goto nextp; + } + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + goto nextp; + } else if ((vp->v_flag & VROOT) == 0 && !lme->lme_force) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* Make things look more like Linux. */ + fstype = vfssw[lme->lme_fstype].vsw_name; + if (lxpr_clean_mntent(&mntpt, &fstype, &resource) != 0 && + !lme->lme_force) { + goto nextp; + } + + lxpr_uiobuf_printf(uiobuf, "%s %s %s %s 0 0\n", + resource, mntpt, fstype, lme->lme_mntopts); + +nextp: + refstr_rele(lme->lme_mntpt); + refstr_rele(lme->lme_resource); + kmem_free(lme->lme_mntopts, lme->lme_mntopts_len); + kmem_free(lme, sizeof (lxpr_mount_entry_t)); + lme = list_remove_head(mounts); + } + + list_destroy(mounts); + kmem_free(mounts, sizeof (list_t)); +} + +/* + * lxpr_read_partitions(): + * + * Over the years, /proc/partitions has been made considerably smaller -- to + * the point that it really is only major number, minor number, number of + * blocks (which we report as 0), and partition name. + * + * We support this because some things want to see it to make sense of + * /proc/diskstats, and also because "fdisk -l" and a few other things look + * here to find all disks on the system. + */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS); + + lxpr_uiobuf_printf(uiobuf, "major minor #blocks name\n\n"); + + lxzd = ztolxzd(LXPTOZ(lxpnp)); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + lxpr_uiobuf_printf(uiobuf, "%4d %7d %10d %s\n", + getmajor(vd->lxvd_emul_dev), getminor(vd->lxvd_emul_dev), + 0, vd->lxvd_name); + vd = list_next(lxzd->lxzd_vdisks, vd); + } +} + +/* + * There aren't many actual devices inside a zone but we want to provide the + * major numbers for the pseudo devices that do exist, including our pts/ptm + * device, as well as the zvol virtual disk device. We simply hardcode the + * emulated major numbers that are used elsewhere in the code and that match + * the expected Linux major numbers. See lx devfs where some of the major + * numbers have no defined constants. + */ +static void +lxpr_read_devices(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_DEVICES); + + lxpr_uiobuf_printf(uiobuf, "Character devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/tty\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/console\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d /dev/ptmx\n", LX_TTY_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d ptm\n", LX_PTM_MAJOR); + lxpr_uiobuf_printf(uiobuf, "%3d pts\n", LX_PTS_MAJOR_MIN); + + lxpr_uiobuf_printf(uiobuf, "\nBlock devices:\n"); + lxpr_uiobuf_printf(uiobuf, "%3d zvol\n", LX_MAJOR_DISK); +} + +/* + * lxpr_read_diskstats(): + * + * See the block comment above the per-device output-generating line for the + * details of the format. + */ +static void +lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd; + kstat_t kn; + int num; + zone_vfs_kstat_t *kip; + size_t size; + lx_virt_disk_t *vd; + + ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS); + + lxzd = ztolxzd(zone); + if (lxzd == NULL) + return; + ASSERT(lxzd->lxzd_vdisks != NULL); + + /* + * Use the zone_vfs kstat, which is a superset of a kstat_io_t, since + * it tracks IO at the zone level. + */ + (void) strlcpy(kn.ks_module, "zone_vfs", sizeof (kn.ks_module)); + (void) strlcpy(kn.ks_name, zone->zone_name, sizeof (kn.ks_name)); + kn.ks_instance = zone->zone_id; + + kip = (zone_vfs_kstat_t *)lxpr_kstat_read(&kn, B_TRUE, &size, &num, + zone->zone_id); + if (kip == NULL) + return; + + if (size < sizeof (kstat_io_t)) { + kmem_free(kip, size); + return; + } + + /* + * Because the zone vfs stats are tracked at the zone level we use + * the same kstat for the zone's virtual disk (the zpool) and any + * zvols that might also visible within the zone. + */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + /* + * /proc/diskstats is defined to have one line of output for + * each block device, with each line containing the following + * 14 fields: + * + * 1 - major number + * 2 - minor mumber + * 3 - device name + * 4 - reads completed successfully + * 5 - reads merged + * 6 - sectors read + * 7 - time spent reading (ms) + * 8 - writes completed + * 9 - writes merged + * 10 - sectors written + * 11 - time spent writing (ms) + * 12 - I/Os currently in progress + * 13 - time spent doing I/Os (ms) + * 14 - weighted time spent doing I/Os (ms) + * + * One small hiccup: we don't actually keep track of time + * spent reading vs. time spent writing -- we keep track of + * time waiting vs. time actually performing I/O. While we + * could divide the total time by the I/O mix (making the + * obviously wrong assumption that I/O operations all take the + * same amount of time), this has the undesirable side-effect + * of moving backwards. Instead, we report the total time + * (read + write) for all three stats (read, write, total). + * This is also a lie of sorts, but it should be more + * immediately clear to the user that reads and writes are + * each being double-counted as the other. + * + * Since certain consumers interpret the major/minor numbers to + * infer device names, some translation is required to avoid + * output which results in totally unexpected results. + */ + + lxpr_uiobuf_printf(uiobuf, "%4d %7d %s ", + getmajor(vd->lxvd_emul_dev), + getminor(vd->lxvd_emul_dev), + vd->lxvd_name); + + if (vd->lxvd_type == LXVD_ZFS_DS) { + /* + * Use the zone-wide vfs stats for any zfs datasets + * represented via virtual devices. + */ +#define KV(N) kip->zv_ ## N.value.ui64 +#define NS_PER_MS (uint64_t)(NANOSEC / MILLISEC) + lxpr_uiobuf_printf(uiobuf, + "%llu %llu %llu %llu " + "%llu %llu %llu %llu " + "%llu %llu %llu\n", + (uint64_t)KV(reads), 0LL, + KV(nread) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)KV(writes), 0LL, + KV(nwritten) / (uint64_t)LXPR_SECTOR_SIZE, + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (uint64_t)(KV(rcnt) + KV(wcnt)), + (KV(rtime) + KV(wtime)) / NS_PER_MS, + (KV(rlentime) + KV(wlentime)) / NS_PER_MS); +#undef KV +#undef NS_PER_MS + } else { + /* + * Report nearly-zeroed statistics for other devices. + * + * Since iostat will ignore devices which report no + * succesful reads or writes, a single read of one + * sector, taking 1ms, is reported. + */ + lxpr_uiobuf_printf(uiobuf, + "1 0 1 1 0 0 0 0 0 0 0\n"); + } + + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + kmem_free(kip, size); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. + */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + lx_proc_data_t *lxpd = ptolxproc(curproc); + char release[LX_KERN_RELEASE_MAX]; + char version[LX_KERN_VERSION_MAX]; + + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(release, lxzd->lxzd_kernel_release, sizeof (release)); + (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version)); + mutex_exit(&lxzd->lxzd_lock); + + /* Use per-process overrides, if specified */ + if (lxpd != NULL && lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(release, lxpd->l_uname_release, + sizeof (release)); + } + if (lxpd != NULL && lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(version, lxpd->l_uname_version, + sizeof (version)); + } + + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) %s\n", + LX_UNAME_SYSNAME, release, +#if defined(__GNUC__) + "gcc", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__, +#else + "cc", 1, 0, 0, +#endif + version); +} + +static void +lxpr_read_vmstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* Only count CPUs which are present and active. */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* + * Needless to say, the metrics presented by vmstat are very specific + * to the internals of the Linux kernel. There is little per-zone + * information which can be translated in a meaningful way to fit the + * expected fields. For the time being, the output is kept sparse. + */ + lxpr_uiobuf_printf(uiobuf, + "pgpgin %lu\n" + "pgpgout %lu\n" + "pswpin %lu\n" + "pswpout %lu\n", + pgpgin_cum, + pgpgout_cum, + pgswapin_cum, + pgswapout_cum); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + /* is the emulated release > 2.4 */ + boolean_t newer_than24 = lx_kern_release_cmp(LXPTOZ(lxpnp), "2.4") > 0; + zone_t *zone = LXPTOZ(lxpnp); + const char *fmtstr0, *fmtstr1; + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + + if (newer_than24) { + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + if (lx_kern_release_cmp(zone, "2.6.33") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.6.24") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.6.11") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0 0\n"; + } else if (lx_kern_release_cmp(zone, "2.5.41") >= 0) { + fmtstr0 = "cpu %lu 0 %lu %lu 0 %lu 0\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu 0 %lu 0\n"; + } else { + /* Note: we pass an unused param to these fmt strings */ + fmtstr0 = "cpu %lu 0 %lu %lu\n"; + fmtstr1 = "cpu%d %lu 0 %lu %lu\n"; + } + + /* Adjust hz */ + user_cum = HZ_TO_LX_USERHZ(user_cum); + sys_cum = HZ_TO_LX_USERHZ(sys_cum); + idle_cum = HZ_TO_LX_USERHZ(idle_cum); + irq_cum = HZ_TO_LX_USERHZ(irq_cum); + + lxpr_uiobuf_printf(uiobuf, fmtstr0, + user_cum, sys_cum, idle_cum, irq_cum); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_IDLE])); + sys_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_SYSTEM])); + user_ticks = HZ_TO_LX_USERHZ(NSEC_TO_TICK(msnsecs[CMS_USER])); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + irq_ticks = HZ_TO_LX_USERHZ(irq_ticks); + + lxpr_uiobuf_printf(uiobuf, fmtstr1, HZ_TO_LX_USERHZ(cp->cpu_id), + user_ticks, sys_ticks, idle_ticks, irq_ticks); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + if (newer_than24) { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + zone->zone_boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); + } else { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + zone->zone_boot_time, + forks_cum); + } +} + +/* + * lxpr_read_swaps(): + * + * We don't support swap files or partitions, but some programs like to look + * here just to check we have some swap on the system, so we lie and show + * our entire swap cap as one swap partition. See lxpr_read_meminfo for an + * explanation on why we report 0 used swap. + * + * The zone's lxzd_swap_disabled boolean controls whether or not we pretend + * swap space is configured. + * + * It is important to use formatting identical to the Linux implementation + * so that consumers do not break. See swap_show() in mm/swapfile.c. + */ +static void +lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + boolean_t swap_enabled; + lx_zone_data_t *lxzd = ztolxzd(zone); + + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + swap_enabled = !lxzd->lxzd_swap_disabled; + + lxpr_uiobuf_printf(uiobuf, + "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); + + if (swap_enabled) { + uint64_t totswap, usedswap; + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + totswap = (k_anoninfo.ani_max * PAGESIZE) >> 10; + } else { + mutex_enter(&zone->zone_mem_lock); + /* Uses units of 1 kb (2^10). */ + totswap = zone->zone_max_swap_ctl >> 10; + mutex_exit(&zone->zone_mem_lock); + } + usedswap = 0; + + lxpr_uiobuf_printf(uiobuf, "%-40s%s\t%llu\t%llu\t%d\n", + "/dev/swap", "partition", totswap, usedswap, -1); + } +} + +static void +lxpr_read_sys_fs_aiomax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_MAX_NR); + lxpr_uiobuf_printf(uiobuf, "%llu\n", LX_AIO_MAX_NR); +} + +static void +lxpr_read_sys_fs_aionr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + uint64_t curr; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_AIO_NR); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + curr = (uint64_t)(lxzd->lxzd_aio_nr); + mutex_exit(&lxzd->lxzd_lock); + lxpr_uiobuf_printf(uiobuf, "%llu\n", curr); +} + +/* + * lxpr_read_sys_fs_filemax(): + * + * The zone's total number of open files is not fixed or tunable, but we can + * provide a number by taking: + * (zone's proc limit) * (process.max-file-descriptor rctl privileged limit). + * The privileged rctl limit is the same as rlim_fd_max. + */ +static void +lxpr_read_sys_fs_filemax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + uint64_t max_fh, proc_lim; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILEMAX); + proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ? + maxpid : zone->zone_nprocs_ctl); + max_fh = proc_lim * (uint64_t)rlim_fd_max; + lxpr_uiobuf_printf(uiobuf, "%llu\n", max_fh); +} + +/* + * lxpr_read_sys_fs_filenr(): + * + * Contains 3 numbers: current number of allocated file handles (open files), + * number of free file handles, and max. number of file handles (same value as + * we use in lxpr_read_sys_fs_filemax). Note that since Linux 2.6 the "free" + * value is always 0, so we just do the same here. We don't keep track of the + * number of files in use within a zone, so we approximate that value by + * looking at the current "fi_nfiles" value for each process in the zone. + */ +static void +lxpr_read_sys_fs_filenr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + uint64_t max_fh, proc_lim, curr_files = 0; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_FILENR); + proc_lim = (uint64_t)(zone->zone_nprocs_ctl == INT_MAX ? + maxpid : zone->zone_nprocs_ctl); + max_fh = proc_lim * (uint64_t)rlim_fd_max; + + for (i = 1; i < v.v_proc; i++) { + uint_t nfiles; + proc_t *p; + uf_info_t *fip; + + mutex_enter(&pidlock); + + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || p->p_zone != zone || + p == zone->zone_zsched || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + continue; + } + + fip = P_FINFO(p); + mutex_enter(&fip->fi_lock); + nfiles = fip->fi_nfiles; + mutex_exit(&fip->fi_lock); + + mutex_exit(&pidlock); + + curr_files += nfiles; + } + + lxpr_uiobuf_printf(uiobuf, "%llu\t0\t%llu\n", curr_files, max_fh); +} + +/* + * inotify tunables exported via /proc. + */ +extern int inotify_maxevents; +extern int inotify_maxinstances; +extern int inotify_maxwatches; + +static void +lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents); +} + +static void +lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances); +} + +static void +lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches); +} + +static void +lxpr_read_sys_fs_pipe_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + uint_t pipe_max; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + pipe_max = lxzd->lxzd_pipe_max_sz; + mutex_exit(&lxzd->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", pipe_max); +} + +static void +lxpr_read_sys_kernel_caplcap(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_CAPLCAP); + lxpr_uiobuf_printf(uiobuf, "%d\n", LX_CAP_MAX_VALID); +} + +static void +lxpr_read_sys_kernel_corepatt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + struct core_globals *cg; + refstr_t *rp; + corectl_path_t *ccp; + char tr[MAXPATHLEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + /* If core dumps are disabled, return an empty string. */ + if ((cg->core_options & CC_PROCESS_PATH) == 0) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + if ((rp = ccp->ccp_path) != NULL) + refstr_hold(rp); + mutex_exit(&ccp->ccp_mtx); + + if (rp == NULL) { + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + bzero(tr, sizeof (tr)); + if (lxpr_core_path_s2l(refstr_value(rp), tr, sizeof (tr)) != 0) { + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "\n"); + return; + } + + refstr_rele(rp); + lxpr_uiobuf_printf(uiobuf, "%s\n", tr); +} + +static void +lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME); + lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename()); +} + +static void +lxpr_read_sys_kernel_msgmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * We don't have an rctl for this. See our definition for LX_MSGMAX + * in the user-level emulation library. Once that code moves into + * the kernel, we can use a common definition. This matches the + * value on Linux. + */ + uint_t val = 8192; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMAX); + + lxpr_uiobuf_printf(uiobuf, "%u\n", val); +} + +static void +lxpr_read_sys_kernel_msgmnb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + proc_t *pp = curproc; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNB); + + mutex_enter(&pp->p_lock); + val = rctl_enforced_value(rc_process_msgmnb, pp->p_rctls, pp); + mutex_exit(&pp->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_msgmni, + LXPTOZ(lxpnp)->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max); +} + +static void +lxpr_read_sys_kernel_osrel(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + char version[LX_KERN_VERSION_MAX]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_OSREL); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + (void) strlcpy(version, lxzd->lxzd_kernel_version, sizeof (version)); + mutex_exit(&lxzd->lxzd_lock); + lxpr_uiobuf_printf(uiobuf, "%s\n", version); +} + +static void +lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid); +} + +static void +lxpr_gen_uuid(char *uuid, size_t size) +{ + uint8_t r[16]; + if (random_get_bytes(r, sizeof (r)) != 0) { + (void) random_get_pseudo_bytes(r, sizeof (r)); + } + /* Set UUID version to 4 (random) */ + r[6] = 0x40 | (r[6] & 0x0f); + /* Set UUID variant to 1 */ + r[8] = 0x80 | (r[8] & 0x3f); + + (void) snprintf(uuid, size, + "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x" + "-%02x%02x%02x%02x%02x%02x", + r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7], r[8], + r[9], r[10], r[11], r[12], r[13], r[14], r[15]); +} + +static void +lxpr_read_sys_kernel_rand_bootid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * This file isn't documented on the Linux proc(5) man page but + * according to the blog of the author of systemd/journald (the + * consumer), he says: + * boot_id: A random ID that is regenerated on each boot. As such it + * can be used to identify the local machine's current boot. It's + * universally available on any recent Linux kernel. It's a good and + * safe choice if you need to identify a specific boot on a specific + * booted kernel. + * + * On Linux the format appears to resemble a uuid so stick with that. + */ + zone_t *zone = LXPTOZ(lxpnp); + lx_zone_data_t *lxzd = ztolxzd(zone); + char bootid[UUID_PRINTABLE_STRING_LENGTH]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_BOOTID); + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(lxzd != NULL); + + mutex_enter(&lxzd->lxzd_lock); + if (lxzd->lxzd_bootid[0] == '\0') { + lxpr_gen_uuid(lxzd->lxzd_bootid, sizeof (lxzd->lxzd_bootid)); + } + (void) strlcpy(bootid, lxzd->lxzd_bootid, sizeof (bootid)); + mutex_exit(&lxzd->lxzd_lock); + + lxpr_uiobuf_printf(uiobuf, "%s\n", bootid); +} + +/* + * The amount of entropy available (in bits). + */ +static void +lxpr_read_sys_kernel_rand_entavl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_ENTAVL); + ASSERT(LXPTOZ(lxpnp)->zone_brand == &lx_brand); + + lxpr_uiobuf_printf(uiobuf, "%d\n", swrand_stats.ss_entEst); +} + +static void +lxpr_read_sys_kernel_rand_uuid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* + * Each read from this read-only file should return a new + * random 128-bit UUID string in the standard UUID format. + */ + zone_t *zone = LXPTOZ(lxpnp); + char uuid[UUID_PRINTABLE_STRING_LENGTH]; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RAND_UUID); + ASSERT(zone->zone_brand == &lx_brand); + + lxpr_gen_uuid(uuid, sizeof (uuid)); + + lxpr_uiobuf_printf(uiobuf, "%s\n", uuid); +} + +static void +lxpr_read_sys_kernel_sem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *pp = curproc; + zone_t *zone = LXPTOZ(lxpnp); + rctl_qty_t vmsl, vopm, vmni, vmns; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SEM); + + mutex_enter(&pp->p_lock); + vmsl = rctl_enforced_value(rc_process_semmsl, pp->p_rctls, pp); + vopm = rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp); + vmni = rctl_enforced_value(rc_zone_semmni, zone->zone_rctls, pp); + mutex_exit(&pp->p_lock); + vmns = vmsl * vmni; + if (vmns < vmsl || vmns < vmni) { + vmns = ULLONG_MAX; + } + /* + * Format: semmsl semmns semopm semmni + * - semmsl: Limit semaphores in a sempahore set. + * - semmns: Limit semaphores in all semaphore sets + * - semopm: Limit operations in a single semop call + * - semmni: Limit number of semaphore sets + */ + lxpr_uiobuf_printf(uiobuf, "%llu\t%llu\t%llu\t%llu\n", + vmsl, vmns, vopm, vmni); +} + +static void +lxpr_read_sys_kernel_shmall(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMALL); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + /* value is in pages */ + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)btop(val)); +} + +static void +lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_shmmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + zone_t *zone = LXPTOZ(lxpnp); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmni, zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", LXPTOZ(lxpnp)->zone_nlwps_ctl); +} + +static void +lxpr_read_sys_net_core_somaxc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_printf(uiobuf, "%d\n", SOMAXCONN); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q); + netstack_rele(ns); +} + +/* + * icmp_echo_ignore_broadcasts + * integer; 0 or 1 + * + * illumos: ndd /dev/ip ip_respond_to_echo_broadcast + * From the tunable guide: Control whether IPv4 responds to broadcast ICMPv4 + * echo request. default: 1 (enabled) + * Not in ip(7p) man page. + * + * Note that the Linux setting is the inverse of the illumos value. + */ +static void +lxpr_read_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + ipst = ns->netstack_ip; + lxpr_uiobuf_printf(uiobuf, "%d\n", !ipst->ips_ip_g_resp_to_echo_bcast); + netstack_rele(ns); +} + +/* + * ip_forward + * integer; default: 0 + * + * illumos: ndd /dev/ip ip_forwarding + * default: 0 (disabled) + * Forwarding is described in the ip(7p) man page. We do not support forwarding + * in lx at this time, thus we do not support Linux-ABI methods for + * enabling/disabling forwarding, and this is always 0. + */ +static void +lxpr_read_sys_net_ipv4_ip_forward(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_FORWARD); + lxpr_uiobuf_printf(uiobuf, "0\n"); +} + +/* + * ip_local_port_range + * + * The low & high port number range. + * integers; default: 32768 61000 + * + * illumos: tcp_smallest_anon_port & tcp_largest_anon_port + * Not in tcp(7p) man page. + */ +static void +lxpr_read_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\t%d\n", + tcps->tcps_smallest_anon_port, tcps->tcps_largest_anon_port); + netstack_rele(ns); +} + +static void +lxpr_read_sys_net_ipv4_tcp_cc_allow(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + /* For now the set of allowed algos is the same as those available. */ + return (lxpr_read_sys_net_ipv4_tcp_cc_avail(lxpnp, uiobuf)); +} + +static int +lxpr_uiobuf_printf_ccname(void *cd, struct cc_algo *algo) +{ + lxpr_uiobuf_t *uiobuf = cd; + lxpr_uiobuf_printf(uiobuf, "%s", algo->name); + lxpr_uiobuf_printf(uiobuf, + STAILQ_NEXT(algo, entries) != NULL ? " " : "\n"); + return (0); +} + +static void +lxpr_read_sys_net_ipv4_tcp_cc_avail(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + (void) cc_walk_algos(lxpr_uiobuf_printf_ccname, uiobuf); +} + +static void +lxpr_read_sys_net_ipv4_tcp_cc_curr(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_CC_CURR); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%s\n", + tcps->tcps_default_cc_algo->name); + netstack_rele(ns); +} + +/* + * tcp_fin_timeout + * + * This specifies how many seconds to wait for a final FIN packet before the + * socket is forcibly closed. This is strictly a violation of the TCP + * specification, but required to prevent denial-of-service attacks. + * integer; default: 60; + * + * illumos: tcp_fin_wait_2_flush_interval + * Not in tcp(7p) man page but see comment in uts/common/inet/tcp/tcp_input.c + * in the tcp_input_data() function on the use of tcp_fin_wait_2_flush_interval. + * The value is in milliseconds. + */ +static void +lxpr_read_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + tcps->tcps_fin_wait_2_flush_interval / 1000); + netstack_rele(ns); +} + +/* + * tcp_keepalive_intvl + * + * The number of seconds between TCP keep-alive probes. default: 75 + * Linux retries tcp_keepalive_probes (9) times before timing out. + * + * illumos: + * We have tcp_ka_rinterval but there is no corresponding tcps_* tunable for + * this. The closest is tcps_keepalive_abort_interval which specifies the + * time threshold for aborting a TCP connection in milliseconds. Linux retries + * 9 times (giving a total of 11.25 minutes) so we emulate this by dividing out + * tcps_keepalive_abort_interval by 9. + */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_abort_interval / 1000) / 9); + netstack_rele(ns); +} + +/* + * tcp_keepalive_time + * + * The number of seconds a connection needs to be idle before TCP begins + * sending out keep-alive probes. The default value is 7200 seconds (2 hours). + * + * illumos: tcp_keepalive_interval + * The interval for sending out the first probe in milliseconds. The default is + * two hours. + */ +static void +lxpr_read_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_keepalive_interval / 1000)); + netstack_rele(ns); +} + +/* + * tcp_max_syn_backlog + * + * The number of half-open connections that can be kept by the backlog queue. + * See the Linux tcp(7) man page. + * + * illumos: tcp_conn_req_max_q0 + */ +static void +lxpr_read_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_conn_req_max_q0); + netstack_rele(ns); +} + +/* + * tcp_retries2 + * + * Controls number of TCP retries for data packets. Often tuned down for HA + * configurations. RFC 1122 recommends at least 100 seconds for the timeout, + * which, for Linux, corresponds to a value of ~8. Oracle suggests a value of + * 3 for a RAC configuration, as do various HA tuning guides. + * integer; Ubuntu 16.04 default: 15 + * + * illumos: There are 4 ndd parameters that are related to this: + * tcp_rexmit_interval_initial: 1000 + * tcp_rexmit_interval_min: 400 + * tcp_rexmit_interval_max: 60000 + * tcp_rexmit_interval_extra: 0 + * Not in tcp(7p) man page. + * + * From the tunables guide: + * tcp_rexmit_interval_initial is the initial retransmission timeout (RTO) for + * a TCP connection in milliseconds (ms). + * The interval_min value is the minimum RTO in ms. + * The interval_max value is the maximum RTO in ms. + * The extra value is an extra time (in ms) to add in to the RTO. + */ +static void +lxpr_read_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + uint_t i, retry, rx_min, rx_max; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + rx_min = tcps->tcps_rexmit_interval_min; + rx_max = tcps->tcps_rexmit_interval_max; + netstack_rele(ns); + + for (i = rx_min, retry = 0; i < rx_max; retry++) { + i *= 2; + } + + lxpr_uiobuf_printf(uiobuf, "%u\n", retry); +} + +/* + * tcp_rmem and tcp_wmem + * + * Display the minimum, default, and maximum TCP receive/transmit window sizes, + * in bytes. See the Linux tcp(7) man page. + * + * In illumos this roughly corresponds to: tcp_recv_hiwat or tcp_xmit_hiwat, + * and tcp_max_buf. + * tcp_recv_hiwat is the default TCP receive window size + * tcp_xmit_hiwat is the default TCP send window size + * tcp_max_buf is the maximum TCP send and receive buffer size + */ +static void +lxpr_read_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + uint_t min; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM || + lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + + /* Linux defaults to a page */ + min = MIN((lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat), PAGESIZE); + + lxpr_uiobuf_printf(uiobuf, "%d\t%d\t%d\n", + min, + (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + tcps->tcps_recv_hiwat : tcps->tcps_xmit_hiwat), + tcps->tcps_max_buf); + netstack_rele(ns); +} + +/* + * tcp_sack + * + * Enable RFC 2018 TCP Selective Acknowledgements. Boolean, default: enabled + * + * illumos: tcp_sack_permitted + * tcp_sack_permitted 0 == disabled, 1 == no initiate but accept, + * 2 == initiate and accept. default is 2. + */ +static void +lxpr_read_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", + (tcps->tcps_sack_permitted == 0 ? 0 : 1)); + netstack_rele(ns); +} + +/* + * tcp_window_scaling + * + * RFC 1323 TCP window scaling. This feature allows the use of a large window + * (> 64K) on a TCP connection. Boolean; default: enabled + * + * illumos: tcp_wscale_always + * tcp_wscale_always is set to 1, the window scale option will always be + * set when connecting to a remote system. If tcp_wscale_always is 0, the + * window scale option will be set only if the user has requested a send or + * receive window larger than 64K. The default value of is 1. + */ +static void +lxpr_read_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + tcp_stack_t *tcps; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENXIO); + return; + } + + tcps = ns->netstack_tcp; + lxpr_uiobuf_printf(uiobuf, "%d\n", tcps->tcps_wscale_always); + netstack_rele(ns); +} + +/* + * The /proc/sys/vm/dirty* files are (poorly) documented in the Linux + * source file Documentation/sysctl/vm.txt. These are various VM tunables + * that we'll never support, but that a few misguided apps want to inspect and + * modify. We simply hardcode some default values and we'll lie about write + * success to these files. + */ +static void +lxpr_read_sys_vm_dirty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + uint_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_BYTES || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BG_RATIO || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_BYTES || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_EXP_CS || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_RATIO || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTYTIME_EXP_SEC || + lxpnp->lxpr_type == LXPR_SYS_VM_DIRTY_WB_CS); + + switch (lxpnp->lxpr_type) { + case LXPR_SYS_VM_DIRTY_BG_RATIO: + val = 10; + break; + case LXPR_SYS_VM_DIRTY_EXP_CS: + val = 3000; + break; + case LXPR_SYS_VM_DIRTY_RATIO: + val = 20; + break; + case LXPR_SYS_VM_DIRTYTIME_EXP_SEC: + val = 43200; + break; + case LXPR_SYS_VM_DIRTY_WB_CS: + val = 500; + break; + default: + val = 0; + break; + } + + lxpr_uiobuf_printf(uiobuf, "%u\n", val); +} + +static void +lxpr_read_sys_vm_max_map_cnt(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MAX_MAP_CNT); + /* We don't limit mappings, just say we have a large limit. */ + lxpr_uiobuf_printf(uiobuf, "%d\n", 16777215); +} + +static void +lxpr_read_sys_vm_minfr_kb(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_MINFR_KB); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_nhpages(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_NHUGEP); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_overcommit_mem(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_OVERCOMMIT_MEM); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +static void +lxpr_read_sys_vm_swappiness(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VM_SWAPPINESS); + lxpr_uiobuf_printf(uiobuf, "%d\n", 0); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +/* + * Report a list of each cgroup subsystem supported by our emulated cgroup fs. + * This needs to exist for systemd to run but for now we don't report any + * cgroup subsystems as being installed. The commented example below shows + * how to print a subsystem entry. + */ +static void +lxpr_read_cgroups(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + "#subsys_name", "hierarchy", "num_cgroups", "enabled"); + + /* + * lxpr_uiobuf_printf(uiobuf, "%s\t%s\t%s\t%s\n", + * "cpu,cpuacct", "2", "1", "1"); + */ +} + +/* + * Report the zone boot arguments. + */ +static void +lxpr_read_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + lxpr_uiobuf_printf(uiobuf, "%s\n", zone->zone_bootargs); +} + + +typedef enum { + LXCS_ALWAYS = 0, + LXCS_CPUID1_ECX, + LXCS_CPUID1_EDX, + LXCS_CPUID7_EBX, + LXCS_CPUID7_ECX, + LXCS_CPUID7_EDX, + LXCS_CPUIDD1_EAX, + LXCS_CPUIDX1_ECX, + LXCS_CPUIDX1_EDX, + LXCS_REG_MAX +} lx_cpuinfo_source_t; + +typedef struct { + lx_cpuinfo_source_t lxcm_source; + uint32_t lxcm_flag; + const char *lxcm_name; +} lx_cpuinfo_mapping_t; + +/* + * This listing is derived from the X86_FEATURE flags data in the Linux kernel. + * Some entries are missing detectino routines. They remain in the list, + * although commented out, to preserve proper order should they be fixed later. + */ +lx_cpuinfo_mapping_t lx_cpuinfo_mappings[] = { + /* CPUID EDX: */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FPU, "fpu" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_VME, "vme" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DE, "de" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE, "pse" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TSC, "tsc" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MSR, "msr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAE, "pae" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCE, "mce" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CX8, "cx8" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_APIC, "apic" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SEP, "sep" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MTRR, "mtrr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PGE, "pge" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MCA, "mca" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CMOV, "cmov" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PAT, "pat" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSE36, "pse36" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PSN, "pn" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_CLFSH, "clflush" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_DS, "dts" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_ACPI, "acpi" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_MMX, "mmx" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_FXSR, "fxsr" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE, "sse" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SSE2, "sse2" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_SS, "ss" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_HTT, "ht" }, + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_TM, "tm" }, + /* reserved */ + { LXCS_CPUID1_EDX, CPUID_INTC_EDX_PBE, "pbe" }, + + /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ +#if defined(__amd64) + { LXCS_ALWAYS, 1, "syscall" }, +#endif + /* Present in the Linux listing but not in recent AMD docs: "mp" */ + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_NX, "nx" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_MMXamd, "mmxext" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_FFXSR, "fxsr_opt" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_1GPG, "pdpe1gb" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_TSCP, "rdtscp" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_LM, "lm" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNowx, "3dnowext" }, + { LXCS_CPUIDX1_EDX, CPUID_AMD_EDX_3DNow, "3dnow" }, + + /* CPUID ECX: */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE3, "pni" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCLMULQDQ, "pclmulqdq" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DTES64, "dtes64" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MON, "monitor" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DSCPL, "ds_cpl" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_VMX, "vmx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SMX, "smx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_EST, "est" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TM2, "tm2" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSSE3, "ssse3" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CID, "cid" }, + { LXCS_CPUID1_ECX, 0x00000800, "sdbg" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_FMA, "fma" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_CX16, "cx16" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_ETPRD, "xtpr" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PDCM, "pdcm" }, + /* reserved */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_PCID, "pcid" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_DCA, "dca" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_1, "sse4_1" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_SSE4_2, "sse4_2" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_X2APIC, "x2apic" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_MOVBE, "movbe" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_POPCNT, "popcnt" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_TSCDL, "tsc_deadline_timer" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AES, "aes" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_XSAVE, "xsave" }, + /* osxsave */ + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_AVX, "avx" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_F16C, "f16c" }, + { LXCS_CPUID1_ECX, CPUID_INTC_ECX_RDRAND, "rdrand" }, + /* not used */ + + /* + * Other features, Linux-defined mapping + * This range is used for feature bits which conflict or are synthesized + * Skipped: + * "recovery", + * "longrun", + * "lrti", + * "cxmmx", + * "k6_mtrr", + * "cyrix_arr", + * "centaur_mcr", + * "constant_tsc", + * "up", + * "arch_perfmon", + * "pebs", + * "bts", + * "rep_good", + * "nopl", + * "xtopology", + * "tsc_reliable", + * "nonstop_tsc", + * "extd_apicid", + * "amd_dcm", + * "aperfmperf", + * "eagerfpu", + * "nonstop_tsc_s3", + * + * "hypervisor", + * "rng", + * "rng_en", + * "ace", + * "ace_en", + * "ace2", + * "ace2_en", + * "phe", + * "phe_en", + * "pmm", + * "pmm_en", + */ + + /* + * More extended AMD flags: CPUID level 0x80000001, ecx, word 6 + */ + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_AHF64, "lahf_lm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CMP_LGCY, "cmp_legacy" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SVM, "svm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_EAS, "extapic" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_CR8D, "cr8_legacy" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LZCNT, "abm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SSE4A, "sse4a" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_MAS, "misalignsse" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_3DNP, "3dnowprefetch" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_OSVW, "osvw" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_IBS, "ibs" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_XOP, "xop" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_SKINIT, "skinit" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_WDT, "wdt" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_LWP, "lwp" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_FMA4, "fma4" }, + { LXCS_CPUIDX1_ECX, 0x00020000, "tce" }, + + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_NIDMSR, "nodeid_msr" }, + + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TBM, "tbm" }, + { LXCS_CPUIDX1_ECX, CPUID_AMD_ECX_TOPOEXT, "topoext" }, + { LXCS_CPUIDX1_ECX, 0x00800000, "perfctr_core" }, + { LXCS_CPUIDX1_ECX, 0x01000000, "perfctr_nb" }, + { LXCS_CPUIDX1_ECX, 0x02000000, "bpext" }, + { LXCS_CPUIDX1_ECX, 0x04000000, "perfctr_l2" }, + { LXCS_CPUIDX1_ECX, 0x08000000, "mwaitx" }, + + /* + * Aux flags and virt bits. + * Skipped: + * "cpb", + * "epb", + * "hw_pstate", + * "proc_feedback", + * "intel_pt", + * "tpr_shadow", + * "vnmi", + * "flexpriority", + * "ept", + * "vpid", + * "vmmcall", + */ + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 + */ + { LXCS_CPUID7_EBX, 0x00000001, "fsgsbase" }, + { LXCS_CPUID7_EBX, 0x00000002, "tsc_adjust" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI1, "bmi1" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_HLE, "hle" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX2, "avx2" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMEP, "smep" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_BMI2, "bmi2" }, + { LXCS_CPUID7_EBX, 0x00000200, "erms" }, + { LXCS_CPUID7_EBX, 0x00000400, "invpcid" }, + { LXCS_CPUID7_EBX, 0x00000800, "rtm" }, + { LXCS_CPUID7_EBX, 0x00001000, "cqm" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_MPX, "mpx" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512F, "avx512f" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512DQ, "avx512dq" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_RDSEED, "rdseed" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_ADX, "adx" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SMAP, "smap" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512IFMA, "avx512ifma" }, + + { LXCS_CPUID7_EBX, 0x00400000, "pcommit" }, + { LXCS_CPUID7_EBX, 0x00800000, "clflushopt" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_CLWB, "clwb" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512PF, "avx512pf" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512ER, "avx512er" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512CD, "avx512cd" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_SHA, "sha_ni" }, + + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512BW, "avx512bw" }, + { LXCS_CPUID7_EBX, CPUID_INTC_EBX_7_0_AVX512VL, "avx512vl" }, + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) + */ + { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VBMI, "avx512vbmi" }, + { LXCS_CPUID7_ECX, CPUID_INTC_ECX_7_0_AVX512VPOPCDQ, + "avx512_vpopcntdq" }, + + /* + * Intel-defined CPU features, CPUID level 0x00000007:0 (edx) + */ + { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124NNIW, "avx512_4nniw" }, + { LXCS_CPUID7_EDX, CPUID_INTC_EDX_7_0_AVX5124FMAPS, "avx512_4fmaps" }, + + /* + * Extended state features, CPUID level 0x0000000d:1 (eax) + */ + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEOPT, "xsaveopt" }, + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVEC, "xsavec" }, + { LXCS_CPUIDD1_EAX, 0x00000004, "xgetbv1" }, + { LXCS_CPUIDD1_EAX, CPUID_INTC_EAX_D_1_XSAVES, "xsaves" }, + + /* + * Skipped: + * "cqm_llc", + * "cqm_occup_llc", + * "clzero", + */ + + /* + * Thermal and Power Management Leaf, CPUID level 0x00000006 (eax) + * Skipped: + * "dtherm", + * "ida", + * "arat", + * "pln", + * "pts", + * "hwp", + * "hwp_notify", + * "hwp_act_window", + * "hwp_epp", + * "hwp_pkg_req", + */ + + /* + * AMD SVM Feature Identification, CPUID level 0x8000000a (edx) + * Skipped: + * "npt", + * "lbrv", + * "svm_lock", + * "nrip_save", + * "tsc_scale", + * "vmcb_clean", + * "flushbyasid", + * "decodeassists", + * "pausefilter", + * "pfthreshold", + */ +}; + +#define LX_CPUINFO_MAPPING_MAX \ + (sizeof (lx_cpuinfo_mappings) / sizeof (lx_cpuinfo_mappings[0])) + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + cpu_t *cp, *cpstart; + int pools_enabled; + char brandstr[CPU_IDSTRLEN]; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + struct cpuid_regs cpr; + uint32_t maxeax, xmaxeax, cpuid_res[LXCS_REG_MAX] = { 0 }; + + cpr.cp_eax = 0; + maxeax = cpuid_insn(cp, &cpr); + cpr.cp_eax = 0x80000000; + xmaxeax = cpuid_insn(cp, &cpr); + + cpuid_res[LXCS_ALWAYS] = 1; + if (maxeax >= 1) { + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUID1_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUID1_EDX] = cpr.cp_edx; + } + if (maxeax >= 7) { + cpr.cp_eax = 7; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUID7_EBX] = cpr.cp_ebx; + cpuid_res[LXCS_CPUID7_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUID7_EDX] = cpr.cp_edx; + } + if (maxeax >= 0xd) { + cpr.cp_eax = 0xd; + cpr.cp_ecx = 1; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUIDD1_EAX] = cpr.cp_eax; + } + if (xmaxeax >= 0x80000001) { + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + cpuid_res[LXCS_CPUIDX1_ECX] = cpr.cp_ecx; + cpuid_res[LXCS_CPUIDX1_EDX] = cpr.cp_edx; + } + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + /* Print CPUID feature flags */ + for (i = 0; i < LX_CPUINFO_MAPPING_MAX; i++) { + lx_cpuinfo_mapping_t *lxm = &lx_cpuinfo_mappings[i]; + + ASSERT(lxm->lxcm_source < LXCS_REG_MAX); + if (cpuid_res[lxm->lxcm_source] & lxm->lxcm_flag) { + lxpr_uiobuf_printf(uiobuf, " %s", + lxm->lxcm_name); + } + } + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +static void +lxpr_read_fdinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + proc_t *p; + file_t *fp; + vnode_t *vp; + offset_t off; + short uf_flag; + int fd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDINFO_FD); + + p = lxpr_lock(lxpnp, NO_ZOMB); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + if ((p->p_flag & SSYS) || p->p_as == &kas) { + lxpr_uiobuf_seterr(uiobuf, EFAULT); + lxpr_unlock(p); + return; + } + + fd = lxpnp->lxpr_desc; + + fp = lxpr_getf(p, fd, &uf_flag); + if (fp == NULL) { + lxpr_uiobuf_seterr(uiobuf, ENOENT); + lxpr_unlock(p); + return; + } + vp = fp->f_vnode; + + /* + * Check that the offset value in the underlying file_t is plausible + * and reset to 0 if not. + */ + if (fp->f_offset == -1) { + off = 0; + } else { + off = fp->f_offset; + if (VOP_SEEK(vp, 0, &off, NULL) != 0) + off = 0; + } + + lxpr_uiobuf_printf(uiobuf, "pos:\t%ld\n", off); + lxpr_uiobuf_printf(uiobuf, "flags:\t0%o\n", + lxpr_open_flags_convert(uf_flag, + fp->f_flag2 << 16 | fp->f_flag)); + lxpr_uiobuf_printf(uiobuf, "mnt_id:\t%u\n", + lxpr_get_mountid(zone, vp->v_vfsp)); + + /* Could show additional fields based on vp->v_type */ + + lxpr_releasef(p, fd); + lxpr_unlock(p); +} + +/* + * Report a list of file systems loaded in the kernel. We only report the ones + * which we support and which may be checked by various components to see if + * they are loaded. + */ +static void +lxpr_read_filesystems(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "autofs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "cgroup"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "nfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "proc"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "sysfs"); + lxpr_uiobuf_printf(uiobuf, "%s\t%s\n", "nodev", "tmpfs"); +} + +/* + * Calculate the number of links in the task dir. Some code (e.g. chromium) + * depends on this value being accurate. + */ +static uint_t +lxpr_count_taskdir(lxpr_node_t *lxpnp) +{ + proc_t *p; + uint_t cnt; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) + return (0); + + cnt = lxpr_count_tasks(p); + + lxpr_unlock(p); + + /* Add the fixed entries ("." & "..") */ + cnt += 2; + return (cnt); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink, fifo or socket + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = lxpnp->lxpr_realvp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_PID_TASKDIR: + vap->va_nlink = lxpr_count_taskdir(lxpnp); + vap->va_size = vap->va_nlink * LXPR_SDSIZE; + break; + case LXPR_PID_TASK_IDDIR: + vap->va_nlink = TIDDIRFILES; + vap->va_size = TIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + /* + * Restore VLNK type for lstat-type activity. + * See lxpr_readlink for more details. + */ + if ((flags & FOLLOW) == 0) + vap->va_type = VLNK; + case LXPR_PID_FDINFO_FD: + case LXPR_PID_TID_FDINFO_FD: + /* Linux leaves the file size for these as 0 */ + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + return (lxpr_doaccess(VTOLXP(vp), B_FALSE, mode, flags, cr, ct)); +} + +/* + * This makes up the bulk of the logic for lxpr_access. An extra parameter + * ('shallow') is present to differentiate checks that must pass muster against + * an underlying resource (lxpr_realvp) and those that are only concerned with + * permission to the process. + */ +static int +lxpr_doaccess(lxpr_node_t *lxpnp, boolean_t shallow, int mode, int flags, + cred_t *cr, caller_context_t *ct) +{ + lxpr_nodetype_t type = lxpnp->lxpr_type; + boolean_t allow_pid_access = B_FALSE; + int shift = 0; + proc_t *tp; + + /* + * lx /proc is primarily a read only file system + * We handle LXPR_SYSDIR as a special case. At least 'systemd' expects + * access() to report /proc/sys is writable, but we can't do that in + * lxpr_is_writable since it breaks other code paths that check if they + * can write there. + */ + if ((mode & VWRITE) && !lxpr_is_writable(type)) { + if (type != LXPR_SYSDIR) + return (EROFS); + } + + if (type == LXPR_PIDDIR) { + return (0); + } + if (lxpnp->lxpr_pid != 0) { + if ((tp = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) { + return (ENOENT); + } + if (tp == curproc || secpolicy_proc_access(cr) == 0 || + priv_proc_cred_perm(cr, tp, NULL, mode) == 0) { + allow_pid_access = B_TRUE; + } + lxpr_unlock(tp); + switch (type) { + case LXPR_PID_CGROUP: + case LXPR_PID_CMDLINE: + case LXPR_PID_COMM: + case LXPR_PID_LIMITS: + case LXPR_PID_LOGINUID: + case LXPR_PID_MOUNTINFO: + case LXPR_PID_MOUNTS: + case LXPR_PID_OOM_SCR_ADJ: + case LXPR_PID_STAT: + case LXPR_PID_STATM: + case LXPR_PID_STATUS: + case LXPR_PID_TASKDIR: + case LXPR_PID_TASK_IDDIR: + case LXPR_PID_TID_CGROUP: + case LXPR_PID_TID_CMDLINE: + case LXPR_PID_TID_COMM: + case LXPR_PID_TID_LIMITS: + case LXPR_PID_TID_LOGINUID: + case LXPR_PID_TID_MOUNTINFO: + case LXPR_PID_TID_OOM_SCR_ADJ: + case LXPR_PID_TID_STAT: + case LXPR_PID_TID_STATM: + case LXPR_PID_TID_STATUS: + /* + * These entries are accessible to any process on the + * system which wishes to query them. + */ + break; + default: + /* + * All other entries under the pid/tid hierarchy + * require proper authorization to be accessed. + */ + if (!allow_pid_access) { + return (EACCES); + } + break; + } + } + + /* + * If this entry has an underlying vnode, rely upon its access checks. + * Skip this if a shallow check has been requested. + */ + if (lxpnp->lxpr_realvp != NULL && !shallow) { + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* + * Allow access to those (root) possessing the correct privilege or + * already authorized against a pid-specific resource. + */ + if (allow_pid_access || secpolicy_proc_access(cr) == 0) { + return (0); + } + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp), ZOMB_OK); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_taskdir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + uint_t tid; + int c; + kthread_t *t; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR); + + /* + * convert the string rendition of the filename to a thread ID + */ + tid = 0; + while ((c = *comp++) != '\0') { + int otid; + if (c < '0' || c > '9') + return (NULL); + + otid = tid; + tid = 10 * tid + c - '0'; + /* integer overflow */ + if (tid / 10 != otid) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock_pid(dlxpnp, tid, NO_ZOMB, &t); + if (p == NULL) + return (NULL); + + /* + * Bail if this is a system process. + */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + if (p->p_brand != &lx_brand) { + /* + * Only the main thread is visible for non-branded processes. + */ + t = p->p_tlist; + if (tid != p->p_pid || t == NULL) { + t = NULL; + } + } else if (t != NULL) { + /* + * Disallow any access to aio in-kernel worker threads. + * To prevent a potential race while looking at the lwp data + * for an exiting thread, we clear the TP_KTHREAD bit in + * lx_cleanlwp() while the p_lock is held. + */ + if ((t->t_proc_flag & TP_KTHREAD) != 0) { + lx_lwp_data_t *lwpd; + + VERIFY((lwpd = ttolxlwp(t)) != NULL); + if ((lwpd->br_lwp_flags & BR_AIO_LWP) != 0) { + lxpr_unlock(p); + return (NULL); + } + } + } + + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + + /* + * Allocate and fill in a new lx /proc taskid node. + * Instead of the last arg being a fd, it is a tid. + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + kthread_t *t; + int i; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock_pid(dlxpnp, dlxpnp->lxpr_desc, NO_ZOMB, &t); + if (p == NULL) + return (NULL); + + /* + * Bail if this is a system process. + */ + if (p->p_as == &kas) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in the new lx /proc taskid dir node + */ + for (i = 0; i < TIDDIRFILES; i++) { + if (strcmp(tiddir[i].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p, + dlxpnp->lxpr_desc); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); + } + } + + lxpr_unlock(p); + return (NULL); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PID_FDDIR || + VTOLXP(dp)->lxpr_type == LXPR_PID_TID_FDDIR); + + return (lxpr_lookup_fdnode(dp, comp)); +} + +static vnode_t * +lxpr_lookup_fdinfodir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PID_FDINFODIR || + VTOLXP(dp)->lxpr_type == LXPR_PID_TID_FDINFODIR); + + return (lxpr_lookup_fdinfonode(dp, comp)); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + vnode_t *vp; + proc_t *p; + kthread_t *t; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock_pid(VTOLXP(dp), pid, ZOMB_OK, &t); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * Allocate and populate a new LX /proc node. + * + * Directory entries for non-main threads can be looked up as + * /proc/<tid> despite the fact that they do not appear in the + * readdir output. Record the lookup pid (tid) so that later + * operations can be aware of this context. + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, pid); + + lxpr_unlock(p); + vp = LXPTOV(lxpnp); + ASSERT(vp != NULL); + + return (vp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sysdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kdir_randdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_randdir, + SYS_RANDDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_netdir, + SYS_NETDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_coredir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_net_ipv4dir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_vmdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_vmdir, + SYS_VMDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir, + SYS_FSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[type](lxpnp, uiop, eofp)); +} + +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zone_t *zone; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zone = LXPTOZ(lxpnp); + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + pid_t pid, raw_pid; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, the + * zsched process for the zone, and anything the security + * policy doesn't allow us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || p->p_zone != zone || + p == zone->zone_zsched || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), p, &pid, NULL); + raw_pid = p->p_pid; + + ASSERT(p->p_stat != 0); + + mutex_exit(&pidlock); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, raw_pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + int err; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + if ((p = lxpr_lock(lxpnp, ZOMB_OK)) == NULL) { + return (ENOENT); + } + err = lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES); + lxpr_unlock(p); + return (err); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, tiddirsize, tasknum; + proc_t *p; + kthread_t *t; + boolean_t branded; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + oresid = uiop->uio_resid; + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) { + return (ENOENT); + } + + /* + * Just emit static entries for system processes and zombies. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its threads. + */ + tiddirsize = p->p_lwpcnt; + branded = (p->p_brand == &lx_brand); + mutex_exit(&p->p_lock); + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + if ((t = p->p_tlist) == NULL) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until all thread's have + * been returned. + */ + for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) { + int i, reclen, len; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the thread list + */ + i = (uoffset / LXPR_SDSIZE) - 2; + if (i < 0 || i >= tiddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (i != tasknum) + goto next; + + if (!branded) { + /* + * Emulating the goofy linux task model is impossible + * to do for native processes. We can compromise by + * presenting only the main thread to the consumer. + */ + emul_tid = p->p_pid; + } else { + if ((lwpd = ttolxlwp(t)) == NULL) { + goto next; + } + /* Don't show aio kernel worker threads */ + if ((t->t_proc_flag & TP_KTHREAD) != 0 && + (lwpd->br_lwp_flags & BR_AIO_LWP) != 0) { + goto next; + } + emul_tid = lwpd->br_pid; + /* + * Convert pid to Linux default of 1 if we're the + * zone's init. + */ + if (emul_tid == LXPTOZ(lxpnp)->zone_proc_initpid) + emul_tid = 1; + } + + dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, p->p_pid, + emul_tid); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + + if ((t = t->t_forw) == p->p_tlist || !branded) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + } + + if (eofp != NULL) + *eofp = 0; + +out: + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + kthread_t *t; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* Confirm that process and thread are still present */ + p = lxpr_lock_pid(lxpnp, lxpnp->lxpr_desc, NO_ZOMB, &t); + if (p == NULL) { + return (ENOENT); + } + lxpr_unlock(p); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES)); +} + +static int +lxpr_readdir_fdlist(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_nodetype_t inodetype) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error, ceof, fddirsize; + proc_t *p; + uf_info_t *fip; + + ASSERT( + (inodetype == LXPR_PID_FD_FD && ( + lxpnp->lxpr_type == LXPR_PID_FDDIR || + lxpnp->lxpr_type == LXPR_PID_TID_FDDIR)) || + (inodetype == LXPR_PID_FDINFO_FD && ( + lxpnp->lxpr_type == LXPR_PID_FDINFODIR || + lxpnp->lxpr_type == LXPR_PID_TID_FDINFODIR))); + + oresid = uiop->uio_resid; + + p = lxpr_lock(lxpnp, ZOMB_OK); + if (p == NULL) + return (ENOENT); + + /* + * For exiting/exited processes or those belonging to the system, only + * emit the fixed entries. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (lxpr_readdir_common(lxpnp, uiop, eofp, 0, 0)); + } + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(inodetype, p->p_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (lxpr_readdir_fdlist(lxpnp, uiop, eofp, LXPR_PID_FD_FD)); +} + +static int +lxpr_readdir_fdinfodir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (lxpr_readdir_fdlist(lxpnp, uiop, eofp, LXPR_PID_FDINFO_FD)); +} + +static int +lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES)); +} + +static int +lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir, + SYS_FSDIRFILES)); +} + +static int +lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +static int +lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static int +lxpr_readdir_sys_kdir_randdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_RANDDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_randdir, + SYS_RANDDIRFILES)); +} + +static int +lxpr_readdir_sys_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_netdir, + SYS_NETDIRFILES)); +} + +static int +lxpr_readdir_sys_net_coredir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_COREDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_coredir, + SYS_NET_COREDIRFILES)); +} + +static int +lxpr_readdir_sys_net_ipv4dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4DIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_net_ipv4dir, + SYS_NET_IPV4DIRFILES)); +} + +static int +lxpr_readdir_sys_vmdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_VMDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_vmdir, + SYS_VMDIRFILES)); +} + +#define isdigit(c) ((c) >= '0' && (c) <= '9') +#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') + +/* + * Obtain a numeric value from the null-terminated input string. + * We don't have strtok in the kernel, so tokenize this ourselves and + * validate the input. + */ +static int +lxpr_tokenize_num(char *str, long *pv, char **ep) +{ + char *pstart, *pc, c, *endptr; + long v; + + for (pc = str; isspace(*pc); pc++) + ; + + for (pstart = pc; isdigit(*pc); pc++) + ; + if (pc == pstart || (!isspace(*pc) && *pc != '\0')) + return (EINVAL); + c = *pc; + *pc = '\0'; + + if (ddi_strtol(pstart, &endptr, 10, &v) != 0) { + *pc = c; + return (EINVAL); + } + if (*endptr != '\0') { + *pc = c; + return (EINVAL); + } + + if (pv != NULL) + *pv = v; + if (ep != NULL) + *ep = ++pc; + + return (0); +} + +static int +lxpr_write_tcp_property(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct, char *prop, + int (*xlate)(char *, int)) +{ + int error; + int res = 0; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + netstack_t *ns; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '\0') /* no input */ + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + if (xlate != NULL && xlate(val, sizeof (val)) != 0) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + pinfo = mod_prop_lookup(ptbl, prop, MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, val, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_core_somaxc(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_CORE_SOMAXCON); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_conn_req_max_q", NULL)); +} + +static int +lxpr_xlate_sec2ms(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_ka_intvl(char *val, int size) +{ + long sec; + char *ep; + + if (lxpr_tokenize_num(val, &sec, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (snprintf(val, size, "%ld", sec * 1000 * 9) >= size) + return (EINVAL); + return (0); +} + +/* + * Approximately translate the input count value into a reasonable + * _rexmit_interval_max timeout. + */ +static int +lxpr_xlate_retry2(char *val, int size) +{ + long cnt; + char *ep; + uint_t i, rx_max; + + if (lxpr_tokenize_num(val, &cnt, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + /* + * The _rexmit_interval_max is limited to 2 hours, so a count of 15 + * or more will exceed that due to exponential backoff. + */ + if (cnt > 15) + cnt = 15; + + rx_max = 400; /* Start with default _rexmit_interval_min in ms */ + for (i = 0; i < cnt; i++) + rx_max *= 2; + + /* + * The _rexmit_interval_max is limited to 2 hours, so if we went over + * the limit, just use 2 hours (in ms). + */ + if (rx_max > (7200 * 1000)) + rx_max = 7200 * 1000; + + if (snprintf(val, size, "%u", rx_max) >= size) + return (EINVAL); + return (0); +} + +static int +lxpr_xlate_sack(char *val, int size) +{ + long flag; + char *ep; + + if (lxpr_tokenize_num(val, &flag, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + if (flag != 0 && flag != 1) + return (EINVAL); + /* see comment on lxpr_read_sys_net_ipv4_tcp_sack */ + if (snprintf(val, size, "%d", (flag == 0 ? 0 : 2)) >= size) + return (EINVAL); + return (0); +} + +/* + * We're updating a property on the ip stack so we can't reuse + * lxpr_write_tcp_property. + */ +static int +lxpr_write_sys_net_ipv4_icmp_eib(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + long flag; + char *ep; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_ICMP_EIB); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '\0') /* no input */ + return (EINVAL); + + if (lxpr_tokenize_num(val, &flag, &ep) != 0) + return (EINVAL); + + if (*ep != '\0' || (flag != 0 && flag != 1)) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + ipst = ns->netstack_ip; + ipst->ips_ip_g_resp_to_echo_bcast = !flag; + + netstack_rele(ns); + return (0); +} + +/* + * We expect two port numbers on a line as input for the range, and we have to + * set two properties on the netstack_tcp, so we can't reuse + * lxpr_write_tcp_property. + */ +static int +lxpr_write_sys_net_ipv4_ip_lport_range(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int res; + size_t olen; + char vals[32]; /* big enough for a line w/ 2 16-bit numeric strings */ + char *ep; + long low, high; + netstack_t *ns; + tcp_stack_t *tcps; + mod_prop_info_t *ptbl = NULL; + mod_prop_info_t *pinfo = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_IP_LPORT_RANGE); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (vals) - 1) + return (EINVAL); + + bzero(vals, sizeof (vals)); + res = uiomove(vals, olen, UIO_WRITE, uio); + if (res != 0) + return (res); + + if (lxpr_tokenize_num(vals, &low, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &high, &ep) != 0) + return (EINVAL); + + if (*ep != '\0') { + /* make sure no other tokens on the line */ + *ep++ = '\0'; + for (; isspace(*ep); ep++) + ; + if (*ep != '\0') + return (EINVAL); + } + + if (low > high || high > 65535) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + tcps = ns->netstack_tcp; + if (low < tcps->tcps_smallest_nonpriv_port) { + netstack_rele(ns); + return (EINVAL); + } + + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + + (void) snprintf(vals, sizeof (vals), "%ld", low); + pinfo = mod_prop_lookup(ptbl, "smallest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + (void) snprintf(vals, sizeof (vals), "%ld", high); + pinfo = mod_prop_lookup(ptbl, "largest_anon_port", MOD_PROTO_TCP); + if (pinfo == NULL || pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + netstack_rele(ns); + return (res); +} + +/* + * We expect three numbers on a line as input for the range, and we have to + * set two properties on the netstack_tcp, so we can't reuse + * lxpr_write_tcp_property. + * + * See the Linux tcp(7) man page. + */ +static int +lxpr_write_sys_net_ipv4_tcp_rwmem(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int res; + size_t olen; + char vals[80]; /* big enough for a line w/ 3 numeric strings */ + char *ep; + long min, def, max, min_limit; + netstack_t *ns; + tcp_stack_t *tcps; + mod_prop_info_t *ptbl; + mod_prop_info_t *pinfo; + char *attr; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM || + lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WMEM); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (vals) - 1) + return (EINVAL); + + bzero(vals, sizeof (vals)); + res = uiomove(vals, olen, UIO_WRITE, uio); + if (res != 0) + return (res); + + if (lxpr_tokenize_num(vals, &min, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &def, &ep) != 0) + return (EINVAL); + + if (lxpr_tokenize_num(ep, &max, &ep) != 0) + return (EINVAL); + + if (*ep != '\0') { + /* make sure no other tokens on the line */ + *ep++ = '\0'; + for (; isspace(*ep); ep++) + ; + if (*ep != '\0') + return (EINVAL); + } + + /* + * Ensure the numbers are valid, low to high. + * Valid ranges from the tunable's guide. + */ + min_limit = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + 2048 : 4096); + if (min > def || def > max || min < min_limit || + def > ONEGB || max < 8192) + return (EINVAL); + + ns = lxpr_netstack(lxpnp); + if (ns == NULL) + return (EINVAL); + + tcps = ns->netstack_tcp; + + /* recv_hiwat and xmit_hiwat are aliased to recv_buf and send_buf. */ + attr = (lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RMEM ? + "recv_buf" : "send_buf"); + + (void) snprintf(vals, sizeof (vals), "%ld", def); + ptbl = ns->netstack_tcp->tcps_propinfo_tbl; + pinfo = mod_prop_lookup(ptbl, attr, MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + + /* + * Don't reduce max for one side (recv or xmit) since that impacts the + * other. + */ + if (res == 0 && max > tcps->tcps_max_buf) { + (void) snprintf(vals, sizeof (vals), "%ld", max); + pinfo = mod_prop_lookup(ptbl, "max_buf", MOD_PROTO_TCP); + if (pinfo == NULL || + pinfo->mpi_setf(ns, cr, pinfo, NULL, vals, 0) != 0) + res = EINVAL; + } + + netstack_rele(ns); + return (res); +} + +static int +lxpr_write_sys_net_ipv4_tcp_cc_curr(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_CC_CURR); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "congestion_control", NULL)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_fin_to(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_FIN_TO); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_fin_wait_2_flush_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_int(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_INT); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_abort_interval", lxpr_xlate_ka_intvl)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_ka_tim(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_KA_TIM); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_keepalive_interval", lxpr_xlate_sec2ms)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_max_syn_bl(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_MAX_SYN_BL); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_conn_req_max_q0", NULL)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_retry2(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_RETRY2); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, + "_rexmit_interval_max", lxpr_xlate_retry2)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_sack(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_SACK); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "sack", + lxpr_xlate_sack)); +} + +static int +lxpr_write_sys_net_ipv4_tcp_winscale(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_NET_IPV4_TCP_WINSCALE); + return (lxpr_write_tcp_property(lxpnp, uio, cr, ct, "_wscale_always", + NULL)); +} + +static int +lxpr_write_sys_fs_pipe_max(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + char *ep; + long u; + size_t size; + lx_zone_data_t *lxzd = ztolxzd(LXPTOZ(lxpnp)); + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_PIPE_MAX); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (lxpr_tokenize_num(val, &u, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + /* + * Bound to PAGESIZE <= input <= lx_pipe_max_limit, then round to the + * nearest page. Linux is a little more picky, rounding to the nearest + * power-of-two pages. Such strengthened behavior can be added later + * if needed. + */ + size = (size_t)u; + size = P2ROUNDUP(MIN(MAX(PAGESIZE, size), lx_pipe_max_limit), PAGESIZE); + + ASSERT(size <= lx_pipe_max_limit); + + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_pipe_max_sz = size; + mutex_exit(&lxzd->lxzd_lock); + + return (0); +} + +static int +lxpr_write_sys_kernel_corepatt(lxpr_node_t *lxpnp, struct uio *uio, + struct cred *cr, caller_context_t *ct) +{ + zone_t *zone = LXPTOZ(lxpnp); + struct core_globals *cg; + refstr_t *rp, *nrp; + corectl_path_t *ccp; + char val[MAXPATHLEN]; + char valtr[MAXPATHLEN]; + size_t olen; + int error; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_COREPATT); + + cg = zone_getspecific(core_zone_key, zone); + ASSERT(cg != NULL); + + if (secpolicy_coreadm(cr) != 0) + return (EPERM); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (val[olen - 1] == '\n') + val[olen - 1] = '\0'; + + if (val[0] == '|') + return (EINVAL); + + if ((error = lxpr_core_path_l2s(val, valtr, sizeof (valtr))) != 0) + return (error); + + nrp = refstr_alloc(valtr); + + ccp = cg->core_default_path; + mutex_enter(&ccp->ccp_mtx); + rp = ccp->ccp_path; + refstr_hold((ccp->ccp_path = nrp)); + cg->core_options |= CC_PROCESS_PATH; + mutex_exit(&ccp->ccp_mtx); + + if (rp != NULL) + refstr_rele(rp); + + return (0); +} + +static int +lxpr_write_pid_loginuid(lxpr_node_t *lxpnp, struct uio *uio, struct cred *cr, + caller_context_t *ct) +{ + int error; + size_t olen; + char val[16]; /* big enough for a uint numeric string */ + char *ep; + long u; + proc_t *p; + lx_proc_data_t *pd; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LOGINUID); + + if (uio->uio_loffset != 0) + return (EINVAL); + + if (uio->uio_resid == 0) + return (0); + + olen = uio->uio_resid; + if (olen > sizeof (val) - 1) + return (EINVAL); + + bzero(val, sizeof (val)); + error = uiomove(val, olen, UIO_WRITE, uio); + if (error != 0) + return (error); + + if (lxpr_tokenize_num(val, &u, &ep) != 0) + return (EINVAL); + if (*ep != '\0') + return (EINVAL); + + if ((p = lxpr_lock(lxpnp, NO_ZOMB)) == NULL) + return (ENXIO); + + if ((pd = ptolxproc(p)) != NULL) { + pd->l_loginuid = (uid_t)u; + } + lxpr_unlock(p); + + return (0); +} + +static int +lxpr_readlink_exe(lxpr_node_t *lxpnp, char *buf, size_t size, cred_t *cr) +{ + size_t dlen = DIRENT64_RECLEN(MAXPATHLEN); + dirent64_t *dp; + vnode_t *dirvp; + int error = ENOENT; + char *dbuf; + proc_t *p; + size_t len; + + p = lxpr_lock(lxpnp, NO_ZOMB); + + if (p == NULL) + return (error); + + dirvp = p->p_execdir; + if (dirvp == NULL) { + lxpr_unlock(p); + return (error); + } + + VN_HOLD(dirvp); + lxpr_unlock(p); + + /* Look up the parent directory path */ + if ((error = vnodetopath(NULL, dirvp, buf, size, cr)) != 0) { + VN_RELE(dirvp); + return (error); + } + + len = strlen(buf); + + dbuf = kmem_alloc(dlen, KM_SLEEP); + + /* + * Walk the parent directory to find the vnode for p->p_exec, in order + * to derive its path. + */ + if ((error = dirfindvp(NULL, dirvp, lxpnp->lxpr_realvp, + cr, dbuf, dlen, &dp)) == 0 && + strlen(dp->d_name) + len + 1 < size) { + buf[len] = '/'; + (void) strcpy(buf + len + 1, dp->d_name); + } else { + error = ENOENT; + } + VN_RELE(dirvp); + kmem_free(dbuf, dlen); + return (error); +} + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char *bp; + size_t buflen, klen; + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* + * Linux does something very "clever" for /proc/<pid>/fd/<num> entries. + * Open FDs are represented as symlinks, the link contents + * corresponding to the open resource. For plain files or devices, + * this isn't absurd since one can dereference the symlink to query + * the underlying resource. For sockets or pipes, it becomes ugly in a + * hurry. To maintain this human-readable output, those FD symlinks + * point to bogus targets such as "socket:[<inodenum>]". This requires + * circumventing vfs since the stat/lstat behavior on those FD entries + * will be unusual. (A stat must retrieve information about the open + * socket or pipe. It cannot fail because the link contents point to + * an absent file.) + * + * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD + * entries. This bypasses code paths which would normally + * short-circuit on symlinks and allows us to emulate the vfs behavior + * expected by /proc consumers. + */ + if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD) + return (EINVAL); + + buflen = klen = MAXPATHLEN + 1; + bp = kmem_alloc(klen, KM_SLEEP); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + error = lxpr_doaccess(lxpnp, B_TRUE, VREAD, 0, cr, ct); + if (error != 0) + goto out; + + error = vnodetopath(NULL, rvp, bp, buflen, cr); + + /* + * Special handling for /proc/<pid>/exe where the vnode path is + * not cached. + */ + if (error != 0 && lxpnp->lxpr_type == LXPR_PID_EXE) + error = lxpr_readlink_exe(lxpnp, bp, buflen, cr); + + if (error != 0) { + /* + * Special handling possible for /proc/<pid>/fd/<num> + * Generate <type>:[<inode>] links, if allowed. + */ + if (lxpnp->lxpr_type != LXPR_PID_FD_FD || + lxpr_readlink_fdnode(lxpnp, bp, buflen) != 0) { + goto out; + } + } + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* Translate the pid (e.g. initpid to 1) */ + lxpr_fixpid(LXPTOZ(lxpnp), curproc, &pid, NULL); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + error = EACCES; + goto out; + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + error = EINVAL; + goto out; + } + } + + /* copy the link data to user space */ + error = uiomove(bp, strlen(bp), UIO_READ, uiop); + +out: + kmem_free(bp, klen); + return (error); +} + + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} + +/* Pollhead for fake POLLET support below */ +static struct pollhead lxpr_pollhead; + +static int +lxpr_poll(vnode_t *vp, short ev, int anyyet, short *reventsp, + pollhead_t **phpp, caller_context_t *ct) +{ + *reventsp = 0; + if (ev & POLLIN) + *reventsp |= POLLIN; + if (ev & POLLRDNORM) + *reventsp |= POLLRDNORM; + if (ev & POLLRDBAND) + *reventsp |= POLLRDBAND; + if (ev & POLLOUT) + *reventsp |= POLLOUT; + if (ev & POLLWRBAND) + *reventsp |= POLLWRBAND; + + /* + * Newer versions of systemd will monitor /proc/self/mountinfo with + * edge-triggered epoll (via libmount). If adding said resource to an + * epoll descriptor fails, as would be the expectation for a call to + * fs_poll when POLLET is present, then systemd will abort and the zone + * will fail to properly boot. Until proper pollwakeup() support is + * wired into lx_proc, valid POLLET support must be faked. + * + * While the only known (at this time) lx_proc resource where POLLET + * support is mandatory is LXPR_PID_MOUNTINFO, we cast a wide net to + * avoid other unexpected trouble. Normal devpoll caching (emitting a + * pollhead when (*reventsp == 0 && !anyyet)) is not enabled. + */ + if ((ev & POLLET) != 0) { + *phpp = &lxpr_pollhead; + } + return (0); +} + +static int +lxpr_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int i; + + for (i = 0; wr_tab[i].wft_type != LXPR_INVALID; i++) { + if (wr_tab[i].wft_type == type) { + if (wr_tab[i].wft_wrf != NULL) { + return (wr_tab[i].wft_wrf(lxpnp, uiop, cr, ct)); + } + break; + } + } + + /* pretend we wrote the whole thing */ + uiop->uio_offset += uiop->uio_resid; + uiop->uio_resid = 0; + return (0); +} + +/* Needed for writable files which are first "truncated" */ +static int +lxpr_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, + cred_t *cred, caller_context_t *ct) +{ + int error; + + if (cmd != F_FREESP) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cred, ct)) != 0) + return (error); + + return (0); +} + +/* + * Needed for writable files which are first "truncated". We only support + * truncation. + */ +static int +lxpr_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (vap->va_mask != AT_SIZE) + return (EINVAL); + if ((error = lxpr_access(vp, VWRITE, 0, cr, ct)) != 0) + return (error); + + return (0); +} + +/* + * We need to allow open with O_CREAT for the writable files. + */ +static int +lxpr_create(vnode_t *dvp, char *nm, vattr_t *vap, enum vcexcl exclusive, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + lxpr_node_t *lxpnp = VTOLXP(dvp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *vp = NULL; + int error; + + ASSERT(type < LXPR_NFILES); + + /* + * restrict create permission to owner or root + */ + if ((error = lxpr_access(dvp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + if (*nm == '\0') + return (EPERM); + + if (dvp->v_type != VDIR) + return (EPERM); + + if (exclusive == EXCL) + return (EEXIST); + + /* + * No writable files in top-level proc dir. We check this to avoid + * getting a non-proc node via "..". + */ + if (type != LXPR_PROCDIR && + lxpr_lookup(dvp, nm, &vp, NULL, 0, NULL, cr, ct, NULL, NULL) == 0) { + lxpr_nodetype_t ftype = VTOLXP(vp)->lxpr_type; + if (!lxpr_is_writable(ftype)) { + VN_RELE(vp); + vp = NULL; + } + } + + if (vp != NULL) { + ASSERT(vp->v_type != VDIR); + + /* confirm permissions against existing file */ + if ((error = lxpr_access(vp, mode, 0, cr, ct)) != 0) { + VN_RELE(vp); + return (error); + } + + *vpp = vp; + return (0); + } + + /* + * Linux proc does not allow creation of addition, non-subsystem + * specific files inside the hierarchy. ENOENT is tossed when such + * actions are attempted. + */ + return (ENOENT); +} diff --git a/usr/src/uts/common/brand/lx/sys/lx_acl.h b/usr/src/uts/common/brand/lx/sys/lx_acl.h new file mode 100644 index 0000000000..1e5ab26407 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_acl.h @@ -0,0 +1,45 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#ifndef _LX_ACL_H +#define _LX_ACL_H + +#include <sys/vnode.h> +#include <sys/uio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Both fall under the 'system.' namespace */ +#define LX_XATTR_POSIX_ACL_ACCESS "posix_acl_access" +#define LX_XATTR_POSIX_ACL_DEFAULT "posix_acl_default" + +enum lx_acl_type { + LX_ACL_ACCESS, + LX_ACL_DEFAULT +}; + +extern int lx_acl_setxattr(vnode_t *, enum lx_acl_type, void *, size_t); +extern int lx_acl_getxattr(vnode_t *, enum lx_acl_type, void *, size_t, + ssize_t *); +extern int lx_acl_removexattr(vnode_t *, enum lx_acl_type); +extern int lx_acl_listxattr(vnode_t *, uio_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_ACL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_audit.h b/usr/src/uts/common/brand/lx/sys/lx_audit.h new file mode 100644 index 0000000000..76686dd9ec --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_audit.h @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2018 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_AUDIT_H +#define _LX_AUDIT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern void lx_audit_init(int (*)(void *, uint_t, const char *, uint_t)); +extern void lx_audit_cleanup(void); +extern void lx_audit_stop_worker(void *, void (*)(void *, boolean_t)); +extern int lx_audit_append_rule(void *, uint_t); +extern int lx_audit_delete_rule(void *, uint_t); +extern void lx_audit_list_rules(void *, + void (*)(void *, void *, uint_t, void *, uint_t)); +extern void lx_audit_get_feature(void *, void (*)(void *, void *, uint_t)); +extern void lx_audit_get(void *, void (*)(void *, void *, uint_t)); +extern int lx_audit_set(void *, void *, uint_t, void (*cb)(void *, boolean_t)); +extern void lx_audit_emit_user_msg(uint_t, uint_t, char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUDIT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h new file mode 100644 index 0000000000..17b19895f4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h @@ -0,0 +1,511 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_H +#define _LX_AUTOFS_H + +/* + * The lxautofs filesystem and driver exist to emulate the Linux autofs + * filesystem and /dev/autofs device (this code emulates both). The + * purpose is to provide support for the Linux "automount" automounter. + * + * The device ioctls map fairly closely to the filesystem ioctls. The device + * ioctls have superseded the filesystem ioctls and the automounter will + * use the device ioctls if the device exists. + * + * The device ioctls are used by the automounter to perform recovery + * in cases where the automounter is restarted while mounts are present. It + * also allows for better management operations when a filesystem is mounted + * on top of an autofs mountpoint, as in the case of an NFS direct mount on + * top of an autofs mount. + * + * + * +++ Linux automounter background. + * + * Linux has two automounters: "amd" (not used in any popular, modern distro) + * and "automount". + * + * "automount" is the normal Linux automounter. It utilizes a kernel + * filesystem (autofs) and device (/dev/autofs) to provide its functionality. + * Basically, it mounts the autofs filesystem at any automounter controlled + * mountpoint. This filesystem then intercepts and redirects lookup operations + * to the userland automounter process via a pipe. The pipe to the automounter + * is established via a mount option when the autofs filesystem is mounted or + * via the setpipefd ioctl if the automounter restarts. When the automounter + * receives a request via this pipe, it does lookups (or unmounts) to whatever + * backing store it's configured to use, does mkdir operations on the autofs + * filesystem, mounts remote NFS filesystems on any directories it manages or + * just created, and signals the autofs device via an ioctl to let it know + * that the lookup (or expire) can continue. Other management operations (such + * as querying expiration for unmounting) are performed using the autofs device. + * + * + * +++ Linux autofs documentation. + * + * Within the Linux src tree, see the file: + * Documentation/filesystems/autofs4-mount-control.txt + * This documents some of the autofs behavior and the device driver ioctls. + * + * The following URL (https://lwn.net/Articles/606960/) documents autofs in + * general. This patch was targeted for Documentation/filesystems/autofs4.txt, + * but seems to have never integrated into the Linux src tree. + * + * + * +++ Linux autofs (and automount daemon) notes + * + * Since we're mimicking the behavior of the Linux autofs filesystem and + * device, we document some of the observed behavior here. + * + * There are multiple versions of the autofs filesystem kernel API protocol + * and modern implementations of the user-land automount daemon would depend + * on v5, although the filesystem API has been superseded by the driver ioctl + * API, which is roughly similar. + * + * We'll describe the filesystem ioctls first, since support for those was + * implemented first. The device ioctls roughly correspond to the filesystem + * ioctls and were implemented last, but the automounter will use those + * ioctls, instead of the filesystem ioctls, when the device is present. + * + * Our original autofs implementation was developed in the mid-2000s around the + * v2 protocol, but that is currently obsolete. Our current implementation is + * based around the v5 protocol API. There was no autofs device support at that + * time. + * + * The autoumounter supports 3 different, mutually exclusive, mount options for + * each mountpoint: + * - indirect (this was all you got with the v2 support) + * - direct + * - offset + * + * An 'indirect' mountpoint is managed with dynamic mounts below that + * mountpoint. For example, if '/home' were an indirect autofs mount, then + * accessing a username under /home would traverse the 'lookup' code described + * below, cause a local subdirectory to be created, and a mount, usually NFS, + * onto that username subdirectory. + * + * A 'direct' mountpoint is an autofs mountpoint which will trigger the + * mounting of another filesystem overtop that mountpoint when accessed. + * + * An 'offset' mountpoint behaves like a 'direct' mountpoint but it is + * created dynamically by the automounter underneath an 'indirect' mountpoint. + * For example, if '/net' were an indirect autosfs mountpoint and the host + * 'jurassic' exported two NFS filesystems; '/var/crash' and '/var/core', then + * accessing '/net/jurassic' would trigger the automounter to create two + * subdirectories; '/net/jurassic/var/crash' and '/net/jurassic/var/core'. The + * automounter would then mount an autofs offset mount onto each one of these + * directories. Accessing either of those directories would then trigger + * automounter to perform another mount on top, as is done with a 'direct' + * mount. + * + * General behavior + * + * A) Autofs allows root owned, non-automounter processes to create + * directories in the autofs filesystem. The autofs filesystem treats the + * automounter's process group as special, but it doesn't prevent root + * processes outside of the automounter's process group from creating new + * directories in the autofs filesystem. + * + * B) Autofs doesn't allow creation of any non-directory entries in the + * autofs filesystem. No entity can create files (e.g. /bin/touch or + * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within + * the autofs filesystem are directories. + * + * C) Autofs only intercepts vop lookup operations. Notably, it does _not_ + * intercept and re-direct vop readdir operations. This means that the + * observed behavior of the Linux automounter can be considerably different + * from that of the illumos automounter. Specifically, on illumos if an autofs + * mountpoint is mounted _without_ the -nobrowse option then if a user does + * an ls operation (which translates into a vop readdir operation) then the + * automounter will intercept that operation and list all the possible + * directories and mountpoints without actually mounting any filesystems. + * Essentially, all automounter managed mountpoints on Linux will behave + * like "-nobrowse" mountpoints on illumos. Here's an example to illustrate + * this. If /ws was mounted on illumos without the -nobrowse option and an + * auto_ws yp map was setup as the backing store for this mountpoint, then an + * "ls /ws" would list all the keys in the map as valid directories, but an + * "ls /ws" on Linux would list an emptry directory. + * + * D) NFS mounts are performed by the automount process. When the automount + * process gets a redirected lookup request, it determines _all_ the + * possible remote mountpoints for that request, creates directory paths + * via mkdir, and mounts the remote filesystems on the newly created paths. + * This is described in the offset mount example above. Once the automounter + * completed the mounts it would signal the autofs filesystem (via an ioctl) + * that the lookup could continue. + * + * E.1) Autofs only redirects vop lookup operations for path entries that + * don't already exist in the autofs filesystem. So for the example above, + * an initial (after the start of the automounter) "ls /net/jurassic" would + * result in a request to the automounter. A subsequest "ls /net/jurassic" + * would not result in a request to the automounter. Even if + * /net/jurassic/var/crash and /net/jurassic/var/core were manually unmounted + * after the initial "ls /net/jurassic", a subsequest "ls /net/jurassic" + * would not result in a new request to the automounter. + * + * E.2) Autofs lookup requests that are sent to the automounter only include + * the root directory path component. So for example, after starting up + * the automounter if a user were to do a "ls /net/jurassic/var/crash", the + * initial lookup request actually sent to the automounter would just be for + * "jurassic" (the same request as if the user had done "ls /net/jurassic"). + * After the initial mounting of the two offset mounts onto crash and core the + * lookup would continue and a final lookup request would be sent to the + * automounter for "crash" (but this would be on a different vfs from the + * /net vfs). + * + * E.3) The two statements above aren't entirely entirely true. The Linux + * autofs filesystem will also redirect lookup operations for leaf + * directories that don't have a filesystem mounted on them. Using the + * example above, if a user did a "ls /net/jurassic", then manually + * unmounted /net/jurassic/var/crash, and then did an "ls + * /net/jurassic/var/crash", this would result in a request for + * "jurassic/var/crash" being sent to the automounter. The strange thing + * (a Linux bug perhaps) is that the automounter won't do anything with this + * request and the lookup will fail. + * + * F) The autofs filesystem communication protocol (what ioctls it supports + * and what data it passes to the automount process) is versioned. The + * userland automount daemon (as of version v5.0.7) expects v5 of the protocol + * (by running the AUTOFS_IOC_PROTOSUBVER ioctl), and exits if that is not + * supported. For v2-v5 the structure passed through the pipe always begins + * with a common header followed by different fields depending on the packet + * type. In addition the different versions support additional ioctls. + * + * v2 - basic lookup request + * v3 - adds expiring (umounting) + * v4 - adds expire multi + * v5 - adds missing indirect, expire indirect, missing direct & expire direct. + * Defines a new protocol structure layout. + * The v5 'missing indirect' and 'missing direct' ioctls are analogous to + * the v2 'missing' ioctl. These ioctls are used to initiate a mount via + * a lookup. The 'expire' ioctls are used by the automounter to query if + * it is possible to unmount the filesystem. 'direct' and 'indirect' + * refer to the mount option type that the automounter performed and + * correlate to an automounter direct or indirect map mointpoint. + * + * G) The automounter periodically issues an 'expire' ioctl to autofs to + * obtain the name of a mountpoint which the automounter can unmount. + * Unmounting is dicussed in more detail below. + * + * H) The device ioctls roughly correspond to the filesystem ioctls, but + * instead of being tied to an auotfs mountpoint vnode, they can be called any + * time. The argument structure uses either a path or an autofs pipe file + * descriptor to indicate what is being operated on. + * + * +++ lxautofs notes + * + * 1) In general, the lxautofs filesystem tries to mimic the behavior of the + * Linux autofs filesystem with the following exceptions: + * + * 1.1) We don't bother to implement the E.3 functionality listed above + * since it doesn't appear to be of any use. + * + * 1.2) We only fully implement v2 and v5 of the autofs protocol. + * + * 2) In general, the approach taken for lxautofs is to keep it as simple + * as possible and to minimize it's memory usage. To do this all information + * about the contents of the lxautofs filesystem are mirrored in the + * underlying filesystem that lxautofs is mounted on and most vop operations + * are simply passed onto this underlying filesystem. This means we don't + * have to implement most of the complex operations that a full filesystem + * normally has to implement. It also means that most of our filesystem state + * (wrt the contents of the filesystem) doesn't actually have to be stored + * in memory, we can simply go to the underlying filesystem to get it when + * it's requested. For the purposes of discussion, we'll call the underlying + * filesystem the "backing store." + * + * The backing store is actually a directory called ".lxautofs" which is created + * in the directory where the lxautofs filesystem is mounted. When the + * lxautofs filesystem is unmounted this backing store directory is deleted. + * If this directory exists at mount time (perhaps the system crashed while a + * previous lxautofs instance was mounted at the same location) it will be + * deleted. There are a few implications of using a backing store worth + * mentioning. + * + * 2.1) lxautofs can't be mounted on a read only filesystem. If this + * proves to be a problem we can probably move the location of the + * backing store. + * + * 2.2) If the backing store filesystem runs out of space then the + * automounter process won't be able to create more directories and mount + * new filesystems. Of course, strange failures usually happen when + * filesystems run out of space. + * + * 3) Why aren't we using gfs? gfs has two different usage models. + * + * 3.1) I'm my own filesystem but i'm using gfs to help with managing + * readdir operations. + * + * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes + * + * We're not using the 3.1 interfaces because we don't implement readdir + * ourselves. We pass all readdir operations onto the backing store + * filesystem and utilize its readdir implementation. + * + * We're not using the 3.2 interfaces because they are really designed for + * in memory filesystems where all of the filesystem state is stored in + * memory. They don't lend themselves to filesystems where part of the + * state is in memory and part of the state is on disk. + * + * For more information on gfs take a look at the block comments in the + * top of gfs.c + * + * 4) Unmounting + * + * The automounter has a timeout associated with each mount. It informs autofs + * of this timeout using the LX_AUTOFS_DEV_IOC_TIMEOUT_CMD ioctl after autofs + * has been mounted on the mountpoint. + * + * After the automounter has mounted something associated with the mountpoint + * then periodically (<timeout>/4 seconds) the automounter will issue the + * LX_AUTOFS_DEV_IOC_EXPIRE_CMD ioctl on the autofs mount. autofs is expected + * to respond with an underlying mountpoint entry which is a candidate for + * unmounting. The automounter will attempt to unmount the filesystem + * (which may fail if it is busy, since this is obviously racy) and then + * acknowledge the expire ioctl. The successful acknowledgement is independent + * of the success of unmounting the underlying filesystem. + * + * Unmount handling varies based on which type of mount the autofs was mounted + * with (indirect, direct or offset). + * + * To support 'indirect' mount expiration, the autofs vfs keeps track of the + * filesystems mounted immediately under the autofs mountpoint (in + * lav_mnt_list) after a lookup has completed successfully. Upon receipt of the + * LX_AUTOFS_IOC_DEV_EXPIRE_CMD ioctl, autofs removes the first element from + * the list, attempts to check if it is busy and if not, returns that mountpoint + * over the fifo (if busy the entry is added to the end of the list). When the + * ioctl is acknowledged, if the mountpoint still exists, that means the unmount + * failed and the entry is added at the back of the list. If there are no + * elements or the first one is busy, EAGAIN is returned for the 'expire' ioctl + * and the autoumounter will check again in <timeout>/4 seconds. + * + * For example, if /home is an autofs indirect mount, then there are typically + * many different {username}-specific NFS mounts under that /home autofs mount. + * autofs uses the lav_mnt_list to respond to 'expire' ioctls in a round-robin + * fashion so that the automounter can unmount user file systems that aren't in + * use. + * + * Expiring 'direct' mounts is similar, but since there is only a single mount, + * the lav_mnt_list only will have at most one entry if there is a filesystem + * mounted overtop of the autofs mount. + * + * Expiring 'offset' mounts is more complicated because there are at least + * two different autofs VFSs involved (the top-level and one for each offset + * mount underneath). The actual offset mount is handled exactly like a 'direct' + * mount. The top-level is an indirect mount and is handled in a similar way + * as described above for indirect mounts, but special handling is needed for + * each offset mount below. + * + * This can be explained using the same 'jurassic' example described earlier + * (/net is an autofs 'indirect' mount and the host 'jurassic' has two exported + * file systems; /var/crash and /var/core). If the user accesses + * /net/jurassic/var/crash then the automounter would setup the system so that + * the following mounts exist: + * - /net (the original autofs indirect mount which triggers everything) + * - /net/jurassic/var/crash (autofs offset mount) + * - /net/jurassic/var/crash (NFS mount on top of the autofs offset mount) + * - /net/jurassic/var/core (autofs offset mount) + * + * For expiration the automounter will issue the LX_AUTOFS_IOC_EXPIRE_MULTI + * ioctl on each autofs vfs for which something is mounted, so we would receive + * an expire ioctl on /net and another on /net/jusrassic/var/crash. The vfs for + * /net will be tracking "jurassic", but we detect it is busy and won't do + * anything at first. The vfs for "crash" will work like a direct mount and + * acknowledge the expire ioctl to the automounter once that filesystem times + * out and is no longer busy. The automounter will then unmount the "crash" + * NFS mount. + * + * Once the "crash" NFS mount has been unmounted by the automounter, we're left + * with the two autofs offset mounts under jurassic. The automounter will not + * try to unmount either of those, so we have to do that. Once we get another + * expire ioctl on /net and check "jurassic", we'll see there are only autofs + * mounts under /net/jurassic. We umount those using the lx_autofs_umount_offset + * function and respond to the automounter expire ioctl with "jurassic", in the + * same way as we would for any other indirect mount. + * + * 5) Recovery + * + * If the automounter is restarted for any reason, it needs to cope with + * pre-existing autofs mounts, as well as other automount-initiated mounts (e.g. + * a direct mount on top of an autofs mountpoint). The automounter uses the + * /proc/mounts file to correlate mounts to the managed mountpoints. It then + * uses the /dev/autofs device to openmount each of the autofs devices and + * reinitialize them using the various dev ioctls (timeout, requester, etc.). + * + * In general, the autoumounter will closemount the mountpoint once it's done, + * but it doesn't in the case of an offset mountpoint with nothing mounted + * on top. In this case the automounter expects autofs to expire that mountpoint + * before it will closemount (so things can subsequently cleanup). We handle + * this special case in the expire code path. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that the name of the actual file system is lxautofs, not lx_autofs, but + * the code uses lx_autofs to prefix the various names. This is because file + * system names are limited to 8 characters. + */ +#define LX_AUTOFS_NAME "lxautofs" + +#define LX_AUTOFS_MINORNAME "autofs" + +/* + * Mount options supported. + */ +#define LX_MNTOPT_FD "fd" +#define LX_MNTOPT_PGRP "pgrp" +#define LX_MNTOPT_MINPROTO "minproto" +#define LX_MNTOPT_MAXPROTO "maxproto" +#define LX_MNTOPT_INDIRECT "indirect" +#define LX_MNTOPT_DIRECT "direct" +#define LX_MNTOPT_OFFSET "offset" + +/* + * Version/subversion of the Linux kernel automount protocol we support. + * + * We fully support v2 and v5. We'll return ENOTSUP for all of the ioctls we + * don't yet handle. + */ +#define LX_AUTOFS_PROTO_VERS5 5 +#define LX_AUTOFS_PROTO_SUBVERSION 2 +#define LX_AUTOFS_PROTO_VERS2 2 + +/* packet types */ +typedef enum laph_ptype { + LX_AUTOFS_PTYPE_MISSING, /* 0 */ + LX_AUTOFS_PTYPE_EXPIRE, /* 1 */ + LX_AUTOFS_PTYPE_EXPIRE_MULTI, /* 2 */ + LX_AUTOFS_PTYPE_MISSING_INDIR, /* 3 */ + LX_AUTOFS_PTYPE_EXPIRE_INDIR, /* 4 */ + LX_AUTOFS_PTYPE_MISSING_DIRECT, /* 5 */ + LX_AUTOFS_PTYPE_EXPIRE_DIRECT /* 6 */ +} laph_ptype_t; + +/* + * Common header for all versions of the protocol. + */ +typedef struct lx_autofs_pkt_hdr { + int laph_protover; /* protocol version number */ + laph_ptype_t laph_type; + int laph_id; /* every pkt must have a unique id */ +} lx_autofs_pkt_hdr_t; + +/* + * Command structure sent to automount process from lxautofs via a pipe. + * This structure is the same for v2-v4 of the automount protocol + * (the communication pipe is established at mount time). + */ +typedef struct lx_autofs_v2_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component to lookup */ +} lx_autofs_v2_pkt_t; + +/* v4 multi-expire */ +typedef struct lx_autofs_v4_exp_pkt { + lx_autofs_pkt_hdr_t lape_hdr; + int lape_len; + char lape_name[MAXNAMELEN]; +} lx_autofs_v4_exp_pkt_t; + +/* v5 */ +typedef struct lx_autofs_v5_pkt { + lx_autofs_pkt_hdr_t lap_hdr; + uint32_t lap_dev; + uint64_t lap_ino; + uint32_t lap_uid; + uint32_t lap_gid; + uint32_t lap_pid; + uint32_t lap_tgid; + uint32_t lap_name_len; + char lap_name[256]; +} lx_autofs_v5_pkt_t; + +union lx_autofs_pkt { + lx_autofs_v2_pkt_t lap_v2; + lx_autofs_v5_pkt_t lap_v5; +}; + +#define lap_protover lap_v2.lap_hdr.laph_protover +#define lap_type lap_v2.lap_hdr.laph_type +#define lap_id lap_v2.lap_hdr.laph_id + +/* + * Ioctls fully supported (v2 protocol). + */ +#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */ +#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */ +#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */ + +/* + * Ioctls supported (v3/v4 protocol). + */ +#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */ +#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0089364 /* arg: ulong_t */ + +/* + * Ioctls not supported (v3/v4 protocol). + */ + /* arg: lx_autofs_v3_exp_pkt_t * */ +#define LX_AUTOFS_IOC_EXPIRE 0x81109365 + +/* + * Ioctls supported (v5 protocol). + */ +#define LX_AUTOFS_IOC_PROTOSUBVER 0x80049367 /* arg: int */ +#define LX_AUTOFS_IOC_ASKUMOUNT 0x80049370 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_MULTI 0x40049366 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE_INDIRECT LX_AUTOFS_IOC_EXPIRE_MULTI +#define LX_AUTOFS_IOC_EXPIRE_DIRECT LX_AUTOFS_IOC_EXPIRE_MULTI + +/* + * autofs device ioctls + */ +#define LX_AUTOFS_DEV_IOC_VERSION_CMD 0xc0189371 +#define LX_AUTOFS_DEV_IOC_PROTOVER_CMD 0xc0189372 +#define LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD 0xc0189373 +#define LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD 0xc0189374 +#define LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD 0xc0189375 +#define LX_AUTOFS_DEV_IOC_READY_CMD 0xc0189376 +#define LX_AUTOFS_DEV_IOC_FAIL_CMD 0xc0189377 +#define LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD 0xc0189378 +#define LX_AUTOFS_DEV_IOC_CATATONIC_CMD 0xc0189379 +#define LX_AUTOFS_DEV_IOC_TIMEOUT_CMD 0xc018937a +#define LX_AUTOFS_DEV_IOC_REQUESTER_CMD 0xc018937b +#define LX_AUTOFS_DEV_IOC_EXPIRE_CMD 0xc018937c +#define LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD 0xc018937d +#define LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD 0xc018937e + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h new file mode 100644 index 0000000000..39ea96d1fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_IMPL_H +#define _LX_AUTOFS_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/modhash.h> +#include <sys/vnode.h> + +#include <sys/lx_autofs.h> + +/* + * Space key. + * Used to persist data across lx_autofs filesystem module unloads. + */ +#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev" + +/* + * Name of the backing store directory. + */ +#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME + +#define LX_AUTOFS_VFS_ID_HASH_SIZE 15 +#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15 +#define LX_AUTOFS_VFS_VN_HASH_SIZE 15 + +enum lx_autofs_mnttype { LXAMT_NONE, LXAMT_INDIR, LXAMT_DIRECT, LXAMT_OFFSET }; + +typedef struct lx_autofs_mntent { + list_node_t lxafme_lst; + uint64_t lxafme_ts; /* time stamp */ + uint_t lxafme_len; + char *lxafme_path; +} lx_autofs_mntent_t; + +/* + * VFS data object. + */ +typedef struct lx_autofs_vfs { + /* Info about the underlying filesystem and backing store. */ + vnode_t *lav_mvp; + char *lav_bs_name; + vnode_t *lav_bs_vp; + + /* Info about the automounter process managing this filesystem. */ + int lav_fd; + pid_t lav_pgrp; + file_t *lav_fifo_wr; + file_t *lav_fifo_rd; + + /* The mount's dev and ino values for v5 protocol msg */ + uint64_t lav_dev; + u_longlong_t lav_ino; + + /* options from the mount */ + enum lx_autofs_mnttype lav_mnttype; + int lav_min_proto; + + /* + * ioctl-set timeout value. The automounter will perform an expire + * ioctl every timeout/4 seconds. We use this to expire a mount once + * it is inactive for the full timeout. + */ + ulong_t lav_timeout; + + /* ioctl-set catatonic value (prevents future mounts). */ + boolean_t lav_catatonic; + + /* Mount initiator's uid/gid for recovery handling. */ + uid_t lav_uid; + gid_t lav_gid; + + /* Each automount requests needs a unique id. */ + id_space_t *lav_ids; + + /* All remaining structure members are protected by lav_lock. */ + kmutex_t lav_lock; + /* openmount counter */ + int lav_openmnt_cnt; + + + /* Hashes to keep track of outstanding automounter requests. */ + mod_hash_t *lav_path_hash; + mod_hash_t *lav_id_hash; + + /* We need to keep track of all our vnodes. */ + vnode_t *lav_root; + mod_hash_t *lav_vn_hash; + + /* list of current mounts */ + list_t lav_mnt_list; +} lx_autofs_vfs_t; + +enum lx_autofs_callres { LXACR_NONE, LXACR_READY, LXACR_FAIL }; + +/* + * Structure to keep track of automounter requests sent to user-land. + */ +typedef struct lx_autofs_automnt_req { + /* Packet that gets sent to the automounter. */ + union lx_autofs_pkt laar_pkt; + int laar_pkt_size; + + /* Reference count. Always updated atomically. */ + uint_t laar_ref; + + /* + * Fields to keep track and sync threads waiting on a lookup. + * Fields are protected by lalr_lock. + */ + kmutex_t laar_lock; + kcondvar_t laar_cv; + int laar_complete; + + enum lx_autofs_callres laar_result; +} lx_autofs_automnt_req_t; + +/* + * Generic stack structure. + */ +typedef struct stack_elem { + list_node_t se_list; + caddr_t se_ptr1; + caddr_t se_ptr2; + caddr_t se_ptr3; +} stack_elem_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h new file mode 100644 index 0000000000..35b1bddb03 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -0,0 +1,772 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _LX_BRAND_H +#define _LX_BRAND_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/zone.h> +#include <sys/ksocket.h> +#include <sys/vfs.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/cpuvar.h> +#include <sys/lx_futex.h> +#include <sys/lx_userhz.h> +#include <sys/uuid.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_BRANDNAME "lx" + +/* + * Brand uname info + */ +#define LX_UNAME_SYSNAME "Linux" +#define LX_UNAME_RELEASE_2_6 "2.6.18" +#define LX_UNAME_RELEASE_2_4 "2.4.21" +#define LX_UNAME_VERSION "BrandZ virtual linux" +#define LX_UNAME_MACHINE32 "i686" +#define LX_UNAME_MACHINE64 "x86_64" + +#define LX_LIB_PATH32 "/native/usr/lib/lx_brand.so.1" +#define LX_LIB_PATH64 "/native/usr/lib/amd64/lx_brand.so.1" + +#define LX_VDSO_PATH32 "/native/usr/lib/brand/lx/lx_vdso.so.1" +#define LX_VDSO_PATH64 "/native/usr/lib/brand/lx/amd64/lx_vdso.so.1" + +#if defined(_LP64) +#define LX_LIB_PATH LX_LIB_PATH64 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE64 +#define LX_VDSO_PATH LX_VDSO_PATH64 +#else +#define LX_LIB_PATH LX_LIB_PATH32 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE32 +#define LX_VDSO_PATH LX_VDSO_PATH32 +#endif + +/* + * This must be large enough for both the 32-bit table and 64-bit table. + */ +#define LX_NSYSCALLS 358 + +/* Highest capability we know about */ +#define LX_CAP_MAX_VALID 36 + +/* sched attr flag values */ +#define LX_SCHED_FLAG_RESET_ON_FORK 0x1 +/* + * brand(2) subcommands + * + * Everything >= 128 is a brand-specific subcommand. + * > 192 is reserved for in-kernel emulated system calls. + */ +#define B_LPID_TO_SPAIR 128 +#define B_GET_CURRENT_CONTEXT 129 +#define B_EMULATION_DONE 130 +/* Some native programs use B_START_NFS_LOCKD, so don't change this. */ +#define B_START_NFS_LOCKD 131 +#define B_BLOCK_ALL_SIGS 132 +#define B_UNBLOCK_ALL_SIGS 133 +#define B_PTRACE_CLONE_BEGIN 134 +#define B_PTRACE_STOP_FOR_OPT 135 +#define B_UNSUPPORTED 136 +#define B_STORE_ARGS 137 +#define B_GETPID 138 +#define B_JUMP_TO_LINUX 139 +#define B_ALL_SIGS_BLOCKED 140 +#define B_EXIT_AS_SIG 141 +/* formerly B_HELPER_WAITID 142 */ +#define B_HELPER_CLONE 143 +#define B_HELPER_SETGROUPS 144 +#define B_HELPER_SIGQUEUE 145 +#define B_HELPER_TGSIGQUEUE 146 +#define B_SET_NATIVE_STACK 147 +/* formerly B_SIGEV_THREAD_ID 148 */ +#define B_OVERRIDE_KERN_VER 149 +#define B_PTRACE_SIG_RETURN 150 +#define B_GET_PERSONALITY 151 + +#ifndef _ASM +/* + * Support for Linux PTRACE_SETOPTIONS handling. + */ +typedef enum lx_ptrace_options { + LX_PTRACE_O_TRACESYSGOOD = 0x0001, + LX_PTRACE_O_TRACEFORK = 0x0002, + LX_PTRACE_O_TRACEVFORK = 0x0004, + LX_PTRACE_O_TRACECLONE = 0x0008, + LX_PTRACE_O_TRACEEXEC = 0x0010, + LX_PTRACE_O_TRACEVFORKDONE = 0x0020, + LX_PTRACE_O_TRACEEXIT = 0x0040, + LX_PTRACE_O_TRACESECCOMP = 0x0080 +} lx_ptrace_options_t; + +#define LX_PTRACE_O_ALL \ + (LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | \ + LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | \ + LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE | \ + LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP) +#endif /* !_ASM */ + +/* siginfo si_status for traced events */ +#define LX_PTRACE_EVENT_FORK 0x100 +#define LX_PTRACE_EVENT_VFORK 0x200 +#define LX_PTRACE_EVENT_CLONE 0x300 +#define LX_PTRACE_EVENT_EXEC 0x400 +#define LX_PTRACE_EVENT_VFORK_DONE 0x500 +#define LX_PTRACE_EVENT_EXIT 0x600 +#define LX_PTRACE_EVENT_SECCOMP 0x700 + +/* + * Brand-private values for the "pr_what" member of lwpstatus, for use with the + * PR_BRAND stop reason. These reasons are validated in lx_stop_notify(); + * update it if you add new reasons here. + */ +#define LX_PR_SYSENTRY 1 +#define LX_PR_SYSEXIT 2 +#define LX_PR_SIGNALLED 3 +#define LX_PR_EVENT 4 + + +#define LX_VERSION_1 1 +#define LX_VERSION LX_VERSION_1 + +#define LX_ATTR_KERN_RELEASE ZONE_ATTR_BRAND_ATTRS +#define LX_ATTR_KERN_VERSION (ZONE_ATTR_BRAND_ATTRS + 1) +#define LX_ATTR_TTY_GID (ZONE_ATTR_BRAND_ATTRS + 2) + +/* + * Aux vector containing phdr of Linux executable and ehdr of interpreter + * (if any), both of which are used by lx_librtld_db to ascertain r_debug. + * We repurpose the 3rd brand-specific aux vector slot for the Linux + * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library). + */ +#define AT_SUN_BRAND_LX_PHDR AT_SUN_BRAND_AUX1 +#define AT_SUN_BRAND_LX_INTERP AT_SUN_BRAND_AUX2 +#define AT_SUN_BRAND_LX_CLKTCK AT_SUN_BRAND_AUX3 +#define AT_SUN_BRAND_LX_SYSINFO_EHDR AT_SUN_BRAND_AUX4 + +/* Aux vectors containing real/effective user/group IDs */ +#define AT_LX_UID 11 +#define AT_LX_EUID 12 +#define AT_LX_GID 13 +#define AT_LX_EGID 14 +/* Aux vector containing hz value */ +#define AT_CLKTCK 17 +/* Aux vector containing secure boolean */ +#define AT_SECURE 23 +/* Aux vector containing vDSO addr */ +#define AT_SYSINFO_EHDR 33 + +/* + * Usermode emulation routines are run on an alternate stack allocated by + * the brand library. Every LWP in a process will incur this overhead beyond + * the regular thread stack: + */ +#define LX_NATIVE_STACK_PAGE_COUNT 64 + +/* + * When returning in a new child process created with vfork(2) (or CLONE_VFORK) + * we discard some of the native stack to prevent corruption of the parent + * emulation state. + */ +#define LX_NATIVE_STACK_VFORK_GAP 0x3000 + +#ifndef _ASM + +extern struct brand lx_brand; + +typedef struct lx_brand_registration { + uint_t lxbr_version; /* version number */ + void *lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration_t; + +typedef struct lx_brand_registration32 { + uint_t lxbr_version; /* version number */ + uint32_t lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration32_t; + +#endif /* _ASM */ + +/* + * GDT usage + */ +#define GDT_TLSMIN (GDT_BRANDMIN) +#define GDT_TLSMAX (GDT_TLSMIN + 2) +#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN) + +#ifndef _ASM + +/* + * Stores information needed by the lx linker to launch the main + * lx executable. + */ +typedef struct lx_elf_data64 { + uintptr_t ed_phdr; + uintptr_t ed_phent; + uintptr_t ed_phnum; + uintptr_t ed_entry; + uintptr_t ed_base; + uintptr_t ed_ldentry; +} lx_elf_data64_t; + +typedef struct lx_elf_data32 { + uint32_t ed_phdr; + uint32_t ed_phent; + uint32_t ed_phnum; + uint32_t ed_entry; + uint32_t ed_base; + uint32_t ed_ldentry; +} lx_elf_data32_t; + +#if defined(_LP64) +typedef lx_elf_data64_t lx_elf_data_t; +#else +typedef lx_elf_data32_t lx_elf_data_t; +#endif + +typedef enum lx_proc_flags { + /* flags configurable via brandsys() and members of LX_PROC_ALL */ + LX_PROC_INSTALL_MODE = 0x01, + LX_PROC_STRICT_MODE = 0x02, + /* internal flags */ + LX_PROC_CHILD_DEATHSIG = 0x04, + LX_PROC_NO_DUMP = 0x08 /* for lx_prctl LX_PR_[GS]ET_DUMPABLE */ +} lx_proc_flags_t; + +#define LX_PROC_ALL (LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE) + +/* Maximum length for fields of LX uname */ +#define LX_SYS_UTS_LN 65 + +/* Max. length of kernel release string */ +#define LX_KERN_RELEASE_MAX LX_SYS_UTS_LN +#define LX_KERN_VERSION_MAX LX_SYS_UTS_LN + +#ifdef _KERNEL + +/* + * Entry points for cgroup integration. + */ +extern void (*lx_cgrp_initlwp)(vfs_t *, uint_t, id_t, pid_t); +extern void (*lx_cgrp_freelwp)(vfs_t *, uint_t, id_t, pid_t); + +#define LX_RLFAKE_LOCKS 0 +#define LX_RLFAKE_NICE 1 +#define LX_RLFAKE_RTPRIO 2 +#define LX_RLFAKE_RTTIME 3 + +#define LX_RLFAKE_NLIMITS 4 + +#define LX_RLIM64_INFINITY (~0ULL) + +typedef struct { + uint64_t rlim_cur; + uint64_t rlim_max; +} lx_rlimit64_t; + +typedef struct { + list_node_t lx_clgrpm_link; + proc_t *lx_clgrpm_pp; +} lx_clone_grp_member_t; + +typedef struct { + kmutex_t lx_clgrp_lock; /* protects cnt & member list */ + uint_t lx_clgrp_cnt; + list_t lx_clgrp_members; +} lx_clone_grp_t; + +/* Entries in the l_clone_grps clone-group array */ +#define LX_CLGRP_FS 0 +#define LX_CLGRP_MAX 1 + +/* See explanation in lx_mem.c about lx_mremap */ +#define LX_REMAP_ANONCACHE_NENTRIES 4 +typedef struct lx_segmap { + uintptr_t lxsm_vaddr; /* virtual address of mapping */ + size_t lxsm_size; /* size of mapping in bytes */ + uint64_t lxsm_lru; /* LRU field for cache */ + uint_t lxsm_flags; /* protection and attribute flags */ +} lx_segmap_t; + +typedef struct lx_proc_data { + uintptr_t l_handler; /* address of user-space handler */ + pid_t l_ppid; /* pid of originating parent proc */ + uid_t l_loginuid; /* /proc/{pid}/loginuid */ + int64_t l_ptrace; /* count of process lwps observed by ptrace */ + lx_elf_data_t l_elf_data; /* ELF data for linux executable */ + /* signal to deliver to parent when this thread group dies */ + int l_signal; + /* native signal to deliver to process when parent dies */ + int l_parent_deathsig; + lx_proc_flags_t l_flags; + + kmutex_t l_clone_grp_lock; /* protects the following member */ + lx_clone_grp_t *l_clone_grps[LX_CLGRP_MAX]; + + lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS]; + + kmutex_t l_io_ctx_lock; /* protects the following members */ + uintptr_t l_io_ctxpage; + kcondvar_t l_io_destroy_cv; + uint_t l_io_ctx_cnt; + struct lx_io_ctx **l_io_ctxs; + + /* Override zone-wide settings for uname release and version */ + char l_uname_release[LX_KERN_RELEASE_MAX]; + char l_uname_version[LX_KERN_VERSION_MAX]; + + /* Linux process personality */ + unsigned int l_personality; + + /* VDSO location */ + uintptr_t l_vdso; + + /* mremap anon cache */ + kmutex_t l_remap_anoncache_lock; + uint64_t l_remap_anoncache_generation; + lx_segmap_t l_remap_anoncache[LX_REMAP_ANONCACHE_NENTRIES]; + + /* Block all signals to all threads; used during vfork */ + uint_t l_block_all_signals; +} lx_proc_data_t; + +#endif /* _KERNEL */ + +/* + * Linux process personality(2) flags stored in l_personality + */ +#define LX_PER_UNAME26 0x0020000 +#define LX_PER_ADDR_NO_RANDOMIZE 0x0040000 +#define LX_PER_FDPIC_FUNCPTRS 0x0080000 +#define LX_PER_MMAP_PAGE_ZERO 0x0100000 +#define LX_PER_ADDR_COMPAT_LAYOUT 0x0200000 +#define LX_PER_READ_IMPLIES_EXEC 0x0400000 +#define LX_PER_ADDR_LIMIT_32BIT 0x0800000 +#define LX_PER_SHORT_INODE 0x1000000 +#define LX_PER_WHOLE_SECONDS 0x2000000 +#define LX_PER_STICKY_TIMEOUTS 0x4000000 +#define LX_PER_ADDR_LIMIT_3GB 0x8000000 + +#define LX_PER_LINUX 0x00 +#define LX_PER_SUNOS (0x06 | LX_PER_STICKY_TIMEOUTS) +#define LX_PER_MASK 0xff + +/* max. number of aio control blocks (see lx_io_setup) allowed across zone */ +#define LX_AIO_MAX_NR 65536 + +/* + * A data type big enough to bitmap all Linux possible cpus. + * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages + * for sched_getaffinity() and sched_getaffinity(). + */ +#define LX_NCPU (1024) +#define LX_AFF_ULONGS (LX_NCPU / (8 * sizeof (ulong_t))) +typedef ulong_t lx_affmask_t[LX_AFF_ULONGS]; + +/* + * Flag values for uc_brand_data[0] in the ucontext_t: + */ +#define LX_UC_STACK_NATIVE 0x00001 +#define LX_UC_STACK_BRAND 0x00002 +#define LX_UC_RESTORE_NATIVE_SP 0x00010 +#define LX_UC_FRAME_IS_SYSCALL 0x00100 +#define LX_UC_RESTART_SYSCALL 0x01000 +#define LX_UC_IGNORE_LINK 0x10000 + +#ifdef _KERNEL + +typedef struct lx_lwp_data lx_lwp_data_t; + +/* + * Flag values for "lxpa_flags" on a ptrace(2) accord. + */ +typedef enum lx_accord_flags { + LX_ACC_TOMBSTONE = 0x01 +} lx_accord_flags_t; + +/* + * Flags values for "br_ptrace_flags" in the LWP-specific data. + */ +typedef enum lx_ptrace_flags { + LX_PTF_SYSCALL = 0x01, /* handling syscall or a trap */ + LX_PTF_EXITING = 0x02, + LX_PTF_STOPPING = 0x04, + LX_PTF_INHERIT = 0x08, + LX_PTF_STOPPED = 0x10, + LX_PTF_PARENT_WAIT = 0x20, + LX_PTF_CLDPEND = 0x40, + LX_PTF_CLONING = 0x80, + LX_PTF_WAITPEND = 0x100, + LX_PTF_NOSTOP = 0x200, /* disable syscall stop event */ + LX_PTF_INSYSCALL = 0x400 /* between syscall enter & exit */ +} lx_ptrace_flags_t; + +/* + * A ptrace(2) accord represents the relationship between a tracer LWP and the + * set of LWPs that it is tracing: the tracees. This data structure belongs + * primarily to the tracer, but is reference counted so that it may be freed by + * whoever references it last. + */ +typedef struct lx_ptrace_accord { + kmutex_t lxpa_lock; + uint_t lxpa_refcnt; + lx_accord_flags_t lxpa_flags; + + /* + * The tracer must hold "pidlock" while clearing these fields for + * exclusion of waitid(), etc. + */ + lx_lwp_data_t *lxpa_tracer; + kcondvar_t *lxpa_cvp; + + /* + * The "lxpa_tracees_lock" mutex protects the tracee list. + */ + kmutex_t lxpa_tracees_lock; + list_t lxpa_tracees; +} lx_ptrace_accord_t; + +/* + * These values are stored in the per-LWP data for a tracee when it is attached + * to a tracer. They record the method that was used to attach. + */ +typedef enum lx_ptrace_attach { + LX_PTA_NONE = 0x00, /* not attached */ + LX_PTA_ATTACH = 0x01, /* due to tracer using PTRACE_ATTACH */ + LX_PTA_TRACEME = 0x02, /* due to child using PTRACE_TRACEME */ + LX_PTA_INHERIT_CLONE = 0x04, /* due to PTRACE_CLONE clone(2) flag */ + LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */ +} lx_ptrace_attach_t; + +typedef enum lx_stack_mode { + LX_STACK_MODE_PREINIT = 0, + LX_STACK_MODE_INIT, + LX_STACK_MODE_NATIVE, + LX_STACK_MODE_BRAND +} lx_stack_mode_t; + +struct lx_pid { + pid_t lxp_spid; /* the SunOS pid and ... */ + id_t lxp_stid; /* ... tid pair */ + pid_t lxp_lpid; /* the corresponding linux pid */ + time_t lxp_start; /* birthday of this pid */ + struct pid *lxp_pidp; /* allocated pid struct */ + proc_t *lxp_procp; /* proc_t corresponding to lxp_spid */ + struct lx_pid *lxp_stol_next; /* link in stol hash table */ + struct lx_pid *lxp_ltos_next; /* link in ltos hash table */ +}; + +/* + * lx-specific data in the klwp_t + */ +struct lx_lwp_data { + uint_t br_lwp_flags; /* misc. flags */ + klwp_t *br_lwp; /* back pointer to container lwp */ + int br_signal; /* signal to send to parent when */ + /* clone()'ed child terminates */ + int br_exitwhy; /* reason for thread (process) exit */ + int br_exitwhat; /* exit code / killing signal */ + cpuset_t *br_affinitymask; /* bitmask of CPU sched affinities */ + struct user_desc br_tls[LX_TLSNUM]; + /* descriptors used by libc for TLS */ + ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */ + ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */ + ulong_t br_lx_gsbase; /* lx user-land gsbase */ + ulong_t br_ntv_gsbase; /* native user-land gsbase */ + pid_t br_pid; /* converted pid for this thread */ + pid_t br_tgid; /* thread group ID for this thread */ + pid_t br_ppid; /* parent pid for this thread */ + id_t br_ptid; /* parent tid for this thread */ + void *br_clear_ctidp; /* clone thread id ptr */ + void *br_set_ctidp; /* clone thread id ptr */ + void *br_robust_list; /* robust lock list, if any */ + + /* first 4 syscall args - used for auditing */ + uintptr_t br_syscall_args[4]; + + /* + * The following struct is used by some system calls to pass extra + * flags into the kernel without impinging on the namespace for + * illumos. + */ + void *br_scall_args; + int br_args_size; /* size in bytes of br_scall_args */ + + boolean_t br_waitid_emulate; + int br_waitid_flags; + + lx_ptrace_flags_t br_ptrace_flags; /* ptrace flags for this LWP */ + lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */ + lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */ + + lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */ + lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */ + lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */ + list_node_t br_ptrace_linkage; /* linkage for lxpa_tracees list */ + + ushort_t br_ptrace_whystop; /* stop reason, 0 for no stop */ + ushort_t br_ptrace_whatstop; /* stop sub-reason */ + + int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */ + /* + * Track the last (native) signal number processed by a ptrace. + * This allows the tracee to properly handle ignored signals after + * the tracer has been notified and the tracee restarted. + */ + int32_t br_ptrace_donesig; + uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */ + + uint_t br_ptrace_event; + ulong_t br_ptrace_eventmsg; + + int br_syscall_num; /* current system call number */ + boolean_t br_syscall_restart; /* should restart on EINTR */ + + /* + * Store the LX_STACK_MODE for this LWP, and the current extent of the + * native (emulation) stack. This is similar, in principle, to the + * sigaltstack mechanism for signal handling. We also use this mode + * flag to determine how to process system calls from this LWP. + */ + lx_stack_mode_t br_stack_mode; + uintptr_t br_ntv_stack; + uintptr_t br_ntv_stack_current; + + /* + * If strict mode is enabled (via LX_STRICT in the environment), any + * call to lx_unsupported() will set this boolean to B_TRUE. This will + * cause us to drop SIGSYS on the LWP as it attempts to return to + * usermode. + */ + boolean_t br_strict_failure; + + /* + * Some syscalls emulated in-kernel still call back out to the + * userspace emulation for certain functions. When that is the case, + * the syscall_return logic must be bypassed at the end of the + * in-kernel syscall code. The NORMALRETURN and JUSTRETURN constants + * are used to choose the behavior. + */ + char br_eosys; + + /* + * Hold a pre-allocated lx_pid structure to be used during lx_initlwp. + */ + struct lx_pid *br_lpid; + + /* + * ID of the cgroup this thread belongs to. + */ + uint_t br_cgroupid; + + /* + * When the zone is running under FSS (which is the common case) then + * we cannot change scheduling class, so we emulate that. By default + * Linux uses LX_SCHED_OTHER (which is 0) and that only supports a + * priority of 0, so no special initialization is needed. + */ + int br_schd_class; /* emulated scheduling class */ + int br_schd_pri; /* emulated scheduling priority */ + uint64_t br_schd_flags; /* emulated [sg]et_attr flags */ + uint64_t br_schd_runtime; /* emulated DEADLINE */ + uint64_t br_schd_deadline; /* emulated DEADLINE */ + uint64_t br_schd_period; /* emulated DEADLINE */ + + fwaiter_t br_fwaiter; /* futex upon which we're waiting */ + uint_t br_clone_grp_flags; /* pending clone group */ +}; + +/* + * Upper limit on br_args_size, low because this value can persist until + * overridden with another value, and the size is given from userland. + */ +#define LX_BR_ARGS_SIZE_MAX (1024) + +typedef enum lx_audit_enbl { + LXAE_DISABLED, + LXAE_ENABLED, + LXAE_LOCKED +} lx_audit_enbl_t; + +/* + * brand specific data + * + * We currently only support a single cgroup mount in an lx zone so we only have + * one ptr (lxzd_cgroup) but this could be changed to a list if cgroups is ever + * enhanced to support different mounts with different subsystem controllers. + */ +typedef struct lx_zone_data { + kmutex_t lxzd_lock; /* protects all members */ + char lxzd_kernel_release[LX_KERN_RELEASE_MAX]; + char lxzd_kernel_version[LX_KERN_VERSION_MAX]; + ksocket_t lxzd_ioctl_sock; + char lxzd_bootid[UUID_PRINTABLE_STRING_LENGTH]; /* procfs boot_id */ + gid_t lxzd_ttygrp; /* tty gid for pty chown */ + vfs_t *lxzd_cgroup; /* cgroup for this zone */ + pid_t lxzd_lockd_pid; /* pid of NFS lockd */ + list_t *lxzd_vdisks; /* virtual disks (zvols) */ + dev_t lxzd_zfs_dev; /* major num for zfs */ + uint_t lxzd_aio_nr; /* see lx_aio.c */ + uint_t lxzd_pipe_max_sz; /* pipe-max-size sysctl val */ + boolean_t lxzd_swap_disabled; /* no fake swap in zone? */ + lx_audit_enbl_t lxzd_audit_enabled; /* auditing? */ + struct lx_audit_state *lxzd_audit_state; /* zone's audit state */ +} lx_zone_data_t; + +/* LWP br_lwp_flags values */ +#define BR_CPU_BOUND 0x0001 +#define BR_AIO_LWP 0x0002 /* aio kernel worker thread */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) +#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l)) +#define ttolxproc(t) \ + (((t)->t_procp->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL) +#define ptolxproc(p) \ + (((p)->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(p)->p_brand_data : NULL) +#define ztolxzd(z) \ + (((z)->zone_brand == &lx_brand) ? \ + (lx_zone_data_t *)(z)->zone_brand_data : NULL) + +/* Macro for converting to system call arguments. */ +#define LX_ARGS(scall) ((struct lx_##scall##_args *)\ + (ttolxlwp(curthread)->br_scall_args)) + +typedef enum lx_virt_disk_type { + LXVD_NONE, + LXVD_ZFS_DS, + LXVD_ZVOL +} lx_virt_disk_type_t; + +typedef struct lx_virt_disk { + list_node_t lxvd_link; + char lxvd_name[MAXNAMELEN]; + lx_virt_disk_type_t lxvd_type; + dev_t lxvd_emul_dev; + dev_t lxvd_real_dev; + uint64_t lxvd_volsize; + uint64_t lxvd_blksize; + char lxvd_real_name[MAXPATHLEN]; +} lx_virt_disk_t; + +/* + * Determine the upper bound on the system call number: + */ +#if defined(_LP64) +#define LX_MAX_SYSCALL(lwp) \ + ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \ + lx_nsysent64 : lx_nsysent32) +#else +#define LX_MAX_SYSCALL(lwp) lx_nsysent32 +#endif + +extern int lx_kern_release_cmp(zone_t *, const char *); + +extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t); +extern void lx_divert(klwp_t *, uintptr_t); +extern int lx_runexe(klwp_t *, void *); +extern void lx_switch_to_native(klwp_t *); + +extern int lx_syscall_enter(void); +extern void lx_syscall_return(klwp_t *, int, long); + +extern void lx_trace_sysenter(int, uintptr_t *); +extern void lx_trace_sysreturn(int, long); + +extern void lx_emulate_user(klwp_t *, int, uintptr_t *); + +extern void lx_audit_ld(); +extern void lx_audit_unld(); +extern void lx_audit_fini(zone_t *); +extern void lx_audit_syscall_exit(int, long); + +#if defined(_SYSCALL32_IMPL) +extern void lx_emulate_user32(klwp_t *, int, uintptr_t *); +#endif + +extern int lx_debug; +#define lx_print if (lx_debug) printf + +/* + * Flags for lx_lpid_lock() + */ +typedef enum { + LXP_PRLOCK = 0x1, /* acquire PR_LOCK as part of locking */ + LXP_ZOMBOK = 0x2 /* allow locking of zombies */ +} lx_pid_flag_t; + +extern void lx_pid_assign(kthread_t *, struct lx_pid *); +extern void lx_pid_reassign(kthread_t *); +extern void lx_pid_rele(pid_t, id_t); +extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *); +extern int lx_lpid_lock(pid_t, zone_t *, lx_pid_flag_t, proc_t **, + kthread_t **); +extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *); +extern void lx_pid_init(void); +extern void lx_pid_fini(void); +extern void lx_acct_out(vnode_t *, int); + +extern uint_t lx_pipe_max_limit; +extern uint_t lx_pipe_max_default; + +/* + * In-Kernel Linux System Call Description. + */ +typedef struct lx_sysent { + char *sy_name; + long (*sy_callc)(); + char sy_flags; + char sy_narg; +} lx_sysent_t; + +#if defined(_LP64) +extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +extern int lx_nsysent64; +#endif +extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +extern int lx_nsysent32; + +#endif /* _KERNEL */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_BRAND_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h new file mode 100644 index 0000000000..f82c6b867d --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h @@ -0,0 +1,161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_LX_FCNTL_H +#define _SYS_LX_FCNTL_H + +#include <sys/vnode.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Lx open/fcntl flags + */ +#define LX_O_RDONLY 00 +#define LX_O_WRONLY 01 +#define LX_O_RDWR 02 +#define LX_O_ACCMODE (LX_O_RDONLY | LX_O_WRONLY | LX_O_RDWR) +#define LX_O_CREAT 0100 +#define LX_O_EXCL 0200 +#define LX_O_NOCTTY 0400 +#define LX_O_TRUNC 01000 +#define LX_O_APPEND 02000 +#define LX_O_NONBLOCK 04000 +#define LX_O_NDELAY LX_O_NONBLOCK +#define LX_O_SYNC 010000 +#define LX_O_FSYNC LX_O_SYNC +#define LX_O_ASYNC 020000 +#define LX_O_DIRECT 040000 +#define LX_O_LARGEFILE 0100000 +#define LX_O_DIRECTORY 0200000 +#define LX_O_NOFOLLOW 0400000 +#define LX_O_CLOEXEC 02000000 +#define LX_O_PATH 010000000 + +#define LX_F_DUPFD 0 +#define LX_F_GETFD 1 +#define LX_F_SETFD 2 +#define LX_F_GETFL 3 +#define LX_F_SETFL 4 +#define LX_F_GETLK 5 +#define LX_F_SETLK 6 +#define LX_F_SETLKW 7 +#define LX_F_SETOWN 8 +#define LX_F_GETOWN 9 +#define LX_F_SETSIG 10 +#define LX_F_GETSIG 11 + +#define LX_F_GETLK64 12 +#define LX_F_SETLK64 13 +#define LX_F_SETLKW64 14 + +#define LX_F_SETLEASE 1024 +#define LX_F_GETLEASE 1025 +#define LX_F_NOTIFY 1026 +#define LX_F_CANCELLK 1029 +#define LX_F_DUPFD_CLOEXEC 1030 +#define LX_F_SETPIPE_SZ 1031 +#define LX_F_GETPIPE_SZ 1032 + +#define LX_F_RDLCK 0 +#define LX_F_WRLCK 1 +#define LX_F_UNLCK 2 + +/* Test for emulated O_PATH setting in file_t flags */ +#define LX_IS_O_PATH(f) (((f)->f_flag & (FREAD|FWRITE)) == 0) + +extern int lx_vp_at(int, char *, vnode_t **, int); + +/* + * Lx flock codes. + */ +#define LX_NAME_MAX 255 +#define LX_LOCK_SH 1 /* shared */ +#define LX_LOCK_EX 2 /* exclusive */ +#define LX_LOCK_NB 4 /* non-blocking */ +#define LX_LOCK_UN 8 /* unlock */ + +/* + * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value. + * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by + * faccessat. + */ +#define LX_AT_FDCWD (-100) +#define LX_AT_SYMLINK_NOFOLLOW 0x100 +#define LX_AT_REMOVEDIR 0x200 +#define LX_AT_EACCESS 0x200 +#define LX_AT_SYMLINK_FOLLOW 0x400 +#define LX_AT_NO_AUTOMOUNT 0x800 +#define LX_AT_EMPTY_PATH 0x1000 + +typedef struct lx_flock { + short l_type; + short l_whence; + long l_start; + long l_len; + int l_pid; +} lx_flock_t; + +typedef struct lx_flock64 { + short l_type; + short l_whence; + long long l_start; + long long l_len; + int l_pid; +} lx_flock64_t; + +#if defined(_KERNEL) + +/* + * 64-bit kernel view of 32-bit usermode structs. + */ +#pragma pack(4) +typedef struct lx_flock32 { + int16_t l_type; + int16_t l_whence; + int32_t l_start; + int32_t l_len; + int32_t l_pid; +} lx_flock32_t; + +typedef struct lx_flock64_32 { + int16_t l_type; + int16_t l_whence; + int64_t l_start; + int64_t l_len; + int32_t l_pid; +} lx_flock64_32_t; +#pragma pack() + +#endif /* _KERNEL && _SYSCALL32_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FCNTL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h new file mode 100644 index 0000000000..7eba389218 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#ifndef _SYS_LX_FUTEX_H +#define _SYS_LX_FUTEX_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI + +/* + * Flags that can be OR'd into a futex operation. + */ +#define FUTEX_CMD_MASK 0x007f +#define FUTEX_PRIVATE_FLAG 0x0080 +#define FUTEX_CLOCK_REALTIME 0x0100 + +#define FUTEX_BITSET_MATCH_ANY 0xffffffff +/* + * FUTEX_WAKE_OP operations + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +/* + * FUTEX_WAKE_OP comparison operations + */ +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ + +/* + * The encoding of the FUTEX_WAKE_OP operation in 32 bits: + * + * +--+-- - --+-- - --+-- - --+-- - --+ + * |S |OP |CMP |OPARG |CMPARG | + * +--+-- - --+-- - --+-- - --+-- - --+ + * |31|30 - 28|27 - 24|23 - 12|11 - 0| + * + * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG. + * (Yes, this whole thing is entirely absurd -- see the block comment in + * lx_futex.c for an explanation of this nonsense.) Macros to extract the + * various components from the operation, given the above encoding: + */ +#define FUTEX_OP_OP(x) (((x) >> 28) & 7) +#define FUTEX_OP_CMP(x) (((x) >> 24) & 15) +#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \ + ((((x) << 8) >> 20))) +#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20) + +#ifdef _KERNEL + +/* + * This structure is used to track all the threads currently waiting on a + * futex. There is one fwaiter_t for each blocked thread. We store all + * fwaiter_t's in a hash structure, indexed by the memid_t of the integer + * containing the futex's value. + * + * At the moment, all fwaiter_t's for a single futex are simply dumped into + * the hash bucket. If futex contention ever becomes a hot path, we can + * chain a single futex's waiters together. + */ +typedef struct fwaiter { + memid_t fw_memid; /* memid of the user-space futex */ + kcondvar_t fw_cv; /* cond var */ + struct fwaiter *fw_next; /* hash queue */ + struct fwaiter *fw_prev; /* hash queue */ + uint32_t fw_bits; /* bits waiting on */ + pid_t fw_tid; /* for PI futexes; the waiter's tid */ + int fw_opri; /* for PI futexes; original pri. */ + boolean_t fw_pri_up; /* for PI futexes; pri. increased */ + volatile int fw_woken; +} fwaiter_t; + +#define FUTEX_WAITERS 0x80000000 +#define FUTEX_OWNER_DIED 0x40000000 +#define FUTEX_TID_MASK 0x3fffffff + +#define FUTEX_ROBUST_LOCK_PI 1 +#define FUTEX_ROBUST_LIST_LIMIT 2048 + +extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2); +extern void lx_futex_init(void); +extern int lx_futex_fini(void); +extern long lx_set_robust_list(void *listp, size_t len); +extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp); +extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FUTEX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h new file mode 100644 index 0000000000..03b9d43038 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_IMPL_H +#define _LX_IMPL_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t, + ulong_t, ulong_t); + + +extern lx_systrace_f *lx_systrace_entry_ptr; +extern lx_systrace_f *lx_systrace_return_ptr; + +extern void lx_brand_systrace_enable(void); +extern void lx_brand_systrace_disable(void); + +extern void lx_unsupported(char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h new file mode 100644 index 0000000000..08d4d78efb --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2018 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LINUX_LDT_H +#define _SYS_LINUX_LDT_H + +#include <sys/segments.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ldt_info { + uint_t entry_number; + uint_t base_addr; + uint_t limit; + uint_t seg_32bit:1, + contents:2, + read_exec_only:1, + limit_in_pages:1, + seg_not_present:1, + useable:1; +}; + +#define LDT_INFO_EMPTY(info) \ + ((info)->base_addr == 0 && (info)->limit == 0 && \ + (info)->contents == 0 && (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && (info)->useable == 0) + +#if defined(__amd64) +#define SETMODE(desc) (desc)->usd_long = SDP_SHORT; +#else +#define SETMODE(desc) +#endif + +#define LDT_INFO_TO_DESC(info, desc) { \ + USEGD_SETBASE(desc, (info)->base_addr); \ + USEGD_SETLIMIT(desc, (info)->limit); \ + (desc)->usd_type = ((info)->contents << 2) | \ + ((info)->read_exec_only ^ 1) << 1 | SDT_S | SDT_A; \ + (desc)->usd_dpl = SEL_UPL; \ + (desc)->usd_p = (info)->seg_not_present ^ 1; \ + (desc)->usd_def32 = (info)->seg_32bit; \ + (desc)->usd_gran = (info)->limit_in_pages; \ + (desc)->usd_avl = (info)->useable; \ + SETMODE(desc); \ +} + +#define DESC_TO_LDT_INFO(desc, info) { \ + bzero((info), sizeof (*(info))); \ + (info)->base_addr = USEGD_GETBASE(desc); \ + (info)->limit = USEGD_GETLIMIT(desc); \ + (info)->seg_not_present = (desc)->usd_p ^ 1; \ + (info)->contents = ((desc)->usd_type >> 2) & 3; \ + (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \ + (info)->seg_32bit = (desc)->usd_def32; \ + (info)->limit_in_pages = (desc)->usd_gran; \ + (info)->useable = (desc)->usd_avl; \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_LDT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h new file mode 100644 index 0000000000..e81a1597f3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -0,0 +1,135 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS__LX_MISC_H +#define _SYS__LX_MISC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <inet/ip.h> +#include <inet/ip6.h> +#include <sys/siginfo.h> +#include <sys/lx_brand.h> + +#ifdef _KERNEL + +extern void lx_setrval(klwp_t *, int, int); +extern void lx_exec(); +extern void lx_exitlwp(klwp_t *); +extern void lx_freelwp(klwp_t *); +extern void *lx_lwpdata_alloc(proc_t *); +extern void lx_lwpdata_free(void *); +extern void lx_initlwp(klwp_t *, void *); +extern void lx_initlwp_post(klwp_t *); +extern void lx_forklwp(klwp_t *, klwp_t *); + +extern void lx_affinity_forklwp(klwp_t *, klwp_t *); + +extern void lx_set_gdt(int, user_desc_t *); +extern void lx_clear_gdt(int); + +extern longlong_t lx_nosys(); + +extern void lx_clone_grp_create(uint_t); +extern void lx_clone_grp_enter(uint_t, proc_t *, proc_t *); +extern void lx_clone_grp_exit(proc_t *, boolean_t); +extern boolean_t lx_clone_grp_member(lx_proc_data_t *, uint_t); +extern int lx_clone_grp_walk(lx_proc_data_t *, uint_t, + int (*)(proc_t *, void *), void *); + +extern greg_t lx_fixsegreg(greg_t, model_t); +extern uintptr_t lx_fsbase(klwp_t *, uintptr_t); +extern void lx_exit_with_sig(proc_t *, sigqueue_t *); +extern boolean_t lx_wait_filter(proc_t *, proc_t *); +extern void lx_sigfd_translate(k_siginfo_t *); +extern int stol_ksiginfo_copyout(k_siginfo_t *, void *); + +extern int ltos_at_flag(int, int, boolean_t); +#if defined(_SYSCALL32_IMPL) +extern int stol_ksiginfo32_copyout(k_siginfo_t *, void *); +#endif + +typedef enum lx_regs_location { + LX_REG_LOC_UNAVAIL, + LX_REG_LOC_LWP, + LX_REG_LOC_UCP +} lx_regs_location_t; + +extern lx_regs_location_t lx_regs_location(lx_lwp_data_t *, void **, boolean_t); + + +typedef enum lx_if_action { + LX_IF_FROMNATIVE, + LX_IF_TONATIVE +} lx_if_action_t; + +/* Linux ARP protocol hardware identifiers */ +#define LX_ARPHRD_ETHER 1 /* Ethernet */ +#define LX_ARPHRD_LOOPBACK 772 /* Loopback */ +#define LX_ARPHRD_VOID 0xffff /* Unknown */ + +/* IPv6 address scope values used in /proc/net/if_inet6 */ +#define LX_IPV6_ADDR_LOOPBACK 0x0010U +#define LX_IPV6_ADDR_LINKLOCAL 0x0020U +#define LX_IPV6_ADDR_SITELOCAL 0x0040U +#define LX_IPV6_ADDR_COMPATv4 0x0080U + +/* Maximum length of a thread name, including the NUL terminator */ +#define LX_PR_SET_NAME_NAMELEN 16 + +extern void lx_ifname_convert(char *, lx_if_action_t); +extern void lx_ifflags_convert(uint64_t *, lx_if_action_t); +extern unsigned int lx_ipv6_scope_convert(const in6_addr_t *); +extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *, + int *); + +extern boolean_t lx_ptrace_stop(ushort_t); +extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t); +extern void lx_ptrace_init(void); +extern void lx_ptrace_fini(void); +extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *, + int *); +extern void lx_ptrace_exit(proc_t *, klwp_t *); +extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *); +extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t); +extern int lx_ptrace_set_clone_inherit(int, boolean_t); +extern int lx_sigcld_repost(proc_t *, sigqueue_t *); +extern int lx_ptrace_issig_stop(proc_t *, klwp_t *); +extern boolean_t lx_ptrace_sig_ignorable(proc_t *, klwp_t *, int); + +extern int lx_helper_clone(int64_t *, int, void *, void *, void *); +extern int lx_helper_setgroups(int, gid_t *); +extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *); +extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *); + +extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *); +extern void lx_vsyscall_enter(proc_t *, klwp_t *, int); + +extern void lx_check_strict_failure(lx_lwp_data_t *); + +extern boolean_t lx_is_eventfd(file_t *); + +extern int lx_read_common(file_t *, uio_t *, size_t *, boolean_t); +extern int lx_write_common(file_t *, uio_t *, size_t *, boolean_t); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS__LX_MISC_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h new file mode 100644 index 0000000000..74bbc939a3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PTM_LINUX_H +#define _SYS_PTM_LINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_PTM_DRV "lx_ptm" +#define LX_PTM_MINOR_NODE "lx_ptmajor" + +#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PTM_LINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h new file mode 100644 index 0000000000..9f606b614f --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGINFO_H +#define _LX_SIGINFO_H + +#include <sys/lx_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_siginfo_t lsi_code values + * + * LX_SI_ASYNCNL: Sent by asynch name lookup completion + * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads + * LX_SI_SIGIO: Sent by queued SIGIO + * LX_SI_ASYNCIO: Sent by asynchronous I/O completion + * LX_SI_MESGQ: Sent by real time message queue state change + * LX_SI_TIMER: Sent by timer expiration + * LX_SI_QUEUE: Sent by sigqueue + * LX_SI_USER: Sent by kill, sigsend, raise, etc. + * LX_SI_KERNEL: Sent by kernel + * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to + * illumos errors, if there is no translation available, this value + * should be used. This value should have no meaning as an si_code in + * illumos or Linux. + * + * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by + * BrandZ. + */ +#define LX_SI_CODE_NOT_EXIST (-61) +#define LX_SI_ASYNCNL (-60) +#define LX_SI_DETHREAD (-7) +#define LX_SI_TKILL (-6) +#define LX_SI_SIGIO (-5) +#define LX_SI_ASYNCIO (-4) +#define LX_SI_MESGQ (-3) +#define LX_SI_TIMER (-2) +#define LX_SI_QUEUE (-1) +#define LX_SI_USER (0) +#define LX_SI_KERNEL (0x80) + +#define LX_SI_MAX_SIZE 128 +#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3) +#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4) + +#if defined(_LP64) +/* + * Because of the odd number (3) of ints before the union, we need to account + * for the smaller padding needed on x64 due to the union being offset to an 8 + * byte boundary. + */ +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64 +#else +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32 +#endif + +typedef struct lx_siginfo { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE]; + + struct { + pid_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid_t _pid; + lx_uid16_t _uid; + union sigval _sigval; + } _rt; + + struct { + pid_t _pid; + lx_uid16_t _uid; + int _status; + clock_t _utime; + clock_t _stime; + } _sigchld; + + struct { + void *_addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "lx_siginfo_t" object. + */ +#pragma pack(4) +typedef struct lx_siginfo32 { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE_32]; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + union sigval32 _sigval; + } _rt; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + int _status; + clock32_t _utime; + clock32_t _stime; + } _sigchld; + + struct { + caddr32_t _addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo32_t; +#pragma pack() +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#define lsi_pid _sifields._kill._pid +#define lsi_uid _sifields._kill._uid +#define lsi_status _sifields._sigchld._status +#define lsi_utime _sifields._sigchld._utime +#define lsi_stime _sifields._sigchld._stime +#define lsi_value _sifields._rt._sigval +#define lsi_int _sifields._rt._sigval.sivalx_int +#define lsi_ptr _sifields._rt._sigval.sivalx_ptr +#define lsi_addr _sifields._sigfault._addr +#define lsi_band _sifields._sigpoll._band +#define lsi_fd _sifields._sigpoll._fd + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGINFO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_signal.h b/usr/src/uts/common/brand/lx/sys/lx_signal.h new file mode 100644 index 0000000000..552c36238b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_signal.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGNAL_H +#define _LX_SIGNAL_H + +#include <lx_signum.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void lx_ltos_sigset(lx_sigset_t *, k_sigset_t *); +extern void lx_stol_sigset(k_sigset_t *, lx_sigset_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGNAL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_socket.h b/usr/src/uts/common/brand/lx/sys/lx_socket.h new file mode 100644 index 0000000000..99489e4d13 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_socket.h @@ -0,0 +1,444 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _SYS_LX_SOCKET_H +#define _SYS_LX_SOCKET_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux address family definitions + * Some of these are not supported + */ +#define LX_AF_UNSPEC 0 /* Unspecified */ +#define LX_AF_UNIX 1 /* local file/pipe name */ +#define LX_AF_INET 2 /* IP protocol family */ +#define LX_AF_AX25 3 /* Amateur Radio AX.25 */ +#define LX_AF_IPX 4 /* Novell Internet Protocol */ +#define LX_AF_APPLETALK 5 /* Appletalk */ +#define LX_AF_NETROM 6 /* Amateur radio */ +#define LX_AF_BRIDGE 7 /* Multiprotocol bridge */ +#define LX_AF_ATMPVC 8 /* ATM PVCs */ +#define LX_AF_X25 9 /* X.25 */ +#define LX_AF_INET6 10 /* IPV 6 */ +#define LX_AF_ROSE 11 /* Amateur Radio X.25 */ +#define LX_AF_DECNET 12 /* DECnet */ +#define LX_AF_NETBEUI 13 /* 802.2LLC */ +#define LX_AF_SECURITY 14 /* Security callback */ +#define LX_AF_KEY 15 /* key management */ +#define LX_AF_ROUTE 16 /* Alias to emulate 4.4BSD */ +#define LX_AF_NETLINK LX_AF_ROUTE +#define LX_AF_PACKET 17 /* Packet family */ +#define LX_AF_ASH 18 /* Ash ? */ +#define LX_AF_ECONET 19 /* Acorn Econet */ +#define LX_AF_ATMSVC 20 /* ATM SVCs */ +#define LX_AF_SNA 22 /* Linux SNA */ +#define LX_AF_IRDA 23 /* IRDA sockets */ +#define LX_AF_PPPOX 24 /* PPPoX sockets */ +#define LX_AF_WANPIPE 25 /* Wanpipe API sockets */ +#define LX_AF_LLC 26 +/* gap in Linux defines for 27 and 28 */ +#define LX_AF_CAN 29 +#define LX_AF_TIPC 30 +#define LX_AF_BLUETOOTH 31 /* Bluetooth sockets */ +#define LX_AF_IUCV 32 +#define LX_AF_RXRPC 33 + +/* limit of AF mappings */ +#define LX_AF_MAX LX_AF_RXRPC + +#define AF_NOTSUPPORTED -1 +#define AF_INVAL -2 + +/* + * Options for use with [gs]etsockopt at the SOL_SOCKET level. + */ +#define LX_SOL_SOCKET 1 + +#define LX_SCM_RIGHTS 1 +#define LX_SCM_CRED 2 + +#define LX_SO_DEBUG 1 +#define LX_SO_REUSEADDR 2 +#define LX_SO_TYPE 3 +#define LX_SO_ERROR 4 +#define LX_SO_DONTROUTE 5 +#define LX_SO_BROADCAST 6 +#define LX_SO_SNDBUF 7 +#define LX_SO_RCVBUF 8 +#define LX_SO_KEEPALIVE 9 +#define LX_SO_OOBINLINE 10 +#define LX_SO_NO_CHECK 11 +#define LX_SO_PRIORITY 12 +#define LX_SO_LINGER 13 +#define LX_SO_BSDCOMPAT 14 +#define LX_SO_REUSEPORT 15 +/* + * For Linux see unix(7) man page SO_PASSCRED description. For Illumos see + * socket.h(3HEAD) man page SO_RECVUCRED description. + */ +#define LX_SO_PASSCRED 16 +#define LX_SO_PEERCRED 17 +#define LX_SO_RCVLOWAT 18 +#define LX_SO_SNDLOWAT 19 +#define LX_SO_RCVTIMEO 20 +#define LX_SO_SNDTIMEO 21 +/* Security levels - as per NRL IPv6 - don't actually do anything */ +#define LX_SO_SECURITY_AUTHENTICATION 22 +#define LX_SO_SECURITY_ENCRYPTION_TRANSPORT 23 +#define LX_SO_SECURITY_ENCRYPTION_NETWORK 24 +#define LX_SO_BINDTODEVICE 25 +/* Socket filtering */ +#define LX_SO_ATTACH_FILTER 26 +#define LX_SO_DETACH_FILTER 27 +#define LX_SO_PEERNAME 28 +#define LX_SO_TIMESTAMP 29 +#define LX_SCM_TIMESTAMP LX_SO_TIMESTAMP +#define LX_SO_ACCEPTCONN 30 + +#define LX_SO_PEERSEC 31 +#define LX_SO_SNDBUFFORCE 32 +#define LX_SO_RCVBUFFORCE 33 +#define LX_SO_PASSSEC 34 +#define LX_SO_TIMESTAMPNS 35 +#define LX_SCM_TIMESTAMPNS LX_SO_TIMESTAMPNS +#define LX_SO_MARK 36 +#define LX_SO_TIMESTAMPING 37 +#define LX_SCM_TIMESTAMPING LX_SO_TIMESTAMPING +#define LX_SO_PROTOCOL 38 +#define LX_SO_DOMAIN 39 +#define LX_SO_RXQ_OVFL 40 +#define LX_SO_WIFI_STATUS 41 +#define LX_SCM_WIFI_STATUS LX_SO_WIFI_STATUS +#define LX_SO_PEEK_OFF 42 +#define LX_SO_NOFCS 43 +#define LX_SO_LOCK_FILTER 44 +#define LX_SO_SELECT_ERR_QUEUE 45 +#define LX_SO_BUSY_POLL 46 +#define LX_SO_MAX_PACING_RATE 47 +#define LX_SO_BPF_EXTENSIONS 48 + +/* + * Options for use with [gs]etsockopt at the RAW level. + * IPPROTO_RAW + */ +#define LX_ICMP_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the PACKET level. + * SOL_PACKET + */ +#define LX_SOL_PACKET 263 + +#define LX_PACKET_ADD_MEMBERSHIP 1 +#define LX_PACKET_DROP_MEMBERSHIP 2 +#define LX_PACKET_RECV_OUTPUT 3 +#define LX_PACKET_RX_RING 5 +#define LX_PACKET_STATISTICS 6 + +/* + * Options for use with [gs]etsockopt at the NETLINK level. + * SOL_NETLINK + */ +#define LX_SOL_NETLINK 270 + +/* + * Linux socket type definitions + */ +#define LX_SOCK_STREAM 1 /* Connection-based byte streams */ +#define LX_SOCK_DGRAM 2 /* Connectionless, datagram */ +#define LX_SOCK_RAW 3 /* Raw protocol interface */ +#define LX_SOCK_RDM 4 /* Reliably-delivered message */ +#define LX_SOCK_SEQPACKET 5 /* Sequenced packet stream */ +#define LX_SOCK_PACKET 10 /* Linux specific */ +#define LX_SOCK_MAX 11 + +/* + * The Linux socket type can be or-ed with other flags (e.g. SOCK_CLOEXEC). + */ +#define LX_SOCK_TYPE_MASK 0xf + +/* + * Linux flags for socket, socketpair and accept4. These are or-ed into the + * socket type value. In the Linux net.h header these come from fcntl.h (note + * that they are in octal in the Linux header). + */ +#define LX_SOCK_CLOEXEC 0x80000 +#define LX_SOCK_NONBLOCK 0x800 + +#define SOCK_NOTSUPPORTED -1 +#define SOCK_INVAL -2 + +/* + * PF_PACKET protocol definitions. + */ +#define LX_ETH_P_802_3 0x0001 +#define LX_ETH_P_ALL 0x0003 +#define LX_ETH_P_802_2 0x0004 +#define LX_ETH_P_IP 0x0800 +#define LX_ETH_P_ARP 0x0806 +#define LX_ETH_P_IPV6 0x86DD + +/* + * IP Protocol levels. Some of these match the Illumos IPPROTO_* values. + */ +#define LX_IPPROTO_IP 0 +#define LX_IPPROTO_ICMP 1 +#define LX_IPPROTO_IGMP 2 +#define LX_IPPROTO_TCP 6 +#define LX_IPPROTO_UDP 17 +#define LX_IPPROTO_IPV6 41 +#define LX_IPPROTO_ICMPV6 58 +#define LX_IPPROTO_RAW 255 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IP + */ +#define LX_IP_TOS 1 +#define LX_IP_TTL 2 +#define LX_IP_HDRINCL 3 +#define LX_IP_OPTIONS 4 +#define LX_IP_ROUTER_ALERT 5 +#define LX_IP_RECVOPTS 6 +#define LX_IP_RETOPTS 7 +#define LX_IP_PKTINFO 8 +#define LX_IP_PKTOPTIONS 9 +#define LX_IP_MTU_DISCOVER 10 +#define LX_IP_RECVERR 11 +#define LX_IP_RECVTTL 12 +#define LX_IP_RECVTOS 13 +#define LX_IP_MTU 14 +#define LX_IP_FREEBIND 15 +#define LX_IP_IPSEC_POLICY 16 +#define LX_IP_XFRM_POLICY 17 +#define LX_IP_PASSSEC 18 +#define LX_IP_TRANSPARENT 19 +#define LX_IP_ORIGDSTADDR 20 +#define LX_IP_MINTTL 21 +#define LX_IP_NODEFRAG 22 +/* Linux apparently leaves a gap here */ +#define LX_IP_MULTICAST_IF 32 +#define LX_IP_MULTICAST_TTL 33 +#define LX_IP_MULTICAST_LOOP 34 +#define LX_IP_ADD_MEMBERSHIP 35 +#define LX_IP_DROP_MEMBERSHIP 36 +#define LX_IP_UNBLOCK_SOURC 37 +#define LX_IP_BLOCK_SOURCE 38 +#define LX_IP_ADD_SOURCE_MEMBERSHIP 39 +#define LX_IP_DROP_SOURCE_MEMBERSHIP 40 +#define LX_IP_MSFILTER 41 +#define LX_MCAST_JOIN_GROUP 42 +#define LX_MCAST_BLOCK_SOURCE 43 +#define LX_MCAST_UNBLOCK_SOURCE 44 +#define LX_MCAST_LEAVE_GROUP 45 +#define LX_MCAST_JOIN_SOURCE_GROUP 46 +#define LX_MCAST_LEAVE_SOURCE_GROUP 47 +#define LX_MCAST_MSFILTER 48 +#define LX_IP_MULTICAST_ALL 49 +#define LX_IP_UNICAST_IF 50 + +/* + * LX_IP_MTU_DISCOVER values + */ +#define LX_IP_PMTUDISC_DONT 0 +#define LX_IP_PMTUDISC_WANT 1 +#define LX_IP_PMTUDISC_DO 2 +#define LX_IP_PMTUDISC_PROBE 3 +#define LX_IP_PMTUDISC_INTERFACE 4 +#define LX_IP_PMTUDISC_OMIT 5 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_IPV6 + */ + +#define LX_IPV6_ADDRFORM 1 +#define LX_IPV6_2292PKTINFO 2 +#define LX_IPV6_2292HOPOPTS 3 +#define LX_IPV6_2292DSTOPTS 4 +#define LX_IPV6_2292RTHDR 5 +#define LX_IPV6_2292PKTOPTIONS 6 +#define LX_IPV6_CHECKSUM 7 +#define LX_IPV6_2292HOPLIMIT 8 +#define LX_IPV6_NEXTHOP 9 +#define LX_IPV6_AUTHHDR 10 +#define LX_IPV6_UNICAST_HOPS 16 +#define LX_IPV6_MULTICAST_IF 17 +#define LX_IPV6_MULTICAST_HOPS 18 +#define LX_IPV6_MULTICAST_LOOP 19 +#define LX_IPV6_JOIN_GROUP 20 +#define LX_IPV6_LEAVE_GROUP 21 +#define LX_IPV6_ROUTER_ALERT 22 +#define LX_IPV6_MTU_DISCOVER 23 +#define LX_IPV6_MTU 24 +#define LX_IPV6_RECVERR 25 +#define LX_IPV6_V6ONLY 26 +#define LX_IPV6_JOIN_ANYCAST 27 +#define LX_IPV6_LEAVE_ANYCAST 28 +#define LX_IPV6_IPSEC_POLICY 34 +#define LX_IPV6_XFRM_POLICY 35 + +#define LX_IPV6_RECVPKTINFO 49 +#define LX_IPV6_PKTINFO 50 +#define LX_IPV6_RECVHOPLIMIT 51 +#define LX_IPV6_HOPLIMIT 52 +#define LX_IPV6_RECVHOPOPTS 53 +#define LX_IPV6_HOPOPTS 54 +#define LX_IPV6_RTHDRDSTOPTS 55 +#define LX_IPV6_RECVRTHDR 56 +#define LX_IPV6_RTHDR 57 +#define LX_IPV6_RECVDSTOPTS 58 +#define LX_IPV6_DSTOPTS 59 +#define LX_IPV6_RECVTCLASS 66 +#define LX_IPV6_TCLASS 67 + +/* + * Options for use with [gs]etsockopt at the IP level. + * IPPROTO_ICMPV6 + */ + +#define LX_ICMP6_FILTER 1 + +/* + * Options for use with [gs]etsockopt at the TCP level. + * IPPROTO_TCP + */ +#define LX_TCP_NODELAY 1 /* Don't delay send to coalesce packets */ +#define LX_TCP_MAXSEG 2 /* Set maximum segment size */ +#define LX_TCP_CORK 3 /* Control sending of partial frames */ +#define LX_TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define LX_TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define LX_TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define LX_TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define LX_TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define LX_TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define LX_TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define LX_TCP_INFO 11 /* Information about this connection. */ +#define LX_TCP_QUICKACK 12 /* Bock/reenable quick ACKs. */ +#define LX_TCP_CONGESTION 13 /* Congestion control algorithm */ +#define LX_TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define LX_TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts on thin streams */ +#define LX_TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define LX_TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define LX_TCP_REPAIR 19 /* TCP socket under repair */ +#define LX_TCP_REPAIR_QUEUE 20 +#define LX_TCP_QUEUE_SEQ 21 +#define LX_TCP_REPAIR_OPTIONS 22 +#define LX_TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define LX_TCP_TIMESTAMP 24 +#define LX_TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes */ + +/* + * Options for use with [gs]etsockopt at the IGMP level. + * IPPROTO_IGMP + */ +#define LX_IGMP_MINLEN 8 +#define LX_IGMP_MAX_HOST_REPORT_DELAY 10 +#define LX_IGMP_HOST_MEMBERSHIP_QUERY 0x11 +#define LX_IGMP_HOST_MEMBERSHIP_REPORT 0x12 +#define LX_IGMP_DVMRP 0x13 +#define LX_IGMP_PIM 0x14 +#define LX_IGMP_TRACE 0x15 +#define LX_IGMP_HOST_NEW_MEMBERSHIP_REPORT 0x16 +#define LX_IGMP_HOST_LEAVE_MESSAGE 0x17 +#define LX_IGMP_MTRACE_RESP 0x1e +#define LX_IGMP_MTRACE 0x1f + +/* + * Linux socket flags for use with recv(2)/send(2)/recvmsg(2)/sendmsg(2) + */ +#define LX_MSG_OOB 0x1 +#define LX_MSG_PEEK 0x2 +#define LX_MSG_DONTROUTE 0x4 +#define LX_MSG_CTRUNC 0x8 +#define LX_MSG_PROXY 0x10 +#define LX_MSG_TRUNC 0x20 +#define LX_MSG_DONTWAIT 0x40 +#define LX_MSG_EOR 0x80 +#define LX_MSG_WAITALL 0x100 +#define LX_MSG_FIN 0x200 +#define LX_MSG_SYN 0x400 +#define LX_MSG_CONFIRM 0x800 +#define LX_MSG_RST 0x1000 +#define LX_MSG_ERRQUEUE 0x2000 +#define LX_MSG_NOSIGNAL 0x4000 +#define LX_MSG_MORE 0x8000 +#define LX_MSG_WAITFORONE 0x10000 +#define LX_MSG_FASTOPEN 0x20000000 +#define LX_MSG_CMSG_CLOEXEC 0x40000000 + +typedef struct lx_msghdr { + void *msg_name; /* optional address */ + socklen_t msg_namelen; /* size of address */ + struct iovec *msg_iov; /* scatter/gather array */ + size_t msg_iovlen; /* # elements in msg_iov */ + void *msg_control; /* ancillary data */ + size_t msg_controllen; /* ancillary data buffer len */ + int msg_flags; /* flags on received message */ +} lx_msghdr_t; + +typedef struct lx_mmsghdr { + lx_msghdr_t msg_hdr; /* message header */ + unsigned int msg_len; /* no. of bytes transmitted */ +} lx_mmsghdr_t; + +#if defined(_LP64) + +typedef struct lx_msghdr32 { + caddr32_t msg_name; /* optional address */ + uint32_t msg_namelen; /* size of address */ + caddr32_t msg_iov; /* scatter/gather array */ + int32_t msg_iovlen; /* # elements in msg_iov */ + caddr32_t msg_control; /* ancillary data */ + uint32_t msg_controllen; /* ancillary data buffer len */ + int32_t msg_flags; /* flags on received message */ +} lx_msghdr32_t; + +typedef struct lx_mmsghdr32 { + lx_msghdr32_t msg_hdr; /* message header */ + unsigned int msg_len; /* no. of bytes transmitted */ +} lx_mmsghdr32_t; + +#endif + +typedef struct lx_sockaddr_in6 { + sa_family_t sin6_family; + in_port_t sin6_port; + uint32_t sin6_flowinfo; + struct in6_addr sin6_addr; + uint32_t sin6_scope_id; /* Depends on scope of sin6_addr */ + /* one 32-bit field shorter than illumos */ +} lx_sockaddr_in6_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_SOCKET_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h new file mode 100644 index 0000000000..78fbf6e0a8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -0,0 +1,341 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. + */ + +#ifndef _SYS_LINUX_SYSCALLS_H +#define _SYS_LINUX_SYSCALLS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern long lx_accept(); +extern long lx_accept4(); +extern long lx_access(); +extern long lx_acct(); +extern long lx_alarm(); +extern long lx_arch_prctl(); +extern long lx_bind(); +extern long lx_brk(); +extern long lx_chdir(); +extern long lx_chmod(); +extern long lx_chown(); +extern long lx_chown16(); +extern long lx_chroot(); +extern long lx_clock_getres(); +extern long lx_clock_gettime(); +extern long lx_clock_settime(); +extern long lx_close(); +extern long lx_connect(); +extern long lx_creat(); +extern long lx_dup(); +extern long lx_dup2(); +extern long lx_dup3(); +extern long lx_epoll_create(); +extern long lx_epoll_create1(); +extern long lx_epoll_ctl(); +extern long lx_epoll_pwait(); +extern long lx_epoll_wait(); +extern long lx_eventfd(); +extern long lx_eventfd2(); +extern long lx_faccessat(); +extern long lx_fadvise64(); +extern long lx_fadvise64_32(); +extern long lx_fadvise64_64(); +extern long lx_fallocate(); +extern long lx_fallocate32(); +extern long lx_fchdir(); +extern long lx_fchmod(); +extern long lx_fchmodat(); +extern long lx_fchown(); +extern long lx_fchown16(); +extern long lx_fchownat(); +extern long lx_fcntl(); +extern long lx_fcntl64(); +extern long lx_fgetxattr(); +extern long lx_flistxattr(); +extern long lx_flock(); +extern long lx_fremovexattr(); +extern long lx_fsetxattr(); +extern long lx_fstat32(); +extern long lx_fstat64(); +extern long lx_fstatat64(); +extern long lx_futex(); +extern long lx_get_robust_list(); +extern long lx_get_thread_area(); +extern long lx_getcpu(); +extern long lx_getcwd(); +extern long lx_getdents_32(); +extern long lx_getdents_64(); +extern long lx_getdents64(); +extern long lx_getegid(); +extern long lx_getegid16(); +extern long lx_geteuid(); +extern long lx_geteuid16(); +extern long lx_getgid(); +extern long lx_getgid16(); +extern long lx_getitimer(); +extern long lx_getpeername(); +extern long lx_getpgid(); +extern long lx_getpgrp(); +extern long lx_getsockname(); +extern long lx_getpid(); +extern long lx_getppid(); +extern long lx_getpriority(); +extern long lx_getrandom(); +extern long lx_getresgid(); +extern long lx_getresgid16(); +extern long lx_getresuid(); +extern long lx_getresuid16(); +extern long lx_getrlimit(); +extern long lx_getrusage(); +extern long lx_getsid(); +extern long lx_getsockopt(); +extern long lx_gettid(); +extern long lx_gettimeofday(); +extern long lx_getuid(); +extern long lx_getuid16(); +extern long lx_getxattr(); +extern long lx_io_cancel(); +extern long lx_io_destroy(); +extern long lx_io_getevents(); +extern long lx_io_setup(); +extern long lx_io_submit(); +extern long lx_ioctl(); +extern long lx_ioprio_get(); +extern long lx_ioprio_set(); +extern long lx_kill(); +extern long lx_lchown(); +extern long lx_lchown16(); +extern long lx_lgetxattr(); +extern long lx_link(); +extern long lx_linkat(); +extern long lx_listen(); +extern long lx_llistxattr(); +extern long lx_llseek(); +extern long lx_lremovexattr(); +extern long lx_lseek32(); +extern long lx_lseek64(); +extern long lx_lsetxattr(); +extern long lx_lstat32(); +extern long lx_lstat64(); +extern long lx_listxattr(); +extern long lx_madvise(); +extern long lx_mincore(); +extern long lx_mkdir(); +extern long lx_mkdirat(); +extern long lx_mlock(); +extern long lx_mlockall(); +extern long lx_mmap(); +extern long lx_mmap2(); +extern long lx_mremap(); +extern long lx_mprotect(); +extern long lx_modify_ldt(); +extern long lx_mount(); +extern long lx_msync(); +extern long lx_munlock(); +extern long lx_munlockall(); +extern long lx_munmap(); +extern long lx_nanosleep(); +extern long lx_nice(); +extern long lx_oldgetrlimit(); +extern long lx_open(); +extern long lx_openat(); +extern long lx_pause(); +extern long lx_personality(); +extern long lx_pipe(); +extern long lx_pipe2(); +extern long lx_poll(); +extern long lx_ppoll(); +extern long lx_pread(); +extern long lx_pread32(); +extern long lx_preadv(); +extern long lx_preadv32(); +extern long lx_prctl(); +extern long lx_prlimit64(); +extern long lx_pselect(); +extern long lx_ptrace(); +extern long lx_pwrite(); +extern long lx_pwrite32(); +extern long lx_pwritev(); +extern long lx_pwritev32(); +extern long lx_read(); +extern long lx_readlink(); +extern long lx_readlinkat(); +extern long lx_readv(); +extern long lx_reboot(); +extern long lx_recv(); +extern long lx_recvmsg(); +extern long lx_recvmmsg(); +extern long lx_recvfrom(); +extern long lx_rename(); +extern long lx_renameat(); +extern long lx_sched_getaffinity(); +extern long lx_sched_getparam(); +extern long lx_sched_getscheduler(); +extern long lx_sched_getattr(); +extern long lx_sched_get_priority_max(); +extern long lx_sched_get_priority_min(); +extern long lx_sched_rr_get_interval(); +extern long lx_sched_setaffinity(); +extern long lx_sched_setattr(); +extern long lx_sched_setparam(); +extern long lx_sched_setscheduler(); +extern long lx_sched_yield(); +extern long lx_select(); +extern long lx_send(); +extern long lx_sendmsg(); +extern long lx_sendmmsg(); +extern long lx_sendto(); +extern long lx_set_robust_list(); +extern long lx_set_thread_area(); +extern long lx_set_tid_address(); +extern long lx_setdomainname(); +extern long lx_setfsuid(); +extern long lx_setfsuid16(); +extern long lx_setfsgid(); +extern long lx_setfsgid16(); +extern long lx_setgid(); +extern long lx_setgid16(); +extern long lx_sethostname(); +extern long lx_setpgid(); +extern long lx_setpriority(); +extern long lx_setregid(); +extern long lx_setregid16(); +extern long lx_setresgid(); +extern long lx_setresgid16(); +extern long lx_setresuid(); +extern long lx_setresuid16(); +extern long lx_setreuid(); +extern long lx_setreuid16(); +extern long lx_setrlimit(); +extern long lx_setsid(); +extern long lx_setuid(); +extern long lx_setuid16(); +extern long lx_setxattr(); +extern long lx_setsockopt(); +extern long lx_symlink(); +extern long lx_symlinkat(); +extern long lx_shutdown(); +extern long lx_socket(); +extern long lx_socketcall(); +extern long lx_socketpair(); +extern long lx_splice(); +extern long lx_stat32(); +extern long lx_stat64(); +extern long lx_stime(); +extern long lx_swapoff(); +extern long lx_swapon(); +extern long lx_sync(); +extern long lx_sync_file_range(); +extern long lx_syncfs(); +extern long lx_sysinfo32(); +extern long lx_sysinfo64(); +extern long lx_syslog(); +extern long lx_removexattr(); +extern long lx_tgkill(); +extern long lx_time(); +extern long lx_times(); +extern long lx_timer_create(); +extern long lx_tkill(); +extern long lx_umask(); +extern long lx_umount(); +extern long lx_umount2(); +extern long lx_uname(); +extern long lx_unlink(); +extern long lx_unlinkat(); +extern long lx_unshare(); +extern long lx_vhangup(); +extern long lx_wait4(); +extern long lx_waitid(); +extern long lx_waitpid(); +extern long lx_write(); +extern long lx_writev(); + +#if defined(_LP64) +/* + * Linux vsyscall addresses: + */ +#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000 +#define LX_VSYS_time (uintptr_t)0xffffffffff600400 +#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800 + +#define LX_VSYSCALL_ADDR (uintptr_t)0xffffffffff600000 +#define LX_VSYSCALL_SIZE (uintptr_t)0x1000 +#endif + +#endif /* _KERNEL */ + +/* + * System call numbers for revectoring: + */ + +#if defined(__amd64) +#define LX_SYS_close 3 +#define LX_SYS_gettimeofday 96 +#define LX_SYS_mount 165 +#define LX_SYS_time 201 +#define LX_SYS_io_setup 206 +#define LX_SYS_clock_gettime 228 +#define LX_SYS_getcpu 309 + +#define LX_SYS32_close 6 +#define LX_SYS32_gettimeofday 78 +#define LX_SYS32_time 13 +#define LX_SYS32_mount 21 +#define LX_SYS32_clock_gettime 265 +#define LX_SYS32_io_setup 245 +#define LX_SYS32_getcpu 318 +#elif defined(__i386) +#define LX_SYS_close 6 +#define LX_SYS_mount 21 +#define LX_SYS_gettimeofday 78 +#define LX_SYS_time 13 +#define LX_SYS_clock_gettime 265 +#define LX_SYS_io_setup 245 +#define LX_SYS_getcpu 318 +#else +#error "Architecture not supported" +#endif /* defined(__amd64) */ + +/* + * The current code in the VDSO operates under the expectation that it will be + * mapped at a fixed offset from the comm page. This simplifies the act of + * locating said page without any other reference. The VDSO must fit within + * this offset, matching the same value as COMM_PAGE_ALIGN. + * See: uts/i86pc/sys/comm_page.h + */ +#define LX_VDSO_SIZE 0x4000 +#define LX_VDSO_ADDR_MASK ~(LX_VDSO_SIZE - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SYSCALLS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h new file mode 100644 index 0000000000..90363c8939 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_types.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_TYPES_H +#define _SYS_LX_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#define SHRT_MIN (-32768) /* min value of a "short int" */ +#define SHRT_MAX 32767 /* max value of a "short int" */ +#define USHRT_MAX 65535 /* max of "unsigned short int" */ +#define INT_MIN (-2147483647-1) /* min value of an "int" */ +#define INT_MAX 2147483647 /* max value of an "int" */ +#define UINT_MAX 4294967295U /* max value of an "unsigned int" */ + +#ifndef LLONG_MAX +#define LLONG_MAX 9223372036854775807LL +#endif + +#if defined(_LP64) +#define LONG_MAX 9223372036854775807L +#define ULONG_MAX 18446744073709551615UL +#else +#define LONG_MAX 2147483647L /* max value of a 32-bit "long int" */ +#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */ +#endif + +#endif /* !_KERNEL */ + + +typedef uint64_t lx_dev_t; +typedef uint16_t lx_dev16_t; +typedef uint32_t lx_ino_t; +typedef uint64_t lx_ino64_t; +typedef uint32_t lx_uid_t; +typedef uint16_t lx_uid16_t; +typedef uint32_t lx_gid_t; +typedef uint16_t lx_gid16_t; +typedef uint32_t lx_off_t; +typedef uint64_t lx_off64_t; +typedef uint32_t lx_blksize_t; +typedef uint32_t lx_blkcnt_t; +typedef uint64_t lx_blkcnt64_t; +typedef uint32_t lx_mode_t; +typedef uint16_t lx_mode16_t; + +/* + * Linux mangles major/minor numbers into dev_t differently than SunOS. + */ +#ifdef _LP64 +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | \ + ((uint64_t)((min) & ~0xff) << 12) | ((uint64_t)((maj) & ~0xfff) << 32)) + +#define LX_GETMAJOR(lx_dev) ((((lx_dev) >> 8) & 0xfff) | \ + ((((uint64_t)(lx_dev)) >> 32) & ~0xfff)) + +#else +#define LX_MAKEDEVICE(maj, min) \ + (((min) & 0xff) | (((maj) & 0xfff) << 8) | (((min) & ~0xff) << 12)) + +#define LX_GETMAJOR(lx_dev) (((lx_dev) >> 8) & 0xfff) +#endif + +#define LX_GETMINOR(lx_dev) (((lx_dev) & 0xff) | (((lx_dev) >> 12) & ~0xff)) +/* Linux supports 20 bits for the minor, and 12 bits for the major number */ +#define LX_MAXMIN 0xfffff +#define LX_MAXMAJ 0xfff + +/* + * Certain Linux tools care deeply about major/minor number mapping. + * Map virtual disks (zfs datasets, zvols, etc) into a safe reserved range. + */ +#define LX_MAJOR_DISK 203 + +/* LX ptm driver major/minor number */ +#define LX_PTM_MAJOR 5 +#define LX_PTM_MINOR 2 + +/* LX pts driver major number range */ +#define LX_PTS_MAJOR_MIN 136 +#define LX_PTS_MAJOR_MAX 143 + +/* LX tty/cons driver major number */ +#define LX_TTY_MAJOR 5 + +#define LX_UID16_TO_UID32(uid16) \ + (((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16)) + +#define LX_GID16_TO_GID32(gid16) \ + (((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16)) + +/* Overflow values default to NFS nobody. */ + +#define UID16_OVERFLOW ((lx_uid16_t)65534) +#define GID16_OVERFLOW ((lx_gid16_t)65534) + +/* + * All IDs with high word non-zero are converted to default overflow values to + * avoid inadvertent truncation to zero (root) (!). + */ +#define LX_UID32_TO_UID16(uid32) \ + ((((uid32) & 0xffff0000) == 0) ? ((lx_uid16_t)(uid32)) : \ + (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW)) + +#define LX_GID32_TO_GID16(gid32) \ + ((((gid32) & 0xffff0000) == 0) ? ((lx_gid16_t)(gid32)) : \ + (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW)) + +#define LX_32TO64(lo, hi) \ + ((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_TYPES_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_userhz.h b/usr/src/uts/common/brand/lx/sys/lx_userhz.h new file mode 100644 index 0000000000..ebbda28698 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_userhz.h @@ -0,0 +1,64 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _LX_USERHZ_H +#define _LX_USERHZ_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Within the kernel, Linux implements an internal hz that they refer to as a + * "jiffy". Linux can be built with different hz, but on modern kernels + * it is frequently 250. However, Linux has a separate concept for the hz + * that is visible outside the kernel. This is called "USER_HZ" and is the + * value returned by 'sysconf(_SC_CLK_TCK)'. This is almost universally set to + * 100hz. Some (lazy) applications just hardcode 100hz instead of checking. + * To accommodate these broken applications, we always work with a USER_HZ of + * 100 and scale accordingly. See the Linux time(7) man page for a more + * detailed discussion of their behavior. See the comment in our + * uts/common/conf/param.c for a discussion of valid native hz values. + * + * There are a few interfaces which expose a clock_t to user-land and which + * need to be considered for USER_HZ adjustment. + * 1) The times(2) syscall. This is handled correctly. + * 2) The waitid(2) syscall passes a siginfo_t which contains si_stime and + * si_utime. Testing waitid(2) on various Linux distributions shows that the + * these fields are garbage. This aligns with the Linux waitid(2) man page, + * which describes the subset of the siginfo_t structure that is populated. + * Neither si_stime or si_utime are listed. + * 3) A sigaction(2) handler can pass a siginfo_t. This is only documented to + * occur when the sa_flags is SA_SIGINFO. The si_stime and si_utime are + * documented to only be populated when the signal is SIGCHLD. However, + * testing on Linux seems to show that these fields are not consistent + * with the corresponding times(2) data for the process, even for the + * SIGCHLD sigaction handler case. + * 4) Some fields in /proc/stat and /proc/pid/stat. See the Linux proc man + * page for references to sysconf(_SC_CLK_TCK). + * + * Although the siginfo_t si_stime and si_utime data for cases #2 and #3 is not + * consistent on Linux, we populate these fields correctly to be on the safe + * side. + */ +extern uint_t lx_hz_scale; +#define LX_USERHZ 100 +#define HZ_TO_LX_USERHZ(x) ((x) / lx_hz_scale) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_USERHZ_H */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_access.c b/usr/src/uts/common/brand/lx/syscall/lx_access.c new file mode 100644 index 0000000000..8cf836cd7a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_access.c @@ -0,0 +1,223 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + * + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/isa_defs.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/cred_impl.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/uio.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/file.h> +#include <fs/fs_subr.h> +#include <c2/audit.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +/* + * Determine accessibility of file. + */ + +#define E_OK 010 /* use effective ids */ +#define R_OK 004 +#define W_OK 002 +#define X_OK 001 + +/* + * Convert Linux LX_AT_* flags to SunOS AT_* flags but skip verifying allowed + * flags have been passed. This also allows EACCESS/REMOVEDIR to be translated + * correctly since on linux they have the same value. + * + * Some code can actually pass in other bits in the flag. We may have to simply + * ignore these, as indicated by the enforce parameter. + */ +int +ltos_at_flag(int lflag, int allow, boolean_t enforce) +{ + int sflag = 0; + + if ((lflag & LX_AT_EACCESS) && (allow & AT_EACCESS)) { + lflag &= ~LX_AT_EACCESS; + sflag |= AT_EACCESS; + } + + if ((lflag & LX_AT_REMOVEDIR) && (allow & AT_REMOVEDIR)) { + lflag &= ~LX_AT_REMOVEDIR; + sflag |= AT_REMOVEDIR; + } + + if ((lflag & LX_AT_SYMLINK_NOFOLLOW) && (allow & AT_SYMLINK_NOFOLLOW)) { + lflag &= ~LX_AT_SYMLINK_NOFOLLOW; + sflag |= AT_SYMLINK_NOFOLLOW; + } + + /* right now SunOS doesn't have a _FOLLOW flag, so use a fake one */ + if ((lflag & LX_AT_SYMLINK_FOLLOW) && (allow & LX_AT_SYMLINK_FOLLOW)) { + lflag &= ~LX_AT_SYMLINK_FOLLOW; + sflag |= LX_AT_SYMLINK_FOLLOW; + } + + /* If lflag is not zero than some flags did not hit the above code. */ + if (enforce && lflag) + return (-1); + + return (sflag); +} + +/* + * For illumos, access() does this: + * If the process has appropriate privileges, an implementation may indicate + * success for X_OK even if none of the execute file permission bits are set. + * + * But for Linux, access() does this: + * If the calling process is privileged (i.e., its real UID is zero), then + * an X_OK check is successful for a regular file if execute permission is + * enabled for any of the file owner, group, or other. + * + * Linux used to behave more like illumos on older kernels: + * In kernel 2.4 (and earlier) there is some strangeness in the handling + * of X_OK tests for superuser. If all categories of execute permission + * are disabled for a nondirectory file, then the only access() test that + * returns -1 is when mode is specified as just X_OK; if R_OK or W_OK is + * also specified in mode, then access() returns 0 for such files. + * + * So we need to handle the case where a privileged process is checking for + * X_OK but none of the execute bits are set on the file. We'll keep the old + * 2.4 behavior for 2.4 emulation but use the new behavior for any other + * kernel rev. + */ +static int +lx_common_access(char *fname, int fmode, vnode_t *startvp) +{ + vnode_t *vp; + cred_t *tmpcr; + int error; + int mode; + cred_t *cr; + int estale_retry = 0; + + if (fmode & ~(E_OK|R_OK|W_OK|X_OK)) + return (EINVAL); + + mode = ((fmode & (R_OK|W_OK|X_OK)) << 6); + + cr = CRED(); + + /* OK to use effective uid/gid, i.e., no need to crdup(CRED())? */ + if ((fmode & E_OK) != 0 || + (cr->cr_uid == cr->cr_ruid && cr->cr_gid == cr->cr_rgid)) { + tmpcr = cr; + crhold(tmpcr); + } else { + tmpcr = crdup(cr); + tmpcr->cr_uid = cr->cr_ruid; + tmpcr->cr_gid = cr->cr_rgid; + tmpcr->cr_ruid = cr->cr_uid; + tmpcr->cr_rgid = cr->cr_gid; + } + +lookup: + if ((error = lookupnameatcred(fname, UIO_USERSPACE, FOLLOW, NULLVPP, + &vp, startvp, tmpcr)) != 0) { + if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) + goto lookup; + crfree(tmpcr); + return (error); + } + + if (mode != 0) { + error = VOP_ACCESS(vp, mode, 0, tmpcr, NULL); + if (error != 0) { + if ((error == ESTALE) && + fs_need_estale_retry(estale_retry++)) { + VN_RELE(vp); + goto lookup; + } + + } else if ((fmode & X_OK) != 0 && cr->cr_ruid == 0 && + lx_kern_release_cmp(curproc->p_zone, "2.4.0") > 0) { + /* check for incorrect execute success */ + vattr_t va; + + va.va_mask = AT_MODE; + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) == 0) { + mode_t m = VTTOIF(va.va_type) | va.va_mode; + + if ((m & S_IFMT) == S_IFREG && + !(m & (S_IXUSR | S_IXGRP | S_IXOTH))) { + /* no execute bits set in the mode */ + error = EACCES; + } + } + } + } + + crfree(tmpcr); + VN_RELE(vp); + return (error); +} + +int +lx_faccessat(int atfd, char *fname, int fmode, int flag) +{ + vnode_t *startvp; + int error; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + if ((flag = ltos_at_flag(flag, AT_EACCESS, B_FALSE)) < 0) + return (set_errno(EINVAL)); + + if (fname == NULL) + return (set_errno(EFAULT)); + if ((error = fgetstartvp(atfd, fname, &startvp)) != 0) + return (set_errno(error)); + if (AU_AUDITING() && startvp != NULL) + audit_setfsat_path(1); + + /* Do not allow E_OK unless AT_EACCESS flag is set */ + if ((flag & AT_EACCESS) == 0) + fmode &= ~E_OK; + + error = lx_common_access(fname, fmode, startvp); + if (startvp != NULL) + VN_RELE(startvp); + if (error) + return (set_errno(error)); + return (0); +} + +int +lx_access(char *fname, int fmode) +{ + return (lx_faccessat(LX_AT_FDCWD, fname, fmode, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_aio.c b/usr/src/uts/common/brand/lx/syscall/lx_aio.c new file mode 100644 index 0000000000..6748245db8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_aio.c @@ -0,0 +1,1345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Linux aio syscall support. + * + * The Linux story around the io_* syscalls is very confusing. The io_* syscalls + * are not exposed via glibc and in fact, glibc seems to implement its own aio + * without using the io_* syscalls at all. However, there is the libaio library + * which uses the io_* syscalls, although its implementation of the io_* + * functions (with the same names!) is different from the syscalls themselves, + * and it uses different definitions for some of the structures involved. + * + * These syscalls are documented to use an aio_context_t for the context + * parameter. On Linux this is a ulong_t. The contexts live in the kernel + * address space and are looked up using the aio_context_t parameter. However, + * the Linux libaio library, which is a consumer of the io_* syscalls, abuses + * the context by assuming it can be used as a pointer into memory that is + * mapped into the process. To accomodate this abomination we map a page of + * anonymous memory and expose the context to user-land as a pointer offset + * into that page. The page itself is never used by our code and our internal + * context ID is simply an integer we calculate based on the page pointer + * offset. + * + * Most applications never use aio, so we don't want an implementation that + * adds overhead to every process, but on the other hand, when an application is + * using aio, it is for performance reasons and we want to be as efficient as + * possible. In particular, we don't want to dynamically allocate resources + * in the paths that enqueue I/O. Instead, we pre-allocate the resources + * we may need when the application performs the io_setup call and keep the + * io_submit and io_getevents calls streamlined. + * + * The general approach here is inspired by the native aio support provided by + * libc in user-land. We have worker threads that pick up pending work from + * the context "lxioctx_pending" list and synchronously issue the operation in + * the control block. When the operation completes, the thread places the + * control block into the context "lxioctx_done" list for later consumption by + * io_getevents. The thread will then attempt to service another pending + * operation or wait for more work to arrive. + * + * The control blocks on the pending or done lists are referenced by an + * lx_io_elem_t struct. This simply holds a pointer to the user-land control + * block and the result of the operation. These elements are pre-allocated at + * io_setup time and stored on the context "lxioctx_free" list. + * + * io_submit pulls elements off of the free list, places them on the pending + * list and kicks a worker thread to run. io_getevents pulls elements off of + * the done list, sets up an event to return, and places the elements back + * onto the free list. + * + * The worker threads are pre-allocated at io_setup time. These are LWP's + * that are part of the process, but never leave the kernel. The number of + * LWP's is allocated based on the nr_events argument to io_setup. Because + * this argument can theoretically be large (up to LX_AIO_MAX_NR), we want to + * pre-allocate enough threads to get good I/O concurrency, but not overdo it. + * For a small nr_events (<= lx_aio_base_workers) we pre-allocate as many + * threads as nr_events so that all of the the I/O can run in parallel. Once + * we exceed lx_aio_base_workers, we scale up the number of threads by 2, until + * we hit the maximum at lx_aio_max_workers. See the code in io_setup for more + * information. + * + * Because the worker threads never leave the kernel, they are marked with the + * TP_KTHREAD bit so that /proc operations essentially ignore them. We also tag + * the brand lwp flags with the BR_AIO_LWP bit so that these threads never + * appear in the lx /proc. Aside from servicing aio submissions, the worker + * threads don't participate in most application-initiated operations. Forking + * is a special case for the workers. The Linux fork(2) and vfork(2) behavior + * always forks only a single thread; the caller. However, during cfork() the + * system attempts to quiesce all threads by calling holdlwps(). The workers + * check for SHOLDFORK and SHOLDFORK1 in their loops and suspend themselves ala + * holdlwp() if the process forks. + * + * It is hard to make any generalized statements about how the aio syscalls + * are used in production. MySQL is one of the more popular consumers of aio + * and in the default configuration it will create 10 contexts with a capacity + * of 256 I/Os (io_setup nr_events) and 1 context with a capacity of 100 I/Os. + * Another application we've seen will create 8 contexts, each with a capacity + * of 128 I/Os. In practice 1-7 was the typical number of in-flight I/Os. + * + * The default configuration for MySQL uses 4 read and 4 write threads. Each + * thread has an associated context. MySQL also allocates 3 additional contexts, + * so in the default configuration it will only use 11, but the number of + * read and write threads can be tuned up to a maximum of 64. We can expand + * a process's number of contexts up to a maximum of LX_IOCTX_CNT_MAX, which + * is significantly more than we've ever seen in use. + * + * According to www.kernel.org/doc/Documentation/sysctl/fs.txt, the + * /proc/sys/fs entries for aio are: + * - aio-nr: The total of all nr_events values specified on the io_setup + * call for every active context. + * - aio-max-nr: The upper limit for aio-nr + * aio-nr is tracked as a zone-wide value. We keep aio-max-nr limited to + * LX_AIO_MAX_NR, which matches Linux and provides plenty of headroom for the + * zone. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/time.h> +#include <sys/brand.h> +#include <sys/sysmacros.h> +#include <sys/sdt.h> +#include <sys/procfs.h> +#include <sys/eventfd.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + +/* These constants match Linux */ +#define LX_IOCB_FLAG_RESFD 0x0001 +#define LX_IOCB_CMD_PREAD 0 +#define LX_IOCB_CMD_PWRITE 1 +#define LX_IOCB_CMD_FSYNC 2 +#define LX_IOCB_CMD_FDSYNC 3 +#define LX_IOCB_CMD_PREADX 4 +#define LX_IOCB_CMD_POLL 5 +#define LX_IOCB_CMD_NOOP 6 +#define LX_IOCB_CMD_PREADV 7 +#define LX_IOCB_CMD_PWRITEV 8 + +#define LX_KIOCB_KEY 0 + +/* + * Base and max. number of contexts/process. Note that we currently map one + * page to manage the user-level context ID, so that code must be adjusted if + * LX_IOCTX_CNT_MAX is ever enlarged. Currently, this is the limit for the + * number of 64-bit pointers in one 4k page. + */ +#define LX_IOCTX_CNT_BASE 16 +#define LX_IOCTX_CNT_MAX 512 + +/* + * Max number of control block pointers, or lx_io_event_t's, to allocate on the + * stack in io_submit or io_getevents. + */ +#define MAX_ALLOC_ON_STACK 128 +#define alloca(x) __builtin_alloca(x) +extern void *__builtin_alloca(size_t); + +/* The context is an offset within the ctxpage we mapped */ +#define CTXID_TO_PTR(L, I) ((L)->l_io_ctxpage + ((I) * sizeof (uintptr_t))) +#define PTR_TO_CTXID(L, P) ((int)((uintptr_t)(P) - (L)->l_io_ctxpage) / \ + sizeof (uintptr_t)) + +typedef ulong_t lx_aio_context_t; + +uint_t lx_aio_base_workers = 16; /* num threads/context before scaling */ +uint_t lx_aio_max_workers = 32; /* upper limit on threads/context */ + +/* + * Internal representation of an aio context. + */ +typedef struct lx_io_ctx { + boolean_t lxioctx_shutdown; /* context is being destroyed */ + uint_t lxioctx_maxn; /* nr_events from io_setup */ + uint_t lxioctx_in_use; /* reference counter */ + kmutex_t lxioctx_f_lock; /* free list lock */ + uint_t lxioctx_free_cnt; /* num. elements in free list */ + list_t lxioctx_free; /* free list */ + kmutex_t lxioctx_p_lock; /* pending list lock */ + kcondvar_t lxioctx_pending_cv; /* pending list cv */ + list_t lxioctx_pending; /* pending list */ + kmutex_t lxioctx_d_lock; /* done list lock */ + kcondvar_t lxioctx_done_cv; /* done list cv */ + uint_t lxioctx_done_cnt; /* num. elements in done list */ + list_t lxioctx_done; /* done list */ +} lx_io_ctx_t; + +/* + * Linux binary definition of an I/O event. + */ +typedef struct lx_io_event { + uint64_t lxioe_data; /* data payload */ + uint64_t lxioe_object; /* object of origin */ + int64_t lxioe_res; /* result code */ + int64_t lxioe_res2; /* "secondary" result (WTF?) */ +} lx_io_event_t; + +/* + * Linux binary definition of an I/O control block. + */ +typedef struct lx_iocb { + uint64_t lxiocb_data; /* data payload */ + uint32_t lxiocb_key; /* must be LX_KIOCB_KEY (!) */ + uint32_t lxiocb_reserved1; + uint16_t lxiocb_op; /* operation */ + int16_t lxiocb_reqprio; /* request priority */ + uint32_t lxiocb_fd; /* file descriptor */ + uint64_t lxiocb_buf; /* data buffer */ + uint64_t lxiocb_nbytes; /* number of bytes */ + int64_t lxiocb_offset; /* offset in file */ + uint64_t lxiocb_reserved2; + uint32_t lxiocb_flags; /* LX_IOCB_FLAG_* flags */ + uint32_t lxiocb_resfd; /* eventfd fd, if any */ +} lx_iocb_t; + +typedef struct lx_io_elem { + list_node_t lxioelem_link; + uint16_t lxioelem_op; /* operation */ + uint16_t lxioelem_flags; /* bits from lxiocb_flags */ + int lxioelem_fd; /* file descriptor */ + file_t *lxioelem_fp; /* getf() file pointer */ + int lxioelem_resfd; /* RESFD file descriptor */ + file_t *lxioelem_resfp; /* RESFD getf() file pointer */ + void *lxioelem_buf; /* data buffer */ + uint64_t lxioelem_nbytes; /* number of bytes */ + int64_t lxioelem_offset; /* offset in file */ + uint64_t lxioelem_data; + ssize_t lxioelem_res; + void *lxioelem_cbp; /* ptr to iocb in userspace */ +} lx_io_elem_t; + +/* From lx_rw.c */ +extern ssize_t lx_pread_fp(file_t *, void *, size_t, off64_t); +extern ssize_t lx_pwrite_fp(file_t *, void *, size_t, off64_t); + +/* From common/syscall/rw.c */ +extern int fdsync(int, int); +/* From common/os/grow.c */ +extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t); + +/* + * Given an aio_context ID, return our internal context pointer with an + * additional ref. count, or NULL if cp not found. + */ +static lx_io_ctx_t * +lx_io_cp_hold(lx_aio_context_t cid) +{ + int id; + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_io_ctx_t *cp; + + mutex_enter(&lxpd->l_io_ctx_lock); + + if (lxpd->l_io_ctxs == NULL) { + ASSERT(lxpd->l_io_ctx_cnt == 0); + ASSERT(lxpd->l_io_ctxpage == (uintptr_t)NULL); + goto bad; + } + + id = PTR_TO_CTXID(lxpd, cid); + if (id < 0 || id >= lxpd->l_io_ctx_cnt) + goto bad; + + if ((cp = lxpd->l_io_ctxs[id]) == NULL) + goto bad; + + if (cp->lxioctx_shutdown) + goto bad; + + atomic_inc_32(&cp->lxioctx_in_use); + mutex_exit(&lxpd->l_io_ctx_lock); + return (cp); + +bad: + mutex_exit(&lxpd->l_io_ctx_lock); + return (NULL); +} + +/* + * Release a hold on the context and clean up the context if it was the last + * hold. + */ +static void +lx_io_cp_rele(lx_io_ctx_t *cp) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_zone_data_t *lxzd; + int i; + lx_io_elem_t *ep; + + mutex_enter(&lxpd->l_io_ctx_lock); + ASSERT(cp->lxioctx_in_use >= 1); + if (cp->lxioctx_in_use > 1) { + atomic_dec_32(&cp->lxioctx_in_use); + /* wake all threads waiting on context rele */ + cv_broadcast(&lxpd->l_io_destroy_cv); + mutex_exit(&lxpd->l_io_ctx_lock); + return; + } + + /* + * We hold the last ref. + */ + for (i = 0; i < lxpd->l_io_ctx_cnt; i++) { + if (lxpd->l_io_ctxs[i] == cp) { + lxpd->l_io_ctxs[i] = NULL; + break; + } + } + ASSERT(i < lxpd->l_io_ctx_cnt); + /* wake all threads waiting on context destruction */ + cv_broadcast(&lxpd->l_io_destroy_cv); + ASSERT(cp->lxioctx_shutdown == B_TRUE); + + mutex_exit(&lxpd->l_io_ctx_lock); + + /* can now decrement the zone's overall aio counter */ + lxzd = ztolxzd(curproc->p_zone); + mutex_enter(&lxzd->lxzd_lock); + VERIFY(cp->lxioctx_maxn <= lxzd->lxzd_aio_nr); + lxzd->lxzd_aio_nr -= cp->lxioctx_maxn; + mutex_exit(&lxzd->lxzd_lock); + + /* + * We have the only pointer to the context now. Free all + * elements from all three queues and the context itself. + */ + while ((ep = list_remove_head(&cp->lxioctx_free)) != NULL) { + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + /* + * During io_submit() we use getf() to get/validate the file pointer + * for the file descriptor in each control block. We do not releasef() + * the fd, but instead pass along the fd and file pointer to the worker + * threads. In order to manage this hand-off we use clear_active_fd() + * in the syscall path and then in our thread which takes over the file + * descriptor, we use a combination of set_active_fd() and releasef(). + * Because our thread that is taking ownership of the fd has not called + * getf(), we first call set_active_fd(-1) to reserve a slot in the + * active fd array for ourselves. + */ + set_active_fd(-1); + while ((ep = list_remove_head(&cp->lxioctx_pending)) != NULL) { + set_active_fd(ep->lxioelem_fd); + releasef(ep->lxioelem_fd); + + if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) { + set_active_fd(ep->lxioelem_resfd); + releasef(ep->lxioelem_resfd); + } + + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + while ((ep = list_remove_head(&cp->lxioctx_done)) != NULL) { + kmem_free(ep, sizeof (lx_io_elem_t)); + } + + ASSERT(list_is_empty(&cp->lxioctx_free)); + list_destroy(&cp->lxioctx_free); + ASSERT(list_is_empty(&cp->lxioctx_pending)); + list_destroy(&cp->lxioctx_pending); + ASSERT(list_is_empty(&cp->lxioctx_done)); + list_destroy(&cp->lxioctx_done); + + kmem_free(cp, sizeof (lx_io_ctx_t)); +} + +/* + * Called by a worker thread to perform the operation specified in the control + * block. + * + * Linux returns a negative errno in the event "lxioelem_res" field as the + * result of a failed operation. We do the same. + */ +static void +lx_io_do_op(lx_io_elem_t *ep) +{ + int err; + int64_t res = 0; + + set_active_fd(ep->lxioelem_fd); + + ttolwp(curthread)->lwp_errno = 0; + switch (ep->lxioelem_op) { + case LX_IOCB_CMD_FSYNC: + case LX_IOCB_CMD_FDSYNC: + /* + * Note that Linux always returns EINVAL for these two + * operations. This is apparently because nothing in Linux + * defines the 'aio_fsync' function. Thus, it is unlikely any + * application will actually submit these. + * + * This is basically fdsync(), but we already have the fp. + */ + err = VOP_FSYNC(ep->lxioelem_fp->f_vnode, + (ep->lxioelem_op == LX_IOCB_CMD_FSYNC) ? FSYNC : FDSYNC, + ep->lxioelem_fp->f_cred, NULL); + if (err != 0) { + (void) set_errno(err); + } + + break; + + case LX_IOCB_CMD_PREAD: + res = lx_pread_fp(ep->lxioelem_fp, ep->lxioelem_buf, + ep->lxioelem_nbytes, ep->lxioelem_offset); + break; + + case LX_IOCB_CMD_PWRITE: + res = lx_pwrite_fp(ep->lxioelem_fp, ep->lxioelem_buf, + ep->lxioelem_nbytes, ep->lxioelem_offset); + break; + + default: + /* We validated the op at io_submit syscall time */ + VERIFY(0); + break; + } + if (ttolwp(curthread)->lwp_errno != 0) + res = -lx_errno(ttolwp(curthread)->lwp_errno, EINVAL); + + ep->lxioelem_res = res; + + releasef(ep->lxioelem_fd); + ep->lxioelem_fd = 0; + ep->lxioelem_fp = NULL; +} + +/* + * The operation has either completed or been cancelled. Finalize the handling + * and move the operation onto the "done" queue. + */ +static void +lx_io_finish_op(lx_io_ctx_t *cp, lx_io_elem_t *ep, boolean_t do_event) +{ + boolean_t do_resfd; + int resfd = 0; + file_t *resfp = NULL; + + if (ep->lxioelem_flags & LX_IOCB_FLAG_RESFD) { + do_resfd = B_TRUE; + resfd = ep->lxioelem_resfd; + resfp = ep->lxioelem_resfp; + } else { + do_resfd = B_FALSE; + } + + ep->lxioelem_flags = 0; + ep->lxioelem_resfd = 0; + ep->lxioelem_resfp = NULL; + + mutex_enter(&cp->lxioctx_d_lock); + list_insert_tail(&cp->lxioctx_done, ep); + cp->lxioctx_done_cnt++; + cv_signal(&cp->lxioctx_done_cv); + mutex_exit(&cp->lxioctx_d_lock); + + /* Update the eventfd if necessary */ + if (do_resfd) { + vnode_t *vp = resfp->f_vnode; + uint64_t val = 1; + + set_active_fd(resfd); + + if (do_event) { + /* + * Eventfd notifications from AIO are special in that + * they are not expected to block. This interface allows + * the eventfd value to reach (but not cross) the + * overflow value. + */ + (void) VOP_IOCTL(vp, EVENTFDIOC_POST, (intptr_t)&val, + FKIOCTL, resfp->f_cred, NULL, NULL); + } + + releasef(resfd); + } +} + +/* + * First check if this worker needs to quit due to shutdown or exit. Return + * true in this case. + * + * Then check if our process is forking. In this case it expects all LWPs to be + * stopped first. For the worker threads, a stop equivalent to holdlwp() is + * necessary before the fork can proceed. + * + * It is common to check p_flag outside of p_lock (see issig) and we want to + * avoid making p_lock any hotter since this is called in the worker main loops. + */ +static boolean_t +lx_io_worker_chk_status(lx_io_ctx_t *cp, boolean_t locked) +{ + if (cp->lxioctx_shutdown) + return (B_TRUE); + + if (curproc->p_flag & (SEXITLWPS | SKILLED)) { + cp->lxioctx_shutdown = B_TRUE; + return (B_TRUE); + } + + if (curproc->p_flag & (SHOLDFORK | SHOLDFORK1)) { + if (locked) + mutex_exit(&cp->lxioctx_p_lock); + + mutex_enter(&curproc->p_lock); + stop(PR_SUSPENDED, SUSPEND_NORMAL); + mutex_exit(&curproc->p_lock); + + if (locked) + mutex_enter(&cp->lxioctx_p_lock); + + if (cp->lxioctx_shutdown) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Worker thread - pull work off the pending queue, perform the operation and + * place the result on the done queue. Do this as long as work is pending, then + * wait for more. + */ +static void +lx_io_worker(void *a) +{ + lx_io_ctx_t *cp = (lx_io_ctx_t *)a; + lx_io_elem_t *ep; + + set_active_fd(-1); /* See comment in lx_io_cp_rele */ + + while (!cp->lxioctx_shutdown) { + mutex_enter(&cp->lxioctx_p_lock); + if (list_is_empty(&cp->lxioctx_pending)) { + /* + * This must be cv_wait_sig, as opposed to cv_wait, so + * that pokelwps works correctly on these threads. + * + * The worker threads have all of their signals held, + * so a cv_wait_sig return of 0 here only occurs while + * we're shutting down. + */ + if (cv_wait_sig(&cp->lxioctx_pending_cv, + &cp->lxioctx_p_lock) == 0) + cp->lxioctx_shutdown = B_TRUE; + } + + if (lx_io_worker_chk_status(cp, B_TRUE)) { + mutex_exit(&cp->lxioctx_p_lock); + break; + } + + ep = list_remove_head(&cp->lxioctx_pending); + mutex_exit(&cp->lxioctx_p_lock); + + while (ep != NULL) { + lx_io_do_op(ep); + + lx_io_finish_op(cp, ep, B_TRUE); + + if (lx_io_worker_chk_status(cp, B_FALSE)) + break; + + mutex_enter(&cp->lxioctx_p_lock); + ep = list_remove_head(&cp->lxioctx_pending); + mutex_exit(&cp->lxioctx_p_lock); + } + } + + lx_io_cp_rele(cp); + + ASSERT(curthread->t_lwp != NULL); + mutex_enter(&curproc->p_lock); + lwp_exit(); +} + +/* + * LTP passes -1 for nr_events but we're limited by LX_AIO_MAX_NR anyway. + */ +long +lx_io_setup(uint_t nr_events, void *ctxp) +{ + int i, slot; + proc_t *p = curproc; + lx_proc_data_t *lxpd = ptolxproc(p); + lx_zone_data_t *lxzd = ztolxzd(p->p_zone); + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + uintptr_t cid; + uint_t nworkers; + k_sigset_t hold_set; + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t cid32; + + if (copyin(ctxp, &cid32, sizeof (cid32)) != 0) + return (set_errno(EFAULT)); + cid = (uintptr_t)cid32; + } else +#endif + if (copyin(ctxp, &cid, sizeof (cid)) != 0) + return (set_errno(EFAULT)); + + /* The cid in user-land must be NULL to start */ + if (cid != (uintptr_t)NULL || nr_events > LX_AIO_MAX_NR) + return (set_errno(EINVAL)); + + mutex_enter(&lxzd->lxzd_lock); + if ((nr_events + lxzd->lxzd_aio_nr) > LX_AIO_MAX_NR) { + mutex_exit(&lxzd->lxzd_lock); + return (set_errno(EAGAIN)); + } + lxzd->lxzd_aio_nr += nr_events; + mutex_exit(&lxzd->lxzd_lock); + + /* Find a free slot */ + mutex_enter(&lxpd->l_io_ctx_lock); + if (lxpd->l_io_ctxs == NULL) { + /* + * First use of aio, allocate a context array and a page + * in our address space to use for context ID handling. + */ + uintptr_t ctxpage; + + ASSERT(lxpd->l_io_ctx_cnt == 0); + ASSERT(lxpd->l_io_ctxpage == (uintptr_t)NULL); + + ttolwp(curthread)->lwp_errno = 0; + ctxpage = (uintptr_t)smmap64(0, PAGESIZE, PROT_READ, + MAP_SHARED | MAP_ANON, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + mutex_exit(&lxpd->l_io_ctx_lock); + return (set_errno(ENOMEM)); + } + + lxpd->l_io_ctxpage = ctxpage; + lxpd->l_io_ctx_cnt = LX_IOCTX_CNT_BASE; + lxpd->l_io_ctxs = kmem_zalloc(lxpd->l_io_ctx_cnt * + sizeof (lx_io_ctx_t *), KM_SLEEP); + slot = 0; + } else { + ASSERT(lxpd->l_io_ctx_cnt > 0); + for (slot = 0; slot < lxpd->l_io_ctx_cnt; slot++) { + if (lxpd->l_io_ctxs[slot] == NULL) + break; + } + + if (slot == lxpd->l_io_ctx_cnt) { + /* Double our context array up to the max. */ + const uint_t new_cnt = lxpd->l_io_ctx_cnt * 2; + const uint_t old_size = lxpd->l_io_ctx_cnt * + sizeof (lx_io_ctx_t *); + const uint_t new_size = new_cnt * + sizeof (lx_io_ctx_t *); + struct lx_io_ctx **old_array = lxpd->l_io_ctxs; + + if (new_cnt > LX_IOCTX_CNT_MAX) { + mutex_exit(&lxpd->l_io_ctx_lock); + mutex_enter(&lxzd->lxzd_lock); + lxzd->lxzd_aio_nr -= nr_events; + mutex_exit(&lxzd->lxzd_lock); + return (set_errno(ENOMEM)); + } + + /* See big theory comment explaining context ID. */ + VERIFY(PAGESIZE >= new_size); + lxpd->l_io_ctxs = kmem_zalloc(new_size, KM_SLEEP); + + bcopy(old_array, lxpd->l_io_ctxs, old_size); + kmem_free(old_array, old_size); + lxpd->l_io_ctx_cnt = new_cnt; + + /* note: 'slot' is now valid in the new array */ + } + } + + cp = kmem_zalloc(sizeof (lx_io_ctx_t), KM_SLEEP); + list_create(&cp->lxioctx_free, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + list_create(&cp->lxioctx_pending, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + list_create(&cp->lxioctx_done, sizeof (lx_io_elem_t), + offsetof(lx_io_elem_t, lxioelem_link)); + mutex_init(&cp->lxioctx_f_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&cp->lxioctx_p_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&cp->lxioctx_d_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&cp->lxioctx_pending_cv, NULL, CV_DEFAULT, NULL); + cv_init(&cp->lxioctx_done_cv, NULL, CV_DEFAULT, NULL); + + /* Add a hold on this context until we're done setting up */ + cp->lxioctx_in_use = 1; + lxpd->l_io_ctxs[slot] = cp; + + cid = CTXID_TO_PTR(lxpd, slot); + + mutex_exit(&lxpd->l_io_ctx_lock); + + /* + * Finish setting up the context. + * + * The context is in the l_io_ctxs array now, so it is potentially + * visible to other threads. However, we have a hold so it cannot be + * destroyed, and both lxioctx_free_cnt and lxioctx_maxn are still 0, + * so nothing can be submitted to this context yet either. + */ + + /* Setup the free list of internal control block elements */ + for (i = 0; i < nr_events; i++) { + ep = kmem_zalloc(sizeof (lx_io_elem_t), KM_SLEEP); + list_insert_head(&cp->lxioctx_free, ep); + } + + /* + * Pre-allocate the worker threads at setup time. + * + * Based on how much concurrent input we may be given, we want enough + * worker threads to get good parallelism but we also want to taper off + * and cap at our upper limit. Our zone's ZFS I/O limit may also come + * into play when we're pumping lots of I/O in parallel. + * + * Note: a possible enhancement here would be to also limit the number + * of worker threads based on the zone's cpu-cap. That is, if the + * cap is low, we might not want too many worker threads. + */ + if (nr_events <= lx_aio_base_workers) { + nworkers = nr_events; + } else { + /* scale up until hit max */ + nworkers = (nr_events / 2) + (lx_aio_base_workers / 2); + if (nworkers > lx_aio_max_workers) + nworkers = lx_aio_max_workers; + } + + sigfillset(&hold_set); + for (i = 0; i < nworkers; i++) { + klwp_t *l; + kthread_t *t; + + /* + * Note that this lwp will not "stop at sys_rtt" as described + * on lwp_create. This lwp will run entirely in the kernel as + * a worker thread serving aio requests. + */ + l = lwp_create(lx_io_worker, (void *)cp, 0, p, TS_STOPPED, + minclsyspri - 1, &hold_set, curthread->t_cid, 0); + if (l == NULL) { + if (i == 0) { + /* + * Uh-oh - we can't create a single worker. + * Release our hold which will cleanup. + */ + cp->lxioctx_shutdown = B_TRUE; + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_maxn = nr_events; + mutex_exit(&lxpd->l_io_ctx_lock); + lx_io_cp_rele(cp); + return (set_errno(ENOMEM)); + } else { + /* + * No new lwp but we already have at least 1 + * worker so don't fail entire syscall. + */ + break; + } + } + + atomic_inc_32(&cp->lxioctx_in_use); + + /* + * Mark it as an in-kernel thread, an lx AIO worker LWP, and + * set it running. + */ + t = lwptot(l); + mutex_enter(&curproc->p_lock); + t->t_proc_flag = (t->t_proc_flag & ~TP_HOLDLWP) | TP_KTHREAD; + lwptolxlwp(l)->br_lwp_flags |= BR_AIO_LWP; + lwp_create_done(t); + mutex_exit(&curproc->p_lock); + } + + /* + * io_submit can occur once lxioctx_free_cnt and lxioctx_maxn are + * non-zero. + */ + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_maxn = cp->lxioctx_free_cnt = nr_events; + mutex_exit(&lxpd->l_io_ctx_lock); + /* Release our hold, worker thread refs keep ctx alive. */ + lx_io_cp_rele(cp); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t cid32 = (uintptr32_t)cid; + + if (copyout(&cid32, ctxp, sizeof (cid32)) != 0) { + /* Since we did a copyin above, this shouldn't fail */ + (void) lx_io_destroy(cid); + return (set_errno(EFAULT)); + } + } else +#endif + if (copyout(&cid, ctxp, sizeof (cid)) != 0) { + /* Since we did a copyin above, this shouldn't fail */ + (void) lx_io_destroy(cid); + return (set_errno(EFAULT)); + } + + return (0); +} + +long +lx_io_submit(lx_aio_context_t cid, const long nr, uintptr_t **bpp) +{ + uint_t i = 0; + int err = 0; + const size_t sz = nr * sizeof (uintptr_t); + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + lx_iocb_t **iocbpp; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + if (nr == 0) { + lx_io_cp_rele(cp); + return (0); + } + + if (nr < 0 || nr > cp->lxioctx_maxn) { + lx_io_cp_rele(cp); + return (set_errno(EINVAL)); + } + + if (nr > MAX_ALLOC_ON_STACK) { + iocbpp = (lx_iocb_t **)kmem_alloc(sz, KM_NOSLEEP); + if (iocbpp == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EAGAIN)); + } + } else { + iocbpp = (lx_iocb_t **)alloca(sz); + } + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + uintptr32_t *iocbpp32; + + if (copyin(bpp, iocbpp, nr * sizeof (uintptr32_t)) != 0) { + lx_io_cp_rele(cp); + err = EFAULT; + goto out; + } + + /* + * Zero-extend the 32-bit pointers to proper size. This is + * performed "in reverse" so it can be done in-place, rather + * than with an additional translation copy. + */ + iocbpp32 = (uintptr32_t *)iocbpp; + i = nr; + do { + i--; + iocbpp[i] = (lx_iocb_t *)(uintptr_t)iocbpp32[i]; + } while (i != 0); + } else +#endif + if (copyin(bpp, iocbpp, nr * sizeof (uintptr_t)) != 0) { + lx_io_cp_rele(cp); + err = EFAULT; + goto out; + } + + /* We need to return an error if not able to process any of them */ + mutex_enter(&cp->lxioctx_f_lock); + if (cp->lxioctx_free_cnt == 0) { + mutex_exit(&cp->lxioctx_f_lock); + lx_io_cp_rele(cp); + err = EAGAIN; + goto out; + } + mutex_exit(&cp->lxioctx_f_lock); + + for (i = 0; i < nr; i++) { + lx_iocb_t cb; + file_t *fp, *resfp = NULL; + + if (cp->lxioctx_shutdown) + break; + + if (copyin(iocbpp[i], &cb, sizeof (lx_iocb_t)) != 0) { + err = EFAULT; + break; + } + + /* There is only one valid flag */ + if (cb.lxiocb_flags & ~LX_IOCB_FLAG_RESFD) { + err = EINVAL; + break; + } + + switch (cb.lxiocb_op) { + case LX_IOCB_CMD_FSYNC: + case LX_IOCB_CMD_FDSYNC: + case LX_IOCB_CMD_PREAD: + case LX_IOCB_CMD_PWRITE: + break; + + /* + * We don't support asynchronous preadv and pwritev (an + * asynchronous scatter/gather being a somewhat odd + * notion to begin with); we return EINVAL for that + * case, which the caller should be able to deal with. + * We also return EINVAL for LX_IOCB_CMD_NOOP or any + * unrecognized opcode. + */ + default: + err = EINVAL; + break; + } + if (err != 0) + break; + + /* Validate fd */ + if ((fp = getf(cb.lxiocb_fd)) == NULL) { + err = EBADF; + break; + } + + if (cb.lxiocb_op == LX_IOCB_CMD_PREAD && + (fp->f_flag & FREAD) == 0) { + err = EBADF; + releasef(cb.lxiocb_fd); + break; + } else if (cb.lxiocb_op == LX_IOCB_CMD_PWRITE && + (fp->f_flag & FWRITE) == 0) { + err = EBADF; + releasef(cb.lxiocb_fd); + break; + } + + /* + * A character device is a bit complicated. Linux seems to + * accept these on some devices (e.g. /dev/zero) but not + * others (e.g. /proc/self/fd/0). This might be related to + * the device being seek-able, but a simple seek-set to the + * current offset will succeed for us on a pty. For now we + * handle this by rejecting the device if it is a stream. + * + * If it is a pipe (VFIFO) or directory (VDIR), we error here + * as does Linux. If it is a socket (VSOCK), it's ok here but + * we will post ESPIPE when processing the I/O CB, as does + * Linux. We also error on our other types: VDOOR, VPROC, + * VPORT, VBAD. + */ + if (fp->f_vnode->v_type == VCHR) { + if (fp->f_vnode->v_stream != NULL) { + err = EINVAL; + releasef(cb.lxiocb_fd); + break; + } + } else if (fp->f_vnode->v_type != VREG && + fp->f_vnode->v_type != VBLK && + fp->f_vnode->v_type != VSOCK) { + err = EINVAL; + releasef(cb.lxiocb_fd); + break; + } + + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + if ((resfp = getf(cb.lxiocb_resfd)) == NULL || + !lx_is_eventfd(resfp)) { + err = EINVAL; + releasef(cb.lxiocb_fd); + if (resfp != NULL) + releasef(cb.lxiocb_resfd); + break; + } + } + + mutex_enter(&cp->lxioctx_f_lock); + if (cp->lxioctx_free_cnt == 0) { + mutex_exit(&cp->lxioctx_f_lock); + releasef(cb.lxiocb_fd); + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + releasef(cb.lxiocb_resfd); + } + if (i == 0) { + /* + * Another thread used all of the free entries + * after the check preceding this loop. Since + * we did nothing, we must return an error. + */ + err = EAGAIN; + } + break; + } + ep = list_remove_head(&cp->lxioctx_free); + cp->lxioctx_free_cnt--; + ASSERT(ep != NULL); + mutex_exit(&cp->lxioctx_f_lock); + + ep->lxioelem_op = cb.lxiocb_op; + ep->lxioelem_fd = cb.lxiocb_fd; + ep->lxioelem_fp = fp; + ep->lxioelem_buf = (void *)(uintptr_t)cb.lxiocb_buf; + ep->lxioelem_nbytes = cb.lxiocb_nbytes; + ep->lxioelem_offset = cb.lxiocb_offset; + ep->lxioelem_data = cb.lxiocb_data; + ep->lxioelem_cbp = iocbpp[i]; + + /* Hang on to the fp but setup to hand it off to a worker */ + clear_active_fd(cb.lxiocb_fd); + + if (cb.lxiocb_flags & LX_IOCB_FLAG_RESFD) { + ep->lxioelem_flags = LX_IOCB_FLAG_RESFD; + ep->lxioelem_resfd = cb.lxiocb_resfd; + ep->lxioelem_resfp = resfp; + clear_active_fd(cb.lxiocb_resfd); + } + + mutex_enter(&cp->lxioctx_p_lock); + list_insert_tail(&cp->lxioctx_pending, ep); + cv_signal(&cp->lxioctx_pending_cv); + mutex_exit(&cp->lxioctx_p_lock); + } + + lx_io_cp_rele(cp); + +out: + if (nr > MAX_ALLOC_ON_STACK) { + kmem_free(iocbpp, sz); + } + if (i == 0 && err != 0) + return (set_errno(err)); + + return (i); +} + +long +lx_io_getevents(lx_aio_context_t cid, long min_nr, const long nr, + lx_io_event_t *events, timespec_t *timeoutp) +{ + int i; + lx_io_ctx_t *cp; + const size_t sz = nr * sizeof (lx_io_event_t); + timespec_t timeout, *tp; + lx_io_event_t *out; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + if (min_nr < 0 || min_nr > cp->lxioctx_maxn || + nr < 0 || nr > cp->lxioctx_maxn) { + lx_io_cp_rele(cp); + return (set_errno(EINVAL)); + } + + if (nr == 0) { + lx_io_cp_rele(cp); + return (0); + } + + if (events == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EFAULT)); + } + + if (timeoutp == NULL) { + tp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &timeout, sizeof (timestruc_t))) { + lx_io_cp_rele(cp); + return (EFAULT); + } + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(timeoutp, &timeout32, + sizeof (timestruc32_t))) { + lx_io_cp_rele(cp); + return (EFAULT); + } + timeout.tv_sec = (time_t)timeout32.tv_sec; + timeout.tv_nsec = timeout32.tv_nsec; + } +#endif + + if (itimerspecfix(&timeout)) { + lx_io_cp_rele(cp); + return (EINVAL); + } + + tp = &timeout; + if (timeout.tv_sec == 0 && timeout.tv_nsec == 0) { + /* + * A timeout of 0:0 is like a poll; we return however + * many events are ready, irrespective of the passed + * min_nr. + */ + min_nr = 0; + } else { + timestruc_t now; + + /* + * We're given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(tp, &now); + } + } + + out = kmem_zalloc(sz, KM_SLEEP); + + /* + * A min_nr of 0 is like a poll even if given a NULL timeout; we return + * however many events are ready. + */ + if (min_nr > 0) { + mutex_enter(&cp->lxioctx_d_lock); + while (!cp->lxioctx_shutdown && cp->lxioctx_done_cnt < min_nr) { + int r; + + r = cv_waituntil_sig(&cp->lxioctx_done_cv, + &cp->lxioctx_d_lock, tp, timechanged); + if (r < 0) { + /* timeout */ + mutex_exit(&cp->lxioctx_d_lock); + lx_io_cp_rele(cp); + kmem_free(out, sz); + return (0); + } else if (r == 0) { + /* interrupted */ + mutex_exit(&cp->lxioctx_d_lock); + lx_io_cp_rele(cp); + kmem_free(out, sz); + return (set_errno(EINTR)); + } + + /* + * Signalled that something was queued up. Check if + * there are now enough or if we have to wait for more. + */ + } + ASSERT(cp->lxioctx_done_cnt >= min_nr || cp->lxioctx_shutdown); + mutex_exit(&cp->lxioctx_d_lock); + } + + /* + * For each done control block, move it into the Linux event we return. + * As we're doing this, we also moving it from the done list to the + * free list. + */ + for (i = 0; i < nr && !cp->lxioctx_shutdown; i++) { + lx_io_event_t *lxe; + lx_io_elem_t *ep; + + lxe = &out[i]; + + mutex_enter(&cp->lxioctx_d_lock); + if (cp->lxioctx_done_cnt == 0) { + mutex_exit(&cp->lxioctx_d_lock); + break; + } + + ep = list_remove_head(&cp->lxioctx_done); + cp->lxioctx_done_cnt--; + mutex_exit(&cp->lxioctx_d_lock); + + lxe->lxioe_data = ep->lxioelem_data; + lxe->lxioe_object = (uint64_t)(uintptr_t)ep->lxioelem_cbp; + lxe->lxioe_res = ep->lxioelem_res; + lxe->lxioe_res2 = 0; + + /* Put it back on the free list */ + ep->lxioelem_cbp = NULL; + ep->lxioelem_data = 0; + ep->lxioelem_res = 0; + mutex_enter(&cp->lxioctx_f_lock); + list_insert_head(&cp->lxioctx_free, ep); + cp->lxioctx_free_cnt++; + mutex_exit(&cp->lxioctx_f_lock); + } + + lx_io_cp_rele(cp); + + /* + * Note: Linux seems to push the events back into the queue if the + * copyout fails. Since this error is due to an application bug, it + * seems unlikely we need to worry about it, but we can revisit this + * if it is ever seen to be an issue. + */ + if (i > 0 && copyout(out, events, i * sizeof (lx_io_event_t)) != 0) { + kmem_free(out, sz); + return (set_errno(EFAULT)); + } + + kmem_free(out, sz); + return (i); +} + +/* + * Linux never returns 0 from io_cancel. A successful cancellation will return + * EINPROGRESS and the result for the cancelled operation will be available via + * a normal io_getevents call. The third parameter (the "result") to this + * syscall is unused. Note that currently the Linux man pages are incorrect + * about this behavior. Also note that in Linux, only the USB driver currently + * support aio cancellation, so callers will almost always get EINVAL when they + * attempt to cancel an IO on Linux. + */ +/*ARGSUSED*/ +long +lx_io_cancel(lx_aio_context_t cid, lx_iocb_t *iocbp, lx_io_event_t *result) +{ + lx_io_ctx_t *cp; + lx_io_elem_t *ep; + uint32_t buf; + + /* + * The Linux io_cancel copies in a field from the iocb in order to + * locate the matching kernel-internal structure. To appease the LTP + * test case which exercises this, a similar copy is performed here. + */ + if (copyin(iocbp, &buf, sizeof (buf)) != 0) { + return (set_errno(EFAULT)); + } + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + /* Try to pull the CB off the pending list */ + mutex_enter(&cp->lxioctx_p_lock); + ep = list_head(&cp->lxioctx_pending); + while (ep != NULL) { + if (ep->lxioelem_cbp == iocbp) { + list_remove(&cp->lxioctx_pending, ep); + break; + } + ep = list_next(&cp->lxioctx_pending, ep); + } + mutex_exit(&cp->lxioctx_p_lock); + + if (ep == NULL) { + lx_io_cp_rele(cp); + return (set_errno(EAGAIN)); + } + + set_active_fd(-1); /* See comment in lx_io_cp_rele */ + set_active_fd(ep->lxioelem_fd); + releasef(ep->lxioelem_fd); + ep->lxioelem_fd = 0; + ep->lxioelem_fp = NULL; + ep->lxioelem_res = -lx_errno(EINTR, EINTR); + + lx_io_finish_op(cp, ep, B_FALSE); + lx_io_cp_rele(cp); + + return (set_errno(EINPROGRESS)); +} + +long +lx_io_destroy(lx_aio_context_t cid) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + lx_io_ctx_t *cp; + int cnt = 0; + + if ((cp = lx_io_cp_hold(cid)) == NULL) + return (set_errno(EINVAL)); + + mutex_enter(&lxpd->l_io_ctx_lock); + cp->lxioctx_shutdown = B_TRUE; + + /* + * Wait for the worker threads and any blocked io_getevents threads to + * exit. We have a hold and our rele will cleanup after all other holds + * are released. + */ + ASSERT(cp->lxioctx_in_use >= 1); + while (cp->lxioctx_in_use > 1) { + DTRACE_PROBE2(lx__io__destroy, lx_io_ctx_t *, cp, int, cnt); + cv_broadcast(&cp->lxioctx_pending_cv); + cv_broadcast(&cp->lxioctx_done_cv); + + /* + * Each worker has a hold. We want to let those threads finish + * up and exit. + */ + cv_wait(&lxpd->l_io_destroy_cv, &lxpd->l_io_ctx_lock); + cnt++; + } + + mutex_exit(&lxpd->l_io_ctx_lock); + lx_io_cp_rele(cp); + return (0); +} + +/* + * Called at proc fork to clear contexts from child. We don't bother to unmap + * l_io_ctxpage since the vast majority of processes will immediately exec and + * cause an unmapping. If the child does not exec, there will simply be a + * single shared page in its address space, so no additional anonymous memory + * is consumed. + */ +void +lx_io_clear(lx_proc_data_t *cpd) +{ + cpd->l_io_ctxs = NULL; + cpd->l_io_ctx_cnt = 0; + cpd->l_io_ctxpage = (uintptr_t)NULL; +} + +/* + * Called via lx_proc_exit to cleanup any existing io context array. All + * worker threads should have already exited by this point, so all contexts + * should already be deleted. + */ +void +lx_io_cleanup(proc_t *p) +{ + lx_proc_data_t *lxpd; + int i; + + mutex_enter(&p->p_lock); + VERIFY((lxpd = ptolxproc(p)) != NULL); + mutex_exit(&p->p_lock); + + mutex_enter(&lxpd->l_io_ctx_lock); + if (lxpd->l_io_ctxs == NULL) { + ASSERT(lxpd->l_io_ctx_cnt == 0); + mutex_exit(&lxpd->l_io_ctx_lock); + return; + } + + ASSERT(lxpd->l_io_ctx_cnt > 0); + for (i = 0; i < lxpd->l_io_ctx_cnt; i++) { + ASSERT(lxpd->l_io_ctxs[i] == NULL); + } + + kmem_free(lxpd->l_io_ctxs, lxpd->l_io_ctx_cnt * sizeof (lx_io_ctx_t *)); + lxpd->l_io_ctxs = NULL; + lxpd->l_io_ctx_cnt = 0; + mutex_exit(&lxpd->l_io_ctx_lock); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c new file mode 100644 index 0000000000..d46e442759 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/errno.h> + +/* From usr/src/uts/common/os/grow.c */ +extern intptr_t brk(caddr_t); + +long +lx_brk(caddr_t nva) +{ + if (nva != 0) { + (void) brk(nva); + + /* + * Despite claims to the contrary in the man page, when Linux + * brk(2) fails, errno is left unchanged. + */ + ttolwp(curthread)->lwp_errno = 0; + } + + /* + * When ASLR was integrated, our internal brk(2) was updated to emit + * the current brk when arg0 == 0. Using the function yields an + * equivalent result to manually calculating the brk, but also + * serializes with changes to the process AS. + */ + return ((long)brk((caddr_t)0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c new file mode 100644 index 0000000000..7783b97cb0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/thread.h> +#include <sys/klwp.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +long +lx_vn_chmod(vnode_t *vp, int mode) +{ + vattr_t vattr; + + vattr.va_mode = mode & MODEMASK; + vattr.va_mask = AT_MODE; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +static long +lx_fchmodat_wrapper(int fd, char *path, int mode) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, 0)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchmodat(int fd, char *path, int mode) +{ + return (lx_fchmodat_wrapper(fd, path, mode)); +} + +long +lx_fchmod(int fd, int mode) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchmod cannot leverage + * lx_fchmodat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chmod(vp, mode); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_chmod(char *path, int mode) +{ + return (lx_fchmodat_wrapper(LX_AT_FDCWD, path, mode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c new file mode 100644 index 0000000000..830fba0a73 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c @@ -0,0 +1,180 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> + +long +lx_vn_chown(vnode_t *vp, uid_t uid, gid_t gid) +{ + vattr_t vattr; + zone_t *zone = crgetzone(CRED()); + + if ((uid != (uid_t)-1 && !VALID_UID(uid, zone)) || + (gid != (gid_t)-1 && !VALID_GID(gid, zone))) { + return (EINVAL); + } + vattr.va_uid = uid; + vattr.va_gid = gid; + vattr.va_mask = 0; + if (vattr.va_uid != -1) + vattr.va_mask |= AT_UID; + if (vattr.va_gid != -1) + vattr.va_mask |= AT_GID; + + if (vn_is_readonly(vp)) { + return (EROFS); + } + return (VOP_SETATTR(vp, &vattr, 0, CRED(), NULL)); +} + +long +lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag) +{ + long error; + vnode_t *vp; + + if ((error = lx_vp_at(fd, path, &vp, native_flag)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if (error == ENOENT && + (pd->l_flags & LX_PROC_INSTALL_MODE)) { + return (0); + } + return (set_errno(error)); + } + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchown_wrapper(int fd, uid_t uid, gid_t gid) +{ + file_t *fp; + vnode_t *vp; + long error; + + /* + * In order to do proper O_PATH handling, lx_fchown cannot leverage + * lx_fchownat with a NULL path since the desired behavior differs. + */ + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + if (LX_IS_O_PATH(fp)) { + releasef(fd); + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + error = lx_vn_chown(vp, uid, gid); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag) +{ + int native_flag = 0; + + if (flag & LX_AT_EMPTY_PATH) { + char c; + + /* + * According to fchownat(2), when AT_EMPTY_PATH is set: "if + * path is an empty string, operate on the file referred to by + * fd". We pass NULL in place of the empty string, which + * causes fchownat() to operate on the fd we passed without an + * additional lookup. + */ + if (copyin(path, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + path = NULL; + } + + flag &= ~LX_AT_EMPTY_PATH; + } + if (flag & LX_AT_SYMLINK_NOFOLLOW) { + flag &= ~LX_AT_SYMLINK_NOFOLLOW; + native_flag |= AT_SYMLINK_NOFOLLOW; + } + if (flag != 0) { + return (set_errno(EINVAL)); + } + + return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag)); +} + +long +lx_fchown(int fd, uid_t uid, gid_t gid) +{ + return (lx_fchown_wrapper(fd, uid, gid)); +} + +long +lx_lchown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, + AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0)); +} + +long +lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchown_wrapper(fd, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid))); +} + +long +lx_lchown16(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c new file mode 100644 index 0000000000..4e00e90b1a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -0,0 +1,513 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +/* + * [This comment omits the 'LX_' prefix on the clone flag names.] + * + * The vast majority of clone calls result in the creation of a new process or + * a new thread. Both of these map easily from Linux to our native code. For + * these calls, the user-level brand library uses a brand call to hook into the + * lx_helper_clone function for the required in-kernel support. + * + * A fork will typically provide these clone flags: + * CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID + * + * A new thread will use our SHARED_AS macro which has the flags: + * CLONE_FILES | CLONE_FS | CLONE_SIGHAND | CLONE_THREAD | CLONE_VM + * + * In rare cases an application will attempt to use a subset of the SHARED_AS + * flags in order to implement some sharing between two processes without using + * a true thread. Because we do not have native support for this concept, the + * lx brand implements the notion of a 'clone-group'. This is a set of + * processes which share a subset of the allowed SHARED_AS flags. The lx brand + * syscalls implement the appropriate sharing for each flag. A clone-group is + * only instantiated in the rare case that a subset of the SHARED_AS flags are + * used with clone. + * + * The following set of flags could theoretically be supported, although most + * are not implemented at this time. The user-level brand library will validate + * that a supported subset of the flags are being used, or error if not. We + * also re-validate in the kernel. + * + * CLONE_FILES: share the file descriptor table + * CLONE_FS: share the filesystem information (root of the filesystem, the + * CWD, and the umask) + * CLONE_SIGHAND: share the table of signal handlers + * CLONE_THREAD: share the thread group + * CLONE_VM: share the address space + * + * At this time, only those flags defined in CLONE_GRP_SUBSET (CLONE_FS) are + * implemented. + * + * When a clone-group is in use, the lx_proc_data_t`l_clone_grps array will + * hold groups of processes sharing the attributes relevant to the clone flag. + * Each supported flag can have an associated group list in the array. + * + * On the first clone, a new lx_clone_grp_t struct will be created. This struct + * holds a pointer to each process in the group. A reference to that group is + * held in the appropriate slot in l_clone_grps. The struct is created for + * the parent process by lx_clone_grp_create() and then the child process will + * associate itself with the group(s) using lx_clone_grp_enter(). + * + * Each syscall acting upon attributes relevant to a clone-group must include + * logic to do so properly. The syscalls will use lx_clone_grp_member() to + * determine if clone-group handling is required, and use lx_clone_grp_walk() + * to walk the list of processes in the group and apply the provided callback + * to each process. + * + * The following example illustrates how a common clone group would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_FS + * When A clones B, a new clone group is created and saved in the LX_CLGRP_FS + * slot in the l_clone_grps array on both A and B. When B clones, since a group + * already exists, C is added to the group and the group is saved in the + * LX_CLGRP_FS slot on C. + * + * The following example illustrates how two common clone groups would be used, + * as processes clone with the same set of CLONE_* flags. + * A clones B with CLONE_FS|CLONE_THREAD + * A new clone group is created and saved in the LX_CLGRP_FS slot in the + * l_clone_grps array on both A and B. A second clone group is created and + * saved in the LX_CLGRP_THREAD slot on both A and B (note that LX_CLGRP_THREAD + * is not implemented at this time). + * + * The following example illustrates how different clone groups would be used, + * as processes clone with different sets of CLONE_* flags. + * A clones B with CLONE_FS + * B clones C with CLONE_THREAD + * C clones D with CLONE_FS + * In this example, only A&B and C&D should share their FS information. B&C + * have to be in two clone groups. When A clones, a new clone group is created + * and saved in the LX_CLGRP_FS slot in the l_clone_grps array on both A and B. + * When B clones, a new clone group is created and saved in the LX_CLGRP_THREAD + * slot on both B and C (note that LX_CLGRP_THREAD is not implemented at this + * time). When C clones, a new clone group is created and saved in the + * LX_CLGRP_FS slot on both C and D. + * + * When a process exits, it removes itself from any groups to which it belongs. + * When the last process exits a group, it is cleaned up. + * + * If clone-groups were commonly used, this implementation would be inefficient + * and unwieldy, but since they are so rare a straightforward list-based + * approach is adequate. + * + * During group creation, the l_clone_grp_lock is first taken to ensure only + * one group is created, otherwise, only the group's lx_clgrp_lock protects the + * list. + * + * Note: Despite the locking, there is still a subtle race that can occur in + * this code. This occurs if a process has two threads and one of them is about + * to execute a clone-group aware syscall (e.g. chdir), while the other thread + * is forking to create a new clone-group. In theory the child process could be + * created, but not yet in the group. The syscall in the first thread could + * thus miss the new process. For example, the first thread might chdir the + * parent, but since the child process was alrady created, but not yet in the + * clone-group, it would not be chdir-ed. + */ + + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> + +/* + * We currently only support a single clone-group (CLONE_FS) but the design + * allows for future expansion by expanding the lx_proc_data+t`l_clone_grps + * array. + */ +static int +lx_clone_flag2grp(uint_t flag) +{ + if (flag & LX_CLONE_FS) + return (LX_CLGRP_FS); + + return (-1); +} + +/* + * Note: this function has the side effect of clearing the flags. + */ +static int +lx_clone_flags_iter(uint_t *fp) +{ + if (*fp & LX_CLONE_FS) { + *fp &= ~LX_CLONE_FS; + return (LX_CLGRP_FS); + } + + return (-1); +} + +/* + * Setup the current process in the proper clone-group(s) and record the + * clone-group flags on the lwp so that we can join the child process to the + * group during lx_forklwp(). + */ +void +lx_clone_grp_create(uint_t flags) +{ + int offset; + lx_proc_data_t *plproc = ttolxproc(curthread); + lx_lwp_data_t *ldp = (lx_lwp_data_t *)ttolwp(curthread)->lwp_brand; + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + if (!LX_IS_CLONE_GRP(flags)) + return; + + ldp->br_clone_grp_flags = flags & LX_CLONE_GRP_SUBSET; + + cgps = plproc->l_clone_grps; + /* + * We take the top-level mutex during create to ensure we only create + * one group per flag. + */ + mutex_enter(&plproc->l_clone_grp_lock); + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * If we already havae a clone-group list for this flag then + * nothing to do. + */ + if (cgp != NULL) + continue; + + /* + * Create a new clone-group. If it ever becomes an issue, we + * could preallocate this memory before taking + * l_clone_grp_lock. + */ + cgp = kmem_alloc(sizeof (lx_clone_grp_t), KM_SLEEP); + mutex_init(&cgp->lx_clgrp_lock, NULL, MUTEX_DEFAULT, NULL); + cgp->lx_clgrp_cnt = 1; + list_create(&cgp->lx_clgrp_members, + sizeof (lx_clone_grp_member_t), + offsetof(lx_clone_grp_member_t, lx_clgrpm_link)); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = curproc; + list_insert_tail(&cgp->lx_clgrp_members, mp); + + /* Attach group to our proc */ + plproc->l_clone_grps[offset] = cgp; + } + mutex_exit(&plproc->l_clone_grp_lock); +} + +/* + * Add the child process to the proper parent clone-group(s). + * + * Called from lx_forklwp, thus there is no need to have any locking for the + * destination proc. This is always run in the thread context of the source + * thread, and the destination thread is always newly created and not referred + * to from anywhere else. The source process should have already created the + * clone group(s) that we need to place the child into via lx_clone_grp_create. + */ +void +lx_clone_grp_enter(uint_t flags, proc_t *srcp, proc_t *dstp) +{ + int offset; + lx_proc_data_t *plproc = ptolxproc(srcp); + lx_proc_data_t *clproc = ptolxproc(dstp); + lx_clone_grp_t **cgps; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + + cgps = plproc->l_clone_grps; + while ((offset = lx_clone_flags_iter(&flags)) != -1) { + cgp = cgps[offset]; + + /* + * Parent should already have a clone-group list for this flag. + * The child joins that group. + */ + VERIFY(cgp != NULL); + + mp = kmem_zalloc(sizeof (lx_clone_grp_member_t), KM_SLEEP); + mp->lx_clgrpm_pp = dstp; + + mutex_enter(&cgp->lx_clgrp_lock); + list_insert_tail(&cgp->lx_clgrp_members, mp); + cgp->lx_clgrp_cnt++; + clproc->l_clone_grps[offset] = cgp; + mutex_exit(&cgp->lx_clgrp_lock); + } +} + +/* + * The process is exiting or we're exec-ing a native app. In the unlikely event + * it is in a clone-group, remove it from the group and perform any necessary + * cleanup. Normally we're called from lx_proc_exit(), so we know we're the + * last lwp in the process, but we can also be called from lx_clearbrand() when + * exec-ing a native application. In this case we know the lwp(s) are stopped + * (It is possible to have multiple lwps if we branded the process but the + * exec failed. Those lwps were just branded as part of the exec, and will + * be de-branded). + */ +void +lx_clone_grp_exit(proc_t *p, boolean_t lwps_ok) +{ + int i; + lx_proc_data_t *plproc = ptolxproc(p); + lx_clone_grp_t **cgps; + + ASSERT(!MUTEX_HELD(&p->p_lock)); + ASSERT(plproc != NULL); + + if (!lwps_ok) + VERIFY(p->p_lwpcnt <= 1); + + cgps = plproc->l_clone_grps; + for (i = 0; i < LX_CLGRP_MAX; i++) { + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + boolean_t found; + + cgp = cgps[i]; + if (cgp == NULL) + continue; + + /* + * The rare case when this process belongs to a clone-group. + */ + + mutex_enter(&cgp->lx_clgrp_lock); + + /* First remove ourselves from the group. */ + found = B_FALSE; + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + if (mp->lx_clgrpm_pp == p) { + found = B_TRUE; + list_remove(&cgp->lx_clgrp_members, mp); + kmem_free(mp, sizeof (lx_clone_grp_member_t)); + ASSERT(cgp->lx_clgrp_cnt > 0); + cgp->lx_clgrp_cnt--; + plproc->l_clone_grps[i] = NULL; + break; + } + mp = list_next(&cgp->lx_clgrp_members, mp); + } + VERIFY(found); + + if (cgp->lx_clgrp_cnt > 0) { + mutex_exit(&cgp->lx_clgrp_lock); + continue; + } + + /* + * cgp->lx_clgrp_cnt == 0 + * + * We're the sole remaining member; finish cleanup now. + */ + ASSERT(plproc->l_clone_grps[i] == NULL); + mutex_exit(&cgp->lx_clgrp_lock); + + /* Delete the group since there are no more references to it. */ + VERIFY(list_is_empty(&cgp->lx_clgrp_members)); + + list_destroy(&cgp->lx_clgrp_members); + mutex_destroy(&cgp->lx_clgrp_lock); + kmem_free(cgp, sizeof (lx_clone_grp_t)); + } +} + +/* + * Return true in the rare case that the process is a member of a clone group + * with the specific flag set. Clone groups are only added to the array + * atomically until this process exits, so we don't need to take + * l_clone_grp_lock. + */ +boolean_t +lx_clone_grp_member(lx_proc_data_t *dp, uint_t flag) +{ + int offset; + + if ((offset = lx_clone_flag2grp(flag)) == -1) + return (B_FALSE); + + if (dp->l_clone_grps[offset] != NULL) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Walk all of the processes in the clone-group list and apply the callback + * to each. Because we're holding the group list lock (lx_clgrp_lock) none of + * the processes can exit, but that is the only locking guarantee made by this + * function itself. + */ +int +lx_clone_grp_walk(lx_proc_data_t *dp, uint_t flag, int (*cb)(proc_t *, void *), + void *arg) +{ + int offset; + lx_clone_grp_t *cgp; + lx_clone_grp_member_t *mp; + int res, rv = 0; + + + ASSERT(dp != NULL); + /* We should not be called unless we belong to a group */ + VERIFY((offset = lx_clone_flag2grp(flag)) != -1); + VERIFY(dp->l_clone_grps[offset] != NULL); + + cgp = dp->l_clone_grps[offset]; + mutex_enter(&cgp->lx_clgrp_lock); + + mp = list_head(&cgp->lx_clgrp_members); + while (mp != NULL) { + res = cb(mp->lx_clgrpm_pp, arg); + /* return the first error we see, but try all procs */ + if (res != 0 && rv == 0) + rv = res; + mp = list_next(&cgp->lx_clgrp_members, mp); + } + + mutex_exit(&cgp->lx_clgrp_lock); + + return (rv); +} + + +/* + * Our lwp has already been created at this point, so this routine is + * responsible for setting up all the state needed to track this as a + * linux cloned thread. + */ +/* ARGSUSED */ +int +lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + struct lx_proc_data *lproc = ttolxproc(curthread); + struct ldt_info info; + struct user_desc descr; + int tls_index; + int entry = -1; + int signo; + + signo = flags & LX_CSIGNAL; + if (signo < 0 || signo > LX_NSIG) + return (set_errno(EINVAL)); + + if (!(flags & LX_CLONE_THREAD)) { + lproc->l_signal = signo; + } else { + if (flags & LX_CLONE_SETTLS) { + if (get_udatamodel() == DATAMODEL_ILP32) { + if (copyin((caddr_t)tls, &info, sizeof (info))) + return (set_errno(EFAULT)); + + if (LDT_INFO_EMPTY(&info)) + return (set_errno(EINVAL)); + + entry = info.entry_number; + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + tls_index = entry - GDT_TLSMIN; + + /* + * Convert the user-space structure into a real + * x86 descriptor and copy it into this LWP's + * TLS array. We also load it into the GDT. + */ + LDT_INFO_TO_DESC(&info, &descr); + bcopy(&descr, &lwpd->br_tls[tls_index], + sizeof (descr)); + lx_set_gdt(entry, &lwpd->br_tls[tls_index]); + } else { + /* + * Set the Linux %fsbase for this LWP. We will + * restore it the next time we return to Linux + * via setcontext()/lx_restorecontext(). + */ + lwpd->br_lx_fsbase = (uintptr_t)tls; + } + } + + lwpd->br_clear_ctidp = + (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL; + + if (signo && ! (flags & LX_CLONE_DETACH)) + lwpd->br_signal = signo; + else + lwpd->br_signal = 0; + + if (flags & LX_CLONE_THREAD) + lwpd->br_tgid = curthread->t_procp->p_pid; + + if (flags & LX_CLONE_PARENT) + lwpd->br_ppid = 0; + + if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) && + (suword32(ctidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) && + (suword32(ptidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + } + + *rval = lwpd->br_pid; + return (0); +} + +long +lx_set_tid_address(int *tidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + long rv; + + lwpd->br_clear_ctidp = tidp; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + rv = lwpd->br_pid; + } + + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_close.c b/usr/src/uts/common/brand/lx/syscall/lx_close.c new file mode 100644 index 0000000000..5d1a1605c1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_close.c @@ -0,0 +1,30 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> + + +extern int close(int); + +long +lx_close(int fdes) +{ + return (close(fdes)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c new file mode 100644 index 0000000000..b0a92394dc --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * We support neither the second argument (NUMA node), nor the third (obsolete + * pre-2.6.24 caching functionality which was ultimately broken). + */ +/* ARGSUSED1 */ +long +lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3) +{ + unsigned int curcpu = curthread->t_cpu->cpu_id; + + if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_dup.c b/usr/src/uts/common/brand/lx/syscall/lx_dup.c new file mode 100644 index 0000000000..d0f513753c --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_dup.c @@ -0,0 +1,53 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +/* From usr/src/uts/common/syscall/fcntl.c */ +extern int fcntl(int, int, intptr_t); + +long +lx_dup(int fd) +{ + return (fcntl(fd, F_DUPFD, 0)); +} + +long +lx_dup2(int oldfd, int newfd) +{ + return (fcntl(oldfd, F_DUP2FD, newfd)); +} + +long +lx_dup3(int oldfd, int newfd, int flags) +{ + int rc; + + /* The only valid flag is O_CLOEXEC. */ + if (flags & ~LX_O_CLOEXEC) + return (set_errno(EINVAL)); + + /* Only DUP2FD_CLOEXEC returns EINVAL on the same fd's */ + if (oldfd == newfd) + return (set_errno(EINVAL)); + + rc = fcntl(oldfd, (flags == 0) ? F_DUP2FD : F_DUP2FD_CLOEXEC, newfd); + return (rc); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_epoll.c b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c new file mode 100644 index 0000000000..47688dad6a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_epoll.c @@ -0,0 +1,303 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/epoll.h> +#include <sys/devpoll.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/vnode.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_signal.h> + +static major_t devpoll_major = 0; + +static boolean_t +lx_epoll_isvalid(file_t *fp) +{ + vnode_t *vp = fp->f_vnode; + + if (vp->v_type == VCHR && getmajor(vp->v_rdev) == devpoll_major) + return (B_TRUE); + return (B_FALSE); +} + +long +lx_epoll_create1(int flags) +{ + int err, fd, rv; + int fmode = FREAD | FWRITE; + boolean_t cloexec = B_FALSE; + vnode_t *vp = NULL; + file_t *fp = NULL; + + if (flags & EPOLL_CLOEXEC) { + cloexec = B_TRUE; + flags &= ~EPOLL_CLOEXEC; + } + if (flags != 0) { + /* No other flags accepted at this time */ + return (set_errno(EINVAL)); + } + + if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) { + err = EMFILE; + goto error; + } + if (ldi_vp_from_name("/devices/pseudo/poll@0:poll", &vp) != 0) { + err = ENOENT; + goto error; + } + if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) { + goto error; + } + err = VOP_IOCTL(vp, DP_EPOLLCOMPAT, 0, fmode, CRED(), &rv, NULL); + if (err != 0) { + (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL); + goto error; + } + + devpoll_major = getmajor(vp->v_rdev); + + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if (cloexec) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); + +error: + if (fp != NULL) { + setf(fd, NULL); + unfalloc(fp); + } + if (vp != NULL) { + VN_RELE(vp); + } + return (set_errno(err)); +} + +long +lx_epoll_create(int size) +{ + if (size <= 0) { + return (set_errno(EINVAL)); + } + + return (lx_epoll_create1(0)); +} + + +/* Match values from libc implementation */ +#define EPOLLIGNORED (EPOLLMSG | EPOLLWAKEUP) +#define EPOLLSWIZZLED \ + (EPOLLRDHUP | EPOLLONESHOT | EPOLLET | EPOLLWRBAND | EPOLLWRNORM) +#define EPOLL_TIMEOUT_CLAMP(t) (((t) < -1) ? -1 : (t)) + +long +lx_epoll_ctl(int fd, int op, int pfd, void *event) +{ + epoll_event_t epevent; + dvpoll_epollfd_t dpevent[2]; + file_t *fp; + iovec_t aiov; + uio_t auio; + uint32_t events, ev = 0; + int error = 0, i = 0; + + dpevent[i].dpep_pollfd.fd = pfd; + switch (op) { + case EPOLL_CTL_DEL: + dpevent[i].dpep_pollfd.events = POLLREMOVE; + break; + + case EPOLL_CTL_MOD: + /* + * In the modify case, we pass down two events: one to + * remove the event and another to add it back. + */ + dpevent[i++].dpep_pollfd.events = POLLREMOVE; + dpevent[i].dpep_pollfd.fd = pfd; + /* FALLTHROUGH */ + + case EPOLL_CTL_ADD: + if (copyin(event, &epevent, sizeof (epevent)) != 0) + return (set_errno(EFAULT)); + + /* + * Mask off the events that we ignore, and then swizzle the + * events for which our values differ from their epoll(7) + * equivalents. + */ + events = epevent.events; + ev = events & ~(EPOLLIGNORED | EPOLLSWIZZLED); + + if (events & EPOLLRDHUP) + ev |= POLLRDHUP; + if (events & EPOLLET) + ev |= POLLET; + if (events & EPOLLONESHOT) + ev |= POLLONESHOT; + if (events & EPOLLWRNORM) + ev |= POLLWRNORM; + if (events & EPOLLWRBAND) + ev |= POLLWRBAND; + + dpevent[i].dpep_data = epevent.data.u64; + dpevent[i].dpep_pollfd.events = ev; + break; + + default: + return (set_errno(EINVAL)); + } + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + aiov.iov_base = (void *)dpevent; + aiov.iov_len = sizeof (dvpoll_epollfd_t) * (i + 1); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = aiov.iov_len; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_loffset = 0; + auio.uio_fmode = fp->f_flag; + + error = VOP_WRITE(fp->f_vnode, &auio, 1, fp->f_cred, NULL); + + releasef(fd); + + switch (error) { + case 0: + return (0); + + case EBADF: + case EEXIST: + case EINVAL: + case ENOENT: + case ENOMEM: + case ENOSPC: + case EPERM: + /* + * Legal errors should pass straight through. + */ + return (set_errno(error)); + + case ELOOP: + /* + * In the case of descriptor loops, /dev/poll emits a more + * descriptive error than Linux epoll consumers would expect. + */ + return (set_errno(EINVAL)); + + default: + /* + * While devpoll itself should not emit unexpected errors, it + * is possible that a VOP_POLL handler might. There is little + * choice but to map these unexpected errors to something which + * is valid for epoll_ctl. + */ + return (set_errno(ENOMEM)); + } +} + +long +lx_epoll_wait(int fd, void *events, int maxevents, int timeout) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout); + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_POLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} + +long +lx_epoll_pwait(int fd, void *events, int maxevents, int timeout, void *sigmask) +{ + struct dvpoll arg; + file_t *fp; + int rv = 0, error, flag; + k_sigset_t ksig; + + if (maxevents <= 0) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } else if (!lx_epoll_isvalid(fp)) { + releasef(fd); + return (set_errno(EINVAL)); + } + if (sigmask != NULL) { + lx_sigset_t lsig; + + if (copyin(sigmask, &lsig, sizeof (lsig)) != 0) { + releasef(fd); + return (set_errno(EFAULT)); + } + lx_ltos_sigset(&lsig, &ksig); + arg.dp_setp = (sigset_t *)&ksig; + } else { + arg.dp_setp = NULL; + } + + arg.dp_nfds = maxevents; + arg.dp_timeout = EPOLL_TIMEOUT_CLAMP(timeout); + arg.dp_fds = (pollfd_t *)events; + flag = fp->f_flag | DATAMODEL_NATIVE | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, DP_PPOLL, (uintptr_t)&arg, flag, + fp->f_cred, &rv, NULL); + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c new file mode 100644 index 0000000000..21205aa18a --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_eventfd.c @@ -0,0 +1,126 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/vnode.h> +#include <sys/eventfd.h> + +static major_t eventfd_major = 0; + +/* io_submit uses this to validate control block eventfd descriptors */ +boolean_t +lx_is_eventfd(file_t *fp) +{ + vnode_t *vp = fp->f_vnode; + + if (vp->v_type == VCHR && getmajor(vp->v_rdev) == eventfd_major) + return (B_TRUE); + return (B_FALSE); +} + +long +lx_eventfd2(uint_t initval, int flags) +{ + int err, fd; + int fmode = FREAD | FWRITE; + vnode_t *vp = NULL; + file_t *fp = NULL; + + if (flags & ~(EFD_NONBLOCK | EFD_CLOEXEC | EFD_SEMAPHORE)) + return (set_errno(EINVAL)); + + if (flags & EFD_NONBLOCK) + fmode |= FNONBLOCK; + + if (falloc((vnode_t *)NULL, fmode, &fp, &fd) != 0) + return (set_errno(EMFILE)); + + if (ldi_vp_from_name("/dev/eventfd", &vp) != 0) { + /* + * If /dev/eventfd is not available then it is less jarring to + * Linux programs to tell them that the system call is not + * supported instead of reporting an error (ENOENT) they are + * not expecting. + */ + err = ENOTSUP; + goto error; + } + if ((err = VOP_OPEN(&vp, fmode | FKLYR, CRED(), NULL)) != 0) { + VN_RELE(vp); + vp = NULL; + goto error; + } + + if (flags & EFD_SEMAPHORE) { + int rv; + + if ((err = VOP_IOCTL(vp, EVENTFDIOC_SEMAPHORE, 0, fmode, CRED(), + &rv, NULL)) != 0) + goto error; + } + + if (initval != 0) { + uint64_t val = initval; + struct uio auio; + struct iovec aiov; + + /* write initial value */ + aiov.iov_base = (caddr_t)&val; + aiov.iov_len = sizeof (val); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = 0; + auio.uio_offset = 0; + auio.uio_resid = sizeof (val); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_fmode = FWRITE; + + if ((err = VOP_WRITE(vp, &auio, FWRITE, CRED(), NULL)) != 0) + goto error; + } + + eventfd_major = getmajor(vp->v_rdev); + + fp->f_vnode = vp; + mutex_exit(&fp->f_tlock); + setf(fd, fp); + if (flags & EFD_CLOEXEC) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); + +error: + if (fp != NULL) { + setf(fd, NULL); + unfalloc(fp); + } + if (vp != NULL) { + (void) VOP_CLOSE(vp, fmode, 0, 0, CRED(), NULL); + VN_RELE(vp); + } + return (set_errno(err)); +} + +long +lx_eventfd(uint_t val) +{ + return (lx_eventfd2(val, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c new file mode 100644 index 0000000000..61f9b936f2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fadvise.c @@ -0,0 +1,103 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_misc.h> + +/* + * Based on illumos posix_fadvise which does nothing. The only difference is + * that on Linux an fd refering to a pipe or FIFO returns EINVAL. The Linux + * POSIX_FADV_* values are the same as the illumos values. See how the 32-bit + * glibc calls fadvise64; the offeset is a 64-bit value, but the length is not. + * fadvise64_64 passes both the offset and length as 64-bit values. The 64-bit + * fadvise64 caller always passes 64-bit values for the offset and length. + */ + +/* + * This is the fadvise64 function used by 64-bit callers, and by 32-bit callers + * after they have adjusted their arguments. + */ +/* ARGSUSED */ +int +lx_fadvise64(int fd, off64_t offset, off64_t len, int advice) +{ + file_t *fp; + boolean_t is_fifo; + + switch (advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + case POSIX_FADV_WILLNEED: + case POSIX_FADV_DONTNEED: + case POSIX_FADV_NOREUSE: + break; + default: + return (set_errno(EINVAL)); + } + + if (len < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + is_fifo = (fp->f_vnode->v_type == VFIFO); + releasef(fd); + + if (is_fifo) + return (set_errno(ESPIPE)); + + return (0); +} + +/* + * This is the fadvise64 function used by 32-bit callers. Linux passes the + * 64-bit offset by concatenating consecutive arguments. We must perform the + * same conversion here. + */ +long +lx_fadvise64_32(int fd, uint32_t off_lo, uint32_t off_hi, int32_t len, + int advice) +{ + off64_t offset; + + offset = off_hi; + offset = offset << 32; + offset |= off_lo; + + return (lx_fadvise64(fd, offset, (off64_t)len, advice)); +} + +/* + * This function is only used by 32-bit callers. Linux passes the 64-bit offset + * and length by concatenating consecutive arguments. We must perform the same + * conversion here. + */ +long +lx_fadvise64_64(int fd, uint32_t off_lo, uint32_t off_hi, uint32_t len_lo, + uint32_t len_hi, int advice) +{ + off64_t offset; + off64_t len; + + offset = off_hi; + offset = offset << 32; + offset |= off_lo; + len = len_hi; + len = len << 32; + len |= len_lo; + + return (lx_fadvise64(fd, offset, len, advice)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c new file mode 100644 index 0000000000..338e4399fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fallocate.c @@ -0,0 +1,251 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/nbmlock.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/sdt.h> + +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); + +#define LX_FALLOC_FL_KEEP_SIZE 0x01 +#define LX_FALLOC_FL_PUNCH_HOLE 0x02 +#define LX_FALLOC_FL_NO_HIDE_STALE 0x04 +#define LX_FALLOC_FL_COLLAPSE_RANGE 0x08 +#define LX_FALLOC_FL_ZERO_RANGE 0x10 + +#define LX_FALLOC_VALID (LX_FALLOC_FL_KEEP_SIZE | LX_FALLOC_FL_PUNCH_HOLE | \ + LX_FALLOC_FL_NO_HIDE_STALE | LX_FALLOC_FL_COLLAPSE_RANGE | \ + LX_FALLOC_FL_ZERO_RANGE) + +#define LX_FALLOC_UNSUPP (LX_FALLOC_FL_NO_HIDE_STALE | \ + LX_FALLOC_FL_COLLAPSE_RANGE) + +long +lx_fallocate(int fd, int mode, off_t offset, off_t len) +{ + int error = 0; + file_t *fp; + vnode_t *vp; + int64_t tot; + struct flock64 bf; + vattr_t vattr; + u_offset_t f_offset; + boolean_t in_crit = B_FALSE; + + /* + * Error checking is in a specific order to make LTP happy. + */ + + tot = offset + len; + if (tot > (LLONG_MAX / (int64_t)1024)) + return (set_errno(EFBIG)); + + if (mode & LX_FALLOC_UNSUPP) + return (set_errno(EOPNOTSUPP)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + if ((fp->f_flag & FWRITE) == 0) { + error = EBADF; + goto done; + } + + vp = fp->f_vnode; + if (vp->v_type != VREG) { + error = EINVAL; + goto done; + } + + if (offset < 0 || len <= 0) { + error = EINVAL; + goto done; + } + + if (tot < 0LL) { + error = EFBIG; + goto done; + } + + if ((mode & ~LX_FALLOC_VALID) != 0) { + error = EINVAL; + goto done; + } + + /* + * If this is the only flag then we don't actually do any work. + */ + if (mode == LX_FALLOC_FL_KEEP_SIZE) + goto done; + + bzero(&bf, sizeof (bf)); + + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + if (mode == 0) { + /* Nothing to do if not extending the file */ + if (vattr.va_size >= tot) + goto done; + + /* Extend the file. */ + bf.l_start = (off64_t)tot; + bf.l_len = (off64_t)0; + + } else if (mode & LX_FALLOC_FL_PUNCH_HOLE) { + /* + * Deallocate space in the file. + */ + if ((mode & LX_FALLOC_FL_KEEP_SIZE) == 0) { + /* this flag is required with punch hole */ + error = EINVAL; + goto done; + } + + if (mode & + ~(LX_FALLOC_FL_PUNCH_HOLE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend since keep_size is set. */ + if (vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = (off_t)vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + + } else if (mode & LX_FALLOC_FL_ZERO_RANGE) { + /* + * Zero out the space in the file. + */ + if (mode & + ~(LX_FALLOC_FL_ZERO_RANGE | LX_FALLOC_FL_KEEP_SIZE)) { + error = EINVAL; + goto done; + } + + /* Make sure we don't extend when keep_size is set. */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + if (offset > vattr.va_size) + goto done; + len = vattr.va_size - offset; + } + + bf.l_start = (off64_t)offset; + bf.l_len = (off64_t)len; + } else { + /* We should have already handled all flags */ + VERIFY(0); + } + + /* + * Check for locks in the range. + */ + f_offset = fp->f_offset; + error = flock_check(vp, &bf, f_offset, MAXOFF_T); + if (error != 0) + goto done; + + /* + * Check for conflicting non-blocking mandatory locks. + * We need to get the size again under nbl_start_crit. + */ + if (nbl_need_check(vp)) { + u_offset_t begin; + ssize_t length; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + vattr.va_mask = AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, 0, CRED(), NULL)) != 0) + goto done; + + /* + * Make sure we don't extend when keep_size is set. + */ + if (mode & LX_FALLOC_FL_KEEP_SIZE && vattr.va_size < tot) { + ASSERT(mode & (LX_FALLOC_FL_PUNCH_HOLE | + LX_FALLOC_FL_ZERO_RANGE)); + + /* + * If the size grew we can short-circuit the rest of + * the work, otherwise adjust bf for the vop_space + * call. + */ + if (offset >= vattr.va_size) + goto done; + len = vattr.va_size - offset; + bf.l_len = (off64_t)len; + } + + if (offset > vattr.va_size) { + begin = vattr.va_size; + length = offset - vattr.va_size; + } else { + begin = offset; + length = vattr.va_size - offset; + } + + if (nbl_conflict(vp, NBL_WRITE, begin, length, 0, NULL)) { + error = EACCES; + goto done; + } + } + + error = VOP_SPACE(vp, F_FREESP, &bf, 0, f_offset, fp->f_cred, NULL); + +done: + if (in_crit) + nbl_end_crit(vp); + + releasef(fd); + if (error != 0) + return (set_errno(error)); + + return (0); +} + +long +lx_fallocate32(int fd, int mode, uint32_t offl, uint32_t offh, uint32_t lenl, + uint32_t lenh) +{ + int64_t offset = 0, len = 0; + + /* + * From 32-bit callers, Linux passes the 64-bit offset and len by + * concatenating consecutive arguments. We must perform the same + * conversion here. + */ + offset = offh; + offset = offset << 32; + offset |= offl; + len = lenh; + len = len << 32; + len |= lenl; + + return (lx_fallocate(fd, mode, (off_t)offset, (off_t)len)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c new file mode 100644 index 0000000000..a5406c0a4f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c @@ -0,0 +1,701 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/cmn_err.h> +#include <sys/pathname.h> +#include <sys/policy.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> +#include <sys/lx_socket.h> +#include <sys/brand.h> +#include <sys/fs/fifonode.h> +#include <sys/strsubr.h> +#include <sys/stream.h> +#include <sys/flock.h> + +extern int fcntl(int, int, intptr_t); +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +extern int lx_pipe_setsz(stdata_t *, uint_t, boolean_t); + + +int +lx_vp_at(int fd, char *upath, vnode_t **vpp, int flag) +{ + vnode_t *startvp; + int error; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + if ((error = fgetstartvp(fd, upath, &startvp)) != 0) { + return (error); + } + + if (upath != NULL) { + uio_seg_t seg = UIO_USERSPACE; + + error = lookupnameat(upath, seg, + (flag == AT_SYMLINK_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, vpp, startvp); + if (startvp != NULL) { + VN_RELE(startvp); + } + return (error); + } else { + /* VN_HOLD was established in fgetstartvp */ + *vpp = startvp; + VERIFY(*vpp); + return (0); + } +} + +#define LTOS_FLOCK(l, s) \ +{ \ + s->l_type = ltos_type(l->l_type); \ + s->l_whence = l->l_whence; \ + s->l_start = l->l_start; \ + s->l_len = l->l_len; \ + s->l_sysid = 0; /* not defined in linux */ \ + s->l_pid = (pid_t)l->l_pid; \ +} + +#define STOL_FLOCK(s, l) \ +{ \ + l->l_type = stol_type(s->l_type); \ + l->l_whence = s->l_whence; \ + l->l_start = s->l_start; \ + l->l_len = s->l_len; \ + l->l_pid = (int)s->l_pid; \ +} + +static short +ltos_type(short l_type) +{ + switch (l_type) { + case LX_F_RDLCK: + return (F_RDLCK); + case LX_F_WRLCK: + return (F_WRLCK); + case LX_F_UNLCK: + return (F_UNLCK); + default: + return (-1); + } +} + +static short +stol_type(short l_type) +{ + switch (l_type) { + case F_RDLCK: + return (LX_F_RDLCK); + case F_WRLCK: + return (LX_F_WRLCK); + case F_UNLCK: + return (LX_F_UNLCK); + default: + /* can't ever happen */ + return (0); + } +} + +static void +ltos_flock(struct lx_flock *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock(struct flock64 *s, struct lx_flock *l) +{ + STOL_FLOCK(s, l) +} + +static void +ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock64(struct flock64 *s, struct lx_flock64_32 *l) +{ + STOL_FLOCK(s, l) +} + +static int +lx_fcntl_getfl(int fd) +{ + int retval; + int rc; + + retval = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + if ((retval & O_ACCMODE) == O_RDONLY) + rc = LX_O_RDONLY; + else if ((retval & O_ACCMODE) == O_WRONLY) + rc = LX_O_WRONLY; + else + rc = LX_O_RDWR; + /* O_NDELAY != O_NONBLOCK, so we need to check for both */ + if (retval & O_NDELAY) + rc |= LX_O_NDELAY; + if (retval & O_NONBLOCK) + rc |= LX_O_NONBLOCK; + if (retval & O_APPEND) + rc |= LX_O_APPEND; + if (retval & O_SYNC) + rc |= LX_O_SYNC; + if (retval & O_LARGEFILE) + rc |= LX_O_LARGEFILE; + if (retval & FASYNC) + rc |= LX_O_ASYNC; + + return (rc); +} + +#define LX_SETFL_MASK (O_NONBLOCK | O_APPEND | O_SYNC | FASYNC); + +static int +lx_fcntl_setfl(int fd, ulong_t arg) +{ + int flags; + + /* + * When performing fcntl(F_SETFL), only certain flags are + * allowed to be manipulated. A mask is used to preserve + * other flags, such as those which are specified during + * open(2). The mask on Linux excludes O_LARGEFILE from + * being manipulated, whereas illumos expects the flag to + * be set. In order to properly preserve the O_LARGEFILE + * (FOFFMAX) state, we must first query for it via + * fcntl(F_GETFL) so that the value can be carried + * through. + */ + flags = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + flags &= ~LX_SETFL_MASK; + + /* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */ + if (arg & LX_O_NDELAY) + flags |= O_NONBLOCK; + if (arg & LX_O_APPEND) + flags |= O_APPEND; + if (arg & LX_O_SYNC) + flags |= O_SYNC; + if (arg & LX_O_ASYNC) + flags |= FASYNC; + + return (fcntl(fd, F_SETFL, flags)); +} + + +static int +lx_fcntl_pipesz(int fd, int cmd, ulong_t arg) +{ + file_t *fp; + vnode_t *vp; + stdata_t *str; + int err = 0, res = 0; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VFIFO || vp->v_op != fifo_vnodeops) { + err = EBADF; + goto out; + } + VERIFY((str = vp->v_stream) != NULL); + + if (cmd == LX_F_SETPIPE_SZ) { + err = lx_pipe_setsz(str, (uint_t)arg, B_FALSE); + } else if (cmd == LX_F_GETPIPE_SZ) { + size_t val; + + err = strqget(RD(str->sd_wrq), QHIWAT, 0, &val); + res = val; + } else { + /* NOTREACHED */ + ASSERT(0); + } + +out: + releasef(fd); + if (err != 0) { + return (set_errno(err)); + } + return (res); +} + +static int +lx_fcntl_common(int fd, int cmd, ulong_t arg) +{ + int rc = 0; + pid_t pid; + int error; + int rv; + int32_t flag; + file_t *fp; + + /* + * We depend on the call to fcntl to set the errno if necessary. + */ + ttolwp(curthread)->lwp_errno = 0; + + switch (cmd) { + case LX_F_SETSIG: + case LX_F_GETSIG: + case LX_F_SETLEASE: + case LX_F_GETLEASE: + case LX_F_NOTIFY: + case LX_F_CANCELLK: + { + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "unsupported fcntl command: %d", cmd); + lx_unsupported(buf); + } + return (set_errno(ENOTSUP)); + + case LX_F_DUPFD: + rc = fcntl(fd, F_DUPFD, arg); + break; + + case LX_F_DUPFD_CLOEXEC: + rc = fcntl(fd, F_DUPFD_CLOEXEC, arg); + break; + + case LX_F_GETFD: + rc = fcntl(fd, F_GETFD, 0); + break; + + case LX_F_SETFD: + rc = fcntl(fd, F_SETFD, arg); + break; + + case LX_F_GETFL: + rc = lx_fcntl_getfl(fd); + break; + + case LX_F_SETFL: + rc = lx_fcntl_setfl(fd, arg); + break; + + case LX_F_SETOWN: + pid = (pid_t)arg; + if (pid == 1) { + /* Setown for the init process uses the real pid. */ + pid = curzone->zone_proc_initpid; + } + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) { + /* + * On illumos F_SETOWN is only defined for sockets, but + * some apps hardcode to do this fcntl on other devices + * (e.g. /dev/tty) to setup signal handling. If the + * app is only setting itself to be the signal + * handler, we pretend to succeed. + */ + if (error != EINVAL || + curthread->t_procp->p_pid != pid) { + return (set_errno(error)); + } + } + + rc = 0; + break; + + case LX_F_GETOWN: + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) + return (set_errno(error)); + + if (pid == curzone->zone_proc_initpid) { + /* Getown for the init process returns 1. */ + pid = 1; + } + + rc = pid; + break; + + case LX_F_SETPIPE_SZ: + case LX_F_GETPIPE_SZ: + rc = lx_fcntl_pipesz(fd, cmd, arg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rc); +} + +static int +lx_fcntl_lock_cmd_to_s(int lx_cmd) +{ + switch (lx_cmd) { + case LX_F_GETLK: + return (F_GETLK); + case LX_F_SETLK: + return (F_SETLK); + case LX_F_SETLKW: + return (F_SETLKW); + case LX_F_GETLK64: + return (F_GETLK64); + case LX_F_SETLK64: + return (F_SETLK64); + case LX_F_SETLKW64: + return (F_SETLKW64); + default: + VERIFY(0); + /*NOTREACHED*/ + return (0); + } +} + +/* + * This is a pain but we can't re-use the fcntl code for locking since it does + * its own copyin/copyout for the flock struct. Since we have to convert the + * struct we have to do our own copyin/out. Thus we replicate the fcntl code for + * these 3 cmds. Luckily it's not much. + */ +static int +lx_fcntl_lock(int fd, int lx_cmd, void *arg) +{ + int cmd; + int error = 0; + file_t *fp; + vnode_t *vp; + int flag; + offset_t maxoffset; + u_offset_t offset; + model_t datamodel; + lx_flock_t lxflk; + lx_flock64_32_t lxflk64; + struct flock64 bf; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + maxoffset = MAXOFF_T; + datamodel = DATAMODEL_NATIVE; +#if defined(_SYSCALL32_IMPL) + if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32) + maxoffset = MAXOFF32_T; +#endif + vp = fp->f_vnode; + flag = fp->f_flag; + offset = fp->f_offset; + + cmd = lx_fcntl_lock_cmd_to_s(lx_cmd); + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + + lxflk.l_type = lxflk32.l_type; + lxflk.l_whence = lxflk32.l_whence; + lxflk.l_start = (off64_t)lxflk32.l_start; + lxflk.l_len = (off64_t)lxflk32.l_len; + lxflk.l_pid = lxflk32.l_pid; + } +#endif /* _SYSCALL32_IMPL */ + + ltos_flock(&lxflk, &bf); + + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) { + if (cmd == F_SETLKW && error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = + B_TRUE; + } + break; + } + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock(&bf, &lxflk); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type, + sizeof (lxflk.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) { + error = EOVERFLOW; + break; + } + + lxflk32.l_type = lxflk.l_type; + lxflk32.l_whence = lxflk.l_whence; + lxflk32.l_start = lxflk.l_start; + lxflk32.l_len = lxflk.l_len; + lxflk32.l_pid = lxflk.l_pid; + + if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + } +#endif /* _SYSCALL32_IMPL */ + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + /* + * Large File support is only used for ILP32 apps. + */ + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (cmd == F_GETLK64) + cmd = F_GETLK; + else if (cmd == F_SETLK64) + cmd = F_SETLK; + else if (cmd == F_SETLKW64) + cmd = F_SETLKW; + + if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + + ltos_flock64(&lxflk64, &bf); + + if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock64(&bf, &lxflk64); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk64.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk64.l_type, + &((lx_flock64_t *)arg)->l_type, + sizeof (lxflk64.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + break; + } + + releasef(fd); + if (error) + return (set_errno(error)); + + return (0); +} + +long +lx_fcntl(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK64: + case LX_F_SETLK64: + case LX_F_SETLKW64: + /* The 64-bit fcntl commands must go through fcntl64(). */ + return (set_errno(EINVAL)); + + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, arg)); + } +} + +long +lx_fcntl64(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + case LX_F_GETLK64: + case LX_F_SETLKW64: + case LX_F_SETLK64: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, (ulong_t)arg)); + } +} + +/* + * Apply or remove an advisory lock on the entire file. F_FLOCK and F_FLOCKW + * are OFD-style locks. For more information, see the comment on ofdlock(). + */ +long +lx_flock(int fd, int op) +{ + int cmd; + int error; + flock64_t bf; + file_t *fp; + + if (op & LX_LOCK_NB) { + cmd = F_FLOCK; + op &= ~LX_LOCK_NB; + } else { + cmd = F_FLOCKW; + } + + switch (op) { + case LX_LOCK_UN: + bf.l_type = F_UNLCK; + break; + case LX_LOCK_SH: + bf.l_type = F_RDLCK; + break; + case LX_LOCK_EX: + bf.l_type = F_WRLCK; + break; + default: + return (set_errno(EINVAL)); + } + + bf.l_whence = 0; + bf.l_start = 0; + bf.l_len = 0; + bf.l_sysid = 0; + bf.l_pid = 0; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + /* + * See the locking comment in fcntl.c. In summary, the *_frlock + * functions in the various file systems basically do some validation, + * then funnel everything through the fs_frlock function. For OFD-style + * locks, fs_frlock will do nothing. Once control returns here, we call + * the ofdlock function to do the actual locking. + */ + error = VOP_FRLOCK(fp->f_vnode, cmd, &bf, fp->f_flag, fp->f_offset, + NULL, fp->f_cred, NULL); + if (error != 0) { + releasef(fd); + return (set_errno(error)); + } + error = ofdlock(fp, cmd, &bf, fp->f_flag, fp->f_offset); + if (error != 0) { + if (cmd == F_FLOCKW && error == EINTR) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + (void) set_errno(error); + } + releasef(fd); + return (error); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c new file mode 100644 index 0000000000..b6244223f9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -0,0 +1,1728 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2020 Joyent, Inc. + * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/priv.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/condvar.h> +#include <sys/inttypes.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_futex.h> +#include <sys/lx_impl.h> +#include <sys/sdt.h> + +/* + * Futexes are a Linux-specific implementation of inter-process mutexes. + * They are designed to use shared memory for simple, uncontested + * operations, and rely on the kernel to resolve any contention issues. + * + * Most of the information in this section comes from the paper "Futexes + * Are Tricky", by Ulrich Drepper. This paper is currently available at: + * http://people.redhat.com/~drepper/futex.pdf. + * + * A futex itself a 4-byte integer, which must be 4-byte aligned. The + * value of this integer is expected to be modified using user-level atomic + * operations. For the original, simple futexes, the futex(4) design itself did + * not impose any semantic constraints on the value stored in the futex; it is + * up to the application to define its own protocol. For the newer, + * priority-inheritance (PI) futexes, the value is 0 or the TID of the holder, + * as defined in futex(2). + * + * When the application decides that kernel intervention is required, it + * will use the futex(2) system call. Originally there were 5 different + * operations that could be performed on a futex, using this system call, but + * that has subsequently been extended. Since this interface has evolved over + * time, there are several different prototypes available to the user. + * Fortunately, there is only a single kernel-level interface: + * + * long sys_futex(void *futex1, int cmd, int val1, + * struct timespec *timeout, void *futex2, int val2) + * + * The kernel-level operations that may be performed on a simple futex are: + * + * FUTEX_WAIT + * + * Atomically verify that futex1 contains the value val1. If it + * doesn't, return EWOULDBLOCK. If it does contain the expected + * value, the thread will sleep until somebody performs a FUTEX_WAKE + * on the futex. The caller may also specify a timeout, indicating + * the maximum time the thread should sleep. If the timer expires, + * the call returns ETIMEDOUT. If the thread is awoken with a signal, + * the call returns EINTR. Otherwise, the call returns 0. + * + * FUTEX_WAKE + * + * Wake up val1 processes that are waiting on futex1. The call + * returns the number of blocked threads that were woken up. + * + * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET + * + * Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument + * denoting a bit vector, with wakers will only waking waiters that match + * in one or more bits. These semantics are dubious enough, but the + * interface has an inconsistency that is glaring even by the + * embarrassingly low standards that Linux sets for itself: the timeout + * argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for + * FUTEX_WAIT. And as if that weren't enough unnecessary complexity, + * the caller may specify this absolute timeout to be against either + * CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET, + * of course! + * + * FUTEX_WAKE_OP + * + * The implementation of a conditional variable in terms of futexes + * actually uses two futexes: one to assure sequential access and one to + * represent the condition variable. This implementation gives rise to a + * particular performance problem whereby a thread is awoken on the futex + * that represents the condition variable only to have to (potentially) + * immediately wait on the futex that protects the condition variable. + * (Do not confuse the futex that serves to protect the condition variable + * with the pthread_mutex_t associated with pthread_cond_t -- which + * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP + * was invented, which performs an atomic compare-and-exchange on a + * second address in a specified fashion (that is, with a specified + * operation). Here are the possible operations (OPARG is defined + * to be 12 bit value embedded in the operation): + * + * - FUTEX_OP_SET: Sets the value at the second address to OPARG + * - FUTEX_OP_ADD: Adds the value to OPARG + * - FUTEX_OP_OR: OR's the value with OPARG + * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG + * - FUTEX_OP_XOR: XOR's the value with OPARG + * + * After this compare-and-exchange on the second address, a FUTEX_WAKE is + * performed on the first address and -- if the compare-and-exchange + * matches a specified result based on a specified comparison operation -- + * a FUTEX_WAKE is performed on the second address. Here are the possible + * comparison operations: + * + * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake + * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake + * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake + * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake + * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake + * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake + * + * As a practical matter, the only way that this is used (or, some might + * argue, is usable) is by the implementation of pthread_cond_signal(), + * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the + * futex that protects the condition variable and wake the futex that + * represents the condition variable. The second wake-up is conditional + * because the futex that protects the condition variable (rather than the + * one that represents it) may or may not have waiters. Given that this + * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for + * five different kinds of operations and six different kinds of + * comparision operations, in practice only one is used. (Namely, setting + * to 0 and waking if the old value is greater than 1 -- which denotes + * that waiters are present and the wakeup should be performed.) Moreover, + * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the + * case that the pthread_mutex_t associated with the pthread_cond_t is + * held at the time of a pthread_cond_signal(), this entire mechanism is + * essentially for naught in this case. As one can imagine (and can + * verify on just about any source base that uses pthread_cond_signal()), + * it is overwhelmingly the common case that the lock associated with the + * pthread_cond_t is held at the time of pthread_cond_signal(), assuring + * that the problem that all of this complexity was designed to solve + * isn't, in fact, solved because the signalled thread simply wakes up + * only to block again on the held mutex. Cue a slow clap! + * + * FUTEX_CMP_REQUEUE + * + * If the value stored in futex1 matches that passed in in val2, wake + * up val1 processes that are waiting on futex1. Otherwise, return + * EAGAIN. + * + * If there are more than val1 threads waiting on the futex, remove + * the remaining threads from this futex, and requeue them on futex2. + * The caller can limit the number of threads being requeued by + * encoding an integral numerical value in the position usually used + * for the timeout pointer. + * + * The call returns the number of blocked threads that were woken up + * or requeued. + * + * FUTEX_REQUEUE + * + * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. + * This command has been declared broken and obsolete, but we still + * need to support it. + * + * FUTEX_FD + * + * Return a file descriptor, which can be used to refer to the futex. + * This operation was broken by design, and was blessedly removed in + * Linux 2.6.26 ("because it was inherently racy"); it should go without + * saying that we don't support this operation. + * + * The kernel-level operations that may be performed on a PI futex are: + * + * FUTEX_LOCK_PI + * + * Called after a user-land attempt to acquire the lock using an atomic + * instruction failed because the futex had a nonzero value (the current + * holder's TID). Once enqueued, the thread sleeps until FUTEX_UNLOCK_PI + * is called on the futex, or the timeout expires. The timeout argument to + * FUTEX_LOCK_PI is absolute, unlike FUTEX_WAIT, and cannot be modified + * as with FUTEX_WAIT_BITSET! + * + * FUTEX_TRYLOCK_PI + * + * Similar to FUTEX_LOCK_PI but can be used for error recovery as + * described in futex(2). + * + * FUTEX_UNLOCK_PI + * + * Called when user-land cannot atomically release the lock because + * there are waiting threads. This will wake the highest priority waiting + * thread. + * + * FUTEX_CMP_REQUEUE_PI + * + * Not implemented at this time. + * + * FUTEX_WAIT_REQUEUE_PI + * + * Not implemented at this time. + * + * Priority Inheritance + * + * Our general approach to priority inheritance recognizes the fact that the + * application is almost certainly not a real-time process running on dedicated + * hardware. The zone is most likely running in a multi-tenant environment under + * FSS, in spite of whatever scheduling class the Linux application thinks it is + * using. Thus, we make our best effort to handle priority inheritance. When a + * thread must block on a PI futex, it may increase the scheduling priority of + * the futex holder to match the blocking thread. The futex holder's original + * priority will be restored when it unlocks the futex. + * + * This approach does not always handle transitive priority inheritance. For + * example, three threads at Low, Medium and High priority: + * L holds futex X + * M holds futex Y and became enqueued on X (M bumped L's priority to M) + * H enqueues on Y and bumps priority of M to H, but never bumps L's priority + * (which is currently M) up to H + * In reality this scenario is both uncommon and likely still executes + * reasonably well under a multi-tenant, FSS scenario. Also note that if H + * enqueued on Y before M enqueues on X, then L will have its priority raised + * to H when M enqueues on X. + * + * PI Futex Cleanup + * + * Futex cleanup can occur when a thread exits unexpectedly while holding one + * or more futexes. Normally this done via a "robust" futex and cleanup of a + * robust PI futex works in the same way as a non-PI robust futex (see + * lx_futex_robust_exit). On Linux, in the case of a non-robust PI futex, + * cleanup can still occur because the futex is associated with a real-time + * mutex inside the kernel (see the futex(2) man page for more details). For lx + * we are not using anything similar. When a thread exits, lx_futex_robust_exit + * will be called, but we would have to iterate every hash bucket, and every + * futex in the chain, to look for futexes held by the exiting thread. This + * would be very expensive and would occur whether or not the thread held any + * futexes. Thus, at this time we don't set the FUTEX_OWNER_DIED bit on + * non-robust PI futexes held by a thread when it exits while holding futexes. + * In practice this does not seem to be a serious limitation since user-level + * code generally appears to use robust futexes, but this may need to be + * revisited if it is observed to be an issue. + */ + +/* + * The structure of the robust_list, as set with the set_robust_list() system + * call. See lx_futex_robust_exit(), below, for details. + */ +typedef struct futex_robust_list { + uintptr_t frl_head; /* list of robust locks held */ + uint64_t frl_offset; /* offset of lock word within a lock */ + uintptr_t frl_pending; /* pending operation */ +} futex_robust_list_t; + +#if defined(_SYSCALL32_IMPL) + +#pragma pack(4) +typedef struct futex_robust_list32 { + uint32_t frl_head; /* list of robust locks held */ + uint32_t frl_offset; /* offset of lock word within a lock */ + uint32_t frl_pending; /* pending operation */ +} futex_robust_list32_t; +#pragma pack() + +#endif + +#define MEMID_COPY(s, d) \ + { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } +#define MEMID_EQUAL(s, d) \ + ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) + +/* + * Because collisions on this hash table can be a source of negative + * scalability, we make it pretty large: 4,096 entries -- 64K. If this + * size is found to be insufficient, the size should be made dynamic. + * (Making it dynamic will be delicate because the per-chain locking will + * necessitate memory retiring or similar; see the 2008 ACM Queue article + * "Real-world concurrency" for details on this technique.) + */ +#define HASH_SHIFT_SZ 12 +#define HASH_SIZE (1 << HASH_SHIFT_SZ) +#define HASH_FUNC(id) \ + ((((uintptr_t)((id)->val[1]) >> 3) + \ + ((uintptr_t)((id)->val[1]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[1]) >> (3 + 2 * HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> 3) + \ + ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ + (HASH_SIZE - 1)) + +/* + * A small, invalid value we can compare against to find the highest scheduling + * priority. + */ +#define BELOW_MINPRI INT_MIN + +/* + * Arbitrary limit on the number of CAS failures allowed in tight looping + * contexts before a back-off retry occurs. + */ +#define CAS_LOOP_LIMIT 100 + +/* + * We place the per-chain lock next to the pointer to the chain itself. + * When compared to an array of orthogonal locks, this reduces false sharing + * (though adjacent entries can still be falsely shared -- just not as many), + * while having the additional bonus of increasing locality. + */ +typedef struct futex_hash { + kmutex_t fh_lock; + fwaiter_t *fh_waiters; +} futex_hash_t; + +static futex_hash_t futex_hash[HASH_SIZE]; + +static void +futex_hashin(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + fwp->fw_prev = NULL; + fwp->fw_next = futex_hash[index].fh_waiters; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp; + futex_hash[index].fh_waiters = fwp; +} + +static void +futex_hashout(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash[index].fh_lock)); + + if (fwp->fw_prev) + fwp->fw_prev->fw_next = fwp->fw_next; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp->fw_prev; + if (futex_hash[index].fh_waiters == fwp) + futex_hash[index].fh_waiters = fwp->fw_next; + + fwp->fw_prev = NULL; + fwp->fw_next = NULL; +} + +/* + * Go to sleep until somebody does a WAKE operation on this futex, we get a + * signal, or the timeout expires. + */ +static int +futex_wait(memid_t *memid, caddr_t addr, + int val, timespec_t *timeout, uint32_t bits, boolean_t hrtime) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp = &lwpd->br_fwaiter; + int err, ret; + int32_t curval; + int index; + + /* + * The LMS_USER_LOCK micro state becomes valid if we sleep; otherwise + * our time will accrue against LMS_SYSTEM. Use of this micro state + * is modelled on lwp_mutex_timedlock(), a native analogue of + * futex_wait(). + */ + (void) new_mstate(t, LMS_USER_LOCK); + + fwp->fw_woken = 0; + fwp->fw_bits = bits; + fwp->fw_tid = 0; + + MEMID_COPY(memid, &fwp->fw_memid); + cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL); + + index = HASH_FUNC(&fwp->fw_memid); + mutex_enter(&futex_hash[index].fh_lock); + + if (fuword32(addr, (uint32_t *)&curval)) { + err = set_errno(EFAULT); + goto out; + } + if (curval != val) { + err = set_errno(EWOULDBLOCK); + goto out; + } + + /* + * We can't have hrtime and a timeout of 0. See below about + * CLOCK_REALTIME. + * On Linux this is is an invalid state anyway, so we'll short cut + * this early to avoid a panic from passing a null pointer to ts2hrt(). + */ + if (hrtime && timeout == NULL) { + err = set_errno(EINVAL); + goto out; + } + + futex_hashin(fwp); + + err = 0; + while ((fwp->fw_woken == 0) && (err == 0)) { + /* + * If hrtime is set, we interpret timeout to be absolute and + * CLOCK_MONOTONIC-based; otherwise we treat it as absolute + * and CLOCK_REALTIME-based. (Strictly speaking -- or at least + * in as much as the term "strictly" means anything in the + * semantic shambles that is Linux -- FUTEX_WAIT defines its + * timeout to be CLOCK_MONOTONIC-based but limited by system + * clock interval; we treat these semantics as effectively + * CLOCK_REALTIME.) + */ + if (hrtime) { + ret = cv_timedwait_sig_hrtime(&fwp->fw_cv, + &futex_hash[index].fh_lock, ts2hrt(timeout)); + } else { + ret = cv_waituntil_sig(&fwp->fw_cv, + &futex_hash[index].fh_lock, timeout, timechanged); + } + + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* + * According to signal(7), a futex(2) call with the + * FUTEX_WAIT operation is restartable. + */ + ttolxlwp(t)->br_syscall_restart = B_TRUE; + err = set_errno(EINTR); + } + } + + /* + * The futex is normally hashed out in wakeup. If we timed out or + * got a signal, we need to hash it out here instead. + */ + if (fwp->fw_woken == 0) + futex_hashout(fwp); + +out: + mutex_exit(&futex_hash[index].fh_lock); + + return (err); +} + +/* + * Wake up to wake_threads threads that are blocked on the futex at memid. + */ +static int +futex_wake(memid_t *memid, int wake_threads, uint32_t mask) +{ + fwaiter_t *fwp, *next; + int index; + int ret = 0; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash[index].fh_lock); + + for (fwp = futex_hash[index].fh_waiters; + fwp != NULL && ret < wake_threads; fwp = next) { + next = fwp->fw_next; + if (MEMID_EQUAL(&fwp->fw_memid, memid)) { + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and + * non-PI usage on the same futex. + */ + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EINVAL)); + } + + if ((fwp->fw_bits & mask)) { + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + ret++; + } + } + } + + mutex_exit(&futex_hash[index].fh_lock); + + return (ret); +} + +static int +futex_wake_op_execute(int32_t *addr, int32_t val3) +{ + int32_t op = FUTEX_OP_OP(val3); + int32_t cmp = FUTEX_OP_CMP(val3); + int32_t cmparg = FUTEX_OP_CMPARG(val3); + int32_t oparg, oldval, newval; + label_t ljb; + int rval; + uint_t loops = 0; + + if ((uintptr_t)addr >= KERNELBASE) + return (-EFAULT); + + if (on_fault(&ljb)) + return (-EFAULT); + + oparg = FUTEX_OP_OPARG(val3); + + do { + /* + * Bail out (for a later retry) if the CAS operation repeatedly + * fails to set the new value. + */ + if (loops++ > CAS_LOOP_LIMIT) { + no_fault(); + return (-EAGAIN); + } + + oldval = *addr; + newval = oparg; + + switch (op) { + case FUTEX_OP_SET: + break; + + case FUTEX_OP_ADD: + newval += oparg; + break; + + case FUTEX_OP_OR: + newval |= oparg; + break; + + case FUTEX_OP_ANDN: + newval &= ~oparg; + break; + + case FUTEX_OP_XOR: + newval ^= oparg; + break; + + default: + no_fault(); + return (-EINVAL); + } + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + no_fault(); + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + rval = (oldval == cmparg); + break; + + case FUTEX_OP_CMP_NE: + rval = (oldval != cmparg); + break; + + case FUTEX_OP_CMP_LT: + rval = (oldval < cmparg); + break; + + case FUTEX_OP_CMP_LE: + rval = (oldval <= cmparg); + break; + + case FUTEX_OP_CMP_GT: + rval = (oldval > cmparg); + break; + + case FUTEX_OP_CMP_GE: + rval = (oldval >= cmparg); + break; + + default: + return (-EINVAL); + } + + return (rval); +} + +static int +futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2, + int wake_threads, int wake_threads2, int val3) +{ + kmutex_t *l1, *l2; + int ret = 0, ret2 = 0, wake; + fwaiter_t *fwp, *next; + int index1, index2; + + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(memid2); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + +retry: + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + /* LINTED: alignment */ + if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) { + /* + * If the futex op fails on a looping CAS attempt, drop the + * involved mutexes to allow others to run, and try again. + */ + if (wake == -EAGAIN) { + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + goto retry; + } + + (void) set_errno(-wake); /* convert back to positive errno */ + ret = -1; + goto out; + } + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and non-PI + * usage on the same futex. + */ + (void) set_errno(EINVAL); + ret = -1; + goto out; + } + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret >= wake_threads) { + break; + } + } + + if (!wake) + goto out; + + for (fwp = futex_hash[index2].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid2)) + continue; + + if (fwp->fw_tid != 0) { + /* + * A PI waiter. It is invalid to mix PI and non-PI + * usage on the same futex. + */ + (void) set_errno(EINVAL); + ret = -1; + goto out; + } + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret2 >= wake_threads2) { + break; + } + } + + ret += ret2; +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + return (ret); +} + +/* + * Wake up to wake_threads waiting on the futex at memid. If there are + * more than that many threads waiting, requeue the remaining threads on + * the futex at requeue_memid. + */ +static int +futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, + ulong_t requeue_threads, caddr_t addr, int *cmpval) +{ + fwaiter_t *fwp, *next; + int index1, index2; + int ret = 0; + int32_t curval; + kmutex_t *l1, *l2; + + /* + * To ensure that we don't miss a wakeup if the value of cmpval + * changes, we need to grab locks on both the original and new hash + * buckets. To avoid deadlock, we always grab the lower-indexed + * lock first. + */ + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(requeue_memid); + + if (index1 == index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash[index1].fh_lock; + l2 = &futex_hash[index2].fh_lock; + } else { + l1 = &futex_hash[index2].fh_lock; + l2 = &futex_hash[index1].fh_lock; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + if (cmpval != NULL) { + if (fuword32(addr, (uint32_t *)&curval)) { + ret = -EFAULT; + goto out; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out; + } + } + + for (fwp = futex_hash[index1].fh_waiters; fwp != NULL; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + MEMID_COPY(requeue_memid, &fwp->fw_memid); + futex_hashin(fwp); + + if ((ret - wake_threads) >= requeue_threads) + break; + } + } + +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + if (ret < 0) + return (set_errno(-ret)); + return (ret); +} + +/* + * Copy in the timeout provided by the application and convert it to an + * absolute timeout. Sadly, this is complicated by the different timeout + * semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET vs. FUTEX_LOCK_PI. (Yes, you + * read that correctly; all three of these have different timeout semantics; + * see the block comment at the top of the file for commentary on this + * inanity.) This function doesn't attempt to clean up all of these + * differences, however; we will only copy the timer value in, perform some + * basic sanity checking, and (if it's an operation operating on a relative + * time, which is to say FUTEX_WAIT) adjust it to be absolute. All other + * nuances (namely, the resolution and clock of the timeout) are left up to + * the caller. + */ +static int +get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd) +{ + timestruc_t now; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) + return (EFAULT); + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) + return (EFAULT); + timeout->tv_sec = (time_t)timeout32.tv_sec; + timeout->tv_nsec = timeout32.tv_nsec; + } +#endif + if (itimerspecfix(timeout)) + return (EINVAL); + + if (cmd == FUTEX_WAIT) { + /* + * We've been given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } + + return (0); +} + +/* + * Attempt to take the futex. If currently held, enqueue (sleep) on the futex + * until a thread performs futex_unlock_pi, we get a signal, or the timeout + * expires. If 'is_trylock' is true and the futex is currently held, return + * EAGAIN immediately. + */ +static int +futex_lock_pi(memid_t *memid, uint32_t *addr, timespec_t *timeout, + boolean_t is_trylock) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp = &lwpd->br_fwaiter; + fwaiter_t *f_fwp; + int fpri, mypri; + int err; + int index; + /* volatile to silence gcc clobber warning for longjmp */ + volatile pid_t mytid; + pid_t ftid; /* current futex holder tid */ + proc_t *fproc = NULL; /* current futex holder proc */ + kthread_t *fthrd; /* current futex holder thread */ + volatile uint32_t oldval; + volatile uint_t loops = 0; + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); + + /* + * Have to take mutex first to prevent the following race with unlock: + * a) T1 sees a tid in the futex and atomically sets FUTEX_WAITERS. + * b) T2 calls unlock, sees there are waiters, but since nothing is in + * the queue yet, it simply returns with the futex now containing 0. + * c) T1 proceeds to enqueue itself. + * At this point nothing will ever wake T1. + */ + index = HASH_FUNC(memid); +retry: + mutex_enter(&futex_hash[index].fh_lock); + + /* It would be very unusual to actually loop here. */ + oldval = 0; + /* CONSTCOND */ + while (1) { + uint32_t curval; + label_t ljb; + + /* + * Make a round trip through the lock if too many CAS failures + * occur, indicative of userspace tomfoolery. + */ + if (loops++ > CAS_LOOP_LIMIT) { + mutex_exit(&futex_hash[index].fh_lock); + goto retry; + } + + if (on_fault(&ljb)) { + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EFAULT)); + } + + /* + * We optimistically try to set our tid on the off chance that + * the futex was released after we initiated the syscall. That + * may work but it is the unlikely path and is usually just our + * way of getting the current value. This also handles the + * retry in the case when the futex only has the high bits set. + */ + curval = atomic_cas_32(addr, oldval, mytid); + if (oldval == curval) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + + oldval = curval; + ftid = oldval & FUTEX_TID_MASK; + /* high bits were only ones set, so we retry to set our tid */ + if (ftid == 0) { + no_fault(); + continue; + } + + if (ftid == mytid) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EDEADLK)); + } + + /* The futex is currently held by another thread. */ + if (is_trylock) { + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(EAGAIN)); + } + + curval = atomic_cas_32(addr, oldval, oldval | FUTEX_WAITERS); + no_fault(); + if (curval == oldval) { + /* + * We set the WAITERS bit so now we can enqueue our + * thread on the mutex. This is the typical path. + */ + oldval |= FUTEX_WAITERS; + break; + } + + /* + * The rare case when a change snuck into the window between + * first getting the futex value and updating it; retry. + */ + oldval = 0; + } + + /* + * Determine if the current futex holder's priority needs to inherit + * our priority (only if it should be increased). + * + * If a non-branded proc is sharing this futex(!?) then we don't + * interact with it. This seems like it would only occur maliciously. + * That proc will never be able to call futex(2) to unlock the futex. + * We just return ESRCH for this invalid case. + * + * Otherwise, get the holder's priority and if necessary, bump it up to + * our level. + */ + mutex_enter(&curproc->p_lock); + (void) CL_DOPRIO(curthread, kcred, 0, &mypri); + mutex_exit(&curproc->p_lock); + + if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) != 0) { + label_t ljb; + + if (on_fault(&ljb) == 0) { + (void) atomic_cas_32(addr, oldval, + oldval | FUTEX_OWNER_DIED); + } + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(ESRCH)); + } + if (!PROC_IS_BRANDED(fproc)) { + mutex_exit(&fproc->p_lock); + mutex_exit(&futex_hash[index].fh_lock); + return (set_errno(ESRCH)); + } + + ASSERT(MUTEX_HELD(&fproc->p_lock)); + (void) CL_DOPRIO(fthrd, kcred, 0, &fpri); + + f_fwp = &lwptolxlwp(ttolwp(fthrd))->br_fwaiter; + if (mypri > fpri) { + /* Save holder's current pri if not already bumped up */ + if (!f_fwp->fw_pri_up) + f_fwp->fw_opri = fpri; + f_fwp->fw_pri_up = B_TRUE; + DTRACE_PROBE2(futex__lck__pri, int, mypri, int, fpri); + CL_DOPRIO(fthrd, kcred, mypri - fpri, &fpri); + } + + /* + * If we haven't already been bumped by some other thread then + * record our pri at time of enqueue. + */ + if (!fwp->fw_pri_up) { + fwp->fw_opri = mypri; + } + mutex_exit(&fproc->p_lock); + + /* + * Enqueue our thread on the mutex. This is similar to futex_wait(). + * See futex_wait() for LMS_USER_LOCK state description. + */ + (void) new_mstate(t, LMS_USER_LOCK); + + fwp->fw_woken = 0; + fwp->fw_bits = 0; + fwp->fw_tid = mytid; + MEMID_COPY(memid, &fwp->fw_memid); + cv_init(&fwp->fw_cv, NULL, CV_DEFAULT, NULL); + + futex_hashin(fwp); + + err = 0; + while (fwp->fw_woken == 0 && err == 0) { + int ret; + + ret = cv_waituntil_sig(&fwp->fw_cv, &futex_hash[index].fh_lock, + timeout, timechanged); + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* EINTR is not valid for futex_lock_pi */ + err = set_errno(EAGAIN); + } + } + + /* + * The futex is normally hashed out in futex_unlock_pi. If we timed out + * or got a signal, we need to hash it out here instead. + */ + if (fwp->fw_woken == 0) + futex_hashout(fwp); + + mutex_exit(&futex_hash[index].fh_lock); + return (err); +} + +/* + * This must be a separate function to prevent compiler complaints about + * clobbering variables via longjmp (on_fault). When setting the new owner we + * must preserve the current WAITERS and OWNER_DIED bits. + */ +static int +futex_unlock_pi_waiter(fwaiter_t *fnd_fwp, uint32_t *addr, uint32_t curval) +{ + label_t ljb; + pid_t tid; + + if (on_fault(&ljb)) { + return (EFAULT); + } + + /* No waiter on this futex; again, not normal, but not an error. */ + if (fnd_fwp == NULL) { + int res = 0; + if (atomic_cas_32(addr, curval, + 0 | (curval & FUTEX_OWNER_DIED)) != curval) + res = EINVAL; + no_fault(); + return (res); + } + + tid = fnd_fwp->fw_tid | (curval & (FUTEX_WAITERS | FUTEX_OWNER_DIED)); + if (atomic_cas_32(addr, curval, tid) != curval) { + /* + * The value was changed behind our back, return an error and + * don't dequeue the waiter. + */ + no_fault(); + return (EINVAL); + } + + no_fault(); + + futex_hashout(fnd_fwp); + fnd_fwp->fw_woken = 1; + cv_signal(&fnd_fwp->fw_cv); + + return (0); +} + +/* + * Paired with futex_lock_pi; wake up highest priority thread that is blocked + * on the futex at memid. A non-zero 'clean_tid' argument is used for a PI + * futex during robust or trylock cleanup when the calling thread may not own + * the futex. During cleanup we check that the futex contains the expected + * tid to avoid cleanup races. + */ +static int +futex_unlock_pi(memid_t *memid, uint32_t *addr, pid_t clean_tid) +{ + kthread_t *t = curthread; + lx_lwp_data_t *lwpd = ttolxlwp(t); + fwaiter_t *fwp, *fnd_fwp; + uint32_t curval; + pid_t mytid; + pid_t holder_tid; + int index; + int hipri; + int res; + + if ((uintptr_t)addr >= KERNELBASE) + return (EFAULT); + + mytid = (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); + + /* See comment in futex_lock_pi for why we take the mutex first. */ + index = HASH_FUNC(memid); + mutex_enter(&futex_hash[index].fh_lock); + + if (fuword32(addr, &curval)) { + mutex_exit(&futex_hash[index].fh_lock); + return (EFAULT); + } + + holder_tid = curval & FUTEX_TID_MASK; + if (clean_tid == 0) { + /* Not cleaning up so we must hold the futex */ + if (holder_tid != mytid) { + mutex_exit(&futex_hash[index].fh_lock); + return (EPERM); + } + } else { + /* + * We're doing cleanup but we want to check if another thread + * already did the cleanup due to a race before we took the + * futex_hash.fh_lock. + * + * There are two posible cases here: + * 1) During robust cleanup we already cleared the dead tid + * from the futex and set the FUTEX_OWNER_DIED bit. + * 2) During trylock cleanup we want to be sure the tid we + * saw in the futex before we took the futex_hash lock + * is still there and that we did not race with another + * trylock also doing cleanup. + */ + DTRACE_PROBE2(futex__unl__clean, int, curval, int, clean_tid); + if ((curval & FUTEX_OWNER_DIED) != 0) { + if (holder_tid != 0) { + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + } else if (holder_tid != clean_tid) { + mutex_exit(&futex_hash[index].fh_lock); + return (0); + } + } + + /* + * If necessary, restore our old priority. Since we only ever bump up + * the priority, our incr should be negative, but we allow for the + * case where the priority was lowered in some other way while we held + * the futex. Also, we only reset our priority on a true unlock, not + * when cleaning up, as indicated by clean_tid. + */ + if (clean_tid == 0) { + fwp = &lwpd->br_fwaiter; + if (fwp->fw_pri_up) { + int curpri; + int incr; + + mutex_enter(&curproc->p_lock); + CL_DOPRIO(curthread, kcred, 0, &curpri); + DTRACE_PROBE2(futex__unl__pri, int, fwp->fw_opri, + int, curpri); + incr = fwp->fw_opri - curpri; + if (incr < 0) { + CL_DOPRIO(curthread, kcred, incr, &curpri); + } + mutex_exit(&curproc->p_lock); + fwp->fw_pri_up = B_FALSE; + } + } + + /* + * Normally an application wouldn't make the syscall if the WAITERS + * bit is not set, but we also come through here on robust and trylock + * cleanup. Preserve the OWNER_DIED bit even though there are no + * waiters and we're just clearing the tid. + */ + if ((curval & FUTEX_WAITERS) == 0) { + res = 0; + label_t fjb; + + if (on_fault(&fjb)) { + mutex_exit(&futex_hash[index].fh_lock); + return (EFAULT); + } + if (atomic_cas_32(addr, curval, + 0 | (curval & FUTEX_OWNER_DIED)) != curval) { + res = EINVAL; + } + + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (res); + } + + /* Find the highest priority waiter. */ + hipri = BELOW_MINPRI; + fnd_fwp = NULL; + for (fwp = futex_hash[index].fh_waiters; fwp != NULL; + fwp = fwp->fw_next) { + if (MEMID_EQUAL(&fwp->fw_memid, memid)) { + if (fwp->fw_tid == 0) { + /* + * A non-PI waiter. It is invalid to mix PI and + * non-PI usage on the same futex. + */ + no_fault(); + mutex_exit(&futex_hash[index].fh_lock); + return (EINVAL); + } + /* + * Because futex_hashin inserts at the head of the list + * we want to find the oldest entry with the highest + * priority (hence >=). + */ + if (fwp->fw_opri >= hipri) { + fnd_fwp = fwp; + hipri = fwp->fw_opri; + } + } + } + + res = futex_unlock_pi_waiter(fnd_fwp, addr, curval); + mutex_exit(&futex_hash[index].fh_lock); + return (res); +} + +/* + * Handle the case where the futex holder is gone and try to recover. Trylock + * will never enqueue on the futex and must return EAGAIN if it is held by + * a live process. + */ +static int +futex_trylock_pi(memid_t *memid, uint32_t *addr) +{ + uint32_t curval; + pid_t ftid; /* current futex holder tid */ + proc_t *fproc = NULL; /* current futex holder proc */ + kthread_t *fthrd; /* current futex holder thread */ + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + if (fuword32(addr, &curval)) + return (set_errno(EFAULT)); + + /* The futex is free, use the normal flow. */ + if (curval == 0) + return (futex_lock_pi(memid, addr, NULL, B_TRUE)); + + /* Determine if the current futex holder is still alive. */ + ftid = curval & FUTEX_TID_MASK; + if (lx_lpid_lock(ftid, curzone, 0, &fproc, &fthrd) == 0) { + mutex_exit(&fproc->p_lock); + } else { + /* + * The current holder is gone. Unlock then take the lock. + * Ignore any error that may result from two threads racing to + * cleanup. + */ + (void) futex_unlock_pi(memid, addr, ftid); + } + return (futex_lock_pi(memid, addr, NULL, B_TRUE)); +} + +long +lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val3) +{ + struct as *as = curproc->p_as; + memid_t memid, memid2; + timestruc_t timeout; + timestruc_t *tptr = NULL; + int val2 = 0; + int rval = 0; + int cmd = op & FUTEX_CMD_MASK; + int private = op & FUTEX_PRIVATE_FLAG; + char dmsg[32]; + + /* must be aligned on int boundary */ + if (addr & 0x3) + return (set_errno(EINVAL)); + + /* Sanity check the futex command */ + if (cmd < 0 || cmd > FUTEX_MAX_CMD) + return (set_errno(EINVAL)); + + if (cmd == FUTEX_FD) { + /* + * FUTEX_FD was sentenced to death for grievous crimes of + * semantics against humanity; it has been ripped out of Linux + * and will never be supported by us. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + switch (cmd) { + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + /* + * These are operations that we don't currently support, but + * may well need to in the future. For now, callers need to + * deal with these being missing -- but if and as that changes, + * they may well need to be implemented. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) { + /* + * Linux only allows FUTEX_CLOCK_REALTIME to be set on the + * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands. + */ + return (set_errno(ENOSYS)); + } + + /* Copy in the timeout structure from userspace. */ + if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_LOCK_PI) && lx_timeout != (uintptr_t)NULL) { + rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd); + + if (rval != 0) + return (set_errno(rval)); + tptr = &timeout; + } + + switch (cmd) { + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: + /* + * lx_timeout is nominally a pointer to a userspace address. + * For several commands, however, it actually contains + * an additional integer parameter. This is horrible, and + * the people who did this to us should be sorry. + */ + val2 = (int)lx_timeout; + } + + /* + * Translate the process-specific, user-space futex virtual + * address(es) to a universal memid. If the private bit is set, we + * can just use our as plus the virtual address, saving quite a bit + * of effort. + */ + if (private) { + memid.val[0] = (uintptr_t)as; + memid.val[1] = (uintptr_t)addr; + } else { + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) + return (set_errno(rval)); + } + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) { + if (addr2 & 0x3) + return (set_errno(EINVAL)); + + if (private) { + memid2.val[0] = (uintptr_t)as; + memid2.val[1] = (uintptr_t)addr2; + } else { + rval = as_getmemid(as, (void *)addr2, &memid2); + if (rval) + return (set_errno(rval)); + } + } + + switch (cmd) { + case FUTEX_WAIT: + rval = futex_wait(&memid, (void *)addr, val, + tptr, FUTEX_BITSET_MATCH_ANY, B_FALSE); + break; + + case FUTEX_WAIT_BITSET: + rval = futex_wait(&memid, (void *)addr, val, tptr, val3, + (tptr == NULL || (op & FUTEX_CLOCK_REALTIME) != 0) ? + B_FALSE : B_TRUE); + break; + + case FUTEX_WAKE: + rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAKE_BITSET: + rval = futex_wake(&memid, val, val3); + break; + + case FUTEX_WAKE_OP: + rval = futex_wake_op(&memid, (void *)addr2, &memid2, + val, val2, val3); + break; + + case FUTEX_CMP_REQUEUE: + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, &val3); + + break; + + case FUTEX_REQUEUE: + /* + * Per Linux futex(2), FUTEX_REQUEUE is the same as + * FUTEX_CMP_REQUEUE, except val3 is ignored. futex_requeue() + * will elide the val3 check if cmpval (the last argument) is + * NULL. + */ + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, NULL); + + break; + + case FUTEX_LOCK_PI: + rval = futex_lock_pi(&memid, (uint32_t *)addr, tptr, B_FALSE); + break; + + case FUTEX_TRYLOCK_PI: + rval = futex_trylock_pi(&memid, (uint32_t *)addr); + break; + + case FUTEX_UNLOCK_PI: + rval = futex_unlock_pi(&memid, (uint32_t *)addr, 0); + if (rval != 0) + (void) set_errno(rval); + break; + } + + return (rval); +} + +/* + * Wake the next waiter if the thread holding the futex has exited without + * releasing the futex. + */ +static void +futex_robust_wake(memid_t *memid, uint32_t tid) +{ + fwaiter_t *fwp; + int index; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash[index].fh_lock); + + for (fwp = futex_hash[index].fh_waiters; fwp != NULL; + fwp = fwp->fw_next) { + if (MEMID_EQUAL(&fwp->fw_memid, memid)) + break; + } + + if (fwp != NULL) { + if (fwp->fw_tid != 0) { + /* + * This is a PI futex and there is a waiter; unlock the + * futex in cleanup mode. Ignore errors, which are very + * unlikely, but could happen if the futex was in an + * unexpected state due to some other cleanup, such as + * might happen with a concurrent trylock call. + */ + mutex_exit(&futex_hash[index].fh_lock); + (void) futex_unlock_pi(memid, + (uint32_t *)(uintptr_t)memid->val[1], tid); + return; + } + + /* non-PI futex, just wake it */ + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } + + mutex_exit(&futex_hash[index].fh_lock); +} + +/* + * Does the dirty work of actually dropping a held robust lock in the event + * of the untimely death of the owner; see lx_futex_robust_exit(), below. + */ +static void +lx_futex_robust_drop(uintptr_t addr, uint32_t tid) +{ + memid_t memid; + uint32_t oldval, newval; + + VERIFY(addr + sizeof (uint32_t) < KERNELBASE); + + do { + fuword32_noerr((void *)addr, &oldval); + + if ((oldval & FUTEX_TID_MASK) != tid) + return; + + newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + /* + * We have now denoted that this lock's owner is dead; we need to + * wake any waiters. + */ + if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0) + return; + + futex_robust_wake(&memid, tid); +} + +/* + * Called when a thread is exiting. The role of the kernel is very clearly + * spelled out in the Linux design document entitled robust-futex-ABI.txt: + * we must (carefully!) iterate over the list of held locks pointed to by + * the robust list head; for each lock, we'll check to see if the calling + * (exiting) thread is the owner, and if so, denote that the lock is dead + * and wake any waiters. (The "pending" field of the head points to a lock + * that is in transition; it should be dropped if held.) If there are any + * errors through here at all (including memory operations), we abort the + * entire operation. + */ +void +lx_futex_robust_exit(uintptr_t addr, uint32_t tid) +{ + futex_robust_list_t list; + uintptr_t entry, next; + model_t model = get_udatamodel(); + int length = 0; + label_t ljb; + + if (on_fault(&ljb)) + return; + + if (addr + sizeof (futex_robust_list_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + copyin_noerr((void *)addr, &list, sizeof (list)); + } +#if defined(_SYSCALL32_IMPL) + else { + futex_robust_list32_t list32; + + copyin_noerr((void *)addr, &list32, sizeof (list32)); + list.frl_head = list32.frl_head; + list.frl_offset = list32.frl_offset; + list.frl_pending = list32.frl_pending; + } +#endif + + /* + * Strip off the PI bit, if any. + */ + entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI; + + while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) { + if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + fulword_noerr((void *)entry, &next); + } +#if defined(_SYSCALL32_IMPL) + else { + uint32_t next32; + fuword32_noerr((void *)entry, &next32); + next = next32; + } +#endif + + /* + * Drop the robust mutex -- but only if our pending lock didn't + * somehow sneak on there. + */ + if (entry != list.frl_pending) + lx_futex_robust_drop(entry + list.frl_offset, tid); + + entry = next & ~FUTEX_LOCK_PI; + } + + /* + * Finally, drop the pending lock if there is one. + */ + if (list.frl_pending != (uint32_t)(uintptr_t)NULL && list.frl_pending + + list.frl_offset + sizeof (uint32_t) < KERNELBASE) + lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid); + +out: + no_fault(); +} + +long +lx_set_robust_list(void *listp, size_t len) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (len != sizeof (futex_robust_list_t)) + return (set_errno(EINVAL)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (len != sizeof (futex_robust_list32_t)) + return (set_errno(EINVAL)); + } +#endif + + /* + * To assure that we are serialized with respect to any racing call + * to lx_get_robust_list(), we lock ourselves to set the value. (Note + * that sprunlock() drops p_lock.) + */ + mutex_enter(&p->p_lock); + sprlock_proc(p); + lwpd->br_robust_list = listp; + sprunlock(p); + + return (0); +} + +long +lx_get_robust_list(pid_t pid, void **listp, size_t *lenp) +{ + model_t model = get_udatamodel(); + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + void *list; + int err = 0; + + if (pid == 0) { + /* + * A pid of 0 denotes the current thread; we lock the current + * process even though it isn't strictly necessary (we can't + * race with set_robust_list() because a thread may only set + * its robust list on itself). + */ + rproc = curproc; + rlwpd = lwptolxlwp(ttolwp(curthread)); + mutex_enter(&curproc->p_lock); + sprlock_proc(rproc); + } else { + if (lx_lpid_lock(pid, curzone, LXP_PRLOCK, &rproc, + &rthr) != 0) { + return (set_errno(ESRCH)); + } + + if (rproc->p_model != model || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL) { + /* + * The target process does not match our data model, or + * we couldn't find the LWP, or the target process is + * not branded. + */ + err = ESRCH; + goto out; + } + } + + if (curproc != rproc && + priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) { + /* + * We don't have the permission to examine the target. + */ + err = EPERM; + goto out; + } + + list = rlwpd->br_robust_list; + +out: + sprunlock(rproc); + + if (err != 0) + return (set_errno(err)); + + if (model == DATAMODEL_NATIVE) { + if (sulword(listp, (uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (sulword(lenp, sizeof (futex_robust_list_t)) != 0) + return (set_errno(EFAULT)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (suword32(listp, (uint32_t)(uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +void +lx_futex_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + mutex_init(&futex_hash[i].fh_lock, NULL, MUTEX_DEFAULT, NULL); +} + +int +lx_futex_fini(void) +{ + int i, err; + + err = 0; + for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { + mutex_enter(&futex_hash[i].fh_lock); + if (futex_hash[i].fh_waiters != NULL) + err = EBUSY; + mutex_exit(&futex_hash[i].fh_lock); + } + return (err); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c new file mode 100644 index 0000000000..275a781fa0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getcwd.c @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/pathname.h> + +/* + * getcwd() - Linux syscall semantics are slightly different; we need to return + * the length of the pathname copied (+ 1 for the terminating NULL byte.) + */ +long +lx_getcwd(char *buf, int size) +{ + int len; + int error; + vnode_t *vp; + char path[MAXPATHLEN + 1]; + + mutex_enter(&curproc->p_lock); + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + mutex_exit(&curproc->p_lock); + if ((error = vnodetopath(NULL, vp, path, sizeof (path), CRED())) != 0) { + VN_RELE(vp); + return (set_errno(error)); + } + VN_RELE(vp); + + len = strlen(path) + 1; + if (len > size) + return (set_errno(ERANGE)); + + if (copyout(path, buf, len) != 0) + return (set_errno(EFAULT)); + + return (len); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c new file mode 100644 index 0000000000..5bde892aea --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/vnode.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> + +#include <sys/lx_types.h> +#include <sys/lx_misc.h> + +#define LX_NAMEMAX 256 + +#define LX_GETDENTS_MAX_BUFSZ 65536 + +/* + * See the comment in our lx_sysfs VFS code for a detailed explanation around + * the handling of 'd_type' here. + */ +#define LX_DT_UNKNOWN 0 +#define LX_DT_FIFO 1 +#define LX_DT_CHR 2 +#define LX_DT_DIR 4 +#define LX_DT_BLK 6 +#define LX_DT_REG 8 +#define LX_DT_LNK 10 +#define LX_DT_SOCK 12 + +/* + * Set by lx_sysfs when it loads. lx_sysfs depends on the lx_brand module, + * so our module has to load first and define the variables that lx_sysfs will + * set when it loads. + */ +int lx_sysfs_vfs_type; +int (*lx_sysfs_vtype)(ino_t); + +/* + * Because the Linux dirent has an extra field (d_type), it's possible that + * each entry will be 8 bytes larger (and aligned to 8 bytes) due to padding. + * To prevent overrun during translation, the illumos-native buffer is sized + * pessimistically. + */ +#define LTOS_GETDENTS_BUFSZ(bufsz, datasz) \ + (((bufsz) / (((datasz) + 15) & ~7)) * sizeof (struct dirent)) + +/* + * Linux d_type offset is at (d_reclen - 1). See the Linux getdents(2) man page. + * This macro assumes d_reclen is already set correctly. + */ +#define LX_DTYPE(l) *(((char *)l) + (l->d_reclen - 1)) + +/* + * Record must be long enough to house d_name string, null terminator and + * d_type field. It's then padded to nearest 8-byte boundary + */ +#define LX_RECLEN(l, t) \ + ((offsetof(t, d_name) + 2 + (l) + 7) & ~7) + +/* + * Bytes after d_name string until d_reclen should be zeroed. + * Includes zero-terminating d_name + */ +#define LX_ZEROLEN(l, t) \ + (LX_RECLEN(l, t) - \ + ((offsetof(t, d_name) + (l)))) + +/* The output format of getdents differs if the caller is 32 or 64 bit. */ +struct lx_dirent_32 { + uint32_t d_ino; + int32_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +struct lx_dirent_64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +static long +lx_getdents_common(int fd, caddr_t uptr, size_t count, + unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int, boolean_t)) +{ + vnode_t *vp; + boolean_t is_sysfs = B_FALSE; + file_t *fp; + struct uio auio; + struct iovec aiov; + int error, at_eof; + int sbufsz, lbufsz, bufsz; + void *lbuf, *sbuf; + size_t outb = 0; + + if (count < lx_size) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + if (!(fp->f_flag & FREAD)) { + releasef(fd); + return (set_errno(EBADF)); + } + + if (vp->v_vfsp->vfs_fstype == lx_sysfs_vfs_type) { + is_sysfs = B_TRUE; + } + + if (count > LX_GETDENTS_MAX_BUFSZ) { + /* + * If the target buffer passed to us is huge, keep the + * translation buffers moderate in size. Iteration will be + * used to fill the request. + */ + lbufsz = LX_GETDENTS_MAX_BUFSZ; + sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size); + } else if (count < (lx_size + MAXPATHLEN)) { + /* + * If the target buffer is tiny, allocate a Linux-format buffer + * big enough to hold at least one max-length row while keeping + * the illumos-format buffer pesimistic in size. + * + * Assuming the buffer is truely tiny, it's likely that the + * result will not fit and an EINVAL will be tossed. + */ + lbufsz = (lx_size + MAXPATHLEN); + sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)), + sizeof (struct dirent)); + } else { + lbufsz = count; + sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size); + } + bufsz = sbufsz; + lbuf = kmem_alloc(lbufsz, KM_SLEEP); + sbuf = kmem_alloc(sbufsz, KM_SLEEP); + + aiov.iov_base = sbuf; + aiov.iov_len = sbufsz; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = sbufsz; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Since we use a conservative buffer allocation for the differing + * struct sizing and Linux places fewer limits on getdents buffers in + * general, there's a chance we'll undershoot on the record count. + * When this happens, we can simply repeat the READDIR operation until + * the available records are exhausted or we've filled the user buffer. + */ + do { + int res; + + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error != 0 || auio.uio_resid == sbufsz) { + break; + } + res = outcb(sbuf, lbuf, bufsz - auio.uio_resid, is_sysfs); + VERIFY(res <= lbufsz); + if (res == 0) { + /* no records to copyout from this batch */ + break; + } else if (res > count) { + /* + * For very small buffer sizes, it's possible that a + * single record is too large due to a long filename. + */ + error = EINVAL; + break; + } + + VERIFY(outb + res <= count); + if (copyout(lbuf, (void *)(uptr + outb), res) != 0) { + error = EFAULT; + break; + } + outb += res; + + /* + * We undershot the request buffer. + * Reset for another READDIR, taking care not to overshoot. + */ + bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size)); + auio.uio_resid = bufsz; + aiov.iov_len = bufsz; + aiov.iov_base = sbuf; + + /* + * Continued progress is allowed only if EOF has not been + * reached and there is enough remaining buffer space to hold + * an entry with a max-length filename. + */ + } while (at_eof == 0 && (count - outb) >= (lx_size + MAXPATHLEN)); + + kmem_free(lbuf, lbufsz); + kmem_free(sbuf, sbufsz); + + if (error) { + releasef(fd); + return (set_errno(error)); + } + + fp->f_offset = auio.uio_loffset; + releasef(fd); + return (outb); +} + +static int +lx_get_sysfs_dtype(ino_t ino) +{ + vtype_t vt; + + vt = lx_sysfs_vtype(ino); + + switch (vt) { + case VREG: return (LX_DT_REG); + case VDIR: return (LX_DT_DIR); + case VBLK: return (LX_DT_BLK); + case VCHR: return (LX_DT_CHR); + case VLNK: return (LX_DT_LNK); + case VFIFO: return (LX_DT_FIFO); + case VSOCK: return (LX_DT_SOCK); + default: return (LX_DT_UNKNOWN); + } +} + +static int +lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent_32 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent_32 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_32); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_32)); + + if (is_sysfs) { + LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +static int +lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent_64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent_64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_64); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_64)); + + if (is_sysfs) { + LX_DTYPE(ld) = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +long +lx_getdents_32(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_32), lx_getdents_format32)); +} + +long +lx_getdents_64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_64), lx_getdents_format64)); +} + +struct lx_dirent64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + uchar_t d_type; + char d_name[1]; +}; + +#define LX_RECLEN64(namelen) \ + ((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7) + +#define LX_ZEROLEN64(namelen) \ + (LX_RECLEN64(namelen) - \ + ((offsetof(struct lx_dirent64, d_name) + (namelen)))) + +static int +lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len, boolean_t is_sysfs) +{ + struct dirent *sd; + struct lx_dirent64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + /* LINTED: alignment */ + sd = (struct dirent *)sbuf; + /* LINTED: alignment */ + ld = (struct lx_dirent64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + ld->d_type = LX_DT_UNKNOWN; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN64(namelen); + /* Zero out any alignment padding */ + bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen)); + + if (is_sysfs) { + ld->d_type = lx_get_sysfs_dtype(ld->d_ino); + } + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + + +long +lx_getdents64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent64), lx_getdents64_format)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c new file mode 100644 index 0000000000..0ebd93304e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * return the pid + */ +long +lx_getpid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + long rv; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + VERIFY(lwpd != NULL); + + rv = lwpd->br_tgid; + } + + return (rv); +} + +/* + * return the parent pid + */ +long +lx_getppid(void) +{ + return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL)); +} + +/* + * return the thread id + */ +long +lx_gettid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + return (lwpd->br_pid == curzone->zone_proc_initpid ? 1 : lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c new file mode 100644 index 0000000000..acc4073483 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * From "uts/common/syscall/getrandom.c": + */ +extern int getrandom(void *, size_t, int); + +long +lx_getrandom(void *bufp, size_t buflen, int flags) +{ + /* + * According to signal(7), calls to getrandom(2) are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (getrandom(bufp, buflen, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c new file mode 100644 index 0000000000..67f0fc9e5e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -0,0 +1,509 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/zone.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/policy.h> +#include <sys/lx_types.h> + +#define LX_NGROUPS_MAX 32 + +/* From usr/src/uts/common/syscall/gid.c & uid.c */ +extern int setgid(gid_t); +extern int setregid(gid_t, gid_t); +extern int setreuid(uid_t, uid_t); +extern int setuid(uid_t); + +/* From usr/src/uts/common/syscall/groups.c */ +extern int setgroups(int, gid_t *); + +long +lx_getegid(void) +{ + return (crgetgid(CRED())); +} + +long +lx_getegid16(void) +{ + return ((int)LX_GID32_TO_GID16(crgetgid(CRED()))); +} + +long +lx_geteuid(void) +{ + return (crgetuid(CRED())); +} + +long +lx_geteuid16(void) +{ + return ((int)LX_UID32_TO_UID16(crgetuid(CRED()))); +} + +long +lx_getgid(void) +{ + return (crgetrgid(CRED())); +} + +long +lx_getgid16(void) +{ + return ((int)LX_GID32_TO_GID16(crgetrgid(CRED()))); +} + +long +lx_getuid(void) +{ + return (crgetruid(CRED())); +} + +long +lx_getuid16(void) +{ + return ((int)LX_UID32_TO_UID16(crgetruid(CRED()))); +} + +long +lx_setgid(gid_t gid) +{ + return (setgid(gid)); +} + +long +lx_setgid16(lx_gid16_t gid) +{ + return (setgid(LX_GID16_TO_GID32(gid))); +} + +long +lx_setregid(gid_t rgid, gid_t egid) +{ + return (setregid(rgid, egid)); +} + +long +lx_setregid16(lx_gid16_t rgid, lx_gid16_t egid) +{ + return (setregid(LX_UID16_TO_UID32(rgid), LX_UID16_TO_UID32(egid))); +} + +long +lx_setreuid(uid_t ruid, uid_t euid) +{ + return (setreuid(ruid, euid)); +} + +long +lx_setreuid16(lx_uid16_t ruid, lx_uid16_t euid) +{ + return (setreuid(LX_UID16_TO_UID32(ruid), LX_UID16_TO_UID32(euid))); +} + +long +lx_setuid(uid_t uid) +{ + return (setuid(uid)); +} + +long +lx_setuid16(lx_uid16_t uid) +{ + return (setuid(LX_UID16_TO_UID32(uid))); +} + +/* + * This function is based on setreuid in common/syscall/uid.c and exists + * because illumos does not have a way to explicitly set the saved uid (suid) + * from any other system call. + */ +long +lx_setresuid(lx_uid_t ruid, lx_uid_t euid, lx_uid_t suid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid > MAXUID)) || + (euid != -1 && (euid > MAXUID)) || + (suid != -1 && (suid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && + ruid != cr->cr_ruid && ruid != cr->cr_uid && + ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else if (suid != -1 && + suid != cr->cr_ruid && suid != cr->cr_uid && + suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (suid != -1) + newcr->cr_suid = suid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresuid16(lx_uid16_t ruid16, lx_uid16_t euid16, lx_uid16_t suid16) +{ + long rval; + + rval = lx_setresuid( + LX_UID16_TO_UID32(ruid16), + LX_UID16_TO_UID32(euid16), + LX_UID16_TO_UID32(suid16)); + + return (rval); +} + +/* + * This function is based on setregid in common/syscall/gid.c + */ +long +lx_setresgid(lx_gid_t rgid, lx_gid_t egid, lx_gid_t sgid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid > MAXUID)) || + (egid != -1 && (egid > MAXUID)) || + (sgid != -1 && (sgid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (rgid != -1 && + rgid != cr->cr_rgid && rgid != cr->cr_gid && + rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (egid != -1 && + egid != cr->cr_rgid && egid != cr->cr_gid && + egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (sgid != -1 && + sgid != cr->cr_rgid && sgid != cr->cr_gid && + sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (sgid != -1) + newcr->cr_sgid = sgid; + if (rgid != -1) + newcr->cr_rgid = rgid; + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresgid16(lx_gid16_t rgid16, lx_gid16_t egid16, lx_gid16_t sgid16) +{ + long rval; + + rval = lx_setresgid( + LX_GID16_TO_GID32(rgid16), + LX_GID16_TO_GID32(egid16), + LX_GID16_TO_GID32(sgid16)); + + return (rval); +} + +/* + * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ + * the terrible hack below so that tests may proceed, if only on DEBUG kernels. + */ +int +lx_helper_setgroups(int ngroups, gid_t *grouplist) +{ +#ifdef DEBUG + if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) + ngroups = ngroups_max; +#endif /* DEBUG */ + + return (setgroups(ngroups, grouplist)); +} + +long +lx_getresuid(lx_uid_t *ruid, lx_uid_t *euid, lx_uid_t *suid) +{ + lx_uid_t lx_ruid, lx_euid, lx_suid; + cred_t *cr = CRED(); + + lx_ruid = (lx_uid_t)crgetruid(cr); + lx_euid = (lx_uid_t)crgetuid(cr); + lx_suid = (lx_uid_t)crgetsuid(cr); + + if (copyout(&lx_ruid, (void *)ruid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_euid, (void *)euid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_suid, (void *)suid, sizeof (lx_uid_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresuid16(lx_uid16_t *ruid16, lx_uid16_t *euid16, lx_uid16_t *suid16) +{ + lx_uid16_t lx_ruid16, lx_euid16, lx_suid16; + cred_t *cr = CRED(); + + lx_ruid16 = LX_UID32_TO_UID16((lx_uid_t)crgetruid(cr)); + lx_euid16 = LX_UID32_TO_UID16((lx_uid_t)crgetuid(cr)); + lx_suid16 = LX_UID32_TO_UID16((lx_uid_t)crgetsuid(cr)); + + if (copyout(&lx_ruid16, (void *)ruid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_euid16, (void *)euid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_suid16, (void *)suid16, sizeof (lx_uid16_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresgid(lx_gid_t *rgid, lx_gid_t *egid, lx_gid_t *sgid) +{ + lx_gid_t lx_rgid, lx_egid, lx_sgid; + cred_t *cr = CRED(); + + lx_rgid = (lx_gid_t)crgetrgid(cr); + lx_egid = (lx_gid_t)crgetgid(cr); + lx_sgid = (lx_gid_t)crgetsgid(cr); + + if (copyout(&lx_rgid, (void *)rgid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_egid, (void *)egid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_sgid, (void *)sgid, sizeof (lx_gid_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_getresgid16(lx_gid16_t *rgid16, lx_gid16_t *egid16, lx_gid16_t *sgid16) +{ + lx_gid16_t lx_rgid16, lx_egid16, lx_sgid16; + cred_t *cr = CRED(); + + lx_rgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetrgid(cr)); + lx_egid16 = LX_GID32_TO_GID16((lx_gid_t)crgetgid(cr)); + lx_sgid16 = LX_GID32_TO_GID16((lx_gid_t)crgetsgid(cr)); + + if (copyout(&lx_rgid16, (void *)rgid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_egid16, (void *)egid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + if (copyout(&lx_sgid16, (void *)sgid16, sizeof (lx_gid16_t)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * The lx brand cannot support the setfs[ug]id16/setfs[ug]id calls as that + * would require significant rework of the illumos privilege mechanisms, so + * instead return the current effective [ug]id. + * + * In Linux, fsids track effective IDs, so returning the effective IDs works + * as a substitute; returning the current value also denotes failure of the + * call if the caller had specified something different. We don't need to + * worry about setting error codes because the Linux calls don't set any. + */ +/*ARGSUSED*/ +long +lx_setfsuid16(uid_t fsuid16) +{ + return ((int)LX_UID32_TO_UID16(crgetuid(CRED()))); +} + +/*ARGSUSED*/ +long +lx_setfsgid16(gid_t fsgid16) +{ + return ((int)LX_GID32_TO_GID16(crgetgid(CRED()))); +} + +/*ARGSUSED*/ +long +lx_setfsuid(uid_t fsuid) +{ + return (crgetuid(CRED())); +} + +/*ARGSUSED*/ +long +lx_setfsgid(gid_t fsgid) +{ + return (crgetgid(CRED())); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c new file mode 100644 index 0000000000..f745a90e41 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c @@ -0,0 +1,1901 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/termio.h> +#include <sys/termios.h> +#include <sys/ptyvar.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <sys/sockio.h> +#include <sys/stropts.h> +#include <sys/ptms.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/sysmacros.h> +#include <sys/lx_misc.h> +#include <sys/lx_ptm.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/kmem.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/if_arp.h> +#include <sys/ioccom.h> +#include <sys/dtrace.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/lx_autofs.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> +#include <sys/dkio.h> +#include <sys/sdt.h> + +/* + * Linux ioctl types + */ +#define LX_IOC_TYPE_HD 0x03 +#define LX_IOC_TYPE_BLK 0x12 +#define LX_IOC_TYPE_FD 0x54 +#define LX_IOC_TYPE_DTRACE 0x68 +#define LX_IOC_TYPE_SOCK 0x89 +#define LX_IOC_TYPE_AUTOFS 0x93 + +/* + * Supported ioctls + */ +#define LX_HDIO_GETGEO 0x0301 +#define LX_BLKGETSIZE 0x1260 +#define LX_BLKSSZGET 0x1268 +#define LX_BLKGETSIZE64 0x80081272 +#define LX_TCGETS 0x5401 +#define LX_TCSETS 0x5402 +#define LX_TCSETSW 0x5403 +#define LX_TCSETSF 0x5404 +#define LX_TCGETA 0x5405 +#define LX_TCSETA 0x5406 +#define LX_TCSETAW 0x5407 +#define LX_TCSETAF 0x5408 +#define LX_TCSBRK 0x5409 +#define LX_TCXONC 0x540a +#define LX_TCFLSH 0x540b +#define LX_TIOCEXCL 0x540c +#define LX_TIOCNXCL 0x540d +#define LX_TIOCSCTTY 0x540e +#define LX_TIOCGPGRP 0x540f +#define LX_TIOCSPGRP 0x5410 +#define LX_TIOCOUTQ 0x5411 +#define LX_TIOCSTI 0x5412 +#define LX_TIOCGWINSZ 0x5413 +#define LX_TIOCSWINSZ 0x5414 +#define LX_TIOCMGET 0x5415 +#define LX_TIOCMBIS 0x5416 +#define LX_TIOCMBIC 0x5417 +#define LX_TIOCMSET 0x5418 +#define LX_TIOCGSOFTCAR 0x5419 +#define LX_TIOCSSOFTCAR 0x541a +#define LX_FIONREAD 0x541b +#define LX_TIOCPKT 0x5420 +#define LX_FIONBIO 0x5421 +#define LX_TIOCNOTTY 0x5422 +#define LX_TIOCSETD 0x5423 +#define LX_TIOCGETD 0x5424 +#define LX_TCSBRKP 0x5425 +#define LX_TIOCGSID 0x5429 +#define LX_TIOCGPTN 0x80045430 +#define LX_TIOCSPTLCK 0x40045431 +#define LX_FIONCLEX 0x5450 +#define LX_FIOCLEX 0x5451 +#define LX_FIOASYNC 0x5452 +#define LX_FIOSETOWN 0x8901 +#define LX_SIOCSPGRP 0x8902 +#define LX_FIOGETOWN 0x8903 +#define LX_SIOCGPGRP 0x8904 +#define LX_SIOCATMARK 0x8905 +#define LX_SIOCGSTAMP 0x8906 +#define LX_SIOCADDRT 0x890b +#define LX_SIOCDELRT 0x890c +#define LX_SIOCRTMSG 0x890d +#define LX_SIOCGIFNAME 0x8910 +#define LX_SIOCSIFLINK 0x8911 +#define LX_SIOCGIFCONF 0x8912 +#define LX_SIOCGIFFLAGS 0x8913 +#define LX_SIOCSIFFLAGS 0x8914 +#define LX_SIOCGIFADDR 0x8915 +#define LX_SIOCSIFADDR 0x8916 +#define LX_SIOCGIFDSTADDR 0x8917 +#define LX_SIOCSIFDSTADDR 0x8918 +#define LX_SIOCGIFBRDADDR 0x8919 +#define LX_SIOCSIFBRDADDR 0x891a +#define LX_SIOCGIFNETMASK 0x891b +#define LX_SIOCSIFNETMASK 0x891c +#define LX_SIOCGIFMETRIC 0x891d +#define LX_SIOCSIFMETRIC 0x891e +#define LX_SIOCGIFMEM 0x891f +#define LX_SIOCSIFMEM 0x8920 +#define LX_SIOCGIFMTU 0x8921 +#define LX_SIOCSIFMTU 0x8922 +#define LX_SIOCSIFNAME 0x8923 +#define LX_SIOCSIFHWADDR 0x8924 +#define LX_SIOCGIFENCAP 0x8925 +#define LX_SIOCSIFENCAP 0x8926 +#define LX_SIOCGIFHWADDR 0x8927 +#define LX_SIOCGIFSLAVE 0x8929 +#define LX_SIOCSIFSLAVE 0x8930 +#define LX_SIOCADDMULTI 0x8931 +#define LX_SIOCDELMULTI 0x8932 +#define LX_SIOCGIFINDEX 0x8933 +#define LX_SIOCSIFPFLAGS 0x8934 +#define LX_SIOCGIFPFLAGS 0x8935 +#define LX_SIOCDIFADDR 0x8936 +#define LX_SIOCSIFHWBROADCAST 0x8937 +#define LX_SIOCGIFCOUNT 0x8938 +#define LX_SIOCGIFBR 0x8940 +#define LX_SIOCSIFBR 0x8941 +#define LX_SIOCGIFTXQLEN 0x8942 +#define LX_SIOCSIFTXQLEN 0x8943 +#define LX_SIOCETHTOOL 0x8946 +#define LX_SIOCGMIIPHY 0x8947 +#define LX_SIOCGMIIREG 0x8948 +#define LX_SIOCSMIIREG 0x8949 +#define LX_SIOCWANDEV 0x894a +#define LX_SIOCOUTQNSD 0x894b +#define LX_SIOCDARP 0x8953 +#define LX_SIOCGARP 0x8954 +#define LX_SIOCSARP 0x8955 +#define LX_SIOCDRARP 0x8960 +#define LX_SIOCGRARP 0x8961 +#define LX_SIOCSRARP 0x8962 +#define LX_SIOCGIFMAP 0x8970 +#define LX_SIOCSIFMAP 0x8971 +#define LX_SIOCADDDLCI 0x8980 +#define LX_SIOCDELDLCI 0x8981 +#define LX_SIOCGIFVLAN 0x8982 +#define LX_SIOCSIFVLAN 0x8983 +#define LX_SIOCBONDENSLAVE 0x8990 +#define LX_SIOCBONDRELEASE 0x8991 +#define LX_SIOCBONDSETHWADDR 0x8992 +#define LX_SIOCBONDSLAVEINFOQUERY 0x8993 +#define LX_SIOCBONDINFOQUERY 0x8994 +#define LX_SIOCBONDCHANGEACTIVE 0x8995 +#define LX_SIOCBRADDBR 0x89a0 +#define LX_SIOCBRDELBR 0x89a1 +#define LX_SIOCBRADDIF 0x89a2 +#define LX_SIOCBRDELIF 0x89a3 +#define LX_SIOCSHWTSTAMP 0x89b0 +#define LX_SIOCGHWTSTAMP 0x89b1 +#define LX_SIOCDEVPRIVATE 0x89f0 +#define LX_SIOCPROTOPRIVATE 0x89e0 + +#define FLUSER(fp) fp->f_flag | get_udatamodel() +#define FLFAKE(fp) fp->f_flag | FKIOCTL + +/* + * LX_NCC must be different from LX_NCCS since while the termio and termios + * structures may look similar they are fundamentally different sizes and + * have different members. + */ +#define LX_NCC 8 +#define LX_NCCS 19 + +struct lx_termio { + unsigned short c_iflag; /* input mode flags */ + unsigned short c_oflag; /* output mode flags */ + unsigned short c_cflag; /* control mode flags */ + unsigned short c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCC]; /* control characters */ +}; + +struct lx_termios { + uint32_t c_iflag; /* input mode flags */ + uint32_t c_oflag; /* output mode flags */ + uint32_t c_cflag; /* control mode flags */ + uint32_t c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCCS]; /* control characters */ +}; + +/* + * c_cc characters which are valid for lx_termio and lx_termios + */ +#define LX_VINTR 0 +#define LX_VQUIT 1 +#define LX_VERASE 2 +#define LX_VKILL 3 +#define LX_VEOF 4 +#define LX_VTIME 5 +#define LX_VMIN 6 +#define LX_VSWTC 7 + +/* + * c_cc characters which are valid for lx_termios + */ +#define LX_VSTART 8 +#define LX_VSTOP 9 +#define LX_VSUSP 10 +#define LX_VEOL 11 +#define LX_VREPRINT 12 +#define LX_VDISCARD 13 +#define LX_VWERASE 14 +#define LX_VLNEXT 15 +#define LX_VEOL2 16 + +/* + * Defaults needed for SunOS to Linux format conversion. + * See INIT_C_CC in linux-stable/include/asm-generic/termios.h + */ +#define LX_DEF_VTIME 0 +#define LX_DEF_VMIN 1 +#define LX_DEF_VEOF '\004' +#define LX_DEF_VEOL 0 + +/* VSD key for lx_cc information */ +static uint_t lx_ioctl_vsd = 0; + + +/* Terminal helpers */ + +static void +l2s_termios(struct lx_termios *l_tios, struct termios *s_tios) +{ + ASSERT((l_tios != NULL) && (s_tios != NULL)); + + bzero(s_tios, sizeof (*s_tios)); + + s_tios->c_iflag = l_tios->c_iflag; + s_tios->c_oflag = l_tios->c_oflag; + s_tios->c_cflag = l_tios->c_cflag; + s_tios->c_lflag = l_tios->c_lflag; + + if (s_tios->c_lflag & ICANON) { + s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF]; + s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL]; + } else { + s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN]; + s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME]; + } + + s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2]; + s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE]; + s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL]; + s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT]; + s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT]; + s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE]; + s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR]; + s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT]; + s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC]; + s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART]; + s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP]; + s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP]; + s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD]; +} + +static void +l2s_termio(struct lx_termio *l_tio, struct termio *s_tio) +{ + ASSERT((l_tio != NULL) && (s_tio != NULL)); + + bzero(s_tio, sizeof (*s_tio)); + + s_tio->c_iflag = l_tio->c_iflag; + s_tio->c_oflag = l_tio->c_oflag; + s_tio->c_cflag = l_tio->c_cflag; + s_tio->c_lflag = l_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF]; + } else { + s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN]; + s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME]; + } + + s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR]; + s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT]; + s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE]; + s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL]; + s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC]; +} + +static void +termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio) +{ + ASSERT((l_tios != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tios->c_cc[LX_VEOF]; + lio->veol = l_tios->c_cc[LX_VEOL]; + lio->vmin = l_tios->c_cc[LX_VMIN]; + lio->vtime = l_tios->c_cc[LX_VTIME]; +} + +static void +termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio) +{ + ASSERT((l_tio != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tio->c_cc[LX_VEOF]; + lio->veol = 0; + lio->vmin = l_tio->c_cc[LX_VMIN]; + lio->vtime = l_tio->c_cc[LX_VTIME]; +} + +static void +s2l_termios(struct termios *s_tios, struct lx_termios *l_tios) +{ + ASSERT((s_tios != NULL) && (l_tios != NULL)); + + bzero(l_tios, sizeof (*l_tios)); + + l_tios->c_iflag = s_tios->c_iflag; + l_tios->c_oflag = s_tios->c_oflag; + l_tios->c_cflag = s_tios->c_cflag; + l_tios->c_lflag = s_tios->c_lflag; + + /* + * Since use of the VMIN/VTIME and VEOF/VEOL control characters is + * mutually exclusive (determined by ICANON), SunOS aliases them in the + * c_cc field in termio/termios. Linux does not perform this aliasing, + * so it expects that the default values are present regardless of + * ICANON status. + * + * These defaults can be overridden later by any values stored via the + * lx_cc mechanism. + */ + if (s_tios->c_lflag & ICANON) { + l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF]; + l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL]; + l_tios->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tios->c_cc[LX_VMIN] = LX_DEF_VMIN; + + } else { + l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN]; + l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME]; + l_tios->c_cc[LX_VEOF] = LX_DEF_VEOF; + l_tios->c_cc[LX_VEOL] = LX_DEF_VEOL; + } + + l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2]; + l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE]; + l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL]; + l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT]; + l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT]; + l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE]; + l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR]; + l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT]; + l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH]; + l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART]; + l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP]; + l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP]; + l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD]; +} + +static void +s2l_termio(struct termio *s_tio, struct lx_termio *l_tio) +{ + ASSERT((s_tio != NULL) && (l_tio != NULL)); + + bzero(l_tio, sizeof (*l_tio)); + + l_tio->c_iflag = s_tio->c_iflag; + l_tio->c_oflag = s_tio->c_oflag; + l_tio->c_cflag = s_tio->c_cflag; + l_tio->c_lflag = s_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF]; + l_tio->c_cc[LX_VTIME] = LX_DEF_VTIME; + l_tio->c_cc[LX_VMIN] = LX_DEF_VMIN; + } else { + l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN]; + l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME]; + l_tio->c_cc[LX_VEOF] = LX_DEF_VEOF; + } + + l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR]; + l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT]; + l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE]; + l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL]; + l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH]; +} + +static void +set_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + /* + * Linux expects that the termio/termios control characters are + * preserved more strictly than illumos supports. In order to preserve + * the illusion that the characters are maintained, they are stored as + * vnode-specific data. + */ + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur == NULL) { + cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP); + bcopy(lio, cur, sizeof (struct lx_cc)); + (void) vsd_set(vp, lx_ioctl_vsd, cur); + } else { + bcopy(lio, cur, sizeof (struct lx_cc)); + } + mutex_exit(&vp->v_vsd_lock); +} + +static int +get_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + int rv = 1; + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur != NULL) { + bcopy(cur, lio, sizeof (*lio)); + rv = 0; + } + mutex_exit(&vp->v_vsd_lock); + return (rv); +} + +/* Socket helpers */ + +typedef struct lx_ifreq32 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + } ifr_ifrn; +} lx_ifreq32_t; + +typedef struct lx_ifreq64 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + /* pad this out to the Linux size */ + uint64_t ifmap[3]; + } ifr_ifrn; +} lx_ifreq64_t; + +typedef struct lx_ifconf32 { + int32_t if_len; + caddr32_t if_buf; +} lx_ifconf32_t; + +typedef struct lx_ifconf64 { + int32_t if_len; + caddr_t if_buf; +} lx_ifconf64_t; + + +/* Generic translators */ + +/* ARGSUSED */ +static int +ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error = 0; + int rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + int32_t iflag, flags; + int error; + + if (copyin((caddr_t)arg, &iflag, sizeof (iflag))) + return (set_errno(EFAULT)); + + mutex_enter(&fp->f_tlock); + vp = fp->f_vnode; + flags = fp->f_flag; + /* Linux sets NONBLOCK instead of FIONBIO */ + if (iflag) + flags |= FNONBLOCK; + else + flags &= ~FNONBLOCK; + /* push the flag down */ + error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL); + fp->f_flag = flags; + mutex_exit(&fp->f_tlock); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + struct vattr vattr; + int error = 0; + int rv; + /* + * offset is int32_t because that is what FIONREAD is defined in terms + * of. We cap at INT_MAX as in other cases for this ioctl. + */ + int32_t offset; + + vp = fp->f_vnode; + + if (vp->v_type == VREG || vp->v_type == VDIR) { + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL); + if (error != 0) + return (set_errno(error)); + offset = MIN(vattr.va_size - fp->f_offset, INT_MAX); + if (copyout(&offset, (caddr_t)arg, sizeof (offset))) + return (set_errno(EFAULT)); + } else { + error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred, + &rv, NULL); + if (error) + return (set_errno(error)); + } + return (0); +} + +/* + * hard disk-related translators + * + * Note that the normal disk ioctls only work for VCHR devices. See spec_ioctl + * which will return ENOTTY for a VBLK device. However, fdisk, etc. expect to + * work with block devices. + * + * We expect a zvol to be the primary block device we're interacting with and + * we use the zone's lxzd_vdisks list to handle zvols specifically. + */ + +typedef struct lx_hd_geom { + unsigned char heads; + unsigned char sectors; + unsigned short cylinders; + unsigned long start; +} lx_hd_geom_t; + +/* + * Return the volsize and blksize for the correct virtual "disk" for the zone. + * Only these two values are returned in 'vdp' within this code. + * + * A virtual "disk" can be a zvol visible within the zone, but most zones are + * not configured with a delegated dataset necessary to make zvols visible. + * + * To make various applications happy, lx also pretends that our root filesystem + * (normally within the zone's dataset) lives on a virtual disk. We have a + * /dev/zfsds0 symlink which points at /dev/zfs. This appears in various places + * to give the illusion of root's disk. For example, see: + * /proc/partitions + * /sys/block/zfsds0 + * /sys/devices/zfs/zfsds0 + * If an application issues the various LX_HDIO_GETGEO, LX_BLKGETSIZE*, or + * LX_BLKSSZGET ioctls on /dev/zfs (that is, minor number 0), we want to return + * something sane. In this case, we return the total size (which is normally + * limited by a quota) of the dataset that the zone root lives on. + */ +static boolean_t +lx_lookup_zdsk_info(lx_zone_data_t *lxzd, dev_t dev, lx_virt_disk_t *vdp) +{ + lx_virt_disk_t *vd; + + /* Handle /dev/zfs */ + if (getminor(dev) == 0) { + struct statvfs64 sv; + + if (VFS_STATVFS(curzone->zone_rootvp->v_vfsp, &sv) == 0) { + vdp->lxvd_volsize = sv.f_blocks * sv.f_frsize; + vdp->lxvd_blksize = sv.f_frsize; + } else { + vdp->lxvd_volsize = 0; + /* always set to prevent potential divide-by-zero */ + vdp->lxvd_blksize = 512; + } + + return (B_TRUE); + } + + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_type == LXVD_ZVOL && vd->lxvd_real_dev == dev) { + bzero(vdp, sizeof (*vdp)); + vdp->lxvd_volsize = vd->lxvd_volsize; + vdp->lxvd_blksize = vd->lxvd_blksize; + return (B_TRUE); + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + return (B_FALSE); +} + +/* + * See zvol_ioctl() which always fails for DKIOCGGEOM. The geometry for a + * zvol (or really any modern disk) is made up, so we do that here as well. + */ +/* ARGSUSED */ +static int +ict_hdgetgeo(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_hd_geom_t lx_geom; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd) || + vd.lxvd_volsize == 0 || vd.lxvd_blksize == 0) { + /* should only happen if new zvol */ + bzero(&lx_geom, sizeof (lx_geom)); + } else { + const diskaddr_t blks = + MAX(1, vd.lxvd_volsize / vd.lxvd_blksize); + + /* + * Attempt to conjure up a Cylinder-Head-Sector + * geometry for the given virtual disk size. + */ + if (blks <= (63*16*65535)) { + /* + * Use traditional BIOS-style geometry for + * adequately small disks. + */ + lx_geom.sectors = 63; + lx_geom.heads = 16; + lx_geom.cylinders = MAX(1, (blks / (63 * 16))); + } else if (blks <= (64*32*65535)) { + /* 1MB per cylinder for 512-byte sectors */ + lx_geom.sectors = 64; + lx_geom.heads = 32; + lx_geom.cylinders = (blks / (64 * 32)); + } else { + /* + * Max out the geometry sizing for large disks. + * This may not be adequate for truely huge + * volumes (maxing out at a little under 2TB + * for those with a 512-byte blocksize), but it + * is the best we can do with the given struct. + */ + lx_geom.sectors = 255; + lx_geom.heads = 255; + lx_geom.cylinders = MIN(65535, + (blks / (255*255))); + } + lx_geom.start = 0; + } + } else { + int res, rv; + struct dk_geom geom; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGGEOM, (intptr_t)&geom, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + lx_geom.heads = geom.dkg_nhead; + lx_geom.sectors = geom.dkg_nsect; + lx_geom.cylinders = geom.dkg_ncyl; + lx_geom.start = 0; + } + + if (copyout(&lx_geom, (caddr_t)arg, sizeof (lx_geom))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Per the Linux sd(4) man page, get the number of sectors. The linux/fs.h + * header says its 512 byte blocks. + */ +/* ARGSUSED */ +static int +ict_blkgetsize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + diskaddr_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd.lxvd_volsize / 512; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity; + if (minfo.dki_lbsize > 512) { + uint_t bsize = minfo.dki_lbsize / 512; + + tot *= bsize; + } + } + + if (copyout(&tot, (caddr_t)arg, sizeof (long))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the sector size (i.e. the logical block size). + */ +/* ARGSUSED */ +static int +ict_blkgetssize(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint_t bsize; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + bsize = 0; + } else { + bsize = (uint_t)vd.lxvd_blksize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + bsize = (uint_t)minfo.dki_lbsize; + } + + if (copyout(&bsize, (caddr_t)arg, sizeof (bsize))) + return (set_errno(EFAULT)); + return (0); +} + +/* + * Get the size. The linux/fs.h header says its in bytes. + */ +/* ARGSUSED */ +static int +ict_blkgetsize64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + uint64_t tot; + lx_zone_data_t *lxzd; + + if (fp->f_vnode->v_type != VCHR && fp->f_vnode->v_type != VBLK) + return (set_errno(EINVAL)); + + lxzd = ztolxzd(curproc->p_zone); + ASSERT(lxzd != NULL); + ASSERT(lxzd->lxzd_vdisks != NULL); + + if (getmajor(fp->f_vnode->v_rdev) == getmajor(lxzd->lxzd_zfs_dev)) { + lx_virt_disk_t vd; + + if (!lx_lookup_zdsk_info(lxzd, fp->f_vnode->v_rdev, &vd)) { + /* should only happen if new zvol */ + tot = 0; + } else { + tot = vd.lxvd_volsize; + } + } else { + int res, rv; + struct dk_minfo minfo; + + res = VOP_IOCTL(fp->f_vnode, DKIOCGMEDIAINFO, (intptr_t)&minfo, + fp->f_flag | FKIOCTL, fp->f_cred, &rv, NULL); + if (res > 0) + return (set_errno(res)); + + tot = minfo.dki_capacity * minfo.dki_lbsize; + } + + if (copyout(&tot, (caddr_t)arg, sizeof (uint64_t))) + return (set_errno(EFAULT)); + return (0); +} + +/* ARGSUSED */ +/* Terminal-related translators */ + +/* ARGSUSED */ +static int +ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF); + + if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + termios2lx_cc(&l_tios, &lio); + l2s_termios(&l_tios, &s_tios); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF); + + if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + l2s_termio(&l_tio, &s_tio); + termio2lx_cc(&l_tio, &lio); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios, *s_tiosd; + uint_t s_tiosl; + + /* get termios defaults */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd, + &s_tiosl) != DDI_SUCCESS) + return (EIO); + ASSERT(s_tiosl == sizeof (*s_tiosd)); + bcopy(s_tiosd, &s_tios, sizeof (s_tios)); + ddi_prop_free(s_tiosd); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + /* + * systemd calls isatty() on the standard input for a process in order + * to determine if it should call chown() upon it. It expects to + * receive ENOTTY when the input is not a TTY, but the native illumos + * ioctl() call returns ENXIO. Without the following translation, + * systemd services fail with 'Failed to change ownership of terminal' + */ + if (error) + return (set_errno(error == ENXIO ? ENOTTY : error)); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tios.c_cc[LX_VEOF] = lio.veof; + l_tios.c_cc[LX_VEOL] = lio.veol; + l_tios.c_cc[LX_VMIN] = lio.vmin; + l_tios.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV)) + return (ict_tcgets_ptm(fp, cmd, arg, lxcmd)); + else + return (ict_tcgets_native(fp, cmd, arg, lxcmd)); +} + +/* ARGSUSED */ +static int +ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + s2l_termio(&s_tio, &l_tio); + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tio.c_cc[LX_VEOF] = lio.veof; + l_tio.c_cc[LX_VMIN] = lio.vmin; + l_tio.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t lpid, spid, tid; + int error, rv; + + /* Converting to the illumos pid is necessary */ + if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(lpid, &spid, &tid) < 0) + return (set_errno(EPERM)); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int rv, error; + /* use null duration to emulate TCSBRKP */ + int dur = 0; + error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur, + FLFAKE(fp), fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t spgrp; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp), + fp->f_cred, &rv, NULL); + if (error == 0) { + if (spgrp == curproc->p_zone->zone_proc_initpid) { + spgrp = 1; + } + if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) { + return (set_errno(EFAULT)); + } + } + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + int error, rv; + + istr.ic_cmd = UNLKPT; + istr.ic_len = 0; + istr.ic_timout = 0; + istr.ic_dp = NULL; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + /* + * The success/fail return values are different between Linux + * and illumos. Linux expects 0 or -1. Illumos can return + * positive number on success. + */ + return ((error != 0) ? set_errno(error) : 0); +} + +/* ARGSUSED */ +static int +ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + cred_t *cr; + pt_own_t pto; + int error, rv; + int ptyno; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + + /* This operation is only valid for the lx_ptm device. */ + if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV)) + return (set_errno(ENOTTY)); + + cr = CRED(); + pto.pto_ruid = cr->cr_uid; + /* + * Both Linux and our native code (see grantpt() in native libc) + * prefer assigning the "tty" gid to the new pty. On Linux this is + * done by udev. Since we're in the kernel we cannot lookup the gid, so + * we rely on the lx_support program to initialize the value in the + * zone data at boot time. + */ + if (lxzd->lxzd_ttygrp == 0) { + pto.pto_rgid = cr->cr_gid; + } else { + pto.pto_rgid = lxzd->lxzd_ttygrp; + } + + istr.ic_cmd = OWNERPT; + istr.ic_len = sizeof (pto); + istr.ic_timout = 0; + istr.ic_dp = (char *)&pto; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (error) + return (set_errno((error == ENOTTY) ? error: EACCES)); + + ptyno = getminor(fp->f_vnode->v_rdev) - 1; + if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno))) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + + /* + * A few Linux libc's (e.g. musl) have chosen to implement isatty() + * using the TIOCGWINSZ ioctl. Some apps also do the same thing + * directly. On Linux that ioctl will return a size of 0x0 for dumb + * terminals but on illumos see the handling for TIOCGWINSZ in ptem's + * ptioc(). We fail if the winsize is all zeros. To emulate the Linux + * behavior use the native ioctl check that we do for isatty and return + * a size of 0x0 if that succeeds. + */ + if (error == EINVAL) { + int err; + struct termio s_tio; + + err = VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (err == 0) { + struct winsize w; + + bzero(&w, sizeof (w)); + if (copyout(&w, (struct winsize *)arg, sizeof (w)) != 0) + return (set_errno(EFAULT)); + return (0); + } + } + + if (error != 0) + return (set_errno(error)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t ttysid, mysid; + int error, rv; + proc_t *p = curproc; + + /* getsid */ + mutex_enter(&p->p_splock); + mysid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + /* + * Report success if we already control the tty. + * If no one controls it, TIOCSCTTY will change that later. + */ + error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error == 0 && ttysid == mysid) + return (0); + + /* + * Need to make sure we're a session leader, otherwise the + * TIOCSCTTY ioctl will fail. + */ + mutex_enter(&pidlock); + if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + sess_create(); + } else { + mutex_exit(&pidlock); + } + + error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp), + fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* Socket-related translators */ + +/* ARGSUSED */ +static int +ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp = fp->f_vnode; + int error, rv; + /* + * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while + * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this. + */ + if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM) + return (set_errno(ENOTTY)); + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (error) + return (set_errno(error)); + + return (0); +} + +static int +ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred) +{ + int error, rv; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + ksocket_t ks; + + ASSERT(lxzd != NULL); + + /* + * For ioctls of this type, we are strict about address family + * whereas Linux is lenient. This strictness can be avoided by using + * an internal AF_INET ksocket, which we use if the family is anything + * but AF_PACKET. + */ + if (vn->v_type == VSOCK && VTOSO(vn)->so_family == AF_PACKET) + return (VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL)); + + mutex_enter(&lxzd->lxzd_lock); + ks = lxzd->lxzd_ioctl_sock; + if (ks == NULL) { + /* + * Linux is not at all picky about address family when it comes + * to supporting interface-related ioctls. To mimic this + * behavior, we'll attempt those ioctls against a ksocket + * configured for that purpose. + */ + (void) ksocket_socket(&lxzd->lxzd_ioctl_sock, AF_INET, + SOCK_DGRAM, 0, 0, curproc->p_zone->zone_kcred); + ks = lxzd->lxzd_ioctl_sock; + } + mutex_exit(&lxzd->lxzd_lock); + + if (ks != NULL) { + error = ksocket_ioctl(ks, cmd, arg, &rv, cred); + } else { + error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL); + } + + return (error); +} + +static int +ict_sioghwaddr(file_t *fp, struct lifreq *lreq) +{ + struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr; + struct sockaddr hwaddr; + int error, size; + + error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq, + FLFAKE(fp), fp->f_cred); + + if (error == EADDRNOTAVAIL && + strncmp(lreq->lifr_name, "lo", 2) == 0) { + /* Emulate success on suspected loopbacks */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + bzero(LLADDR(sdl), sdl->sdl_alen); + error = 0; + } + + if (error == 0) { + bzero(&hwaddr, sizeof (hwaddr)); + lx_stol_hwaddr(sdl, &hwaddr, &size); + bcopy(&hwaddr, &lreq->lifr_addr, + size + sizeof (sdl->sdl_family)); + } + + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifname(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + int len; + char name[LIFNAMSIZ]; + netstack_t *ns; + ip_stack_t *ipst; + phyint_t *phyi; + + if (fp->f_vnode->v_type != VSOCK) { + return (set_errno(EINVAL)); + } + + len = (curproc->p_model == DATAMODEL_LP64) ? sizeof (lx_ifreq64_t) : + sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) { + return (set_errno(EFAULT)); + } + + /* + * Since Linux calls this ioctl on all sorts of sockets, perform the + * interface name lookup manually. + */ + if ((ns = netstack_get_current()) == NULL) { + return (set_errno(EINVAL)); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &req.ifr_index, NULL); + if (phyi != NULL) { + (void) strncpy(name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + if (strlen(name) != 0) { + /* Truncate for ifreq and copyout */ + (void) strncpy(req.ifr_name, name, IFNAMSIZ); + if (copyout(&req, (struct ifreq *)arg, len) != 0) { + return (set_errno(EFAULT)); + } + return (0); + } + + return (set_errno(EINVAL)); +} + +/* ARGSUSED */ +static int +ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + struct lifreq lreq; + int error, len; + + /* Convert from Linux ifreq to illumos lifreq */ + if (curproc->p_model == DATAMODEL_LP64) + len = sizeof (lx_ifreq64_t); + else + len = sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) + return (set_errno(EFAULT)); + bzero(&lreq, sizeof (lreq)); + (void) strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ); + bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ); + lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE); + + switch (cmd) { + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMTU: + case SIOCSIFMTU: + /* + * Convert cmd from SIO*IF* to SIO*LIF*. + * This is needed since Linux allows ifreq operations on ipv6 + * sockets where illumos does not. + */ + cmd = ((cmd & IOC_INOUT) | + _IOW('i', ((cmd & 0xff) + 100), struct lifreq)); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFINDEX: + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFFLAGS: + cmd = SIOCGLIFFLAGS; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE); + break; + case SIOCSIFFLAGS: + cmd = SIOCSLIFFLAGS; + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFHWADDR: + error = ict_sioghwaddr(fp, &lreq); + break; + case LX_SIOCGIFTXQLEN: + /* + * Illumos lacks the notion of txqlen. Confirm the provided + * interface is valid with SIOCGLIFINDEX and return a fake + * txqlen of 1. Loopback devices will report txqlen of 0. + */ + if (strncmp(lreq.lifr_name, "lo", 2) == 0) { + lreq.lifr_index = 0; + error = 0; + break; + } + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) { + /* lifr_index aliases to the qlen field */ + lreq.lifr_index = 1; + } + break; + case LX_SIOCSIFHWADDR: + /* + * We're not going to support SIOCSIFHWADDR, but we need to be + * able to check the result of the copyin first to see if the + * command should have returned EFAULT. + */ + default: + error = EINVAL; + } + + if (error != 0) + return (set_errno(error)); + + /* Convert back to a Linux ifreq */ + lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE); + bzero(&req, sizeof (req)); + (void) strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ); + bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ); + + if (copyout(&req, (struct lifreq *)arg, len) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* ARGSUSED */ +static int +ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf32_t conf; + lx_ifreq32_t *oreq; + struct ifconf sconf; + int ifcount, error, i; + size_t native_len, lx_len; + + if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* + * First, figure out how many interfaces exist so that kmem allocations + * are no larger than needed. + */ + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, (intptr_t)&ifcount, + FLFAKE(fp), fp->f_cred); + if (error != 0) { + return (set_errno(error)); + } + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == (uint32_t)(uintptr_t)NULL) { + conf.if_len = ifcount * sizeof (lx_ifreq32_t); + + if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } + + ifcount = MIN(ifcount, conf.if_len / sizeof (lx_ifreq32_t)); + + /* Get interface configuration list. */ + native_len = ifcount * sizeof (struct ifreq); + sconf.ifc_len = native_len; + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, native_len); + return (set_errno(error)); + } + /* Recalculate in case a nic was removed between ict_if_ioctl calls. */ + ifcount = sconf.ifc_len / sizeof (struct ifreq); + + /* Convert data to Linux format & rename interfaces */ + lx_len = ifcount * sizeof (lx_ifreq32_t); + oreq = (lx_ifreq32_t *)kmem_alloc(lx_len, KM_SLEEP); + for (i = 0; i < ifcount; i++) { + /* + * struct ifreq and lx_ifreq32_t are the same size, unlike the + * 64-bit version of this function. + */ + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = lx_len; + kmem_free(sconf.ifc_req, native_len); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, lx_len); + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf64_t conf; + lx_ifreq64_t *oreq; + struct ifconf sconf; + int ifcount, error, i; + size_t native_len, lx_len; + + if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* + * First, figure out how many interfaces exist so that kmem allocations + * are no larger than needed. + */ + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, (intptr_t)&ifcount, + FLFAKE(fp), fp->f_cred); + if (error != 0) { + return (set_errno(error)); + } + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + conf.if_len = ifcount * sizeof (lx_ifreq64_t); + + if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } + + ifcount = MIN(ifcount, conf.if_len / sizeof (lx_ifreq64_t)); + + /* Get interface configuration list. */ + native_len = ifcount * sizeof (struct ifreq); + sconf.ifc_len = native_len; + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, native_len); + return (set_errno(error)); + } + /* Recalculate in case a nic was removed between ict_if_ioctl calls. */ + ifcount = sconf.ifc_len / sizeof (struct ifreq); + + /* Convert data to Linux format & rename interfaces */ + lx_len = ifcount * sizeof (lx_ifreq64_t); + oreq = (lx_ifreq64_t *)kmem_zalloc(lx_len, KM_SLEEP); + for (i = 0; i < ifcount; i++) { + /* + * struct ifreq and lx_ifreq64_t start with common elements. + * Anything after that is padding, which is zeroed with + * kmem_zalloc above. + */ + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (oreq->ifr_name) + + sizeof (oreq->ifr_ifrn.ifru_addr)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = lx_len; + kmem_free(sconf.ifc_req, native_len); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, lx_len); + return (error); +} + +/* ARGSUSED */ +static int +ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (curproc->p_model == DATAMODEL_LP64) + return (ict_siocgifconf64(fp, cmd, arg, lxcmd)); + else + return (ict_siocgifconf32(fp, cmd, arg, lxcmd)); +} + +/* + * Unfortunately some of the autofs ioctls want to return a positive integer + * result which does not indicate an error. To minimize disruption in the + * rest of the code, we'll treat a positive return as an errno and a negative + * return as the non-error return (which we then negate). + */ +/* ARGSUSED */ +static int +ict_autofs(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int res = 0; + int rv; + + res = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (res > 0) + return (set_errno(res)); + if (res == 0) + return (0); + return (-res); +} + +/* Structure used to define an ioctl translator. */ +typedef struct lx_ioc_cmd_translator { + int lict_lxcmd; + int lict_cmd; + int (*lict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd); +} lx_ioc_cmd_translator_t; + +#define LX_IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define LX_IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define LX_IOC_CMD_TRANSLATOR_END \ + {0, 0, NULL} + +static lx_ioc_cmd_translator_t lx_ioc_xlate_fd[] = { + LX_IOC_CMD_TRANSLATOR_FILTER(FIONBIO, ict_fionbio) + LX_IOC_CMD_TRANSLATOR_FILTER(FIONREAD, ict_fionread) + LX_IOC_CMD_TRANSLATOR_PASS(FIOASYNC) + + /* streams related */ + LX_IOC_CMD_TRANSLATOR_PASS(TCXONC) + LX_IOC_CMD_TRANSLATOR_PASS(TCFLSH) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCEXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNXCL) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSTI) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIS) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMBIC) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCMSET) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCSETD) + LX_IOC_CMD_TRANSLATOR_PASS(TCSBRK) + + /* terminal related */ + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGETD) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCGSID) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY) + LX_IOC_CMD_TRANSLATOR_PASS(TIOCPKT) + + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETS, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSW, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETSF, ict_tcsets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETA, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAW, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCSETAF, ict_tcseta) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETS, ict_tcgets) + LX_IOC_CMD_TRANSLATOR_FILTER(TCGETA, ict_tcgeta) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ, ict_tiocgwinsz) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP, ict_tcsbrkp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP, ict_tiocspgrp) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP, ict_tiocgpgrp) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK, ict_sptlock) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN, ict_gptn) + LX_IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY, ict_tiocsctty) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_socket[] = { + LX_IOC_CMD_TRANSLATOR_PASS(FIOGETOWN) + + LX_IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP) + LX_IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK, ict_siocatmark) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN, ict_siolifreq) + LX_IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF, ict_siocgifconf) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFNAME, ict_siocgifname) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_dtrace[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE) + LX_IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_autofs[] = { + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_READY) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_FAIL) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_CATATONIC) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_SETTIMEOUT) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_EXPIRE_MULTI) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_PROTOSUBVER) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_IOC_ASKUMOUNT) + + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_VERSION_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_PROTOSUBVER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_OPENMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CLOSEMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_READY_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_FAIL_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_SETPIPEFD_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_CATATONIC_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_TIMEOUT_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_REQUESTER_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_EXPIRE_CMD) + LX_IOC_CMD_TRANSLATOR_PTHRU(LX_AUTOFS_DEV_IOC_ASKUMOUNT_CMD) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_AUTOFS_DEV_IOC_ISMOUNTPOINT_CMD, + ict_autofs) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_hd[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_HDIO_GETGEO, ict_hdgetgeo) + + LX_IOC_CMD_TRANSLATOR_END +}; + +static lx_ioc_cmd_translator_t lx_ioc_xlate_blk[] = { + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE, ict_blkgetsize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKSSZGET, ict_blkgetssize) + LX_IOC_CMD_TRANSLATOR_CUSTOM(LX_BLKGETSIZE64, ict_blkgetsize64) + + LX_IOC_CMD_TRANSLATOR_END +}; + +/* + * Linux only restarts ioctls for "slow" devices. This includes terminals, + * pipes, and sockets. If additional "slow" devices are discovered in the + * future, they can be added here as well. + */ +static boolean_t +lx_ioctl_is_slow_dev(file_t *fp) +{ + int rv; + struct termio s_tio; + vtype_t vt = fp->f_vnode->v_type; + + if (vt == VFIFO || vt == VSOCK) + return (B_TRUE); + + /* Check if it's a terminal using the isatty() approach. */ + if (vt == VCHR && VOP_IOCTL(fp->f_vnode, TCGETA, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +static void +lx_ioctl_vsd_free(void *data) +{ + kmem_free(data, sizeof (struct lx_cc)); +} + +void +lx_ioctl_init() +{ + vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free); +} + +void +lx_ioctl_fini() +{ + vsd_destroy(&lx_ioctl_vsd); +} + +long +lx_ioctl(int fdes, int cmd, intptr_t arg) +{ + file_t *fp; + int res = 0, error = ENOTTY; + lx_ioc_cmd_translator_t *ict = NULL; + + if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) { + res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0); + return ((res != 0) ? set_errno(res) : 0); + } + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + switch ((cmd & 0xff00) >> 8) { + case LX_IOC_TYPE_FD: + ict = lx_ioc_xlate_fd; + break; + + case LX_IOC_TYPE_DTRACE: + ict = lx_ioc_xlate_dtrace; + break; + + case LX_IOC_TYPE_SOCK: + ict = lx_ioc_xlate_socket; + error = EOPNOTSUPP; + break; + + case LX_IOC_TYPE_AUTOFS: + ict = lx_ioc_xlate_autofs; + break; + + case LX_IOC_TYPE_BLK: + ict = lx_ioc_xlate_blk; + break; + + case LX_IOC_TYPE_HD: + ict = lx_ioc_xlate_hd; + break; + + default: + releasef(fdes); + return (set_errno(ENOTTY)); + } + + /* + * Today, none of the ioctls supported by the emulation possess + * overlapping cmd values. Because of that, no type interrogation of + * the fd is done before executing specific ioctl emulation. It's + * assumed that the vnode-specific logic called by the emulation + * function will reject ioctl commands not supported by the fd. + */ + VERIFY(ict != NULL); + while (ict->lict_func != NULL) { + if (ict->lict_lxcmd == cmd) + break; + ict++; + } + if (ict->lict_func == NULL) { + releasef(fdes); + return (set_errno(error)); + } + + res = ict->lict_func(fp, ict->lict_cmd, arg, ict->lict_lxcmd); + + if (ttolwp(curthread)->lwp_errno == EINTR && lx_ioctl_is_slow_dev(fp)) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + releasef(fdes); + return (res); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c new file mode 100644 index 0000000000..13397e199e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioprio.c @@ -0,0 +1,66 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/lx_brand.h> + +/* 'which' values. */ +#define LX_IOPRIO_WHO_PROCESS 1 +#define LX_IOPRIO_WHO_PGRP 2 +#define LX_IOPRIO_WHO_USER 3 + +/* + * The possible values for the class. We report best effort (BE) as the class + * in use. + */ +#define LX_IOPRIO_CLASS_RT 1 +#define LX_IOPRIO_CLASS_BE 2 +#define LX_IOPRIO_CLASS_IDLE 3 + +/* Macro to determine the class from the input mask */ +#define LX_IOPRIO_PRIO_CLASS(m) ((m) >> 13) + +/* ARGSUSED */ +long +lx_ioprio_get(int which, int who) +{ + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + return (LX_IOPRIO_CLASS_BE); +} + +/* + * We allow setting any valid class, even though it's ignored. + * We ignore the 'who' parameter which means that we're not searching for + * the specified target in order to return a specific errno in the case that + * the target does not exist. + */ +/* ARGSUSED */ +long +lx_ioprio_set(int which, int who, int mask) +{ + int class; + + if (which < LX_IOPRIO_WHO_PROCESS || which > LX_IOPRIO_WHO_USER) + return (set_errno(EINVAL)); + + class = LX_IOPRIO_PRIO_CLASS(mask); + if (class < LX_IOPRIO_CLASS_RT || class > LX_IOPRIO_CLASS_IDLE) + return (set_errno(EINVAL)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c new file mode 100644 index 0000000000..6fefbde705 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -0,0 +1,408 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/thread.h> +#include <sys/signal.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <lx_signum.h> +#include <sys/contract/process_impl.h> + +extern int kill(pid_t, int); + +/* + * Check if it is legal to send this signal to the init process. Linux + * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid + * 1. + */ +static int +lx_init_sig_check(int sig, pid_t pid) +{ + proc_t *p; + int rv = 0; + + mutex_enter(&pidlock); + if ((p = prfind(pid)) == NULL || p->p_stat == SIDL) { + rv = ESRCH; + } else if (sig != 0) { + if (sigismember(&cantmask, sig)) { + rv = EPERM; + } else { + mutex_enter(&p->p_lock); + if (PTOU(p)->u_signal[sig-1] == SIG_DFL || + PTOU(p)->u_signal[sig-1] == SIG_IGN) { + rv = EPERM; + } + mutex_exit(&p->p_lock); + } + } + mutex_exit(&pidlock); + + return (rv); +} + +static long +lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill) +{ + kthread_t *t; + proc_t *pp, *cp = curproc; + sigqueue_t *sqp; + int sig, rv; + + /* + * Unlike kill(2), Linux tkill(2) doesn't allow signals to + * be sent to process IDs <= 0 as it doesn't overlay any special + * semantics on the pid. + */ + if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * If the Linux pid is 1, translate the pid to the actual init + * pid for the zone. Note that Linux dictates that no unhandled + * signals may be sent to init, so check for that, too. + * + * Otherwise, extract the tid and real pid from the Linux pid. + */ + if (pid == 1) { + pid_t initpid; + + initpid = cp->p_zone->zone_proc_initpid; + if ((rv = lx_init_sig_check(sig, initpid)) != 0) { + return (set_errno(rv)); + } + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * Find the process for the passed pid... + */ + if (lx_lpid_lock(pid, curzone, 0, &pp, &t) != 0) { + rv = set_errno(ESRCH); + goto free_and_exit; + } + + /* + * Make sure the thread group matches the thread. + */ + if (tgkill) { + if ((pid == 1 && tgid != 1) || + (pid != 1 && tgid != pp->p_pid)) { + mutex_exit(&pp->p_lock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + } + + /* + * Deny permission to send the signal if either of the following + * is true: + * + * + The signal is SIGCONT and the target pid is not in the same + * session as the sender + * + * + prochasprocperm() shows the user lacks sufficient permission + * to send the signal to the target pid + */ + if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) || + (!prochasprocperm(pp, cp, CRED()))) { + mutex_exit(&pp->p_lock); + rv = set_errno(EPERM); + goto free_and_exit; + } + + /* a signal of 0 means just check for the existence of the thread */ + if (lx_sig == 0) { + mutex_exit(&pp->p_lock); + rv = 0; + goto free_and_exit; + } + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = cp->p_pid; + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(pp, t, sqp); + + mutex_exit(&pp->p_lock); + + return (0); + +free_and_exit: + kmem_free(sqp, sizeof (sigqueue_t)); + return (rv); +} + +long +lx_tgkill(pid_t tgid, pid_t pid, int lx_sig) +{ + return (lx_thrkill(tgid, pid, lx_sig, B_TRUE)); +} + +long +lx_tkill(pid_t pid, int lx_sig) +{ + return (lx_thrkill(0, pid, lx_sig, B_FALSE)); +} + +long +lx_kill(pid_t lx_pid, int lx_sig) +{ + pid_t s_pid, initpid; + sigsend_t v; + zone_t *zone = curzone; + struct proc *p; + int err, sig, nfound; + + if ((lx_sig < 0) || (lx_sig > LX_NSIG) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + initpid = zone->zone_proc_initpid; + if (lx_pid == 0 || lx_pid == -1) { + s_pid = 0; + } else if (lx_pid > 0) { + /* + * Translations for individual processes (including pid 1) is + * all handled by lx_lpid_to_spair. + */ + if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } else { + ASSERT(lx_pid < 0); + if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid it means that the + * process group leader doesn't exist in this zone. + * In this case assuming that the Linux pid is + * the same as the Solaris pid will get us the + * correct behavior. + */ + s_pid = -lx_pid; + } + } + + /* + * Check that it is legal for this signal to be sent to init + */ + if (s_pid == initpid && (err = lx_init_sig_check(sig, s_pid)) != 0) + return (set_errno(err)); + + /* + * For individual processes, kill() semantics are the same between + * Solaris and Linux. + */ + if (lx_pid >= 0) + return (kill(s_pid, sig)); + + /* + * In Solaris, sending a signal to -pid means "send a signal to + * everyone in process group pid." In Linux it means "send a + * signal to everyone in the group other than init." Sending a + * signal to -1 means "send a signal to every process except init + * and myself." + */ + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + err = 0; + + mutex_enter(&pidlock); + + p = (lx_pid == -1) ? practive : pgfind(s_pid); + nfound = 0; + while (err == 0 && p != NULL) { + if ((p->p_zone == zone) && (p->p_stat != SIDL) && + (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) { + nfound++; + err = sigsendproc(p, &v); + } + + p = (lx_pid == -1) ? p->p_next : p->p_pglink; + } + mutex_exit(&pidlock); + + /* + * If we found no processes, we'll return ESRCH -- but unlike our + * native kill(2), we do not return EPERM if processes are found but + * we did not have permission to send any of them a signal. + */ + if (nfound == 0) + err = ESRCH; + + return (err ? set_errno(err) : 0); +} + +/* + * This handles the unusual case where the user sends a non-queueable signal + * through rt_sigqueueinfo. Signals sent with codes that indicate they are + * queuable are sent through the sigqueue syscall via the user level function + * lx_rt_sigqueueinfo(). + */ +int +lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + proc_t *target_proc; + pid_t s_pid; + zone_t *zone = curproc->p_zone; + sigsend_t send; + int err; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + /* Unlike in lx_kill, this process id must be exact, no negatives. */ + if (tgid == 0) + return (set_errno(ESRCH)); + if (tgid < 0) + return (set_errno(EINVAL)); + /* + * Translate init directly, otherwise use the convenient utility + * function to translate. Since we're sending to the whole group, we + * only need the solaris pid, and not the lwp id. + */ + if (tgid == 1) { + s_pid = zone->zone_proc_initpid; + } else { + if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } + /* + * We shouldn't have queuable signals here, those are sent elsewhere by + * the usermode handler for this emulated call. + */ + if (!SI_CANQUEUE(kinfo.si_code)) { + return (set_errno(EINVAL)); + } + /* Since our signal shouldn't queue, we just call sigsendproc(). */ + bzero(&send, sizeof (send)); + send.sig = sig; + send.checkperm = 1; + send.sicode = kinfo.si_code; + send.value = kinfo.si_value; + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + err = 0; + if (target_proc != NULL) { + err = sigsendproc(target_proc, &send); + if (err == 0 && send.perm == 0) + err = EPERM; + } else { + err = ESRCH; + } + mutex_exit(&pidlock); + + return (err ? set_errno(err) : 0); +} + +/* + * Unlike the above function, this handles all system calls to rt_tgsigqueue + * regardless of si_code. + */ +int +lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) +{ + int err; + proc_t *p = NULL; + kthread_t *t; + sigqueue_t *sqp; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) { + return (set_errno(EFAULT)); + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + + if (lx_lpid_lock(tid, curzone, 0, &p, &t) != 0) { + err = ESRCH; + goto errout; + } + + /* + * For group leaders, the SunOS pid == Linux pid, so the SunOS leader + * pid should be the same as the tgid. Because the tgid comes in via + * the syscall, we need to check for an invalid value. + */ + if (p->p_pid != tgid) { + err = EINVAL; + goto errout; + } + + /* + * In order to match the Linux behavior of emitting ESRCH errors before + * confirming that the signal is valid, this check _must_ be performed + * after the target process/thread is located. + */ + if (sig < 0 || sig >= NSIG) { + err = EINVAL; + goto errout; + } + + /* + * To merely check for the existence of a thread, the caller will pass + * a signal value of 0. + */ + if (sig != 0) { + ASSERT(sqp != NULL); + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = kinfo.si_code; + sqp->sq_info.si_pid = p->p_pid; + sqp->sq_info.si_ctid = PRCTID(p); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + } + mutex_exit(&p->p_lock); + return (0); + +errout: + if (p != NULL) { + mutex_exit(&p->p_lock); + } + kmem_free(sqp, sizeof (sigqueue_t)); + return (set_errno(err)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_link.c b/usr/src/uts/common/brand/lx/syscall/lx_link.c new file mode 100644 index 0000000000..f2e268771e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_link.c @@ -0,0 +1,200 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/systm.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +#define LX_LINK_ALLOWED (LX_AT_SYMLINK_FOLLOW | LX_AT_EMPTY_PATH) + +/* From "uts/common/syscall/stat.c" */ +extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **); +/* From uts/common/syscall/unlink.c */ +extern int unlinkat(int, char *, int); +/* From uts/common/syscall/symlink.c */ +extern int symlinkat(char *, int, char *); +/* From uts/common/syscall/readlink.c */ +extern ssize_t readlinkat(int, char *, char *, size_t); + +static long +lx_link_common(int ffd, char *from, int tfd, char *to, int flags) +{ + int error; + vnode_t *fsvp = NULL, *tsvp = NULL; + enum symfollow follow = NO_FOLLOW; + + if ((flags & ~LX_LINK_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flags & LX_AT_EMPTY_PATH) == 0) { + char c; + + /* + * Check that both 'from' and 'to' names are non-empty if + * AT_EMPTY_PATH is not set. + */ + if (copyin(from, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + if (copyin(to, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } else if (c == '\0') { + return (set_errno(ENOENT)); + } + + /* + * XXX: When our support for LX capabilities improves, ENOENT + * should be thrown when a process lacking CAP_DAC_READ_SEARCH + * attempts to use the AT_EMPTY_PATH flag. + */ + } + if ((flags & LX_AT_SYMLINK_FOLLOW) != 0) { + follow = FOLLOW; + } + + if ((error = fgetstartvp(ffd, from, &fsvp)) != 0) { + goto out; + } + if ((error = fgetstartvp(tfd, to, &tsvp)) != 0) { + goto out; + } + error = vn_linkat(fsvp, from, follow, tsvp, to, UIO_USERSPACE); + +out: + if (fsvp != NULL) { + VN_RELE(fsvp); + } + if (tsvp != NULL) { + VN_RELE(tsvp); + } + if (error) { + return (set_errno(error)); + } + return (0); +} + +long +lx_link(char *from, char *to) +{ + return (lx_link_common(AT_FDCWD, from, AT_FDCWD, to, 0)); +} + +long +lx_linkat(int ffd, char *from, int tfd, char *to, int flags) +{ + ffd = (ffd == LX_AT_FDCWD) ? AT_FDCWD : ffd; + tfd = (tfd == LX_AT_FDCWD) ? AT_FDCWD : tfd; + + return (lx_link_common(ffd, from, tfd, to, flags)); +} + +static boolean_t +lx_isdir(int atfd, char *path) +{ + cred_t *cr = NULL; + vnode_t *vp = NULL; + boolean_t is_dir; + + if (cstatat_getvp(atfd, path, NO_FOLLOW, &vp, &cr) != 0) + return (B_FALSE); + + crfree(cr); + is_dir = (vp->v_type == VDIR); + VN_RELE(vp); + + return (is_dir); +} + +long +lx_unlink(char *path) +{ + int err; + + if ((err = unlinkat(AT_FDCWD, path, 0)) == EPERM) { + /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */ + if (lx_isdir(AT_FDCWD, path)) + return (set_errno(EISDIR)); + } + + return (err); +} + +long +lx_unlinkat(int atfd, char *path, int flag) +{ + int err; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + if ((flag = ltos_at_flag(flag, AT_REMOVEDIR, B_TRUE)) < 0) + return (set_errno(EINVAL)); + + err = unlinkat(atfd, path, flag); + if (err == EPERM && !(flag & AT_REMOVEDIR)) { + /* On Linux, an unlink of a dir returns EISDIR, not EPERM. */ + if (lx_isdir(atfd, path)) + return (set_errno(EISDIR)); + } + if (err == EEXIST && (flag & AT_REMOVEDIR)) { + /* On Linux, an unlink of a non-empty dir returns ENOTEMPTY, not EEXIST. */ + if (lx_isdir(atfd, path)) + return (set_errno(ENOTEMPTY)); + } + + return (err); +} + +long +lx_symlink(char *name1, char *name2) +{ + return (symlinkat(name1, AT_FDCWD, name2)); +} + +long +lx_symlinkat(char *name1, int atfd, char *name2) +{ + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + return (symlinkat(name1, atfd, name2)); +} + +long +lx_readlink(char *path, char *buf, size_t bufsize) +{ + if (bufsize <= 0) + return (set_errno(EINVAL)); + + return (readlinkat(AT_FDCWD, path, buf, bufsize)); +} + +long +lx_readlinkat(int atfd, char *path, char *buf, size_t bufsize) +{ + if (bufsize <= 0) + return (set_errno(EINVAL)); + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + return (readlinkat(atfd, path, buf, bufsize)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_lseek.c b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c new file mode 100644 index 0000000000..3ac32a2faf --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_lseek.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/errno.h> +#include <sys/debug.h> + + +#if defined(_SYSCALL32_IMPL) || defined(_ILP32) + +/* from uts/common/syscalls/lseek.c */ +extern offset_t llseek32(int32_t, uint32_t, uint32_t, int); +extern off32_t lseek32(int32_t, off32_t, int32_t); + +long +lx_llseek(int fd, uint32_t off_high, uint32_t off_low, void *out, int whence) +{ + offset_t res; + + ASSERT(get_udatamodel() == DATAMODEL_ILP32); + res = llseek32(fd, off_low, off_high, whence); + if (ttolwp(curthread)->lwp_errno == 0) { + if (copyout(&res, out, sizeof (offset_t)) != 0) { + return (set_errno(EFAULT)); + } + } + return (ttolwp(curthread)->lwp_errno); +} + + +long +lx_lseek32(int fd, off32_t offset, int whence) +{ + offset_t res; + const uint32_t hival = (offset < 0) ? (uint32_t)-1 : 0; + + /* + * When returning EOVERFLOW for an offset which is outside the bounds + * of an off32_t, Linux will still perform the actual seek before + * yielding EOVERFLOW. + * + * In order to emulate that behavior, an llseek bound to the 64-bit + * boundary is used. The overflow can then be reported after the + * successful seek. + */ + ASSERT(get_udatamodel() == DATAMODEL_ILP32); + res = llseek32(fd, (uint32_t)offset, hival, whence); + if (ttolwp(curthread)->lwp_errno == 0 && res > MAXOFF32_T) { + return (set_errno(EOVERFLOW)); + } + return (res); + +} +#endif /* defined(_SYSCALL32_IMPL) || defined(_ILP32) */ + +#if defined(_LP64) + +/* from uts/common/syscalls/lseek.c */ +extern off_t lseek64(int, off_t, int); + +long +lx_lseek64(int fd, off_t offset, int whence) +{ + ASSERT(get_udatamodel() == DATAMODEL_LP64); + return (lseek64(fd, offset, whence)); +} + +#endif /* defined(_LP64) */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mem.c b/usr/src/uts/common/brand/lx/syscall/lx_mem.c new file mode 100644 index 0000000000..15351444c8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mem.c @@ -0,0 +1,1118 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/mman.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/lx_brand.h> +#include <sys/fcntl.h> +#include <sys/pathname.h> +#include <vm/seg_vn.h> +#include <vm/seg_spt.h> +#include <sys/shm_impl.h> +#include <vm/as.h> + +/* From uts/common/os/grow.c */ +extern int mprotect(caddr_t, size_t, int); +extern caddr_t smmap64(caddr_t, size_t, int, int, int, off_t); +extern int munmap(caddr_t, size_t); +/* From uts/common/syscall/close.c */ +extern int close(int); +/* From uts/common/fs/proc/prsubr.c */ +extern uint_t pr_getprot(struct seg *, int, void **, caddr_t *, caddr_t *, + caddr_t); +/* From uts/common/vm/seg_spt.c */ +extern struct seg_ops segspt_shmops; +/* From uts/common/syscall/memcntl.c */ +extern int memcntl(caddr_t, size_t, int, caddr_t, int, int); +/* From uts/common/os/grow.c */ +extern int smmap_common(caddr_t *, size_t, int, int, struct file *, offset_t); + +/* + * After Linux 2.6.8, an unprivileged process can lock memory up to its + * RLIMIT_MEMLOCK resource limit. + * + * Within memcntl() it assumes we have PRIV_PROC_LOCK_MEMORY, or the check in + * secpolicy_lock_memory() will fail when we attempt to lock memory. Thus, + * to support the Linux semantics, we bypass memcntl() and perform the locking + * operations directly. + */ + +#define LX_MADV_NORMAL 0 +#define LX_MADV_RANDOM 1 +#define LX_MADV_SEQUENTIAL 2 +#define LX_MADV_WILLNEED 3 +#define LX_MADV_DONTNEED 4 +#define LX_MADV_FREE 8 +#define LX_MADV_REMOVE 9 +#define LX_MADV_DONTFORK 10 +#define LX_MADV_DOFORK 11 +#define LX_MADV_MERGEABLE 12 +#define LX_MADV_UNMERGEABLE 13 +#define LX_MADV_HUGEPAGE 14 +#define LX_MADV_NOHUGEPAGE 15 +#define LX_MADV_DONTDUMP 16 +#define LX_MADV_DODUMP 17 + +#define LX_VALID_MSYNC (MS_ASYNC|MS_INVALIDATE|MS_SYNC) + +#define LX_PROT_GROWSDOWN 0x01000000 +#define LX_PROT_GROWSUP 0x02000000 + +/* Internal segment map flags */ +#define LX_SM_READ 0x01 +#define LX_SM_WRITE 0x02 +#define LX_SM_EXEC 0x04 +#define LX_SM_SHM 0x08 +#define LX_SM_ANON 0x10 +#define LX_SM_SHARED 0x20 +#define LX_SM_NORESERVE 0x40 + +/* For convenience */ +#define LX_PROT_GROWMASK (LX_PROT_GROWSUP|LX_PROT_GROWSDOWN) + +/* From lx_rlimit.c */ +extern void lx_get_rctl(char *, struct rlimit64 *); + +static int +lx_mlock_common(int op, uintptr_t addr, size_t len) +{ + int err; + struct as *as = curproc->p_as; + const uintptr_t align_addr = addr & (uintptr_t)PAGEMASK; + const size_t align_len = P2ROUNDUP(len + (addr & PAGEOFFSET), PAGESIZE); + + if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((align_addr + align_len) <= align_addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(EINVAL)); + } + + err = as_ctl(as, (caddr_t)align_addr, align_len, op, 0, 0, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_mlock(uintptr_t addr, size_t len) +{ + int err; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlock2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + return (lx_mlock_common(MC_LOCK, addr, len)); +} + +int +lx_munlock(uintptr_t addr, size_t len) +{ + return (lx_mlock_common(MC_UNLOCK, addr, len)); +} + +int +lx_mlockall(int flags) +{ + int err; + struct as *as = curproc->p_as; + + /* + * If the the caller is not privileged and either the limit is 0, or + * the kernel version is earlier than 2.6.9, then fail with EPERM. See + * LTP mlockall2.c. + */ + if ((err = secpolicy_lock_memory(CRED())) != 0) { + struct rlimit64 rlim64; + + lx_get_rctl("process.max-locked-memory", &rlim64); + if (rlim64.rlim_cur == 0 || + lx_kern_release_cmp(curzone, "2.6.9") < 0) + return (set_errno(err)); + } + + if ((flags & ~(MCL_FUTURE | MCL_CURRENT)) || flags == 0) + return (set_errno(EINVAL)); + + err = as_ctl(as, 0, 0, MC_LOCKAS, 0, (uintptr_t)flags, NULL, 0); + if (err == EAGAIN) + err = ENOMEM; + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_munlockall(void) +{ + int err; + struct as *as = curproc->p_as; + + if (lx_kern_release_cmp(curzone, "2.6.9") < 0) { + if ((err = secpolicy_lock_memory(CRED())) != 0) + return (set_errno(err)); + } + + err = as_ctl(as, 0, 0, MC_UNLOCKAS, 0, 0, NULL, 0); + return (err == 0 ? 0 : set_errno(err)); +} + +int +lx_msync(uintptr_t addr, size_t len, int flags) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + if ((addr & PAGEOFFSET) != 0 || + (flags & ~LX_VALID_MSYNC) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) < addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (memcntl((caddr_t)addr, align_len, MC_SYNC, + (caddr_t)(uintptr_t)flags, 0, 0)); +} + +int +lx_madvise(uintptr_t addr, size_t len, int advice) +{ + int err; + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + switch (advice) { + case LX_MADV_REMOVE: + /* approximately similar */ + advice = MADV_FREE; + break; + + case LX_MADV_DONTNEED: + /* + * On Linux, MADV_DONTNEED implies an immediate purge of the + * specified region. This is spuriously different from + * (nearly) every other Unix, having apparently been done to + * mimic the semantics on Digital Unix (!). This is bad enough + * (MADV_FREE both has better semantics and results in better + * performance), but it gets worse: Linux applications (and + * notably, jemalloc) have managed to depend on the busted + * semantics of MADV_DONTNEED on Linux. We implement these + * semantics via MADV_PURGE -- and we translate our advice + * accordingly. + */ + advice = MADV_PURGE; + break; + + case LX_MADV_FREE: + advice = MADV_FREE; + break; + + case LX_MADV_NORMAL: + case LX_MADV_RANDOM: + case LX_MADV_SEQUENTIAL: + case LX_MADV_WILLNEED: + /* These map directly to the illumos values */ + break; + + case LX_MADV_DONTFORK: + case LX_MADV_DOFORK: + case LX_MADV_HUGEPAGE: + case LX_MADV_NOHUGEPAGE: + case LX_MADV_DONTDUMP: + case LX_MADV_DODUMP: + /* harmless to pretend these work */ + return (0); + default: + return (set_errno(EINVAL)); + } + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* + * Catch overflow (including when aligning len). Unlike + * similar syscalls, this is an EINVAL failure for madvise(2). + */ + return (set_errno(EINVAL)); + } + + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err == EBUSY) { + if (advice != MADV_PURGE) { + return (set_errno(EINVAL)); + } + /* + * If we received an EBUSY from a MADV_PURGE, we will now try + * again with a MADV_DONTNEED: there are conditions (namely, + * with locked mappings that haven't yet been faulted in) where + * MADV_PURGE will fail but MADV_DONTNEED will succeed. If + * this succeeds, we'll call the operation a success; if not, + * we'll kick back EINVAL. + */ + advice = MADV_DONTNEED; + err = memcntl((caddr_t)addr, align_len, MC_ADVISE, + (caddr_t)(intptr_t)advice, 0, 0); + if (err != 0) { + return (set_errno(EINVAL)); + } + /* Clear the old errno since success was eventually achieved. */ + ttolwp(curthread)->lwp_errno = 0; + } + return (err); +} + +int +lx_mprotect(uintptr_t addr, size_t len, int prot) +{ + const size_t align_len = P2ROUNDUP(len, PAGESIZE); + + /* + * The flags for native mprotect(2) are essentially the same as those + * on Linux, with the exception of PROT_GROWSUP/PROT_GROWSDOWN, for + * which there is no native analog. Those flags are presently ignored, + * unless they are both present, which represents an invalid argument. + */ + if ((prot & LX_PROT_GROWMASK) == LX_PROT_GROWMASK) { + return (set_errno(EINVAL)); + } + prot &= ~(LX_PROT_GROWMASK); + + if ((addr & PAGEOFFSET) != 0) { + return (set_errno(EINVAL)); + } else if (len == 0) { + /* Linux short-circuits to success on zero length */ + return (0); + } else if ((addr + align_len) <= addr) { + /* Catch overflow (including when aligning len) */ + return (set_errno(ENOMEM)); + } + + return (mprotect((void *)addr, align_len, prot)); +} + +/* + * There are two forms of mmap, mmap() and mmap2(). The only difference is that + * the final argument to mmap2() specifies the number of pages, not bytes. Also, + * mmap2 is 32-bit only. + * + * Linux has a number of additional flags, but they are all deprecated. We also + * ignore the MAP_GROWSDOWN flag, which has no equivalent on Solaris. + * + * The Linux mmap() returns ENOMEM in some cases where illumos returns + * EOVERFLOW, so we translate the errno as necessary. + */ + +#define LX_MAP_ANONYMOUS 0x00020 +#define LX_MAP_LOCKED 0x02000 +#define LX_MAP_NORESERVE 0x04000 +#define LX_MAP_32BIT 0x00040 + +#define ONE_GB 0x40000000 + +static void lx_remap_anoncache_invalidate(uintptr_t, size_t); + +static int +lx_ltos_mmap_flags(int flags) +{ + int new_flags; + + new_flags = flags & (MAP_TYPE | MAP_FIXED); + + if (flags & LX_MAP_ANONYMOUS) + new_flags |= MAP_ANONYMOUS; + if (flags & LX_MAP_NORESERVE) + new_flags |= MAP_NORESERVE; + +#if defined(_LP64) + if (flags & LX_MAP_32BIT) + new_flags |= MAP_32BIT; +#endif + + return (new_flags); +} + +static void * +lx_mmap_common(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + caddr_t ret; + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Under Linux, the file descriptor is ignored when mapping zfod + * anonymous memory, On illumos, we want the fd set to -1 for the + * same functionality. + */ + if (flags & LX_MAP_ANONYMOUS) + fd = -1; + + /* + * We refuse, as a matter of principle, to overcommit memory. + * Unfortunately, several bits of important and popular software expect + * to be able to pre-allocate large amounts of virtual memory but then + * probably never use it. One particularly bad example of this + * practice is golang. Another is the JVM. + * + * In the interest of running software, unsafe or not, we fudge + * something vaguely similar to overcommit by permanently enabling + * MAP_NORESERVE unless MAP_LOCKED was requested: + */ + if (!(flags & LX_MAP_LOCKED)) { + flags |= LX_MAP_NORESERVE; + } + + /* + * This is totally insane. The NOTES section in the linux mmap(2) man + * page claims that on some architectures, read protection may + * automatically include exec protection. It has been observed on a + * native linux system that the /proc/<pid>/maps file does indeed + * show that segments mmap'd from userland (such as libraries mapped in + * by the dynamic linker) all have exec the permission set, even for + * data segments. + * + * This insanity is tempered by the fact that the behavior is disabled + * for ELF binaries bearing a PT_GNU_STACK header which lacks PF_X + * (which most do). Such a header will clear the READ_IMPLIES_EXEC + * flag from the process personality. + */ + if (prot & PROT_READ) { + if ((lxpd->l_personality & LX_PER_READ_IMPLIES_EXEC) != 0) { + prot |= PROT_EXEC; + } + } + + ret = smmap64(addr, len, prot, lx_ltos_mmap_flags(flags), fd, off); + if (ttolwp(curthread)->lwp_errno != 0) { + if (ttolwp(curthread)->lwp_errno == EOVERFLOW) + (void) set_errno(ENOMEM); + return ((void *)-1); + } + + if (flags & LX_MAP_LOCKED) { + (void) lx_mlock_common(MC_LOCK, (uintptr_t)ret, len); + /* clear any errno from mlock */ + ttolwp(curthread)->lwp_errno = 0; + } + + /* + * We have a new mapping; invalidate any cached anonymous regions that + * overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)ret, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (ret); +} + +long +lx_mmap(void *addr, size_t len, int prot, int flags, int fd, off64_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, off)); +} + +long +lx_mmap2(void *addr, size_t len, int prot, int flags, + int fd, off_t off) +{ + return ((ssize_t)lx_mmap_common(addr, len, prot, flags, fd, + (off64_t)off * PAGESIZE)); +} + +long +lx_munmap(void *addr, size_t len) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + + /* + * Invalidate any cached anonymous regions that overlap(ped) with it. + */ + mutex_enter(&lxpd->l_remap_anoncache_lock); + lx_remap_anoncache_invalidate((uintptr_t)addr, len); + mutex_exit(&lxpd->l_remap_anoncache_lock); + + return (munmap(addr, len)); +} + +#define LX_MREMAP_MAYMOVE 1 /* mapping can be moved */ +#define LX_MREMAP_FIXED 2 /* address is fixed */ + +/* + * Unfortunately, the Linux mremap() manpage contains a statement that is, at + * best, grossly oversimplified: that mremap() "can be used to implement a + * very efficient realloc(3)." To the degree this is true at all, it is only + * true narrowly (namely, when large buffers are being expanded but can't be + * expanded in place due to virtual address space restrictions) -- but + * apparently, someone took this very literally, because variants of glibc + * appear to simply implement realloc() in terms of mremap(). This is + * unfortunate because absent intelligent usage, it forces realloc() to have + * an unncessary interaction with the VM system for small expansions -- and if + * realloc() is itself abused (e.g., if a consumer repeatedly expands and + * contracts the same memory buffer), the net result can be less efficient + * than a much more naive realloc() implementation. And if native Linux is + * suboptimal in this case, we are deeply pathological, having not + * historically supported mremap() for anonymous mappings at all. To make + * this at least palatable, we not only support remap for anonymous mappings + * (see lx_remap_anon(), below), we also cache the metadata associated with + * these anonymous remappings to reduce the need to search our address space. + * We implement the anonymous metadata cache with l_remap_anoncache, an LRU + * cache of lx_segmap_t's that correspond to anonymous segments that have been + * resized (only anonymous mappings that have been remapped are cached). The + * cache is part of the process's lx-brand-specifc data. + */ + +/* + * Search our address space (as) mappings to find the specified mapping. This + * is derived from the procfs prgetmap() code. We implement the "reserved" + * behavior on the segment so as to accommodate the case where an mmap()'d and + * then ftruncate()'d file is being mremap()'d: we use the size of the + * mapping (which we need to validate old_size). + * + * Return 0 if mapping is found, errno if there is a problem or if mapping + * not found. If the mapping is found, we populate the mp parameter, vpp and + * offp with the results. + */ +static int +lx_get_mapping(uintptr_t find_addr, size_t find_size, lx_segmap_t *mp, + vnode_t **vpp, offset_t *offp) +{ + struct as *as = curproc->p_as; + struct seg *seg; + uint_t prot; + caddr_t saddr, eaddr, naddr; + + /* pr_getprot asserts that the as is held as a writer */ + AS_LOCK_ENTER(as, RW_WRITER); + + seg = as_segat(as, (caddr_t)find_addr); + if (seg == NULL || (seg->s_flags & S_HOLE) != 0) { + AS_LOCK_EXIT(as); + return (EFAULT); + } + + /* + * We're interested in the reserved space, so we use the size of the + * segment itself. + */ + eaddr = seg->s_base + seg->s_size; + for (saddr = seg->s_base; saddr < eaddr; saddr = naddr) { + uintptr_t vaddr; + size_t size; + struct vnode *vp; + void *tmp = NULL; + + prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr); + if (saddr == naddr) + continue; + + vaddr = (uintptr_t)saddr; + size = (uintptr_t)naddr - (uintptr_t)saddr; + + if (vaddr == find_addr && find_size < size && + (find_size & PAGEOFFSET) != 0) { + /* + * We found a mapping but the size being requested is + * less than the mapping and not a multiple of our page + * size. If it is an anonymous mapping, that likely + * means the application did the initial mmap with this + * odd size. We'll round up to the next page boundary + * in this case. + */ + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) { + /* + * It's anonymous, round up the size. + */ + find_size = ptob(btopr(find_size)); + } + } + + /* Check if mapping matches our arguments */ + if (vaddr == find_addr && size == find_size) { + struct vattr vattr; + + mp->lxsm_vaddr = vaddr; + mp->lxsm_size = size; + mp->lxsm_flags = 0; + + *offp = SEGOP_GETOFFSET(seg, saddr); + + if (prot & PROT_READ) + mp->lxsm_flags |= LX_SM_READ; + if (prot & PROT_WRITE) + mp->lxsm_flags |= LX_SM_WRITE; + if (prot & PROT_EXEC) + mp->lxsm_flags |= LX_SM_EXEC; + if (SEGOP_GETTYPE(seg, saddr) & MAP_SHARED) + mp->lxsm_flags |= LX_SM_SHARED; + if (SEGOP_GETTYPE(seg, saddr) & MAP_NORESERVE) + mp->lxsm_flags |= LX_SM_NORESERVE; + if (seg->s_ops == &segspt_shmops || + (seg->s_ops == &segvn_ops && + (SEGOP_GETVP(seg, saddr, &vp) != 0 || + vp == NULL))) + mp->lxsm_flags |= LX_SM_ANON; + + if (seg->s_ops == &segspt_shmops) { + mp->lxsm_flags |= LX_SM_SHM; + } else if ((mp->lxsm_flags & LX_SM_SHARED) && + curproc->p_segacct && shmgetid(curproc, + seg->s_base) != SHMID_NONE) { + mp->lxsm_flags |= LX_SM_SHM; + } + + vattr.va_mask = AT_FSID | AT_NODEID; + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, saddr, &vp) == 0 && + vp != NULL && vp->v_type == VREG && + VOP_GETATTR(vp, &vattr, 0, CRED(), + NULL) == 0) { + VN_HOLD(vp); + *vpp = vp; + } else { + *vpp = NULL; + } + + AS_LOCK_EXIT(as); + return (0); + } + + if (vaddr <= find_addr && + find_addr + find_size < vaddr + size) { + /* + * We have a mismatch, but our specified range is a + * subset of the actual segment; this is EINVAL. + */ + AS_LOCK_EXIT(as); + DTRACE_PROBE2(lx__mremap__badsubset, caddr_t, + vaddr, size_t, size); + return (EINVAL); + } + } + + AS_LOCK_EXIT(as); + return (EFAULT); +} + +static void +lx_remap_anoncache_invalidate(uintptr_t addr, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + if (lxpd->l_remap_anoncache_generation == 0) + return; + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *map = &lxpd->l_remap_anoncache[i]; + + /* + * If the ranges overlap at all, we zap it. + */ + if (addr < map->lxsm_vaddr + map->lxsm_size && + map->lxsm_vaddr < addr + size) { + bzero(map, sizeof (lx_segmap_t)); + } + } +} + +static void +lx_remap_anoncache_load(lx_segmap_t *map, size_t size) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + uint64_t oldest = UINT64_MAX; + lx_segmap_t *evict = NULL; + uint_t i; + + ASSERT(MUTEX_HELD(&lxpd->l_remap_anoncache_lock)); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + lx_segmap_t *cp = &lxpd->l_remap_anoncache[i]; + + if (cp->lxsm_vaddr == map->lxsm_vaddr) { + /* + * We're already in the cache -- we just need to update + * our LRU field and size to reflect the hit. + */ + cp->lxsm_lru = lxpd->l_remap_anoncache_generation++; + cp->lxsm_size = size; + return; + } + + if (cp->lxsm_vaddr == 0) { + evict = cp; + break; + } + + if (cp->lxsm_lru < oldest) { + oldest = cp->lxsm_lru; + evict = cp; + } + } + + /* Update the entry we're evicting */ + ASSERT(evict != NULL); + evict->lxsm_vaddr = map->lxsm_vaddr; + evict->lxsm_size = size; + evict->lxsm_flags = map->lxsm_flags; + evict->lxsm_lru = lxpd->l_remap_anoncache_generation++; +} + +static int lx_u2u_copy(void *, void *, size_t); + +/* + * As part of lx_remap() (see below) and to accommodate heavy realloc() use + * cases (see the discussion of the l_remap_anoncache, above), we allow + * anonymous segments to be "remapped" in that we are willing to truncate them + * or append to them (as much as that's allowed by virtual address space + * usage). If we fall out of these cases, we take the more expensive option + * of actually copying the data to a new segment -- but we locate the address + * in a portion of the address space that should give us plenty of VA space to + * expand. + * + * We return the address of the mapping or set errno if there is a problem. + */ +static long +lx_remap_anon(lx_segmap_t *mapin, size_t new_size, uint_t flags, + uintptr_t new_addr) +{ + lx_segmap_t m; + int mflags = MAP_ANON; + int prot = 0; + void *addr, *hint = NULL; + + ASSERT(MUTEX_HELD(&ptolxproc(curproc)->l_remap_anoncache_lock)); + + /* + * Make a copy of the input lx_segmap_t argument since it might be + * a reference into the anon cache, and we're manipulating cache + * entries during this function. + */ + m = *mapin; + + /* + * If our new size is less than our old size and we're either not + * being ordered to move it or the address we're being ordered to + * move it to is our current address, we can just act as Procrustes + * and chop off anything larger than the new size. + */ + if (new_size < m.lxsm_size && (!(flags & LX_MREMAP_FIXED) || + new_addr == m.lxsm_vaddr)) { + if (munmap((void *)(m.lxsm_vaddr + new_size), + m.lxsm_size - new_size) != 0) { + return (set_errno(EINVAL)); + } + + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + if (m.lxsm_flags & LX_SM_SHM) + return (set_errno(EINVAL)); + + if (m.lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (m.lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (m.lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= (m.lxsm_flags & LX_SM_SHARED) ? MAP_SHARED : MAP_PRIVATE; + + if (m.lxsm_flags & LX_SM_NORESERVE) + mflags |= MAP_NORESERVE; + + /* + * If we're not being told where to move it, let's try to expand our + * mapping in place by adding a fixed mapping after it. + */ + if (!(flags & LX_MREMAP_FIXED)) { + void *tmp_addr = (void *)(m.lxsm_vaddr + m.lxsm_size); + + ASSERT(new_size > m.lxsm_size); + addr = smmap64(tmp_addr, new_size - m.lxsm_size, prot, + mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + /* There is no place to mmap some extra anon */ + return (set_errno(EINVAL)); + } + + if (addr == tmp_addr) { + /* The expansion worked */ + lx_remap_anoncache_load(&m, new_size); + return (m.lxsm_vaddr); + } + + /* + * Our advisory address was not followed -- which, as a + * practical matter, means that the range conflicted with an + * extant mapping. Unmap wherever our attempted expansion + * landed, and drop into the relocation case. + */ + (void) munmap(addr, new_size - m.lxsm_size); + } + + lx_remap_anoncache_invalidate(m.lxsm_vaddr, m.lxsm_size); + + /* + * If we're here, we actually need to move this mapping -- so if we + * can't move it, we're done. + */ + if (!(flags & LX_MREMAP_MAYMOVE)) + return (set_errno(ENOMEM)); + + /* + * If this is a shared private mapping, we can't remap it. + */ + if (m.lxsm_flags & LX_SM_SHARED) + return (set_errno(EINVAL)); + + if (flags & LX_MREMAP_FIXED) { + mflags |= MAP_FIXED; + hint = (void *)new_addr; + } else { + /* + * Search our address space for a gap to remap into. To give + * ourselves plenty of room for further mremap() expansion, + * we'll multiply our new size by 16 and look for a gap at + * least that big. Historically we looked for an empty gap + * around the 2GB region, so we start our search for the lowest + * gap in that vicinity. + */ + caddr_t base; + size_t upper; + + base = (caddr_t)ONE_GB; + upper = (uintptr_t)USERLIMIT - (uintptr_t)base; + + if (as_gap(curproc->p_as, (new_size << 4UL), &base, &upper, + AH_LO, NULL) != -1) + hint = base; + } + + addr = smmap64(hint, new_size, prot, mflags, -1, 0); + if (ttolwp(curthread)->lwp_errno != 0) { + return (ttolwp(curthread)->lwp_errno); + } + + if (lx_u2u_copy((void *)m.lxsm_vaddr, addr, m.lxsm_size) != 0) { + /* We couldn't complete the relocation, backout & fail */ + (void) munmap(addr, new_size); + return (set_errno(ENOMEM)); + } + + (void) munmap((void *)m.lxsm_vaddr, m.lxsm_size); + + /* + * Add the relocated mapping to the cache. + */ + m.lxsm_vaddr = (uintptr_t)addr; + lx_remap_anoncache_load(&m, new_size); + + return ((long)addr); +} + +/* + * We don't have a native mremap() (nor do we particularly want one), so + * we emulate it strictly in lx. The idea is simple: we just want to + * mmap() the underlying object with the new size and rip down the old mapping. + * However, this is slightly complicated because we don't actually have the + * file descriptor that corresponds to the resized mapping. So to get a file + * descriptor, we may have to search our address space for the mapping and use + * the associated vnode to create a file descriptor. Assuming that this + * succeeds, we then mmap() it and rip down the original mapping. There are + * clearly many reasons why this might fail; absent a more apt errno (e.g., + * ENOMEM in some cases), we return EINVAL to denote these cases. + */ +long +lx_mremap(uintptr_t old_addr, size_t old_size, size_t new_size, int flags, + uintptr_t new_addr) +{ + int prot = 0, oflags, mflags = 0, i, res; + lx_segmap_t map, *mp; + int rval = 0; + lx_proc_data_t *lxpd; + offset_t off; + struct vnode *vp = NULL; + file_t *fp; + caddr_t naddr; + + if (flags & LX_MREMAP_FIXED) { + /* MREMAP_FIXED requires MREMAP_MAYMOVE */ + if ((flags & LX_MREMAP_MAYMOVE) == 0) + return (set_errno(EINVAL)); + + if (new_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + mflags |= MAP_FIXED; + } else { + if (new_size == old_size) + return (old_addr); + + /* new_addr is optional and only valid when LX_MREMAP_FIXED. */ + new_addr = (uintptr_t)NULL; + } + + if (old_addr & PAGEOFFSET) + return (set_errno(EINVAL)); + + if (new_size == 0) + return (set_errno(EINVAL)); + + /* + * First consult the anoncache; if we find the segment there, we'll + * drop straight into lx_remap_anon() and save ourself the pain of + * searching our address space. + */ + lxpd = ptolxproc(curproc); + mutex_enter(&lxpd->l_remap_anoncache_lock); + + for (i = 0; i < LX_REMAP_ANONCACHE_NENTRIES; i++) { + long rv; + + mp = &lxpd->l_remap_anoncache[i]; + + if (mp->lxsm_vaddr != old_addr) + continue; + + if (mp->lxsm_size != old_size) + continue; + + /* + * lx_remap_anon will either: + * a) expand/contract in place, returning old_addr + * b) relocate & expand the mapping, returning a new address + * c) there will be an error of some sort and errno will be set + */ + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + mutex_exit(&lxpd->l_remap_anoncache_lock); + + /* + * Search our address space to find the specified mapping. + */ + if ((res = lx_get_mapping(old_addr, old_size, &map, &vp, &off)) > 0) + return (set_errno(res)); + + /* + * We found the mapping. + */ + mp = ↦ + DTRACE_PROBE1(lx__mremap__seg, lx_segmap_t *, mp); + + if (mp->lxsm_flags & LX_SM_SHM) { + /* + * If this is either ISM or System V shared memory, we're not + * going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + if (mp->lxsm_flags & LX_SM_ANON) { + /* + * This is an anonymous mapping -- which is the one case in + * which we perform something that approaches a true remap. + */ + long rv; + + if (vp != NULL) + VN_RELE(vp); + mutex_enter(&lxpd->l_remap_anoncache_lock); + rv = lx_remap_anon(mp, new_size, flags, new_addr); + mutex_exit(&lxpd->l_remap_anoncache_lock); + return (rv); + } + + /* The rest of the code is for a 'named' mapping */ + + if (!(flags & LX_MREMAP_MAYMOVE)) { + /* + * If we're not allowed to move this mapping, we're going to + * act as if we can't expand it. + */ + rval = set_errno(ENOMEM); + goto out; + } + + if (!(mp->lxsm_flags & LX_SM_SHARED)) { + /* + * If this is a private mapping, we're not going to remap it. + */ + rval = set_errno(EINVAL); + goto out; + } + + oflags = (mp->lxsm_flags & LX_SM_WRITE) ? (FWRITE | FREAD) : FREAD; + if (vp == NULL) { + /* + * If vp is NULL, the path might not exist. We're going to kick + * it back with EINVAL. + */ + rval = set_errno(EINVAL); + goto out; + } + + /* falloc cannot fail with a NULL fdp. */ + VERIFY0(falloc(vp, oflags, &fp, NULL)); + mutex_exit(&fp->f_tlock); + + if (mp->lxsm_flags & LX_SM_WRITE) + prot |= PROT_WRITE; + + if (mp->lxsm_flags & LX_SM_READ) + prot |= PROT_READ; + + if (mp->lxsm_flags & LX_SM_EXEC) + prot |= PROT_EXEC; + + mflags |= MAP_SHARED; + + /* + * We're using smmap_common to pass the fp directly, instead of + * initializing a temporary file descriptor for smmap64(), so as to + * prevent any inadvertent use of that temporary fd within the + * application. + */ + naddr = (caddr_t)new_addr; + rval = smmap_common(&naddr, new_size, prot, mflags, fp, off); + + mutex_enter(&fp->f_tlock); + unfalloc(fp); + + if (rval != 0) { + rval = set_errno(ENOMEM); + goto out; + } + + /* + * Our mapping succeeded; we're now going to rip down the old mapping. + */ + (void) munmap((void *)old_addr, old_size); + +out: + if (vp != NULL) + VN_RELE(vp); + + if (rval == 0) + return ((long)naddr); + return ((long)rval); +} + +#pragma GCC diagnostic ignored "-Wclobbered" +/* + * During mremap we had to relocate the initial anonymous mapping to a new + * location (a new anonymous mapping). Copy the user-level data from the first + * mapping to the second mapping. + * + * We have to lock both sides to ensure there is no fault. We do this in 16MB + * chunks at a time and we do not concern ourselves with the zone's + * max-locked-memory rctl. + * + * Keep this function at the end since we're disabling the compiler's "clobber" + * check due to the on_fault call. + */ +static int +lx_u2u_copy(void *src, void *dst, size_t len) +{ + size_t mlen; + caddr_t sp, dp; + int err; + page_t **ppa_src, **ppa_dst; + label_t ljb; + struct as *p_as = curproc->p_as; + + /* Both sides should be page aligned since they're from smmap64 */ + ASSERT(((uintptr_t)src & PAGEOFFSET) == 0); + ASSERT(((uintptr_t)dst & PAGEOFFSET) == 0); + /* Both came from mmap, so they should be valid user pointers */ + ASSERT((uintptr_t)src < USERLIMIT && (uintptr_t)dst < USERLIMIT); + + sp = src; + dp = dst; + + do { + mlen = MIN(len, 16 * 1024 * 1024); + + err = as_pagelock(p_as, &ppa_src, sp, mlen, S_READ); + if (err != 0) { + return (err); + } + err = as_pagelock(p_as, &ppa_dst, dp, mlen, S_WRITE); + if (err != 0) { + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (err); + } + + DTRACE_PROBE3(lx__mremap__copy, void *, sp, void *, dp, + size_t, mlen); + + /* on_fault calls smap_disable */ + if (on_fault(&ljb)) { + /* + * Given that the pages are locked and smap is disabled, + * we really should never get here. If we somehow do + * get here, the copy fails just as if we could not + * lock the pages to begin with. + */ + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + return (EFAULT); + } + ucopy(sp, dp, mlen); + no_fault(); /* calls smap_enable */ + + as_pageunlock(p_as, ppa_dst, dp, mlen, S_WRITE); + as_pageunlock(p_as, ppa_src, sp, mlen, S_READ); + + len -= mlen; + sp += mlen; + dp += mlen; + } while (len > 0); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c new file mode 100644 index 0000000000..25f06e134b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_miscsys.c @@ -0,0 +1,495 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/systeminfo.h> +#include <sys/fcntl.h> +#include <sys/resource.h> +#include <sys/uadmin.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> + +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 672274793 +#define LINUX_REBOOT_MAGIC2A 85072278 +#define LINUX_REBOOT_MAGIC2B 369367448 +#define LINUX_REBOOT_MAGIC2C 537993216 + +#define LINUX_REBOOT_CMD_RESTART 0x1234567 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_CAD_ON 0x89abcdef +#define LINUX_REBOOT_CMD_CAD_OFF 0 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART2 0xa1b2c3d4 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xD000FCE2 +#define LINUX_REBOOT_CMD_KEXEC 0x45584543 + +#define LX_RUSAGE_SELF 0 +#define LX_RUSAGE_CHILDREN (-1) +#define LX_RUSAGE_BOTH (-2) +#define LX_RUSAGE_THREAD 1 + +#define LX_SWAP_PRIOMASK 0x7fff +#define LX_SWAP_PREFER 0x8000 +#define LX_SWAP_DISCARD 0x10000 +#define LX_SWAP_DISCARD_ONCE 0x20000 +#define LX_SWAP_DISCARD_PAGES 0x40000 + +#define LX_SWAP_ALL (LX_SWAP_DISCARD_PAGES | \ + LX_SWAP_DISCARD_ONCE | \ + LX_SWAP_DISCARD | \ + LX_SWAP_PREFER | LX_SWAP_PRIOMASK) + +/* From uts/common/fs/vfs.c */ +extern void vfs_sync(int); +/* From uts/common/os/grow.c */ +extern int mincore(caddr_t, size_t, char *); +extern int munmap(caddr_t, size_t); +/* From uts/common/os/session.c */ +extern int vhangup(); +/* From uts/common/syscall/alarm.c */ +extern int alarm(int); +/* From uts/common/syscall/chdir.c */ +extern int chdir(char *); +extern int chroot(char *); +extern int fchdir(int); +/* From uts/common/syscall/nice.c */ +extern int nice(int); +/* From uts/common/syscall/open.c */ +extern int open(char *, int, int); +/* From uts/common/syscall/pause.c */ +extern int pause(); +/* From uts/common/syscall/rusagesys.c */ +extern int rusagesys(int, void *, void *, void *, void *); +/* From uts/common/syscall/systeminfo.c */ +extern long systeminfo(int, char *, long); +/* From uts/common/syscall/timers.c */ +extern int getitimer(uint_t, struct itimerval *); +/* From uts/common/syscall/time.c */ +extern int stime(time_t); +/* From uts/common/syscall/uadmin.c */ +extern int uadmin(int, int, uintptr_t); +/* From uts/common/syscall/chdir.c */ +extern int chdir_proc(proc_t *, vnode_t *, boolean_t, boolean_t); +/* From uts/common/fs/lookup.c */ +extern int lookupname(char *, enum uio_seg, int, vnode_t **, vnode_t **); +/* From uts/common/fs/fs_subr.c */ +extern int fs_need_estale_retry(int); +/* From uts/common/os/acct.c */ +extern int sysacct(char *); + +/* The callback arguments when handling a FS clone group. */ +typedef struct { + vnode_t *lcfa_vp; + boolean_t lcfa_type; + boolean_t lcfa_traverse; +} lx_clone_fs_arg_t; + +long +lx_alarm(int seconds) +{ + return (alarm(seconds)); +} + +static int +lx_clone_fs_cb(proc_t *pp, void *arg) +{ + lx_clone_fs_arg_t *ap = (lx_clone_fs_arg_t *)arg; + int err; + + /* + * Either: + * A) The initial lookupname() from lx_clone_fs_do_group() will have + * added a hold on the vnode to ensure its existence throughout the + * walk. + * B) We added a hold in fchdir. + * We need to add another hold for each process in the group. + */ + VN_HOLD(ap->lcfa_vp); + if ((err = chdir_proc(pp, ap->lcfa_vp, ap->lcfa_type, + ap->lcfa_traverse)) != 0) { + /* if we failed, chdir_proc already did a rele on vp */ + return (err); + } + + return (0); +} + +/* + * Check to see if the process is in a CLONE_FS clone group. Return false + * if not (the normal case), otherwise perform the setup, do the group walk + * and return true. + */ +static boolean_t +lx_clone_fs_do_group(char *path, boolean_t is_chroot, int *errp) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + vnode_t *vp; + lx_clone_fs_arg_t arg; + int err; + int estale_retry = 0; + + if (!lx_clone_grp_member(lproc, LX_CLONE_FS)) + return (B_FALSE); + + /* Handle the rare case of being in a CLONE_FS clone group */ + +retry: + err = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (err != 0) { + if (err == ESTALE && fs_need_estale_retry(estale_retry++)) + goto retry; + *errp = err; + return (B_TRUE); + } + + arg.lcfa_vp = vp; + arg.lcfa_type = is_chroot; + arg.lcfa_traverse = B_TRUE; + + /* + * We use the VN_HOLD from the lookup to guarantee vp exists for the + * entire walk. + */ + err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb, + (void *)&arg); + VN_RELE(vp); + *errp = err; + return (B_TRUE); +} + +long +lx_chdir(char *path) +{ + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_FALSE, &err)) + return ((err != 0) ? set_errno(err) : 0); + + return (chdir(path)); +} + +long +lx_chroot(char *path) +{ + int err; + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_fs_do_group(path, B_TRUE, &err)) + return ((err != 0) ? set_errno(err) : 0); + + return (chroot(path)); +} + +long +lx_creat(char *path, mode_t mode) +{ + return (open(path, O_WRONLY | O_CREAT | O_TRUNC, mode)); +} + +long +lx_fchdir(int fd) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + + if (lx_clone_grp_member(lproc, LX_CLONE_FS)) { + /* Handle the rare case of being in a CLONE_FS clone group */ + file_t *fp; + vnode_t *vp; + lx_clone_fs_arg_t arg; + int err; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + vp = fp->f_vnode; + VN_HOLD(vp); + releasef(fd); + + arg.lcfa_vp = vp; + arg.lcfa_type = B_FALSE; + arg.lcfa_traverse = B_FALSE; + + /* + * We use the VN_HOLD above to guarantee vp exists for the + * entire walk. + */ + err = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_fs_cb, + (void *)&arg); + VN_RELE(vp); + if (err) + return (set_errno(err)); + return (0); + } + + return (fchdir(fd)); +} + +long +lx_getitimer(int which, struct itimerval *value) +{ + return (getitimer(which, value)); +} + +/* Linux and illumos have the same rusage structures. */ +long +lx_getrusage(int who, struct rusage *rup) +{ + int code; + + switch (who) { + case LX_RUSAGE_SELF: + code = _RUSAGESYS_GETRUSAGE; + break; + case LX_RUSAGE_CHILDREN: + code = _RUSAGESYS_GETRUSAGE_CHLD; + break; + case LX_RUSAGE_THREAD: + code = _RUSAGESYS_GETRUSAGE_LWP; + break; + default: + return (set_errno(EINVAL)); + } + + return (rusagesys(code, rup, NULL, NULL, NULL)); +} + +long +lx_mincore(caddr_t addr, size_t len, char *vec) +{ + int r; + + r = mincore(addr, len, vec); + if (r == EINVAL) { + /* + * LTP mincore01 expects mincore with a huge len to fail with + * ENOMEM on a modern kernel, although on Linux 2.6.11 and + * earlier, it will return EINVAL. + */ + if (lx_kern_release_cmp(curzone, "2.6.11") > 0 && (long)len < 0) + return (set_errno(ENOMEM)); + } + return (r); +} + +long +lx_nice(int incr) +{ + return (nice(incr)); +} + +long +lx_pause(void) +{ + return (pause()); +} + +/*ARGSUSED*/ +long +lx_reboot(int magic1, int magic2, uint_t flag, uintptr_t p4) +{ + if (magic1 != LINUX_REBOOT_MAGIC1) + return (set_errno(EINVAL)); + + switch (magic2) { + case LINUX_REBOOT_MAGIC2: + case LINUX_REBOOT_MAGIC2A: + case LINUX_REBOOT_MAGIC2B: + case LINUX_REBOOT_MAGIC2C: + break; + default: + return (set_errno(EINVAL)); + } + + /* + * Once we have better Linux capabilities(7) support we should check + * CAP_SYS_BOOT instead. + */ + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + switch (flag) { + case LINUX_REBOOT_CMD_CAD_ON: + case LINUX_REBOOT_CMD_CAD_OFF: + /* ignored */ + return (0); + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + return (uadmin(A_SHUTDOWN, AD_HALT, (uintptr_t)NULL)); + + case LINUX_REBOOT_CMD_RESTART: + case LINUX_REBOOT_CMD_RESTART2: + /* RESTART2 may need more work */ + return (uadmin(A_SHUTDOWN, AD_BOOT, (uintptr_t)NULL)); + + default: + return (set_errno(EINVAL)); + } +} + +long +lx_setdomainname(char *name, long len) +{ + if (len < 0 || len >= LX_SYS_UTS_LN) + return (set_errno(EINVAL)); + + ttolwp(curthread)->lwp_errno = 0; + (void) systeminfo(SI_SET_SRPC_DOMAIN, name, len); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + return (0); +} + +long +lx_sethostname(char *name, size_t len) +{ + ttolwp(curthread)->lwp_errno = 0; + (void) systeminfo(SI_SET_HOSTNAME, name, len); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + return (0); +} + +long +lx_stime(time_t *tp) +{ + time_t time; + + if (copyin(tp, &time, sizeof (time)) != 0) + return (set_errno(EFAULT)); + + return (stime(time)); +} + +long +lx_sync(void) +{ + vfs_sync(0); + return (0); +} + +/* + * For syslog, since there is no Linux kernel and nothing to log, we simply + * emulate a kernel buffer (LOG_BUF_LEN) of 0 bytes and only handle errors for + * bad input. All actions except 3 and 10 require CAP_SYS_ADMIN or CAP_SYSLOG + * so without full capabilities support, for now we just perform an euid check. + */ +long +lx_syslog(int type, char *bufp, int len) +{ + if (type < 0 || type > 10) + return (set_errno(EINVAL)); + + if (type != 3 && type != 10 && crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + if (type >= 2 && type <= 4 && (bufp == NULL || len < 0)) + return (set_errno(EINVAL)); + + if (type == 8 && (len < 1 || len > 8)) + return (set_errno(EINVAL)); + + return (0); +} + +long +lx_vhangup(void) +{ + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * The native vhangup code does nothing except check for the sys_config + * privilege. Eventually we'll first want to check our emulation for the + * Linux CAP_SYS_TTY_CONFIG capability, but currently, since we've + * already checked that our process is root, just succeed. + */ + return (0); +} + +long +lx_acct(char *p) +{ + return (sysacct(p)); +} + +/* + * Support for Linux namespaces is not yet implemented. Normally we would + * simply return ENOSYS for this. However, "systemd" uses mount namespaces to + * provide the PrivateTmp feature for some services. Use of this feature is + * becoming common and these services will fail to run without namespace + * support. "systemd" has a fallback to allow these types of services to run if + * it sees either EACCES or EPERM when it tries to setup the namespace. Until + * we have namespace support, we return EPERM to workaround this issue. + */ +/*ARGSUSED*/ +long +lx_unshare(int flags) +{ + return (set_errno(EPERM)); +} + +/* + * The whole idea of "swap space" within a zone is a complete fabrication. + * However, some apps expect to be able to see swap space data in the /proc + * files, while other apps actually don't want there to be any swap space + * configured. We use the swapon/off syscalls to allow this visibility to be + * controlled from within the zone iself. Note that the "swapon" CLI tends to + * do a lot of additional validation which will fail within a zone. + * + * Once we have better Linux capabilities(7) support we should check + * CAP_SYS_ADMIN instead of uid == 0. + */ +long +lx_swapoff(char *path) +{ + char buf[MAXPATHLEN]; + size_t len; + lx_zone_data_t *lxzd; + + /* Simple validaton of the argument */ + if (copyinstr(path, buf, sizeof (buf), &len) != 0) + return (set_errno(EFAULT)); + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + lxzd = ztolxzd(curzone); + ASSERT(lxzd != NULL); + + lxzd->lxzd_swap_disabled = B_TRUE; + return (0); +} + +long +lx_swapon(char *path, int flags) +{ + char buf[MAXPATHLEN]; + size_t len; + lx_zone_data_t *lxzd; + + /* Simple validaton of the arguments */ + if (copyinstr(path, buf, sizeof (buf), &len) != 0) + return (set_errno(EFAULT)); + if (flags & ~LX_SWAP_ALL) + return (set_errno(EINVAL)); + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + + lxzd = ztolxzd(curzone); + ASSERT(lxzd != NULL); + + lxzd->lxzd_swap_disabled = B_FALSE; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c new file mode 100644 index 0000000000..2f29f56d5f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* + * From "uts/common/syscall/mkdir.c": + */ +extern int mkdirat(int, char *, int); + +long +lx_mkdirat(int fd, char *dname, int dmode) +{ + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + return (mkdirat(fd, dname, dmode)); +} + +long +lx_mkdir(char *dname, int dmode) +{ + return (mkdirat(AT_FDCWD, dname, dmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c new file mode 100644 index 0000000000..aa6e12a7d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/segments.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/sysi86.h> +#include <sys/cmn_err.h> +#include <sys/lx_ldt.h> + +/* + * Read the ldt_info structure in from the Linux app, convert it to an ssd + * structure, and then call setdscr() to do all the heavy lifting. + */ +static int +write_ldt(void *data, ulong_t count) +{ + user_desc_t usd; + struct ssd ssd; + struct ldt_info ldt_inf; + proc_t *pp = curthread->t_procp; + int err; + + if (count != sizeof (ldt_inf)) + return (set_errno(EINVAL)); + + if (copyin(data, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + if (ldt_inf.entry_number >= MAXNLDT) + return (set_errno(EINVAL)); + + LDT_INFO_TO_DESC(&ldt_inf, &usd); + usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number)); + + /* + * Get everyone into a safe state before changing the LDT. + */ + if (!holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + + err = setdscr(&ssd); + + /* + * Release the hounds! + */ + mutex_enter(&pp->p_lock); + continuelwps(pp); + mutex_exit(&pp->p_lock); + + return (err ? set_errno(err) : 0); +} + +static int +read_ldt(void *uptr, ulong_t count) +{ + proc_t *pp = curproc; + int bytes; + + if (pp->p_ldt == NULL) + return (0); + + bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + if (bytes > count) + bytes = count; + + if (copyout(pp->p_ldt, uptr, bytes)) + return (set_errno(EFAULT)); + + return (bytes); +} + +long +lx_modify_ldt(int op, void *data, ulong_t count) +{ + int rval; + + switch (op) { + case 0: + rval = read_ldt(data, count); + break; + + case 1: + rval = write_ldt(data, count); + break; + + default: + rval = set_errno(ENOSYS); + break; + } + + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mount.c b/usr/src/uts/common/brand/lx/syscall/lx_mount.c new file mode 100644 index 0000000000..fff6c81339 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mount.c @@ -0,0 +1,744 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/ctype.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/types.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_autofs.h> + +#define tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x)) + +/* + * mount(2) is significantly different between Linux and illumos. One of the + * main differences is between the set of flags. Some flags on Linux can be + * translated to an illumos equivalent, some are converted to a + * filesystem-specific option, while others have no equivalent whatsoever. + * + * Another big difference is that mounting NFS is fully handled in the kernel on + * Linux whereas on illumos a lot of preliminary work is done by the NFS mount + * command before calling mount(2). As a simplification, we forward NFS + * mount calls back out to the user-level library which does the same kind of + * preliminary processing that is done by the native user-level NFS mount code. + */ +#define LX_MS_MGC_VAL 0xC0ED0000 +#define LX_MS_RDONLY 0x00000001 +#define LX_MS_NOSUID 0x00000002 +#define LX_MS_NODEV 0x00000004 +#define LX_MS_NOEXEC 0x00000008 +#define LX_MS_SYNCHRONOUS 0x00000010 +#define LX_MS_REMOUNT 0x00000020 +#define LX_MS_MANDLOCK 0x00000040 +#define LX_MS_NOATIME 0x00000400 +#define LX_MS_NODIRATIME 0x00000800 +#define LX_MS_BIND 0x00001000 +#define LX_MS_MOVE 0x00002000 +#define LX_MS_REC 0x00004000 +#define LX_MS_SILENT 0x00008000 +#define LX_MS_POSIXACL 0x00010000 +#define LX_MS_UNBINDABLE 0x00020000 +#define LX_MS_PRIVATE 0x00040000 +#define LX_MS_SLAVE 0x00080000 +#define LX_MS_SHARED 0x00100000 +#define LX_MS_RELATIME 0x00200000 +#define LX_MS_KERNMOUNT 0x00400000 +#define LX_MS_I_VERSION 0x00800000 +#define LX_MS_STRICTATIME 0x01000000 +#define LX_MS_LAZYTIME 0x02000000 + +/* Linux kernel-internal flags - ignored if passed in */ +#define LX_MS_NOSEC 0x10000000 +#define LX_MS_BORN 0x20000000 +#define LX_MS_ACTIVE 0x40000000 +#define LX_MS_NOUSER 0x80000000 + +#define LX_MS_SUPPORTED (LX_MS_MGC_VAL | \ + LX_MS_RDONLY | LX_MS_NOSUID | \ + LX_MS_NODEV | LX_MS_NOEXEC | \ + LX_MS_REMOUNT | LX_MS_NOATIME | \ + LX_MS_BIND | LX_MS_SILENT | \ + LX_MS_STRICTATIME | LX_MS_NOSEC | \ + LX_MS_BORN | LX_MS_ACTIVE | LX_MS_NOUSER) + +/* + * support definitions + */ +typedef enum mount_opt_type { + MOUNT_OPT_INVALID = 0, + MOUNT_OPT_NORMAL = 1, /* option value: none */ + MOUNT_OPT_UINT = 2, /* option value: unsigned int */ + MOUNT_OPT_PASSTHRU = 3 /* option value: validated downstream */ +} mount_opt_type_t; + +typedef struct mount_opt { + char *mo_name; + mount_opt_type_t mo_type; +} mount_opt_t; + +/* From uts/common/syscall/umount.c */ +extern int umount2(char *, int); + +/* From lx_chown.c */ +extern long lx_vn_chown(vnode_t *, uid_t, gid_t); + +/* + * Globals + */ +static mount_opt_t lofs_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_proc_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_sysfs_options[] = { + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_tmpfs_options[] = { + { "size", MOUNT_OPT_PASSTHRU }, + { "mode", MOUNT_OPT_UINT }, + { "uid", MOUNT_OPT_UINT }, + { "gid", MOUNT_OPT_UINT }, + { "nr_inodes", MOUNT_OPT_PASSTHRU }, + { NULL, MOUNT_OPT_INVALID } +}; + +static mount_opt_t lx_autofs_options[] = { + { LX_MNTOPT_FD, MOUNT_OPT_UINT }, + { LX_MNTOPT_PGRP, MOUNT_OPT_UINT }, + { LX_MNTOPT_MINPROTO, MOUNT_OPT_UINT }, + { LX_MNTOPT_MAXPROTO, MOUNT_OPT_UINT }, + { LX_MNTOPT_INDIRECT, MOUNT_OPT_NORMAL }, + { LX_MNTOPT_DIRECT, MOUNT_OPT_NORMAL }, + { LX_MNTOPT_OFFSET, MOUNT_OPT_NORMAL }, + { NULL, MOUNT_OPT_INVALID } +}; + +static const char *lx_common_mnt_opts[] = { + "exec", + "noexec", + "devices", + "nodevices", + "dev", + "nodev", + "suid", + "nosuid", + NULL +}; + +/* + * Check the mount options. + * + * On illumos all mount option verification is done by the user-level mount + * command. Invalid options are simply ignored by domount(). Thus, we check + * here for invalid/unsupported options. + */ +static int +lx_mnt_opt_verify(char *opts, mount_opt_t *mop) +{ + int opts_len = strlen(opts); + char *opt, *tp; + int opt_len, i; + boolean_t last = B_FALSE; + + ASSERT((opts != NULL) && (mop != NULL)); + + /* If no options were specified, nothing to do. */ + if (opts_len == 0) + return (0); + + /* If no options are allowed, fail. */ + if (mop[0].mo_name == NULL) + return (ENOTSUP); + + /* Don't accept leading or trailing ','. */ + if ((opts[0] == ',') || (opts[opts_len] == ',')) + return (EINVAL); + + /* Don't accept sequential ','. */ + for (i = 1; i < opts_len; i++) { + if ((opts[i - 1] == ',') && (opts[i] == ',')) + return (EINVAL); + } + + /* + * Verify each prop one at a time. There is no strtok in the kernel but + * it's easy to tokenize the entry ourselves. + */ + opt = opts; + for (tp = opt; *tp != ',' && *tp != '\0'; tp++) + ; + if (*tp == ',') { + *tp = '\0'; + } else { + last = B_TRUE; + } + for (;;) { + opt_len = strlen(opt); + + /* Check common options we support on all filesystems */ + for (i = 0; lx_common_mnt_opts[i] != NULL; i++) { + if (strcmp(opt, lx_common_mnt_opts[i]) == 0) + goto next_opt; + } + + /* Check for matching option/value pair. */ + for (i = 0; mop[i].mo_name != NULL; i++) { + char *ovalue; + int ovalue_len, mo_len; + + /* If the options is too short don't bother comparing */ + mo_len = strlen(mop[i].mo_name); + if (opt_len < mo_len) { + /* Keep trying to find a match. */ + continue; + } + + /* Compare the option to an allowed option. */ + if (strncmp(mop[i].mo_name, opt, mo_len) != 0) { + /* Keep trying to find a match. */ + continue; + } + + if (mop[i].mo_type == MOUNT_OPT_NORMAL) { + /* The option doesn't take a value. */ + if (opt_len == mo_len) { + /* This option is ok. */ + break; + } else { + /* Keep trying to find a match. */ + continue; + } + } + + /* This options takes a value. */ + if ((opt_len == mo_len) || (opt[mo_len] != '=')) { + /* Keep trying to find a match. */ + continue; + } + + /* We have an option match. Verify option value. */ + ovalue = &opt[mo_len] + 1; + ovalue_len = strlen(ovalue); + + /* Value can't be zero length string. */ + if (ovalue_len == 0) { + goto bad; + } + + if (mop[i].mo_type == MOUNT_OPT_UINT) { + int j; + /* Verify that value is an unsigned int. */ + for (j = 0; j < ovalue_len; j++) { + if (!ISDIGIT(ovalue[j])) { + goto bad; + } + } + } else if (mop[i].mo_type == MOUNT_OPT_PASSTHRU) { + /* Filesystem will do its own validation. */ + break; + } else { + /* Unknown option type specified. */ + goto bad; + } + + /* The option is ok. */ + break; + } + + /* If there were no matches this is an unsupported option. */ + if (mop[i].mo_name == NULL) { + goto bad; + } + +next_opt: + /* + * This option is ok, either we're done or move on to the next + * option. + */ + if (last) + break; + + *tp = ','; + opt = tp + 1; + for (tp = opt; *tp != ',' && *tp != '\0'; tp++) + ; + if (*tp == ',') { + *tp = '\0'; + } else { + last = B_TRUE; + } + }; + + /* We verified all the options. */ + return (0); + +bad: + if (!last) { + *tp = ','; + } + return (EINVAL); +} + +/* + * Remove an option from the string and save it in the provided buffer. + * The option string should have already been verified as valid. + * Return 0 if not present, -1 if error, and 1 if present and fine. + */ +static int +lx_mnt_opt_rm(char *opts, char *rmopt, char *retstr, int retlen) +{ + int opts_len = strlen(opts); + char *optstart, *optend; + int optlen; + + ASSERT((opts != NULL) && (rmopt != NULL)); + + if (retstr != NULL) + retstr[0] = '\0'; + + /* If no options were specified, there's no problem. */ + if (opts_len == 0) + return (0); + + if ((optstart = strstr(opts, rmopt)) == NULL) + return (0); + + for (optend = optstart; *optend != ',' && *optend != '\0'; optend++) + ; + + optlen = optend - optstart; + if (retstr != NULL) { + if (optlen >= retlen) + return (-1); + (void) strncpy(retstr, optstart, optlen); + retstr[optlen] = '\0'; + } + + if (*optend == ',') + optend++; + + optlen = strlen(optend) + 1; + bcopy(optend, optstart, optlen); + + if (*optstart == '\0' && optstart != opts) { + /* removed last opt and it had a preceeding opt, remove comma */ + *(optstart - 1) = '\0'; + } + + return (1); +} + +static int +lx_mnt_opt_val(char *opt, int *valp) +{ + char *op, *ep; + long lval; + + if ((op = strchr(opt, '=')) == NULL) + return (-1); + + op++; + if (!ISDIGIT(*op)) + return (-1); + + if (ddi_strtoul(op, &ep, 10, (ulong_t *)&lval) != 0 || lval > INT_MAX) { + return (-1); + } + + if (*ep != '\0') + return (-1); + + *valp = (int)lval; + return (0); +} + +static int +lx_mnt_add_opt(char *option, char *buf, size_t buf_size) +{ + char *fmt_str = NULL; + size_t len; + + ASSERT((option != NULL) && (strlen(option) > 0)); + ASSERT((buf != NULL) && (buf_size > 0)); + + if (buf[0] == '\0') { + fmt_str = "%s"; + } else { + fmt_str = ",%s"; + } + + len = strlen(buf); + VERIFY(len <= buf_size); + buf_size -= len; + buf += len; + + if (snprintf(buf, buf_size, fmt_str, option) > (buf_size - 1)) + return (EOVERFLOW); + return (0); +} + +static int +lx_mnt_copyin_arg(const char *from, char *to, size_t len) +{ + size_t slen; + int rv; + + rv = copyinstr(from, to, len, &slen); + if (rv == ENAMETOOLONG || slen == len) + return (ENAMETOOLONG); + if (rv != 0) + return (EFAULT); + + return (0); +} + +long +lx_mount(const char *sourcep, const char *targetp, const char *fstypep, + uint_t flags, const void *datap) +{ + char fstype[16]; + char *source, *target, *options; + size_t sourcel, targetl, optionsl; + int sflags, rv; + struct mounta ma, *map = &ma; + vfs_t *vfsp; + vnode_t *vp = NULL; + int uid = -1; + int gid = -1; + + if ((rv = lx_mnt_copyin_arg(fstypep, fstype, sizeof (fstype))) != 0) { + if (rv == ENAMETOOLONG) + return (set_errno(ENODEV)); + return (set_errno(rv)); + } + + /* + * Vector back out to userland emulation for NFS. + */ + if (strcmp(fstype, "nfs") == 0 || strcmp(fstype, "nfs4") == 0) { + uintptr_t uargs[5] = {(uintptr_t)sourcep, (uintptr_t)targetp, + (uintptr_t)fstypep, (uintptr_t)flags, (uintptr_t)datap}; + + /* The userspace emulation will do the lx_syscall_return() */ + ttolxlwp(curthread)->br_eosys = JUSTRETURN; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(ttolwp(curthread), LX_SYS32_mount, + uargs); + } else +#endif + { + lx_emulate_user(ttolwp(curthread), LX_SYS_mount, uargs); + } + return (0); + } + + /* Make sure we support the requested mount flags. */ + if ((flags & ~LX_MS_SUPPORTED) != 0) + return (set_errno(ENOTSUP)); + + sourcel = targetl = MAXPATHLEN; + optionsl = MAX_MNTOPT_STR; + source = kmem_alloc(sourcel, KM_SLEEP); + target = kmem_alloc(targetl, KM_SLEEP); + options = kmem_alloc(optionsl, KM_SLEEP); + + sflags = MS_SYSSPACE | MS_OPTIONSTR; + options[0] = '\0'; + + /* Copy in parameters that are always present. */ + if ((rv = lx_mnt_copyin_arg(sourcep, source, sourcel)) != 0) + goto out; + + if ((rv = lx_mnt_copyin_arg(targetp, target, targetl)) != 0) + goto out; + + /* + * While SunOS is picky about mount(2) target paths being absolute, + * Linux is not so strict. In order to facilitate this looser + * requirement we must lookup the full path. + */ + if (target[0] != '/') { + if ((rv = lookupname(target, UIO_SYSSPACE, FOLLOW, + NULLVPP, &vp)) != 0) { + goto out; + } + + rv = vnodetopath(NULL, vp, target, targetl, CRED()); + VN_RELE(vp); + if (rv != 0) + goto out; + } + + /* + * Following commits* in September 2020, systemd mounts most + * filesystems via an open file descriptor in order to avoid + * following mount point symlinks. + * It does this by opening the mount point with O_NOFOLLOW|O_PATH + * and then performing the mount on /proc/self/fd/<fd>. + * In illumos, this results in a mount attempt on the lx_proc vnode + * instead of the intended mount point, which fails since /proc files + * are generally marked as unmountable (and we wouldn't want this + * anyway). + * As a workaround, the /proc vnode's link target is retrieved and + * used for mount. This lookup causes procfs to return the path of + * the vnode's realvp which is the path of the underlying directory. + * Additonally, since systemd is holding an open file descriptor for + * the mount point, overlay mounts also need to be allowed. + * + * * https://github.com/systemd/systemd/commit/28126409b20bca9aa6f + * * https://github.com/systemd/systemd/commit/21935150a0c42b91a32 + */ + if (strncmp(target, "/proc/self/fd/", 14) == 0 && + lookupname(target, UIO_SYSSPACE, NO_FOLLOW, NULLVPP, &vp) == 0) { + struct iovec iov = {0}; + struct uio uio = {0}; + + iov.iov_base = target; + iov.iov_len = targetl; + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_resid = targetl; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_llimit = MAXOFFSET_T; + + rv = VOP_READLINK(vp, &uio, CRED(), NULL); + VN_RELE(vp); + if (rv != 0) + goto out; + target[targetl - uio.uio_resid] = '\0'; + sflags |= MS_OVERLAY; + } + + /* Copy in Linux mount options. */ + if (datap != NULL && + (rv = lx_mnt_copyin_arg(datap, options, optionsl)) != 0) { + goto out; + } + + /* Do filesystem specific mount work. */ + if (flags & LX_MS_BIND) { + /* If MS_BIND is set, we turn this into a lofs mount. */ + (void) strcpy(fstype, "lofs"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lofs_options)) != 0) + goto out; + } else if (strcmp(fstype, "tmpfs") == 0) { + char idstr[64]; + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_tmpfs_options)) != 0) + goto out; + + /* + * Linux defaults to mode=1777 for tmpfs mounts. + */ + if (strstr(options, "mode=") == NULL) { + if (options[0] != '\0') + (void) strlcat(options, ",", optionsl); + (void) strlcat(options, "mode=1777", optionsl); + } + + /* + * Linux supports "nr_inodes=<val>" for tmpfs. There is no + * analogue in illumos - drop the option. + */ + (void) lx_mnt_opt_rm(options, "nr_inodes=", NULL, 0); + + switch (lx_mnt_opt_rm(options, "uid=", idstr, sizeof (idstr))) { + case 0: + uid = -1; + break; + case 1: + if (lx_mnt_opt_val(idstr, &uid) < 0) { + rv = EINVAL; + goto out; + } + break; + default: + rv = E2BIG; + goto out; + } + switch (lx_mnt_opt_rm(options, "gid=", idstr, sizeof (idstr))) { + case 0: + gid = -1; + break; + case 1: + if (lx_mnt_opt_val(idstr, &gid) < 0) { + rv = EINVAL; + goto out; + } + break; + default: + rv = E2BIG; + goto out; + } + + /* + * Linux seems to always allow overlay mounts. We allow this + * everywhere except under /dev where it interferes with device + * emulation. + */ + if (strcmp(target, "/dev") != 0 && + strncmp(target, "/dev/", 5) != 0) + sflags |= MS_OVERLAY; + } else if (strcmp(fstype, "proc") == 0) { + /* Translate proc mount requests to lx_proc requests. */ + (void) strcpy(fstype, "lx_proc"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_proc_options)) != 0) + goto out; + } else if (strcmp(fstype, "sysfs") == 0) { + /* Translate sysfs mount requests to lx_sysfs requests. */ + (void) strcpy(fstype, "lx_sysfs"); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_sysfs_options)) != 0) + goto out; + } else if (strcmp(fstype, "cgroup") == 0) { + /* Translate cgroup mount requests to lx_cgroup requests. */ + (void) strcpy(fstype, "lx_cgroup"); + + /* + * Currently don't verify Linux mount options since we can + * have a subsystem string provided. + */ + } else if (strcmp(fstype, "autofs") == 0) { + /* Translate autofs mount requests to lxautofs requests. */ + (void) strcpy(fstype, LX_AUTOFS_NAME); + + /* Verify Linux mount options. */ + if ((rv = lx_mnt_opt_verify(options, lx_autofs_options)) != 0) + goto out; + + /* Linux seems to always allow overlay mounts */ + sflags |= MS_OVERLAY; + } else { + return (set_errno(ENODEV)); + } + + /* Convert some Linux flags to illumos flags. */ + if (flags & LX_MS_RDONLY) + sflags |= MS_RDONLY; + if (flags & LX_MS_NOSUID) + sflags |= MS_NOSUID; + if (flags & LX_MS_REMOUNT) + sflags |= MS_REMOUNT; + + /* + * Convert some Linux flags to illumos option strings. + */ + if (flags & LX_MS_STRICTATIME) { + /* + * The "strictatime" mount option ensures that none of the + * weaker atime-related mode options are in effect. + */ + flags &= ~(LX_MS_RELATIME | LX_MS_NOATIME); + } + if ((flags & LX_MS_NODEV) && + (rv = lx_mnt_add_opt("nodev", options, optionsl)) != 0) { + goto out; + } + if ((flags & LX_MS_NOEXEC) && + (rv = lx_mnt_add_opt("noexec", options, optionsl)) != 0) { + goto out; + } + if ((flags & LX_MS_NOATIME) && + (rv = lx_mnt_add_opt("noatime", options, optionsl)) != 0) { + goto out; + } + + if ((rv = lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp)) != 0) + goto out; + + /* If mounting proc over itself, just return ok */ + if (strcmp(fstype, "lx_proc") == 0 && strcmp("lx_proc", + vfssw[vp->v_vfsp->vfs_fstype].vsw_name) == 0) { + VN_RELE(vp); + rv = 0; + goto out; + } + + map->spec = source; + map->dir = target; + map->flags = sflags; + map->fstype = fstype; + map->dataptr = NULL; + map->datalen = 0; + map->optptr = options; + map->optlen = optionsl; + + rv = domount(NULL, map, vp, CRED(), &vfsp); + VN_RELE(vp); + if (rv != 0) + goto out; + + VFS_RELE(vfsp); + if (strcmp(fstype, "tmpfs") == 0 && (uid != -1 || gid != -1)) { + /* Handle tmpfs uid/gid mount options. */ + if (lookupname(target, UIO_SYSSPACE, FOLLOW, NULLVPP, + &vp) == 0) { + (void) lx_vn_chown(vp, (uid_t)uid, (gid_t)gid); + VN_RELE(vp); + } + } + +out: + kmem_free(source, sourcel); + kmem_free(target, targetl); + kmem_free(options, optionsl); + + return (rv == 0 ? rv : set_errno(rv)); +} + +/* + * umount() is identical to illumos, though implemented on top of umount2(). + */ +long +lx_umount(char *path) +{ + return (umount2(path, 0)); +} + +/* + * The Linux umount2() system call is identical to illumos but has a different + * value for MNT_FORCE (the logical equivalent to MS_FORCE). + */ +#define LX_MNT_FORCE 0x1 + +long +lx_umount2(char *path, int flg) +{ + int flags = 0; + + if (flg & ~LX_MNT_FORCE) + return (set_errno(EINVAL)); + + if (flg & LX_MNT_FORCE) + flags |= MS_FORCE; + + return (umount2(path, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c new file mode 100644 index 0000000000..5d5e4397ce --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c @@ -0,0 +1,224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/mutex.h> + +#include <sys/lx_types.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> + +extern int fcntl(int, int, intptr_t); +extern int openat(int, char *, int, int); +extern int open(char *, int, int); +extern int close(int); +extern int cioctl(file_t *, int, intptr_t, int *); +extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **, + vnode_t *); + + +static int +ltos_open_flags(int input) +{ + int flags; + + if (input & LX_O_PATH) + input &= (LX_O_DIRECTORY | LX_O_NOFOLLOW | LX_O_CLOEXEC); + + /* + * The illumos O_ACCMODE also includes O_SEARCH|O_EXEC + * so this has the effect of stripping those here. + */ + flags = (input & LX_O_ACCMODE); + + if (input & LX_O_CREAT) + flags |= O_CREAT; + if (input & LX_O_EXCL) + flags |= O_EXCL; + if (input & LX_O_NOCTTY) + flags |= O_NOCTTY; + if (input & LX_O_TRUNC) + flags |= O_TRUNC; + if (input & LX_O_APPEND) + flags |= O_APPEND; + if (input & LX_O_NONBLOCK) + flags |= O_NONBLOCK; + if (input & LX_O_SYNC) + flags |= O_SYNC; + if (input & LX_O_LARGEFILE) + flags |= O_LARGEFILE; + if (input & LX_O_NOFOLLOW) + flags |= O_NOFOLLOW; + if (input & LX_O_CLOEXEC) + flags |= O_CLOEXEC; + if (input & LX_O_DIRECTORY) + flags |= O_DIRECTORY; + + /* + * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the + * device backing the fd in question. illumos has O_DIRECT but + * we additionally need O_RSYNC|O_SYNC to simulate the Linux + * semantics as far as possible. + * + * The LX_O_DIRECT flag also requires that the transfer size and + * alignment of I/O buffers be a multiple of the logical block size for + * the underlying file system, but frankly there isn't an easy way to + * support that functionality without doing something like adding an + * fcntl(2) flag to denote LX_O_DIRECT mode. + * + * Since LX_O_DIRECT is merely a performance advisory, we'll just + * emulate what we can and trust that the only applications expecting + * an error when performing I/O from a misaligned buffer or when + * passing a transfer size is not a multiple of the underlying file + * system block size will be test suites. + */ + if (input & LX_O_DIRECT) + flags |= (O_RSYNC|O_SYNC|O_DIRECT); + + return (flags); +} + +#define LX_POSTPROCESS_OPTS (LX_O_ASYNC | LX_O_PATH) + +static int +lx_open_postprocess(int fd, int fmode) +{ + file_t *fp; + int error = 0; + + if ((fmode & LX_POSTPROCESS_OPTS) == 0) { + /* Skip out early, if possible */ + return (0); + } + + if ((fp = getf(fd)) == NULL) { + /* + * It is possible that this fd was closed by the time we + * arrived here if some one is hammering away with close(). + */ + return (EIO); + } + + if (fmode & LX_O_ASYNC && error == 0) { + if ((error = VOP_SETFL(fp->f_vnode, fp->f_flag, FASYNC, + fp->f_cred, NULL)) == 0) { + mutex_enter(&fp->f_tlock); + fp->f_flag |= FASYNC; + mutex_exit(&fp->f_tlock); + } + } + + if (fmode & LX_O_PATH && error == 0) { + /* + * While the O_PATH flag has no direct analog in SunOS, it is + * emulated by removing both FREAD and FWRITE from f_flag. + * This causes read(2) and write(2) result in EBADF and can be + * checked for in other syscalls to trigger the correct behavior + * there. + */ + mutex_enter(&fp->f_tlock); + fp->f_flag &= ~(FREAD|FWRITE); + mutex_exit(&fp->f_tlock); + } + + releasef(fd); + if (error != 0) { + (void) closeandsetf(fd, NULL); + } + return (error); +} + +long +lx_openat(int atfd, char *path, int fmode, int cmode) +{ + int flags, fd, error; + mode_t mode = 0; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + flags = ltos_open_flags(fmode); + + if ((fmode & (LX_O_NOFOLLOW|LX_O_PATH|__FLXPATH)) == + (LX_O_NOFOLLOW|LX_O_PATH|__FLXPATH)) { + flags |= __FLXPATH; + } + + if (flags & O_CREAT) + mode = (mode_t)cmode; + + ttolwp(curthread)->lwp_errno = 0; + fd = openat(atfd, path, flags, mode); + if (ttolwp(curthread)->lwp_errno != 0) { + if ((fmode & (LX_O_NOFOLLOW|LX_O_PATH|__FLXPATH)) == + (LX_O_NOFOLLOW|LX_O_PATH) && + ttolwp(curthread)->lwp_errno == ELOOP) { + /* + * On Linux, if O_NOFOLLOW and O_PATH are set together + * and the target is a symbolic link, then openat + * should return a file descriptor referring to the + * symbolic link. + * + * This file descriptor can be used with fchownat(2), + * fstatat(2), linkat(2), and readlinkat(2) alongside + * an empty pathname. + * + * illumos has a private interface flag that causes + * openat() to return a file descriptor attached to + * the symlink's vnode. This, in conjunction with the + * other adjustments made in lx_open_postprocess() + * for O_PATH, is enough to satisfy systemd and + * other parts of Linux. + */ + return (lx_openat(atfd, path, fmode|__FLXPATH, cmode)); + } + + if (ttolwp(curthread)->lwp_errno == EINTR) + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (ttolwp(curthread)->lwp_errno); + } + + if ((error = lx_open_postprocess(fd, fmode)) != 0) { + return (set_errno(error)); + } + return (fd); +} + +long +lx_open(char *path, int fmode, int cmode) +{ + return (lx_openat(LX_AT_FDCWD, path, fmode, cmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_personality.c b/usr/src/uts/common/brand/lx/syscall/lx_personality.c new file mode 100644 index 0000000000..e7aa945b50 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_personality.c @@ -0,0 +1,112 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/mutex.h> +#include <sys/brand.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + + +/* + * These flags are for what Linux calls "bug emulation". + * (Descriptions from the personality(2) Linux man page.) + * + * Flags which are currently actionable in LX: + * - READ_IMPLIES_EXEC (since Linux 2.6.8) + * With this flag set, PROT_READ implies PROT_EXEC for mmap(2). + * + * Flags which are current accepted but ignored: + * - UNAME26 (since Linux 3.1) + * Have uname(2) report a 2.6.40+ version number rather than a 3.x version + * number. Added as a stopgap measure to support broken applications that + * could not handle the kernel version- numbering switch from 2.6.x to 3.x. + * + * - ADDR_NO_RANDOMIZE (since Linux 2.6.12) + * With this flag set, disable address-space-layout randomization. + * + * - FDPIC_FUNCPTRS (since Linux 2.6.11) + * User-space function pointers to signal handlers point (on certain + * architectures) to descriptors. + * + * - MMAP_PAGE_ZERO (since Linux 2.4.0) + * Map page 0 as read-only (to support binaries that depend on this SVr4 + * behavior). + * + * - ADDR_COMPAT_LAYOUT (since Linux 2.6.9) + * With this flag set, provide legacy virtual address space layout. + * + * - ADDR_LIMIT_32BIT (since Linux 2.2) + * Limit the address space to 32 bits. + * + * - SHORT_INODE (since Linux 2.4.0) + * No effects(?). + * + * - WHOLE_SECONDS (since Linux 1.2.0) + * No effects(?). + * + * - STICKY_TIMEOUTS (since Linux 1.2.0) + * With this flag set, select(2), pselect(2), and ppoll(2) do not modify the + * returned timeout argument when interrupted by a signal handler. + * + * - ADDR_LIMIT_3GB (since Linux 2.4.0) + * With this flag set, use 0xc0000000 as the offset at which to search a + * virtual memory chunk on mmap(2); otherwise use 0xffffe000. + */ + +#define LX_PER_GET 0xffffffff + +long +lx_personality(unsigned int arg) +{ + lx_proc_data_t *lxpd = ptolxproc(curproc); + unsigned int result = 0; + + mutex_enter(&curproc->p_lock); + result = lxpd->l_personality; + + if (arg == LX_PER_GET) { + mutex_exit(&curproc->p_lock); + return (result); + } + + /* + * Prevent changes to the personality if the process is undergoing an + * exec. This will allow elfexec and friends to manipulate the + * personality without hinderance. + */ + if ((curproc->p_flag & P_PR_EXEC) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EINVAL)); + } + + /* + * Keep tabs when a non-Linux personality is set. This is silently + * allowed to succeed, even though the emulation required is almost + * certainly missing. + */ + if ((arg & LX_PER_MASK) != LX_PER_LINUX) { + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "invalid personality: %02X", + arg & LX_PER_MASK); + lx_unsupported(buf); + } + + lxpd->l_personality = arg; + mutex_exit(&curproc->p_lock); + return (result); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c new file mode 100644 index 0000000000..2acd9d431e --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pgrp.c @@ -0,0 +1,189 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/lx_misc.h> + +#define LX_INIT_PGID 1 +#define LX_INIT_SID 1 + +/* From uts/common/syscall/pgrpsys.c */ +extern int setpgrp(int, int, int); + +long +lx_getpgrp(void) +{ + int pg; + + /* getpgrp() */ + pg = setpgrp(0, 0, 0); + + /* + * If the pgrp is that of the init process, return the value Linux + * expects. + */ + if (pg == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + return (pg); +} + +long +lx_getpgid(int pid) +{ + pid_t spid; + int tid; + int pg; + + if (pid < 0) + return (set_errno(ESRCH)); + + /* + * If the supplied pid matches that of the init process, return the pgid + * Linux expects. + */ + if (pid == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* getpgid() */ + ttolwp(curthread)->lwp_errno = 0; + pg = setpgrp(4, spid, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + /* + * If the pgid is that of the init process, return the value Linux + * expects. + */ + if (pg == curzone->zone_proc_initpid) + return (LX_INIT_PGID); + + return (pg); +} + +long +lx_setpgid(pid_t pid, pid_t pgid) +{ + pid_t spid, spgid; + int tid; + int pg; + int ret; + + if (pid < 0) + return (set_errno(ESRCH)); + + if (pgid < 0) + return (set_errno(EINVAL)); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + if (pgid == 0) { + spgid = spid; + } else if (lx_lpid_to_spair(pgid, &spgid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* setpgid() */ + ret = setpgrp(5, spid, spgid); + + if (ret == EPERM) { + /* + * On Linux, when calling setpgid with a desired pgid that is + * equal to the current pgid of the process, no error is + * emitted. This differs slightly from illumos which would + * return EPERM. To emulate the Linux behavior, we check + * specifically for matching pgids. + */ + + /* getpgid() */ + ttolwp(curthread)->lwp_errno = 0; + pg = setpgrp(4, spid, 0); + if (ttolwp(curthread)->lwp_errno == 0 && spgid == pg) + return (0); + return (set_errno(EPERM)); + } + + return (ret); +} + +long +lx_getsid(int pid) +{ + pid_t spid; + int tid; + int sid; + + if (pid < 0) + return (set_errno(ESRCH)); + + /* + * If the supplied pid matches that of the init process, return the sid + * Linux expects. + */ + if (pid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + if (pid == 0) { + spid = curproc->p_pid; + } else if (lx_lpid_to_spair(pid, &spid, &tid) < 0) { + return (set_errno(ESRCH)); + } + + /* getsid() */ + ttolwp(curthread)->lwp_errno = 0; + sid = setpgrp(2, spid, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + + /* + * If the sid is that of the init process, return the value Linux + * expects. + */ + if (sid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + return (sid); +} + +long +lx_setsid(void) +{ + int sid; + + /* setsid() */ + ttolwp(curthread)->lwp_errno = 0; + sid = setpgrp(3, 0, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + /* + * If the sid is that of the init process, return the value Linux + * expects. + */ + if (sid == curzone->zone_proc_initpid) + return (LX_INIT_SID); + + return (sid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c new file mode 100644 index 0000000000..96959e40df --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c @@ -0,0 +1,309 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved. + * + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/fs/fifonode.h> +#include <sys/fcntl.h> +#include <sys/policy.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/sysmacros.h> + +#define LX_DEFAULT_PIPE_SIZE 65536 + +/* + * Our default value for fs.pipe-size-max mirrors Linux. The enforced maximum + * is meant to provide some sort of upper bound on pipe buffer sizing. Its + * value was chosen somewhat arbitrarily. + */ +uint_t lx_pipe_max_default = 1048576; +uint_t lx_pipe_max_limit = 8388608; + +int +lx_pipe_setsz(stdata_t *str, uint_t size, boolean_t is_init) +{ + int err; + stdata_t *mate; + lx_zone_data_t *lxzd = ztolxzd(curzone); + uint_t max_size = lxzd->lxzd_pipe_max_sz; + fifonode_t *fnp1, *fnp2; + + size = P2ROUNDUP(size, PAGESIZE); + if (size == 0) { + return (EINVAL); + } else if (size > max_size && secpolicy_resource(CRED()) != 0) { + if (!is_init) { + return (EPERM); + } + /* + * If the size limit is breached during initial pipe setup, + * simply clamp it to the maximum. On Linux kernels prior to + * 4.9, this clamping would not occur and it would be possible + * to open a pipe with the default buffer size even if it + * exceeded the sysctl limit. Rather than trigger behavior + * here based on the configured kernel version, it is applied + * to all callers. + */ + size = max_size; + ASSERT(max_size <= lx_pipe_max_limit); + } else if (size > lx_pipe_max_limit) { + /* + * Unlike Linux, we do maintain a global hard cap on pipe + * buffer limits. + */ + return (EPERM); + } + + if (!STRMATED(str)) { + err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size); + if (err == 0) { + fnp1 = VTOF(str->sd_vnode); + mutex_enter(&fnp1->fn_lock->flk_lock); + fnp1->fn_hiwat = size; + mutex_exit(&fnp1->fn_lock->flk_lock); + } + return (err); + } + + /* + * Ensure consistent order so the set operation is always attempted on + * the "higher" stream first. + */ + if (str > str->sd_mate) { + VERIFY((mate = str->sd_mate) != NULL); + } else { + mate = str; + VERIFY((str = mate->sd_mate) != NULL); + } + + /* + * While it is unfortunate that an error could occur for the latter + * half of the stream pair, there is little to be done about it aside + * from reporting the failure. + */ + if ((err = strqset(RD(str->sd_wrq), QHIWAT, 0, (intptr_t)size)) == 0) { + err = strqset(RD(mate->sd_wrq), QHIWAT, 0, (intptr_t)size); + } + + if (err == 0) { + fnp1 = VTOF(str->sd_vnode); + fnp2 = VTOF(str->sd_mate->sd_vnode); + + /* + * See fnode_constructor. Both sides should have the same + * lock. We expect our callers to ensure that the vnodes + * are VFIFO and have v_op == fifovnops. + */ + ASSERT(str->sd_vnode->v_type == VFIFO); + ASSERT(str->sd_mate->sd_vnode->v_type == VFIFO); + ASSERT(fnp1->fn_lock == fnp2->fn_lock); + + mutex_enter(&fnp1->fn_lock->flk_lock); + + fnp1->fn_hiwat = size; + fnp2->fn_hiwat = size; + + mutex_exit(&fnp1->fn_lock->flk_lock); + } + + return (err); +} + +/* + * Based on native pipe(2) system call, except that the pipe is half-duplex. + */ +static int +lx_hd_pipe(intptr_t arg, int flags) +{ + vnode_t *vp1, *vp2; + struct file *fp1, *fp2; + int error = 0; + int flag1, flag2, iflags; + int fd1, fd2; + stdata_t *str; + + /* + * Validate allowed flags. + */ + if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) { + return (set_errno(EINVAL)); + } + /* + * Allocate and initialize two vnodes. + */ + makepipe(&vp1, &vp2); + + /* + * Allocate and initialize two file table entries and two + * file pointers. The first file pointer is open for read and the + * second is open for write. + */ + if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) { + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); + } + + if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0) + goto out2; + + /* + * Create two stream heads and attach to each vnode. + */ + if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0) + goto out; + + if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) { + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, + fp1->f_cred, NULL); + goto out; + } + + strmate(vp1, vp2); + + VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid(); + + /* + * Attempt to set pipe buffer sizes to expected value. + */ + VERIFY((str = vp1->v_stream) != NULL); + (void) lx_pipe_setsz(str, LX_DEFAULT_PIPE_SIZE, B_TRUE); + + /* + * Set the O_NONBLOCK flag if requested. + */ + if (flags & FNONBLOCK) { + flag1 = fp1->f_flag; + flag2 = fp2->f_flag; + iflags = flags & FNONBLOCK; + + if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp1->f_flag |= iflags; + + if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp2->f_flag |= iflags; + } + + /* + * Return the file descriptors to the user. They now + * point to two different vnodes which have different + * stream heads. + */ + if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) || + copyout(&fd2, &((int *)arg)[1], sizeof (int))) { + error = EFAULT; + goto out_vop_close; + } + + /* + * Now fill in the entries that falloc reserved + */ + mutex_exit(&fp1->f_tlock); + mutex_exit(&fp2->f_tlock); + setf(fd1, fp1); + setf(fd2, fp2); + + /* + * Optionally set the FCLOEXEC flag + */ + if ((flags & FCLOEXEC) != 0) { + f_setfd(fd1, FD_CLOEXEC); + f_setfd(fd2, FD_CLOEXEC); + } + + return (0); +out_vop_close: + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL); + (void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL); +out: + setf(fd2, NULL); + unfalloc(fp2); +out2: + setf(fd1, NULL); + unfalloc(fp1); + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); +} + +/* + * pipe(2) system call. + */ +long +lx_pipe(intptr_t arg) +{ + return (lx_hd_pipe(arg, 0)); +} + +/* + * pipe2(2) system call. + */ +long +lx_pipe2(intptr_t arg, int lxflags) +{ + int flags = 0; + + /* + * Validate allowed flags. + */ + if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) { + return (set_errno(EINVAL)); + } + + /* + * Convert from Linux flags to illumos flags. + */ + if (lxflags & LX_O_NONBLOCK) { + flags |= FNONBLOCK; + } + if (lxflags & LX_O_CLOEXEC) { + flags |= FCLOEXEC; + } + + return (lx_hd_pipe(arg, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_poll.c b/usr/src/uts/common/brand/lx/syscall/lx_poll.c new file mode 100644 index 0000000000..e54130aff1 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_poll.c @@ -0,0 +1,786 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/poll_impl.h> +#include <sys/schedctl.h> +#include <sys/lx_signal.h> + +/* + * Max number of FDs that can be given to poll() or select() before we return + * EINVAL (the Linux man page documents this value as {OPEN_MAX}, and defaults + * it to this value). + */ +int lx_poll_max_fds = 1048576; + +/* From uts/common/syscall/poll.c */ +extern int poll_copyin(pollstate_t *, pollfd_t *, nfds_t); +extern int poll_common(pollstate_t *, pollfd_t *, nfds_t, timespec_t *, int *); + +/* + * These events are identical between Linux and SunOS + */ +#define LX_POLLIN 0x001 +#define LX_POLLPRI 0x002 +#define LX_POLLOUT 0x004 +#define LX_POLLERR 0x008 +#define LX_POLLHUP 0x010 +#define LX_POLLNVAL 0x020 +#define LX_POLLRDNORM 0x040 +#define LX_POLLRDBAND 0x080 + +#define LX_POLL_COMMON_EVENTS (LX_POLLIN | LX_POLLPRI | LX_POLLOUT | \ + LX_POLLERR | LX_POLLHUP | LX_POLLNVAL | LX_POLLRDNORM | LX_POLLRDBAND) + +/* + * These events differ between Linux and SunOS + */ +#define LX_POLLWRNORM 0x0100 +#define LX_POLLWRBAND 0x0200 +#define LX_POLLRDHUP 0x2000 + + +#define LX_POLL_SUPPORTED_EVENTS \ + (LX_POLL_COMMON_EVENTS | LX_POLLWRNORM | LX_POLLWRBAND | LX_POLLRDHUP) + + +static int +lx_poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i, error = 0; + pollfd_t *pollfdp; + + if ((error = poll_copyin(ps, fds, nfds)) != 0) { + return (error); + } + pollfdp = ps->ps_pollfd; + + /* Convert the Linux events bitmask into SunOS equivalent. */ + for (i = 0; i < nfds; i++) { + short lx_events = pollfdp[i].events; + short events; + + /* + * If the caller is polling for an unsupported event, we + * have to bail out. + */ + if (lx_events & ~LX_POLL_SUPPORTED_EVENTS) { + return (ENOTSUP); + } + + events = lx_events & LX_POLL_COMMON_EVENTS; + if (lx_events & LX_POLLWRNORM) + events |= POLLWRNORM; + if (lx_events & LX_POLLWRBAND) + events |= POLLWRBAND; + if (lx_events & LX_POLLRDHUP) + events |= POLLRDHUP; + pollfdp[i].events = events; + oldevt[i] = lx_events; + } + return (0); +} + +static int +lx_poll_copyout(pollfd_t *pollfdp, pollfd_t *fds, nfds_t nfds, short *oldevt) +{ + int i; + + /* + * Convert SunOS revents bitmask into Linux equivalent and restore + * cached events field which was swizzled by lx_poll_copyin. + */ + for (i = 0; i < nfds; i++) { + short revents = pollfdp[i].revents; + short lx_revents = revents & LX_POLL_COMMON_EVENTS; + short orig_events = oldevt[i]; + + if (revents & POLLWRBAND) + lx_revents |= LX_POLLWRBAND; + if (revents & POLLRDHUP) + lx_revents |= LX_POLLRDHUP; + /* + * Because POLLOUT and POLLWRNORM are native defined as the + * same value, care must be taken when translating them to + * Linux where they differ. + */ + if (revents & POLLOUT) { + if ((orig_events & LX_POLLOUT) == 0) + lx_revents &= ~LX_POLLOUT; + if (orig_events & LX_POLLWRNORM) + lx_revents |= LX_POLLWRNORM; + } + + pollfdp[i].revents = lx_revents; + pollfdp[i].events = orig_events; + } + + if (copyout(pollfdp, fds, sizeof (pollfd_t) * nfds) != 0) + return (EFAULT); + + return (0); +} + +static long +lx_poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + short *oldevt = NULL; + int error = 0, fdcnt = 0; + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + /* + * Cap the number of FDs they can give us so we don't go + * allocating a huge chunk of memory. Note that this is *not* + * the RLIMIT_NOFILE rctl. + */ + if (nfds > lx_poll_max_fds) { + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + /* + * Certain event types which are distinct on Linux are aliased + * against each other on illumos. In order properly translate + * back into the Linux format, the original events of interest + * are stored in 'oldevt' for use during lx_poll_copyout. + */ + oldevt = kmem_alloc(nfds * sizeof (short), KM_SLEEP); + if ((error = lx_poll_copyin(ps, fds, nfds, oldevt)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + + /* + * The Linux poll(2) implicitly polls for POLLERR and POLLHUP + * in addition to any other events specified for the file + * descriptors in question. It does not modify pollfd_t`events + * to reflect that fact when performing a later copyout. + */ + ps->ps_implicit_ev = POLLERR | POLLHUP; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + + /* + * Clear implicit event interest, if needed. + */ + if (ps != NULL) { + ps->ps_implicit_ev = 0; + } + + +pollout: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (nfds != 0 && error == 0) { + error = lx_poll_copyout(pollfdp, fds, nfds, oldevt); + } + if (oldevt != NULL) { + kmem_free(oldevt, nfds * sizeof (short)); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_poll(pollfd_t *fds, nfds_t nfds, int timeout) +{ + timespec_t ts, *tsp = NULL; + + if (timeout >= 0) { + ts.tv_sec = timeout / MILLISEC; + ts.tv_nsec = (timeout % MILLISEC) * MICROSEC; + tsp = &ts; + } + + return (lx_poll_common(fds, nfds, tsp, NULL)); +} + +long +lx_ppoll(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, lx_sigset_t *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset; + + if (copyin(setp, &lset, sizeof (lset))) + return (set_errno(EFAULT)); + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + + return (lx_poll_common(fds, nfds, tsp, ksetp)); +} + +typedef struct lx_select_buf_s { + long *lsb_rfds; + long *lsb_wfds; + long *lsb_efds; + unsigned int lsb_size; +} lx_select_buf_t; + +/* + * Size (in bytes) of buffer appropriate for fd_set copyin/copyout. + * Linux uses buffers of 'long' to accomplish this. + */ +#define LX_FD_SET_BYTES (sizeof (long)) +#define LX_FD_SET_BITS (8 * LX_FD_SET_BYTES) +#define LX_FD_SET_SIZE(nfds) \ + ((((nfds) + (LX_FD_SET_BITS - 1)) / LX_FD_SET_BITS) * LX_FD_SET_BYTES) + +static int +lx_select_copyin(pollstate_t *ps, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds) +{ + int n; + long *in, *out, *ex; + long absent = 0; + pollfd_t *pfd; + nfds_t old_nfds; + + /* + * Just like pollsys and lx_poll, attempt to reuse ps_pollfd if it is + * appropriately sized. See poll_copyin for more detail. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pfd = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pfd; + ps->ps_nfds = nfds; + } else { + pfd = ps->ps_pollfd; + } + + if (rfds != NULL) { + if (copyin(rfds, sbuf->lsb_rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyin(wfds, sbuf->lsb_wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyin(efds, sbuf->lsb_efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + in = (rfds != NULL) ? sbuf->lsb_rfds : &absent; + out = (wfds != NULL) ? sbuf->lsb_wfds : &absent; + ex = (efds != NULL) ? sbuf->lsb_efds : &absent; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long b, m, j; + + b = (unsigned long)(*in | *out | *ex); + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + int fd = n + j; + + if (fd >= nfds) + return (0); + pfd->events = 0; + if (b & 1) { + pfd->fd = fd; + if (*in & m) + pfd->events |= POLLRDNORM; + if (*out & m) + pfd->events |= POLLWRNORM; + if (*ex & m) + pfd->events |= POLLRDBAND; + } else { + pfd->fd = -1; + } + pfd++; + b >>= 1; + m <<= 1; + } + + if (rfds != NULL) + in++; + if (wfds != NULL) + out++; + if (efds != NULL) + ex++; + } + return (0); +} + +static int +lx_select_copyout(pollfd_t *pollfdp, lx_select_buf_t *sbuf, int nfds, + long *rfds, long *wfds, long *efds, int *fdcnt) +{ + int n; + pollfd_t *pfd; + long rv = 0; + + /* + * If poll did not find any fds of interest, we can just zero out the + * fd_set fields for copyout. + */ + if (*fdcnt == 0) { + if (rfds != NULL) { + bzero(sbuf->lsb_rfds, sbuf->lsb_size); + } + if (wfds != NULL) { + bzero(sbuf->lsb_wfds, sbuf->lsb_size); + } + if (efds != NULL) { + bzero(sbuf->lsb_efds, sbuf->lsb_size); + } + goto copyout; + } + + /* + * For each fd, if any bits are set convert them into the appropriate + * pollfd struct. (Derived from libc's select logic) + */ + pfd = pollfdp; + for (n = 0; n < nfds; n += LX_FD_SET_BITS) { + unsigned long m, j; + long in = 0, out = 0, ex = 0; + + m = 1; + for (j = 0; j < LX_FD_SET_BITS; j++) { + if ((n + j) >= nfds) + break; + if (pfd->revents != 0) { + if (pfd->revents & POLLNVAL) { + return (EBADF); + } + if (pfd->revents & POLLRDNORM) { + in |= m; + rv++; + } + if (pfd->revents & POLLWRNORM) { + out |= m; + rv++; + } + if (pfd->revents & POLLRDBAND) { + ex |= m; + rv++; + } + /* + * Only set this bit on return if we asked + * about input conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDNORM)) { + if ((in & m) == 0) { + /* wasn't already set */ + rv++; + } + in |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLWRNORM)) { + if ((out & m) == 0) { + /* wasn't already set */ + rv++; + } + out |= m; + } + /* + * Only set this bit on return if we asked + * about output conditions. + */ + if ((pfd->revents & (POLLHUP|POLLERR)) && + (pfd->events & POLLRDBAND)) { + if ((ex & m) == 0) { + /* wasn't already set */ + rv++; + } + ex |= m; + } + } + m <<= 1; + pfd++; + } + if (rfds != NULL) + sbuf->lsb_rfds[n / LX_FD_SET_BITS] = in; + if (wfds != NULL) + sbuf->lsb_wfds[n / LX_FD_SET_BITS] = out; + if (efds != NULL) + sbuf->lsb_efds[n / LX_FD_SET_BITS] = ex; + } + +copyout: + if (rfds != NULL) { + if (copyout(sbuf->lsb_rfds, rfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (wfds != NULL) { + if (copyout(sbuf->lsb_wfds, wfds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + if (efds != NULL) { + if (copyout(sbuf->lsb_efds, efds, sbuf->lsb_size) != 0) { + return (EFAULT); + } + } + *fdcnt = rv; + return (0); +} + + +static long +lx_select_common(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *tsp, k_sigset_t *ksetp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL, *fake_fds = NULL; + lx_select_buf_t sbuf = {0}; + int error = 0, fdcnt = 0; + + if (nfds < 0) { + return (set_errno(EINVAL)); + } + + /* + * Reset our signal mask, if requested. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto out; + } + mutex_exit(&p->p_lock); + } + + /* + * Because poll caching uses the userspace pollfd_t pointer to verify + * cache reuse validity, a simulated value must be supplied when + * emulating Linux select(2). The first non-NULL pointer from + * rfds/wfds/efds is used for this purpose. + */ + if (rfds != NULL) { + fake_fds = (pollfd_t *)rfds; + } else if (wfds != NULL) { + fake_fds = (pollfd_t *)wfds; + } else if (efds != NULL) { + fake_fds = (pollfd_t *)efds; + } else { + /* + * A non-zero nfds was supplied but all three fd_set pointers + * were null. Fall back to doing a simple timeout. + */ + nfds = 0; + } + + /* + * Initialize pollstate and copy in pollfd data if present. + */ + if (nfds != 0) { + /* + * Cap the number of FDs they can give us so we don't go + * allocating a huge chunk of memory. Note that this is *not* + * the RLIMIT_NOFILE rctl. + */ + if (nfds > lx_poll_max_fds) { + error = EINVAL; + goto out; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + sbuf.lsb_size = LX_FD_SET_SIZE(nfds); + if (rfds != NULL) + sbuf.lsb_rfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (wfds != NULL) + sbuf.lsb_wfds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + if (efds != NULL) + sbuf.lsb_efds = kmem_alloc(sbuf.lsb_size, KM_SLEEP); + + error = lx_select_copyin(ps, &sbuf, nfds, rfds, wfds, efds); + if (error != 0) { + goto out; + } + + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fake_fds, (nfds_t)nfds, tsp, &fdcnt); + +out: + /* + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. + */ + if (ksetp != NULL) { + mutex_enter(&p->p_lock); + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + mutex_exit(&p->p_lock); + } + + /* + * Copy out the events and return the fdcnt to the user. + */ + if (error == 0 && nfds != 0) { + error = lx_select_copyout(pollfdp, &sbuf, nfds, rfds, wfds, + efds, &fdcnt); + } + if (sbuf.lsb_size != 0) { + if (sbuf.lsb_rfds != NULL) + kmem_free(sbuf.lsb_rfds, sbuf.lsb_size); + if (sbuf.lsb_wfds != NULL) + kmem_free(sbuf.lsb_wfds, sbuf.lsb_size); + if (sbuf.lsb_efds != NULL) + kmem_free(sbuf.lsb_efds, sbuf.lsb_size); + } + if (error) { + return (set_errno(error)); + } + return (fdcnt); +} + +long +lx_select(int nfds, long *rfds, long *wfds, long *efds, + struct timeval *timeoutp) +{ + timespec_t ts, *tsp = NULL; + + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + struct timeval tv; + + if (copyin(timeoutp, &tv, sizeof (tv))) + return (set_errno(EFAULT)); + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * (NANOSEC / MICROSEC); + } else { + struct timeval32 tv32; + + if (copyin(timeoutp, &tv32, sizeof (tv32))) + return (set_errno(EFAULT)); + ts.tv_sec = tv32.tv_sec; + ts.tv_nsec = tv32.tv_usec * (NANOSEC / MICROSEC); + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, NULL)); +} + + +typedef struct { + uintptr_t lpsa_addr; + unsigned long lpsa_len; +} lx_pselect_sig_arg_t; + +#if defined(_LP64) +typedef struct { + caddr32_t lpsa_addr; + uint32_t lpsa_len; +} lx_pselect_sig_arg32_t; +#endif /* defined(_LP64) */ + +long +lx_pselect(int nfds, long *rfds, long *wfds, long *efds, + timespec_t *timeoutp, void *setp) +{ + timespec_t ts, *tsp = NULL; + k_sigset_t kset, *ksetp = NULL; + + /* + * Copy in timeout and sigmask. + */ + if (timeoutp != NULL) { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + if (setp != NULL) { + lx_sigset_t lset, *sigaddr = NULL; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_pselect_sig_arg_t lpsa; + + if (copyin(setp, &lpsa, sizeof (lpsa)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)lpsa.lpsa_addr; + } +#if defined(_LP64) + else { + lx_pselect_sig_arg32_t lpsa32; + + if (copyin(setp, &lpsa32, sizeof (lpsa32)) != 0) + return (set_errno(EFAULT)); + /* + * Linux forces a size to be passed only so it can + * check that it's the size of a sigset_t. + */ + if (lpsa32.lpsa_len != sizeof (lx_sigset_t)) + return (set_errno(EINVAL)); + + sigaddr = (lx_sigset_t *)(uint64_t)lpsa32.lpsa_addr; + } +#endif /* defined(_LP64) */ + + /* This is where we check if the sigset is *really* NULL. */ + if (sigaddr != NULL) { + if (copyin(sigaddr, &lset, sizeof (lset)) != 0) + return (set_errno(EFAULT)); + + lx_ltos_sigset(&lset, &kset); + ksetp = &kset; + } + } + + return (lx_select_common(nfds, rfds, wfds, efds, tsp, ksetp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_prctl.c b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c new file mode 100644 index 0000000000..8b3c267653 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_prctl.c @@ -0,0 +1,351 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/user.h> +#include <sys/priv.h> +#include <sys/brand.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> + +#define LX_PR_SET_PDEATHSIG 1 +#define LX_PR_GET_PDEATHSIG 2 +#define LX_PR_GET_DUMPABLE 3 +#define LX_PR_SET_DUMPABLE 4 +#define LX_PR_GET_UNALIGN 5 +#define LX_PR_SET_UNALIGN 6 +#define LX_PR_GET_KEEPCAPS 7 +#define LX_PR_SET_KEEPCAPS 8 +#define LX_PR_GET_FPEMU 9 +#define LX_PR_SET_FPEMU 10 +#define LX_PR_GET_FPEXC 11 +#define LX_PR_SET_FPEXC 12 +#define LX_PR_GET_TIMING 13 +#define LX_PR_SET_TIMING 14 +#define LX_PR_SET_NAME 15 +#define LX_PR_GET_NAME 16 +#define LX_PR_GET_ENDIAN 19 +#define LX_PR_SET_ENDIAN 20 +#define LX_PR_GET_SECCOMP 21 +#define LX_PR_SET_SECCOMP 22 +#define LX_PR_CAPBSET_READ 23 +#define LX_PR_CAPBSET_DROP 24 +#define LX_PR_GET_TSC 25 +#define LX_PR_SET_TSC 26 +#define LX_PR_GET_SECUREBITS 27 +#define LX_PR_SET_SECUREBITS 28 +#define LX_PR_SET_TIMERSLACK 29 +#define LX_PR_GET_TIMERSLACK 30 +#define LX_PR_TASK_PERF_EVENTS_DISABLE 31 +#define LX_PR_TASK_PERF_EVENTS_ENABLE 32 +#define LX_PR_MCE_KILL 33 +#define LX_PR_MCE_KILL_GET 34 +#define LX_PR_SET_MM 35 +#define LX_PR_SET_CHILD_SUBREAPER 36 +#define LX_PR_GET_CHILD_SUBREAPER 37 +#define LX_PR_SET_NO_NEW_PRIVS 38 +#define LX_PR_GET_NO_NEW_PRIVS 39 +#define LX_PR_GET_TID_ADDRESS 40 +#define LX_PR_SET_THP_DISABLE 41 +#define LX_PR_GET_THP_DISABLE 42 + +#define SECCOMP_MODE_FILTER 2 + +long +lx_prctl(int opt, uintptr_t arg2, uintptr_t arg3) +{ + long err; + char ebuf[64]; + + switch (opt) { + case LX_PR_GET_DUMPABLE: { + /* Only track in brand data - could hook into SNOCD later */ + lx_proc_data_t *lxpd; + int val; + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + val = lxpd->l_flags & LX_PROC_NO_DUMP; + mutex_exit(&curproc->p_lock); + + return (val == 0); + } + + case LX_PR_SET_DUMPABLE: { + lx_proc_data_t *lxpd; + + if (arg2 != 0 && arg2 != 1) { + return (set_errno(EINVAL)); + } + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + if (arg2 == 0) { + lxpd->l_flags |= LX_PROC_NO_DUMP; + } else { + lxpd->l_flags &= ~LX_PROC_NO_DUMP; + } + mutex_exit(&curproc->p_lock); + + return (0); + } + + case LX_PR_GET_SECUREBITS: { + /* Our bits are always 0 */ + return (0); + } + + case LX_PR_SET_SECUREBITS: { + /* Ignore setting any bits from arg2 */ + return (0); + } + + case LX_PR_SET_KEEPCAPS: { + /* + * The closest illumos analog to SET_KEEPCAPS is the PRIV_AWARE + * flag. There are probably some cases where it's not exactly + * the same, but this will do for a first try. + */ + if (arg2 == 0) { + err = setpflags(PRIV_AWARE_RESET, 1, NULL); + } else { + err = setpflags(PRIV_AWARE, 1, NULL); + } + + if (err != 0) { + return (set_errno(err)); + } + return (0); + } + + case LX_PR_GET_NAME: { + /* + * We allow longer thread names than Linux for compatibility + * with other OSes (Solaris, NetBSD) that also allow larger + * names. We just truncate (with NUL termination) if + * the name is longer. + */ + char name[LX_PR_SET_NAME_NAMELEN] = { 0 }; + kthread_t *t = curthread; + + mutex_enter(&ttoproc(t)->p_lock); + if (t->t_name != NULL) { + (void) strlcpy(name, t->t_name, sizeof (name)); + } + mutex_exit(&ttoproc(t)->p_lock); + + /* + * FWIW, the prctl(2) manpage says that the user-supplied + * buffer should be at least 16 (LX_PR_SET_NAME_NAMELEN) bytes + * long. + */ + if (copyout(name, (void *)arg2, LX_PR_SET_NAME_NAMELEN) != 0) { + return (set_errno(EFAULT)); + } + return (0); + } + + case LX_PR_SET_NAME: { + char name[LX_PR_SET_NAME_NAMELEN] = { 0 }; + kthread_t *t = curthread; + proc_t *p = ttoproc(t); + int ret; + + ret = copyinstr((const char *)arg2, name, sizeof (name), NULL); + /* + * prctl(2) explicitly states that over length strings are + * silently truncated + */ + if (ret != 0 && ret != ENAMETOOLONG) { + return (set_errno(EFAULT)); + } + name[LX_PR_SET_NAME_NAMELEN - 1] = '\0'; + + if ((ret = thread_setname(t, name)) != 0) { + return (set_errno(ret)); + } + + /* + * In Linux, PR_SET_NAME sets the name of the thread, not the + * process. Due to the historical quirks of Linux's asinine + * thread model, this name is effectively the name of the + * process (as visible via ps(1)) if the thread is the first of + * its task group. The first thread is therefore special, and + * to best mimic Linux semantics we set the thread name, and if + * we are setting LWP 1, we also update the name of the process. + */ + if (t->t_tid != 1) { + return (0); + } + + /* + * We are currently choosing to not allow an empty thread + * name to clear p->p_user.u_comm and p->p_user.u_psargs. + * This is a slight divergence from linux behavior (which + * allows this) so that we can preserve the original command. + */ + if (strlen(name) == 0) { + return (0); + } + + /* + * We explicitly use t->t_name here instead of name in case + * a thread has come in between the above thread_setname() + * call and the setting of u_comm/u_psargs below. On Linux, + * one can also change the name of a thread (either itself or + * another thread in the same process) via writing to /proc, so + * while racy, this is no worse than what might happen on + * Linux. + */ + mutex_enter(&p->p_lock); + (void) strncpy(p->p_user.u_comm, t->t_name, MAXCOMLEN + 1); + (void) strncpy(p->p_user.u_psargs, t->t_name, PSARGSZ); + mutex_exit(&p->p_lock); + return (0); + } + + case LX_PR_GET_PDEATHSIG: { + int sig; + lx_proc_data_t *lxpd; + + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + sig = lxpd->l_parent_deathsig; + mutex_exit(&curproc->p_lock); + + return (sig); + } + + case LX_PR_SET_PDEATHSIG: { + int sig = lx_ltos_signo((int)arg2, 0); + proc_t *pp = NULL; + lx_proc_data_t *lxpd; + + if (sig == 0 && arg2 != 0) { + return (set_errno(EINVAL)); + } + + mutex_enter(&pidlock); + /* Set signal on our self */ + mutex_enter(&curproc->p_lock); + VERIFY((lxpd = ptolxproc(curproc)) != NULL); + lxpd->l_parent_deathsig = sig; + pp = curproc->p_parent; + mutex_exit(&curproc->p_lock); + + /* Configure parent to potentially signal children on death */ + mutex_enter(&pp->p_lock); + if (PROC_IS_BRANDED(pp)) { + VERIFY((lxpd = ptolxproc(pp)) != NULL); + /* + * Mark the parent as having children which wish to be + * signaled on death of parent. + */ + lxpd->l_flags |= LX_PROC_CHILD_DEATHSIG; + } else { + /* + * If the parent is not a branded process, the needed + * hooks to facilitate this mechanism will not fire + * when it dies. We lie about success in this case. + */ + /* EMPTY */ + } + mutex_exit(&pp->p_lock); + mutex_exit(&pidlock); + return (0); + } + + case LX_PR_CAPBSET_DROP: { + /* + * On recent versions of Linux the login svc drops capabilities + * and if that fails the svc dies and is restarted by systemd. + * For now we pretend dropping capabilities succeeded. + */ + return (0); + } + + case LX_PR_SET_NO_NEW_PRIVS: { + /* + * On recent versions of Linux more services are starting to set + * NoNewPrivs=yes in their systemd unit file. Since we currently + * just return success for LX_PR_CAPBSET_DROP there is currently + * no need to map this to the illumos privileges. + */ + return (0); + } + + case LX_PR_GET_NO_NEW_PRIVS: { + /* + * Some Linux applications (such as Elasticsearch) use + * PR_SET_NO_NEW_PRIVS to enable the NoNewPrivs flag, and then + * query the flag status, and fail unless the query indicates + * the flag is enabled. Since we return success in + * LX_PR_SET_NO_NEW_PRIVS, we assume here that the application + * has intended to enable the flag, so we return 1 indicating + * that the flag is enabled. + */ + return (1); + } + + case LX_PR_SET_SECCOMP: { + if (arg2 == SECCOMP_MODE_FILTER) { + if (arg3 == (uintptr_t)NULL) { + /* + * prctl(2) says PR_SET_SECCOMP should indicate + * EFAULT if arg3 is an invalid address. + */ + return (set_errno(EFAULT)); + } + + /* + * Some Linux applications install seccomp BPF rules. + * For example, Elasticsearch installs rules that + * prevent fork/exec. Since Illumos doesn't have an + * analogous system call rule engine, for now we just + * pretend and lie that the rule installation + * succeeded. + */ + /* So we can track who needs this */ + DTRACE_PROBE(lx__SECCOMP); + return (0); + } + break; + } + + case LX_PR_GET_SECCOMP: { + /* + * Some Linux applications (such as Elasticsearch) use + * PR_SET_SECCOMP to install seccomp BPF rules, and then query + * the status, and fail unless the query indicates its rules + * are installed. Since we return success in LX_PR_SET_SECCOMP, + * we assume here that the application has intended to install + * its rules, so we return SECCOMP_MODE_FILTER indicating that + * its rules are installed. + */ + return (SECCOMP_MODE_FILTER); + } + + default: + break; + } + + (void) snprintf(ebuf, 64, "prctl option %d", opt); + lx_unsupported(ebuf); + return (set_errno(EINVAL)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_priority.c b/usr/src/uts/common/brand/lx/syscall/lx_priority.c new file mode 100644 index 0000000000..44c60b66bf --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_priority.c @@ -0,0 +1,192 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/procset.h> +#include <sys/resource.h> +#include <sys/priocntl.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* From uts/common/disp/priocntl.c */ +extern int donice(procset_t *, pcnice_t *); + +/* + * The Linux syscall returns priorities in the range (highest) 40-1 (lowest) + * and then glibc adjusts these to the range -20 - 19. + */ +long +lx_getpriority(int which, id_t who) +{ + int rval; + idtype_t idtype; + id_t id, lid; + pcnice_t pcnice; + procset_t procset; + + switch (which) { + case PRIO_PROCESS: + idtype = P_PID; + if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0) + return (set_errno(ESRCH)); + break; + case PRIO_PGRP: + idtype = P_PGID; + break; + case PRIO_USER: + idtype = P_UID; + break; + default: + return (set_errno(EINVAL)); + } + + /* Linux fails with a different errno on a negative id */ + if (who < 0) + return (set_errno(ESRCH)); + + id = (who == 0 ? P_MYID : who); + + pcnice.pc_val = 0; + pcnice.pc_op = PC_GETNICE; + + setprocset(&procset, POP_AND, idtype, id, P_ALL, 0); + + rval = donice(&procset, &pcnice); + if (rval != 0) { + if (which == PRIO_PROCESS && + (who == curproc->p_pid || who == 0) && + strcmp(sclass[curthread->t_cid].cl_name, "RT") == 0) { + /* + * donice() will always return EINVAL if we're in the + * RT class. The zone won't be able to put itself or any + * of its processes into RT, but if we put the whole + * zone into RT via the scheduling-class property, then + * getpriority would always fail. This breaks pam and + * prevents any login. Just pretend to be the highest + * priority. + */ + return (40); + } + + /* + * Linux does not return EINVAL for invalid 'who' values, it + * returns ESRCH instead. We already validated 'which' above. + */ + if (rval == EINVAL) + rval = ESRCH; + return (set_errno(rval)); + } + + /* + * The return value of the getpriority syscall is biased by 20 to avoid + * returning negative values when successful (-20 internally is our + * highest priority and 19 is our lowest). + */ + return (20 - pcnice.pc_val); +} + +/* + * Return EPERM if the current process is not allowed to operate on the target + * process (which is part of the procset for setpriority). + */ +/* ARGSUSED */ +static int +lx_chk_pripriv(proc_t *pp, char *dummy) +{ + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + if (!prochasprocperm(pp, curproc, CRED())) { + mutex_exit(&pp->p_lock); + return (EPERM); + } + mutex_exit(&pp->p_lock); + return (0); +} + +long +lx_setpriority(int which, id_t who, int prio) +{ + int rval; + idtype_t idtype; + id_t id, lid; + pcnice_t pcnice; + procset_t procset; + + switch (which) { + case PRIO_PROCESS: + idtype = P_PID; + if (who > 0 && lx_lpid_to_spair(who, &who, &lid) < 0) + return (set_errno(ESRCH)); + break; + case PRIO_PGRP: + idtype = P_PGID; + break; + case PRIO_USER: + idtype = P_UID; + break; + default: + return (set_errno(EINVAL)); + } + + /* Linux fails with a different errno on a negative id */ + if (who < 0) + return (set_errno(ESRCH)); + + id = (who == 0 ? P_MYID : who); + + if (prio > NZERO - 1) { + prio = NZERO - 1; + } else if (prio < -NZERO) { + prio = -NZERO; + } + + pcnice.pc_val = prio; + pcnice.pc_op = PC_SETNICE; + + setprocset(&procset, POP_AND, idtype, id, P_ALL, 0); + + rval = donice(&procset, &pcnice); + if (rval != 0) { + /* + * Once we fully support Linux capabilities, we should update + * the following check to look at the CAP_SYS_NICE capability. + */ + if (rval == EPERM && crgetuid(CRED()) != 0) { + /* + * donice() returns EPERM under two conditions: + * 1) if either the real or eff. uid don't match + * 2) we lack the privileges to raise the priority + * + * However, setpriority() must return a different errno + * based on the following: + * EPERM - real or eff. uid did not match + * EACCES - trying to increase priority + * + * We use lx_chk_pripriv to determine which case we hit. + * + * Note that the native setpriority(3C) code has the + * same race on re-checking. + */ + if (dotoprocs(&procset, lx_chk_pripriv, NULL) != EPERM) + rval = EACCES; + } + + return (set_errno(rval)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rename.c b/usr/src/uts/common/brand/lx/syscall/lx_rename.c new file mode 100644 index 0000000000..2fad627771 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rename.c @@ -0,0 +1,39 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* From uts/common/syscall/rename.c */ +extern int rename(char *, char *); +extern int renameat(int, char *, int, char *); + +long +lx_rename(char *p1, char *p2) +{ + return (rename(p1, p2)); +} + +long +lx_renameat(int atfd1, char *p1, int atfd2, char *p2) +{ + if (atfd1 == LX_AT_FDCWD) + atfd1 = AT_FDCWD; + + if (atfd2 == LX_AT_FDCWD) + atfd2 = AT_FDCWD; + + return (renameat(atfd1, p1, atfd2, p2)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c new file mode 100644 index 0000000000..eadc588824 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c @@ -0,0 +1,597 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/zone.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/sysmacros.h> +#include <sys/var.h> + +#define LX_RLIMIT_CPU 0 +#define LX_RLIMIT_FSIZE 1 +#define LX_RLIMIT_DATA 2 +#define LX_RLIMIT_STACK 3 +#define LX_RLIMIT_CORE 4 +#define LX_RLIMIT_RSS 5 +#define LX_RLIMIT_NPROC 6 +#define LX_RLIMIT_NOFILE 7 +#define LX_RLIMIT_MEMLOCK 8 +#define LX_RLIMIT_AS 9 +#define LX_RLIMIT_LOCKS 10 /* NA limit on locks, early 2.4 only */ +#define LX_RLIMIT_SIGPENDING 11 +#define LX_RLIMIT_MSGQUEUE 12 +#define LX_RLIMIT_NICE 13 /* NA ceiling for nice */ +#define LX_RLIMIT_RTPRIO 14 /* NA ceiling on the RT priority */ +#define LX_RLIMIT_RTTIME 15 /* NA cpu limit for RT proc. */ + +#define LX_RLIMIT_NLIMITS 16 + +#define RCTL_INFINITE(x) \ + ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x->rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct { + ulong_t rlim_cur; + ulong_t rlim_max; +} lx_rlimit_t; + +typedef struct { + uint32_t rlim_cur; + uint32_t rlim_max; +} lx_rlimit32_t; + +/* + * Linux supports many of the same resources that we do, but on illumos these + * are rctls. Instead of using rlimit, we use rctls for all of the limits. + * This table is used to translate Linux rlimit keys into the illumos legacy + * rlimit. We then primarily use the rctl/rlimit compatability code to + * manage these. + */ +static int l_to_r[LX_RLIMIT_NLIMITS] = { + RLIMIT_CPU, /* 0 CPU */ + RLIMIT_FSIZE, /* 1 FSIZE */ + RLIMIT_DATA, /* 2 DATA */ + RLIMIT_STACK, /* 3 STACK */ + RLIMIT_CORE, /* 4 CORE */ + -1, /* 5 RSS */ + -1, /* 6 NPROC */ + RLIMIT_NOFILE, /* 7 NOFILE */ + -1, /* 8 MEMLOCK */ + RLIMIT_AS, /* 9 AS */ + -1, /* 10 LOCKS */ + -1, /* 11 SIGPENDING */ + -1, /* 12 MSGQUEUE */ + -1, /* 13 NICE */ + -1, /* 14 RTPRIO */ + -1 /* 15 RTTIME */ +}; + +/* + * Magic value Linux uses to indicate infinity + */ +#define LX_RLIM_INFINITY_N ULONG_MAX + +void +lx_get_rctl(char *nm, struct rlimit64 *rlp64) +{ + rctl_hndl_t hndl; + rctl_val_t *oval, *nval; + + rlp64->rlim_cur = RLIM_INFINITY; + rlp64->rlim_max = RLIM_INFINITY; + + nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP); + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + oval = NULL; + while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) { + oval = nval; + switch (nval->rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_cur = nval->rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_max = nval->rcv_value; + break; + } + } + + mutex_exit(&curproc->p_lock); + kmem_free(nval, sizeof (rctl_val_t)); + + if (rlp64->rlim_cur == RLIM_INFINITY && + rlp64->rlim_max != RLIM_INFINITY) + rlp64->rlim_cur = rlp64->rlim_max; +} + +static int +lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int resource; + int64_t cur = -1; + boolean_t cur_inf = B_FALSE; + int64_t max = -1; + boolean_t max_inf = B_FALSE; + struct rlimit64 rlim64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max; + break; + + case LX_RLIMIT_NICE: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max; + break; + + case LX_RLIMIT_RTTIME: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max; + break; + + case LX_RLIMIT_RSS: + /* zone.max-physical-memory */ + zone_get_physmem_data(curzone->zone_id, + (pgcnt_t *)&rlim64.rlim_cur, + (pgcnt_t *)&rlim64.rlim_max); /* max is dummy variable */ + rlim64.rlim_cur = rlim64.rlim_max = ptob(rlim64.rlim_cur); + + break; + + case LX_RLIMIT_NPROC: + /* + * This is a limit on the number of processes for a + * real user ID (not enforced for privileged processes). + * + * This is analogous to v.v_maxup but is further capped + * by zone.max-processes + */ + rlim64.rlim_cur = rlim64.rlim_max = + MIN(v.v_maxup, curzone->zone_nprocs_ctl); + break; + + case LX_RLIMIT_MEMLOCK: + lx_get_rctl("process.max-locked-memory", &rlim64); + + /* If unlimited, use zone.max-locked-memory */ + if (rlim64.rlim_max == RLIM64_INFINITY) + rlim64.rlim_max = curzone->zone_locked_mem_ctl; + if (rlim64.rlim_cur == RLIM64_INFINITY) + rlim64.rlim_cur = curzone->zone_locked_mem_ctl; + break; + + case LX_RLIMIT_SIGPENDING: + lx_get_rctl("process.max-sigqueue-size", &rlim64); + break; + + case LX_RLIMIT_MSGQUEUE: + lx_get_rctl("process.max-msg-messages", &rlim64); + break; + + default: + resource = l_to_r[lx_resource]; + + mutex_enter(&curproc->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], curproc, + &rlim64); + mutex_exit(&curproc->p_lock); + break; + } + + + if (rlim64.rlim_cur == RLIM64_INFINITY) { + cur = LX_RLIM_INFINITY_N; + } else { + cur = rlim64.rlim_cur; + } + if (rlim64.rlim_max == RLIM64_INFINITY) { + max = LX_RLIM_INFINITY_N; + } else { + max = rlim64.rlim_max; + } + + if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) { + /* + * Stunningly, Linux has somehow managed to confuse the concept + * of a "limit" with that of a "default" -- and the value of + * RLIMIT_STACK is used by NPTL as the _default_ stack size if + * it isn't specified. (!!) Even for a system that prides + * itself on slapdash castles of junk, this is an amazingly + * willful act of incompetence -- and one that is gleefully + * confessed in the pthread_create() man page: "if the + * RLIMIT_STACK soft resource limit at the time the program + * started has any value other than 'unlimited', then it + * determines the default stack size of new threads." A + * typical stack limit for us is 32TB; if it needs to be said, + * setting the default stack size to be 32TB doesn't work so + * well! Of course, glibc dropping a deuce in its pants + * becomes our problem -- so to prevent smelly accidents we + * tell Linux that any stack limit over the old (32-bit) values + * for infinity are just infinitely large. + */ + cur_inf = B_TRUE; + max_inf = B_TRUE; + } + + if (cur_inf) { + *rlim_curp = LX_RLIM64_INFINITY; + } else { + *rlim_curp = cur; + } + + if (max_inf) { + *rlim_maxp = LX_RLIM64_INFINITY; + } else { + *rlim_maxp = max; + } + + return (0); +} + +/* + * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit + * in Linux headers and code. The only difference between this and the old + * getrlimit (variously called getrlimit or old_getrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. Modern code will + * use this version by default. + */ +long +lx_getrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (rlim_cur == LX_RLIM64_INFINITY) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else if (rlim_cur > LX_RLIM_INFINITY_N) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max == LX_RLIM64_INFINITY) + rl.rlim_max = LX_RLIM_INFINITY_N; + else if (rlim_max > LX_RLIM_INFINITY_N) + rl.rlim_max = LX_RLIM_INFINITY_N; + else + rl.rlim_max = (ulong_t)rlim_max; + + if (copyout(&rl, rlp, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (rlim_cur > UINT_MAX) + rl.rlim_cur = UINT_MAX; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > UINT_MAX) + rl.rlim_max = UINT_MAX; + else + rl.rlim_max = (ulong_t)rlim_max; + + rl32.rlim_cur = rl.rlim_cur; + rl32.rlim_max = rl.rlim_max; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +/* + * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit + * in Linux headers and code. The only difference between this and the new + * getrlimit (variously called getrlimit or ugetrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. + * + * This is only used for 32-bit code. + */ +long +lx_oldgetrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit32_t rl32; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (rlim_cur > INT_MAX) + rl32.rlim_cur = INT_MAX; + else + rl32.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > INT_MAX) + rl32.rlim_max = INT_MAX; + else + rl32.rlim_max = (ulong_t)rlim_cur; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +lx_set_rctl(char *nm, struct rlimit64 *rlp64) +{ + int err; + rctl_hndl_t hndl; + rctl_alloc_gp_t *gp; + + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + + /* + * We're not supposed to do this but since we want all our rctls to + * behave like rlimits, we take advantage of this function to set up + * this way. + */ + err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0, + CRED()); + + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + + return (err); +} + +static int +lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int err; + int resource; + rctl_alloc_gp_t *gp; + struct rlimit64 rl64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max; + break; + + case LX_RLIMIT_NICE: + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTTIME: + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RSS: + /* + * zone.max-physical-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_NPROC: + /* + * zone.max-processes + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_MEMLOCK: + /* + * We allow setting to unlimited (LX_RLIM_INFINITY_N). The zone + * limit will always apply. + */ + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + err = lx_set_rctl("process.max-locked-memory", &rl64); + if (err != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_SIGPENDING: + /* + * On Ubuntu at least, the login and sshd processes expect to + * set this limit to 16k and login will fail if this fails. On + * illumos we have a system limit of 8k and normally the + * privileged limit is 512. We simply pretend this works to + * allow login to work. + */ + if (rlim_max > 8192) + return (0); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64)) + != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_MSGQUEUE: + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0) + return (set_errno(err)); + break; + + default: + resource = l_to_r[lx_resource]; + + /* + * Linux limits the max number of open files to 1m and there is + * a test for this. + */ + if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024)) + return (EPERM); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + err = rctl_rlimit_set(rctlproc_legacy[resource], curproc, + &rl64, gp, rctlproc_flags[resource], + rctlproc_signals[resource], CRED()); + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + if (err != 0) + return (set_errno(err)); + break; + } + + return (0); +} + +long +lx_setrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(rlp, &rl, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (copyin(rlp, &rl32, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + rl.rlim_cur = rl32.rlim_cur; + rl.rlim_max = rl32.rlim_max; + } +#endif + + if ((rl.rlim_max != LX_RLIM_INFINITY_N && + rl.rlim_cur == LX_RLIM_INFINITY_N) || + rl.rlim_cur > rl.rlim_max) + return (set_errno(EINVAL)); + + if (rl.rlim_cur == LX_RLIM_INFINITY_N) + rlim_cur = LX_RLIM64_INFINITY; + else + rlim_cur = rl.rlim_cur; + + if (rl.rlim_max == LX_RLIM_INFINITY_N) + rlim_max = LX_RLIM64_INFINITY; + else + rlim_max = rl.rlim_max; + + rv = lx_setrlimit_common(resource, rlim_cur, rlim_max); + if (rv != 0) + return (set_errno(rv)); + return (0); +} + +/* + * From the man page: + * The Linux-specific prlimit() system call combines and extends the + * functionality of setrlimit() and getrlimit(). It can be used to both set + * and get the resource limits of an arbitrary process. + * + * If pid is 0, then the call applies to the calling process. + */ +long +lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp) +{ + int rv; + lx_rlimit64_t nrl, orl; + + if (pid != 0) { + /* XXX TBD if needed */ + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "setting prlimit %d for another process\n", resource); + lx_unsupported(buf); + return (ENOTSUP); + } + + if (orlp != NULL) { + /* we first get the current limits */ + rv = lx_getrlimit_common(resource, &orl.rlim_cur, + &orl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (nrlp != NULL) { + if (copyin(nrlp, &nrl, sizeof (nrl)) != 0) + return (set_errno(EFAULT)); + + if ((nrl.rlim_max != LX_RLIM64_INFINITY && + nrl.rlim_cur == LX_RLIM64_INFINITY) || + nrl.rlim_cur > nrl.rlim_max) + return (set_errno(EINVAL)); + + rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (orlp != NULL) { + /* now return the original limits, if necessary */ + if (copyout(&orl, orlp, sizeof (orl)) != 0) + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c new file mode 100644 index 0000000000..34aafcaf5d --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c @@ -0,0 +1,956 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/nbmlock.h> +#include <sys/limits.h> + +/* uts/common/syscall/rw.c */ +extern size_t copyout_max_cached; + + +/* Common routines */ + +static int +lx_iovec_copyin(void *uiovp, int iovcnt, iovec_t *kiovp, ssize_t *count) +{ +#ifdef _SYSCALL32_IMPL + /* + * 32-bit callers need to have their iovec expanded, while ensuring + * that they can't move more than 2Gbytes of data in a single call. + */ + if (get_udatamodel() == DATAMODEL_ILP32) { + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len = 0; + ssize32_t total32 = 0; + int i; + + if (iovcnt > IOV_MAX_STACK) { + aiov32len = iovcnt * sizeof (iovec32_t); + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + } + + if (copyin(uiovp, aiov32, iovcnt * sizeof (iovec32_t))) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32 = aiov32[i].iov_len; + total32 += iovlen32; + if (iovlen32 < 0 || total32 < 0) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EINVAL); + } + kiovp[i].iov_len = iovlen32; + kiovp[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT32) { + if (aiov32len != 0) { + kmem_free(aiov32, aiov32len); + } + return (EFAULT); + } + } + *count = total32; + + if (aiov32len != 0) + kmem_free(aiov32, aiov32len); + } else +#endif + { + ssize_t total = 0; + int i; + + if (copyin(uiovp, kiovp, iovcnt * sizeof (iovec_t))) + return (EFAULT); + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = kiovp[i].iov_len; + total += iovlen; + if (iovlen < 0 || total < 0) { + return (EINVAL); + } + /* Linux does a basic sanity test on the address */ + if ((uintptr_t)kiovp[i].iov_base >= USERLIMIT) { + return (EFAULT); + } + } + *count = total; + } + return (0); +} + +int +lx_read_common(file_t *fp, uio_t *uiop, size_t *nread, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 0, ioflag; + ssize_t count = uiop->uio_resid; + size_t rcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + if (fp->f_vnode->v_type == VDIR) { + return (EISDIR); + } + if (positioned && + (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) { + return (ESPIPE); + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_READ, uiop->uio_offset, count, svmand, + NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + /* + * For non-positioned reads, recheck offset/count validity inside + * VOP_WRLOCK to prevent filesize from changing during validation. + */ + if (!positioned) { + u_offset_t uoffset = (u_offset_t)(ulong_t)fp->f_offset; + + if ((vp->v_type == VREG) && (uoffset >= OFFSET_MAX(fp))) { + struct vattr va; + + va.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &va, 0, fp->f_cred, NULL); + VOP_RWUNLOCK(vp, rwflag, NULL); + if (error != 0) + goto out; + /* We have to return EOF if fileoff is >= file size. */ + if (uoffset >= va.va_size) + goto out; + /* + * File is greater than or equal to maxoff and + * therefore we return EOVERFLOW. + */ + error = EOVERFLOW; + goto out; + } + if ((vp->v_type == VREG) && + (uoffset + count > OFFSET_MAX(fp))) { + count = (ssize_t)(OFFSET_MAX(fp) - uoffset); + uiop->uio_resid = count; + } + uiop->uio_offset = uoffset; + } + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + /* If read sync is not asked for, filter sync flags */ + if ((ioflag & FRSYNC) == 0) + ioflag &= ~(FSYNC|FDSYNC); + error = VOP_READ(vp, uiop, ioflag, fp->f_cred, NULL); + rcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, sysread, 1); + CPU_STATS_ADDQ(cp, sys, readch, (ulong_t)rcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)rcount; + /* Store offset for non-positioned reads */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = rcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nread = rcount; + return (error); +} + +int +lx_write_common(file_t *fp, uio_t *uiop, size_t *nwrite, boolean_t positioned) +{ + vnode_t *vp = fp->f_vnode; + int error = 0, rwflag = 1, ioflag; + ssize_t count = uiop->uio_resid; + size_t wcount = 0; + struct cpu *cp; + boolean_t in_crit = B_FALSE; + + if (positioned && + (fp->f_vnode->v_type == VFIFO || fp->f_vnode->v_type == VSOCK)) { + return (ESPIPE); + } + + /* + * We have to enter the critical region before calling VOP_RWLOCK + * to avoid a deadlock with ufs. + */ + if (nbl_need_check(vp)) { + int svmand; + + nbl_start_crit(vp, RW_READER); + in_crit = B_TRUE; + error = nbl_svmand(vp, fp->f_cred, &svmand); + if (error != 0) + goto out; + if (nbl_conflict(vp, NBL_WRITE, uiop->uio_loffset, count, + svmand, NULL) != 0) { + error = EACCES; + goto out; + } + } + + (void) VOP_RWLOCK(vp, rwflag, NULL); + + if (!positioned) { + /* + * For non-positioned writes, the value of fp->f_offset is + * re-queried while inside VOP_RWLOCK. This ensures that other + * writes which alter the filesize will be taken into account. + */ + uiop->uio_loffset = fp->f_offset; + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } else { + /* + * In a senseless departure from POSIX, positioned write calls + * on Linux do _not_ ignore the O_APPEND flag. + */ + ioflag = uiop->uio_fmode & (FAPPEND|FSYNC|FDSYNC|FRSYNC); + } + if (vp->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)(ulong_t)uiop->uio_loffset; + + if (fileoff >= curproc->p_fsz_ctl) { + VOP_RWUNLOCK(vp, rwflag, NULL); + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + error = EFBIG; + goto out; + } + if (fileoff >= OFFSET_MAX(fp)) { + VOP_RWUNLOCK(vp, rwflag, NULL); + error = EFBIG; + goto out; + } + if (fileoff + count > OFFSET_MAX(fp)) { + count = (ssize_t)(OFFSET_MAX(fp) - fileoff); + uiop->uio_resid = count; + } + } + + error = VOP_WRITE(vp, uiop, ioflag, fp->f_cred, NULL); + wcount = count - uiop->uio_resid; + CPU_STATS_ENTER_K(); + cp = CPU; + CPU_STATS_ADDQ(cp, sys, syswrite, 1); + CPU_STATS_ADDQ(cp, sys, writech, (ulong_t)wcount); + CPU_STATS_EXIT_K(); + ttolwp(curthread)->lwp_ru.ioch += (ulong_t)wcount; + + /* Store offset for non-positioned writes */ + if (!positioned) { + if (vp->v_type == VFIFO) { + /* Backward compatibility */ + fp->f_offset = wcount; + } else if (((fp->f_flag & FAPPEND) == 0) || + (vp->v_type != VREG) || (count != 0)) { + /* POSIX */ + fp->f_offset = uiop->uio_loffset; + } + } + VOP_RWUNLOCK(vp, rwflag, NULL); + +out: + if (in_crit) + nbl_end_crit(vp); + *nwrite = wcount; + return (error); +} + +/* + * The Linux routines for reading and writing data from file descriptors behave + * differently from their SunOS counterparts in a few key ways: + * + * - Passing an iovcnt of 0 to the vectored functions results in an error on + * SunOS, but on Linux it yields return value of 0. + * + * - If any data is successfully read or written, Linux will return a success. + * This is unlike SunOS which would return an error code for the entire + * operation in cases where vectors had gone unprocessed. + * + * - Breaking from POSIX, Linux positioned writes (pwrite/pwritev) on Linux + * will obey the O_APPEND flag if it is set on the descriptor. + */ + +ssize_t +lx_read(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return ((ssize_t)nread); +} + +ssize_t +lx_write(int fdes, void *cbuf, size_t ccount) +{ + struct uio auio; + struct iovec aiov; + file_t *fp; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (error != 0) + return (set_errno(error)); + return (nwrite); +} + +ssize_t +lx_readv(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_FALSE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_writev(int fdes, struct iovec *iovp, int iovcnt) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG && count == 0) { + goto out; + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = fp->f_offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_FALSE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pread_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + ssize_t count = (ssize_t)ccount; + size_t nread = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write + * a file beyond the maximum offset representable in + * an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error == EINTR) { + if (nread != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + if (error) { + return (set_errno(error)); + } + return ((ssize_t)nread); + +} + +ssize_t +lx_pread(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + file_t *fp; + size_t nread; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + nread = lx_pread_fp(fp, cbuf, ccount, offset); + releasef(fdes); + return (nread); +} + +ssize_t +lx_pwrite_fp(file_t *fp, void *cbuf, size_t ccount, off64_t offset) +{ + struct uio auio; + struct iovec aiov; + ssize_t count = (ssize_t)ccount; + size_t nwrite = 0; + int fflag, error = 0; + + if (count < 0) + return (set_errno(EINVAL)); + if (((fflag = fp->f_flag) & (FWRITE)) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * return EINVAL for offsets that cannot be + * represented in an off_t. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwrite to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } + + aiov.iov_base = cbuf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_CACHED; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error == EINTR) { + if (nwrite != 0) { + error = 0; + } else { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + if (error) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_pwrite(int fdes, void *cbuf, size_t ccount, off64_t offset) +{ + file_t *fp; + size_t nwrite; + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + nwrite = lx_pwrite_fp(fp, cbuf, ccount, offset); + releasef(fdes); + return (nwrite); +} + +ssize_t +lx_pread32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pread(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwrite32(int fdes, void *cbuf, size_t ccount, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwrite(fdes, cbuf, ccount, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_preadv(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nread = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FREAD) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((offset_t)MAXOFFSET_T - fileoff); + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = MAXOFFSET_T; + auio.uio_fmode = fflag; + if (count <= copyout_max_cached) + auio.uio_extflg = UIO_COPY_CACHED; + else + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_read_common(fp, &auio, &nread, B_TRUE); + + if (error != 0) { + if (nread != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nread); +} + +ssize_t +lx_pwritev(int fdes, void *iovp, int iovcnt, off64_t offset) +{ + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; + file_t *fp; + ssize_t count; + size_t nwrite = 0; + int fflag, error = 0; + + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EINVAL)); + } else if (iovcnt == 0) { + return (0); + } + + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + if ((error = lx_iovec_copyin(iovp, iovcnt, aiov, &count)) != 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(error)); + } + + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + return (set_errno(EBADF)); + } + if (((fflag = fp->f_flag) & FWRITE) == 0) { + error = EBADF; + goto out; + } + if (fp->f_vnode->v_type == VREG) { + u_offset_t fileoff = (u_offset_t)offset; + + if (count == 0) + goto out; + /* + * Return EINVAL if an invalid offset comes to pread. + * Negative offset from user will cause this error. + */ + if (fileoff > MAXOFFSET_T) { + error = EINVAL; + goto out; + } + /* + * Take appropriate action if we are trying to write above the + * resource limit. + */ + if (fileoff >= curproc->p_fsz_ctl) { + mutex_enter(&curproc->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], + curproc->p_rctls, curproc, RCA_UNSAFE_SIGINFO); + mutex_exit(&curproc->p_lock); + + error = EFBIG; + goto out; + } + /* + * Don't allow pwritev to cause file sizes to exceed maxoffset. + */ + if (fileoff == MAXOFFSET_T) { + error = EFBIG; + goto out; + } + /* + * Limit offset such that we don't read or write a file beyond + * the maximum offset representable in an off_t structure. + */ + if (fileoff + count > MAXOFFSET_T) + count = (ssize_t)((u_offset_t)MAXOFFSET_T - fileoff); + } + + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_loffset = offset; + auio.uio_resid = count; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_llimit = curproc->p_fsz_ctl; + auio.uio_fmode = fflag; + auio.uio_extflg = UIO_COPY_DEFAULT; + + error = lx_write_common(fp, &auio, &nwrite, B_TRUE); + + if (error != 0) { + if (nwrite != 0) { + error = 0; + } else if (error == EINTR) { + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + } +out: + releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); + if (error != 0) { + return (set_errno(error)); + } + return (nwrite); +} + +ssize_t +lx_preadv32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, uint32_t off_hi) +{ + return (lx_preadv(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} + +ssize_t +lx_pwritev32(int fdes, void *iovp, int iovcnt, uint32_t off_lo, + uint32_t off_hi) +{ + return (lx_pwritev(fdes, iovp, iovcnt, LX_32TO64(off_lo, off_hi))); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c new file mode 100644 index 0000000000..6d4904a5fe --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -0,0 +1,1161 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * Emulation for scheduling related syscalls. + * + * Under a typical zone configuration the zones will always be running under + * FSS so that no single zone can monopolize the system. Zones do not have the + * privilege to leave FSS (for the obvious reason that this would violate the + * global zone resource management policies). Thus, for the sched_* syscalls + * we typically will never be able to emulate those using our other native + * scheduling classes. Under this common case we simply track the scheduler + * settings on the lwp's lx brand structure and we also try to adjust the + * lwp priority within the valid range to approximate the intended effect. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/processor.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/sysmacros.h> +#include <sys/policy.h> +#include <sys/procset.h> +#include <sys/priocntl.h> + +typedef int l_pid_t; + +extern int yield(); +extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); + +static int lx_sched_setprocset(procset_t *, l_pid_t); +static long lx_do_priocntlsys(int, procset_t *, void *); + +#define BITS_PER_BYTE 8 + +/* + * Linux scheduler policies. + */ +#define LX_SCHED_OTHER 0 +#define LX_SCHED_FIFO 1 +#define LX_SCHED_RR 2 +#define LX_SCHED_BATCH 3 +#define LX_SCHED_IDLE 5 +#define LX_SCHED_DEADLINE 6 + +/* + * Linux scheduler priority ranges. + */ +#define LX_SCHED_PRIORITY_MIN_OTHER 0 +#define LX_SCHED_PRIORITY_MAX_OTHER 0 +#define LX_SCHED_PRIORITY_MIN_RRFIFO 1 +#define LX_SCHED_PRIORITY_MAX_RRFIFO 99 + +#define MAXPRI 60 /* See FSS_MAXUPRI */ + +/* + * When emulating scheduling priorities (e.g. under FSS) we'll do the best we + * can by adjusting the thread's priority within our range. + */ +static int lx_emul_pri_map[] = { + 0, /* LX_SCHED_OTHER */ + MAXPRI, /* LX_SCHED_FIFO */ + MAXPRI - 1, /* LX_SCHED_RR */ + -MAXPRI + 1, /* LX_SCHED_BATCH */ + 0, /* UNUSED */ + -MAXPRI, /* LX_SCHED_IDLE */ + MAXPRI /* LX_SCHED_DEADLINE */ +}; + +/* + * Determine if we should emulate the sched_* syscalls. A zone is almost always + * going to be running under FSS in any kind of production configuration, and + * FSS is currently the only class which zone processes won't have the privilege + * to leave. Instead of checking for FSS explicitly, we generalize our check + * using CL_CANEXIT. + */ +#define EMUL_SCHED() (CL_CANEXIT(curthread, CRED()) != 0) + +struct lx_sched_param { + int lx_sched_prio; +}; + +typedef struct lx_sched_attr { + uint32_t lx_size; + + uint32_t lx_sched_policy; + uint64_t lx_sched_flags; + + /* For LX_SCHED_OTHER or LX_SCHED_BATCH */ + int lx_sched_nice; + + /* For LX_SCHED_FIFO or LX_SCHED_RR */ + uint32_t lx_sched_priority; + + /* For LX_SCHED_DEADLINE */ + uint64_t lx_sched_runtime; + uint64_t lx_sched_deadline; + uint64_t lx_sched_period; +} lx_sched_attr_t; + +long +lx_sched_yield(void) +{ + yield(); + + return (0); +} + +static void +ltos_cpuset(lx_affmask_t *lmask, cpuset_t *smask) +{ + /* NOTE: fix this code if NCPU is ever made > LX_NCPU */ + + cpuset_zero(smask); + for (int i = 0; i < NCPU; i++) { + if (BT_TEST(*lmask, i)) { + cpuset_add(smask, i); + } + } +} + +static void +stol_cpuset(cpuset_t *smask, lx_affmask_t *lmask) +{ + /* NOTE: fix this code if NCPU is ever made > LX_NCPU */ + + bzero(lmask, sizeof (*lmask)); + for (int i = 0; i < NCPU; i++) { + if (cpu_in_set(smask, i)) { + BT_SET(*lmask, i); + } + } +} + +/* + * Find and lock a process for lx_sched_* operations. + * Sets 'pp' and 'tp' on success, with P_PR_LOCK set and p_lock held. + * The target process must be branded. + */ +static int +lx_sched_pidlock(l_pid_t pid, proc_t **pp, kthread_t **tp, boolean_t is_write) +{ + proc_t *p; + kthread_t *t = NULL; + int err = 0; + + if (pid < 0) { + return (EINVAL); + } + if (pid == 0) { + p = curproc; + ASSERT(PROC_IS_BRANDED(p)); + mutex_enter(&p->p_lock); + sprlock_proc(p); + + *tp = curthread; + *pp = p; + return (0); + } + + if (lx_lpid_lock((pid_t)pid, curzone, LXP_PRLOCK, &p, &t) != 0) { + return (ESRCH); + } + + ASSERT(MUTEX_HELD(&p->p_lock)); + if (!(PROC_IS_BRANDED(p))) { + sprunlock(p); + return (EPERM); + } + + if (is_write) { + cred_t *cr = CRED(); + + /* + * To perform a sched_* operation on a thread outside of the + * current process, either the euid/egid of the target must + * match, or the calling process must hold CAP_SYS_NICE. + * (PRIV_PROC_PRIOUP maps to CAP_SYS_NICE) + */ + err = 0; + if (secpolicy_raisepriority(cr) != 0) { + err = 0; + mutex_exit(&p->p_lock); + mutex_enter(&p->p_crlock); + if (crgetuid(cr) != crgetuid(p->p_cred) || + crgetgid(cr) != crgetgid(p->p_cred)) { + err = EPERM; + } + mutex_exit(&p->p_crlock); + mutex_enter(&p->p_lock); + if (err != 0) { + sprunlock(p); + return (err); + } + } + } + *pp = p; + *tp = t; + ASSERT(MUTEX_HELD(&p->p_lock)); + return (0); +} + +long +lx_sched_getaffinity(l_pid_t pid, unsigned int len, void *maskp) +{ + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + int err; + unsigned int pmin, pmax, compare_size; + lx_affmask_t lmask; + cpuset_t *smask; + + /* + * The length boundary requirement is to match Linux's behavior. + */ + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + compare_size = sizeof (uint32_t); + break; + default: + compare_size = sizeof (ulong_t); + break; + } + if ((len & (compare_size - 1)) != 0) { + return (set_errno(EINVAL)); + } + + smask = cpuset_alloc(KM_SLEEP); + if ((err = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) { + cpuset_free(smask); + return (set_errno(err)); + } + + mutex_exit(&p->p_lock); + mutex_enter(&cpu_lock); + mutex_enter(&p->p_lock); + /* + * Grab the existing affinity mask and constrain it by the current set + * of active CPUs (which may have changed since it was assigned. + */ + lwpd = ttolxlwp(tp); + cpuset_or(smask, lwpd->br_affinitymask); + cpuset_and(smask, &cpu_active_set); + sprunlock(p); + mutex_exit(&cpu_lock); + + cpuset_bounds(smask, &pmin, &pmax); + stol_cpuset(smask, &lmask); + cpuset_free(smask); + + /* + * It is out of convenience that this check is performed so late. If + * the need arises, it could be altered to be done earlier in order to + * match Linux error ordering. + */ + if (pmax >= (len * BITS_PER_BYTE)) { + return (set_errno(EINVAL)); + } + + len = MIN(len, sizeof (lx_affmask_t)); + if (copyout(&lmask, maskp, len) != 0) { + return (set_errno(EFAULT)); + } + return (len); +} + +long +lx_sched_setaffinity(l_pid_t pid, unsigned int len, void *maskp) +{ + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + int err; + unsigned int pmin, pmax; + lx_affmask_t lmask; + cpuset_t *smask; + + if (pid < 0) { + return (set_errno(EINVAL)); + } + + if (len < sizeof (lmask)) { + bzero(&lmask, sizeof (lmask)); + } else if (len > sizeof (lmask)) { + len = sizeof (lmask); + } + if (copyin(maskp, &lmask, len) != 0) { + return (set_errno(EFAULT)); + } + smask = cpuset_alloc(KM_SLEEP); + ltos_cpuset(&lmask, smask); + if ((err = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) { + cpuset_free(smask); + return (set_errno(err)); + } + + /* + * Constrain the mask to currently active CPUs. + */ + mutex_exit(&p->p_lock); + mutex_enter(&cpu_lock); + mutex_enter(&p->p_lock); + lwpd = ttolxlwp(tp); + + cpuset_and(smask, &cpu_active_set); + if (cpuset_isnull(smask)) { + err = EINVAL; + goto out; + } + if (cpuset_isequal(lwpd->br_affinitymask, smask)) { + err = 0; + goto out; + } + + /* + * If one (and only one) CPU is selected in the affinity mask, bind the + * thread to that CPU. + */ + cpuset_bounds(smask, &pmin, &pmax); + VERIFY(pmin != CPUSET_NOTINSET); + if (pmin == pmax) { + processorid_t obind; + + (void) cpu_bind_thread(tp, pmin, &obind, &err); + if (err != 0) { + goto out; + } + } else { + /* + * If the thread transitions away from a single-CPU mask, it + * should be unbound from that processor. + */ + cpuset_bounds(lwpd->br_affinitymask, &pmin, &pmax); + if (pmin == pmax) { + processorid_t obind; + (void) cpu_bind_thread(tp, PBIND_NONE, &obind, &err); + } + } + cpuset_zero(lwpd->br_affinitymask); + cpuset_or(lwpd->br_affinitymask, smask); + err = 0; + +out: + mutex_exit(&cpu_lock); + sprunlock(p); + cpuset_free(smask); + if (err != 0) { + return (set_errno(err)); + } + return (0); +} + +void +lx_affinity_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + proc_t *pp = lwptoproc(srclwp); + lx_lwp_data_t *slwpd = lwptolxlwp(srclwp); + lx_lwp_data_t *dlwpd = lwptolxlwp(dstlwp); + + /* + * Copy over the affinity mask. This could be enhanced in the future + * to perform single-CPU binding like sched_setaffinity. + */ + mutex_enter(&pp->p_lock); + cpuset_zero(dlwpd->br_affinitymask); + cpuset_or(dlwpd->br_affinitymask, slwpd->br_affinitymask); + mutex_exit(&pp->p_lock); +} + +long +lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int prio, maxupri; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + int incr; + lx_lwp_data_t *lwpd; + + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + if (prio != LX_SCHED_PRIORITY_MIN_OTHER) + return (set_errno(EINVAL)); + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) + return (set_errno(EPERM)); + if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO || + prio > LX_SCHED_PRIORITY_MAX_RRFIFO) + return (set_errno(EINVAL)); + break; + default: + return (set_errno(EINVAL)); + } + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + lwpd = lwptolxlwp(ttolwp(tp)); + if (lwpd->br_schd_class == LX_SCHED_IDLE && + policy != LX_SCHED_IDLE && crgetuid(CRED()) != 0) { + + sprunlock(p); + return (set_errno(EPERM)); + } + + lwpd->br_schd_class = policy; + lwpd->br_schd_pri = prio; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + + sprunlock(p); + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* get the class id */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* get the current policy */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (policy < 0) { + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + return (set_errno(EINVAL)); + } + } + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getscheduler(l_pid_t pid) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int policy; + int rv; + + if (pid < 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + sprunlock(p); + + return (policy); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + policy = set_errno(EINVAL); + } + + return (policy); +} + +long +lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int policy; + int prio, maxupri; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + int incr; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + if (prio != LX_SCHED_PRIORITY_MIN_OTHER) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + if (prio < LX_SCHED_PRIORITY_MIN_RRFIFO || + prio > LX_SCHED_PRIORITY_MAX_RRFIFO) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + default: + /* this shouldn't happen */ + ASSERT(0); + sprunlock(p); + return (set_errno(EINVAL)); + } + + lwptolxlwp(ttolwp(tp))->br_schd_pri = prio; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + sprunlock(p); + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) lx_do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) lx_do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_sched_param local_param; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + tsinfo_t *tsi; + int prio, scale; + int rv; + + if (pid < 0 || param == NULL) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + local_param.lx_sched_prio = lwptolxlwp(ttolwp(tp))->br_schd_pri; + sprunlock(p); + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + bzero(&local_param, sizeof (local_param)); + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + /* + * I don't know if we need to do this, coz it can't be + * changed from zero anyway..... + */ + tsi = (tsinfo_t *)pcinfo.pc_clinfo; + prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + scale = tsi->ts_maxupri; + if (scale == 0) + local_param.lx_sched_prio = 0; + else + local_param.lx_sched_prio = -(prio * 20) / scale; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + local_param.lx_sched_prio = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else { + rv = set_errno(EINVAL); + } + + if (rv == 0) + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (rv); +} + +long +lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival) +{ + klwp_t *lwp = ttolwp(curthread); + struct timespec interval; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int rv; + + if (pid < 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + int policy; + proc_t *p; + kthread_t *tp = NULL; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + policy = lwptolxlwp(ttolwp(tp))->br_schd_class; + sprunlock(p); + + interval.tv_sec = 0; + if (policy == LX_SCHED_RR) { + /* Use a made-up value similar to Linux */ + interval.tv_nsec = 100000000; + } else { + interval.tv_nsec = 0; + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + /* + * A timespec may overflow for 32-bit but EOVERFLOW + * is not documented as an acceptable error for + * sched_rr_get_interval. Such an occurance would be + * exceptionally weird for the RR interval. + */ + TIMESPEC_TO_TIMESPEC32(&t32, &interval); + + if (copyout(&t32, ival, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + } + else +#endif + { + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + } + + return (0); + } + + if ((rv = lx_sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) lx_do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) lx_do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0); + bzero(&pcinfo, sizeof (pcinfo)); + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) lx_do_priocntlsys(PC_GETCID, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * Contrary to what the man page says, you don't have to be in RR to + * get this interval. + */ + if (((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) { + interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + /* + * Like above, the 32-bit EOVERFLOW check is not + * appropriate here. + */ + TIMESPEC_TO_TIMESPEC32(&t32, &interval); + + if (copyout(&t32, ival, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + } + else +#endif + { + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + } + + return (0); + } + + return (set_errno(EINVAL)); +} + +long +lx_sched_get_priority_min(uintptr_t policy) +{ + /* + * Linux scheduling priorities are not alterable, so there is no + * illumos translation necessary. + */ + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + return (LX_SCHED_PRIORITY_MIN_RRFIFO); + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + return (LX_SCHED_PRIORITY_MIN_OTHER); + default: + break; + } + return (set_errno(EINVAL)); +} + +long +lx_sched_get_priority_max(uintptr_t policy) +{ + /* + * Linux scheduling priorities are not alterable, so there is no + * illumos translation necessary. + */ + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + return (LX_SCHED_PRIORITY_MAX_RRFIFO); + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + case LX_SCHED_DEADLINE: + return (LX_SCHED_PRIORITY_MAX_OTHER); + default: + break; + } + return (set_errno(EINVAL)); +} + +long +lx_sched_setattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t flags) +{ + int rv; + uint32_t lx_size; + lx_sched_attr_t local_attr; + uint64_t flg; + + if (pid < 0 || attr == NULL || flags != 0) + return (set_errno(EINVAL)); + + if (copyin(attr, &lx_size, sizeof (lx_size))) + return (set_errno(EFAULT)); + + if (lx_size > sizeof (local_attr)) + return (set_errno(E2BIG)); + + bzero(&local_attr, sizeof (local_attr)); + if (copyin(attr, &local_attr, lx_size)) + return (set_errno(EFAULT)); + + flg = local_attr.lx_sched_flags; + if ((flg & ~LX_SCHED_FLAG_RESET_ON_FORK) != 0) + return (set_errno(EINVAL)); + + if (EMUL_SCHED()) { + int policy; + proc_t *p; + kthread_t *tp = NULL; + int incr; + lx_lwp_data_t *lwpd; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_TRUE)) != 0) + return (set_errno(rv)); + + policy = local_attr.lx_sched_policy; + + switch (policy) { + case LX_SCHED_OTHER: + case LX_SCHED_BATCH: + case LX_SCHED_IDLE: + break; + case LX_SCHED_FIFO: + case LX_SCHED_RR: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + if (local_attr.lx_sched_priority < + LX_SCHED_PRIORITY_MIN_RRFIFO || + local_attr.lx_sched_priority > + LX_SCHED_PRIORITY_MAX_RRFIFO) { + sprunlock(p); + return (set_errno(EINVAL)); + } + break; + + case LX_SCHED_DEADLINE: + if (crgetuid(CRED()) != 0) { + sprunlock(p); + return (set_errno(EPERM)); + } + break; + default: + sprunlock(p); + return (set_errno(EINVAL)); + } + + lwpd = lwptolxlwp(ttolwp(tp)); + lwpd->br_schd_class = policy; + lwpd->br_schd_flags = flg; + lwpd->br_schd_pri = local_attr.lx_sched_priority; + + lwpd->br_schd_runtime = local_attr.lx_sched_runtime; + lwpd->br_schd_deadline = local_attr.lx_sched_deadline; + lwpd->br_schd_period = local_attr.lx_sched_period; + + ASSERT(policy <= LX_SCHED_DEADLINE); + incr = lx_emul_pri_map[policy]; + + CL_DOPRIO(tp, CRED(), incr, &rv); + sprunlock(p); + return (0); + } + + /* Currently not supported under other classes */ + return (set_errno(ENOSYS)); +} + +long +lx_sched_getattr(l_pid_t pid, lx_sched_attr_t *attr, uint32_t size, + uint32_t flags) +{ + lx_sched_attr_t local_attr; + int rv; + + if (pid < 0 || attr == NULL || flags != 0 || size < sizeof (local_attr)) + return (set_errno(EINVAL)); + + bzero(&local_attr, sizeof (local_attr)); + if (EMUL_SCHED()) { + proc_t *p; + kthread_t *tp = NULL; + lx_lwp_data_t *lwpd; + + /* Find and operate on the target lwp. */ + if ((rv = lx_sched_pidlock(pid, &p, &tp, B_FALSE)) != 0) + return (set_errno(rv)); + + lwpd = lwptolxlwp(ttolwp(tp)); + local_attr.lx_sched_policy = lwpd->br_schd_class; + local_attr.lx_sched_priority = lwpd->br_schd_pri; + local_attr.lx_sched_flags = lwpd->br_schd_flags; + + local_attr.lx_sched_runtime = lwpd->br_schd_runtime; + local_attr.lx_sched_deadline = lwpd->br_schd_deadline; + local_attr.lx_sched_period = lwpd->br_schd_period; + + sprunlock(p); + + local_attr.lx_size = sizeof (lx_sched_attr_t); + + if (copyout(&local_attr, attr, sizeof (local_attr))) + return (set_errno(EFAULT)); + + return (0); + } + + /* Currently not supported under other classes */ + return (set_errno(ENOSYS)); +} + +static int +lx_sched_setprocset(procset_t *procset, l_pid_t pid) +{ + id_t lid, rid; + idtype_t lidtype, ridtype; + + /* + * define the target lwp + */ + if (pid == 0) + pid = curproc->p_pid; + + if (lx_lpid_to_spair(pid, &pid, &lid) < 0) + return (set_errno(ESRCH)); + rid = 0; + ridtype = P_ALL; + lidtype = P_LWPID; + + setprocset(procset, POP_AND, lidtype, lid, ridtype, rid); + + return (0); +} + +static long +lx_do_priocntlsys(int cmd, procset_t *procset, void *arg) +{ + return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0, + UIO_SYSSPACE)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_socket.c b/usr/src/uts/common/brand/lx/syscall/lx_socket.c new file mode 100644 index 0000000000..a433020f90 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_socket.c @@ -0,0 +1,4832 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/sockio.h> +#include <sys/thread.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/kmem.h> +#include <sys/un.h> +#include <sys/sunddi.h> +#include <sys/cred.h> +#include <sys/ucred.h> +#include <sys/model.h> +#include <sys/brand.h> +#include <sys/vmsystm.h> +#include <sys/limits.h> +#include <sys/fcntl.h> +#include <sys/sysmacros.h> +#include <netpacket/packet.h> +#include <sockcommon.h> +#include <socktpi_impl.h> +#include <netinet/udp.h> +#include <sys/sdt.h> +#include <netinet/tcp.h> +#include <netinet/igmp.h> +#include <netinet/icmp6.h> +#include <inet/cc.h> +#include <inet/tcp_impl.h> +#include <lx_errno.h> + +#include <sys/lx_brand.h> +#include <sys/lx_socket.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> + +/* From uts/common/fs/sockfs/socksyscalls.c */ +extern int listen(int, int, int); +extern int shutdown(int, int, int); + +typedef struct lx_ucred { + pid_t lxu_pid; + lx_uid_t lxu_uid; + lx_gid_t lxu_gid; +} lx_ucred_t; + +typedef struct lx_socket_aux_data +{ + kmutex_t lxsad_lock; + enum lxsad_status_t { + LXSS_NONE = 0, + LXSS_CONNECTING, + LXSS_CONNECTED + } lxsad_status; + uint_t lxsad_flags; +} lx_socket_aux_data_t; + +#define LX_SS_MAXSIZE 128 + +typedef struct lx_sockaddr_storage { + unsigned short lxss_family; + char lxdata[LX_SS_MAXSIZE - sizeof (unsigned short)]; +} lx_sockaddr_storage_t; + +typedef struct lx_group_req { + uint32_t lxgr_interface; +#ifdef _LP64 + /* On 64-bit linux kernels, gr_interface is padded by 4 bytes. */ + uint32_t _lxgr_pad; +#endif + lx_sockaddr_storage_t lxgr_group; +} lx_group_req_t; + +#if defined(_SYSCALL32_IMPL) + +typedef struct lx_group_req32 { + uint32_t lxgr_interface; + lx_sockaddr_storage_t lxgr_group; +} lx_group_req32_t; + +#endif /* defined(_SYSCALL32_IMPL) */ + +/* lxsad_flags */ +#define LXSAD_FL_STRCRED 0x1 +#define LXSAD_FL_EMULSEQPKT 0x2 +/* These two work together to implement Linux SO_REUSEADDR semantics. */ +#define LXSAD_FL_EMULRUADDR 0x4 +#define LXSAD_FL_EMULRUPORT 0x8 + +static lx_socket_aux_data_t *lx_sad_acquire(vnode_t *); + +/* VSD key for lx-specific socket information */ +static uint_t lx_socket_vsd = 0; + +/* Convenience enum to enforce translation direction */ +typedef enum lx_xlate_dir { + SUNOS_TO_LX, + LX_TO_SUNOS +} lx_xlate_dir_t; + +/* enum for getpeername/getsockname handling */ +typedef enum lx_getname_type { + LX_GETPEERNAME, + LX_GETSOCKNAME +} lx_getname_type_t; + +/* + * What follows are a series of tables we use to translate Linux constants + * into equivalent Illumos constants and back again. I wish this were + * cleaner, more programmatic, and generally nicer. Sadly, life is messy, + * and Unix networking even more so. + */ +static const int ltos_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* LX_AF_UNSPEC */ + AF_UNIX, /* LX_AF_UNIX */ + AF_INET, /* LX_AF_INET */ + AF_NOTSUPPORTED, /* LX_AF_AX25 */ + AF_NOTSUPPORTED, /* LX_AF_IPX */ + AF_NOTSUPPORTED, /* LX_AF_APPLETALK */ + AF_NOTSUPPORTED, /* LX_AF_NETROM */ + AF_NOTSUPPORTED, /* LX_AF_BRIDGE */ + AF_NOTSUPPORTED, /* LX_AF_ATMPVC */ + AF_NOTSUPPORTED, /* LX_AF_X25 */ + AF_INET6, /* LX_AF_INET6 */ + AF_NOTSUPPORTED, /* LX_AF_ROSE */ + AF_NOTSUPPORTED, /* LX_AF_DECNET */ + AF_NOTSUPPORTED, /* LX_AF_NETBEUI */ + AF_NOTSUPPORTED, /* LX_AF_SECURITY */ + AF_NOTSUPPORTED, /* LX_AF_KEY */ + AF_LX_NETLINK, /* LX_AF_NETLINK */ + AF_PACKET, /* LX_AF_PACKET */ + AF_NOTSUPPORTED, /* LX_AF_ASH */ + AF_NOTSUPPORTED, /* LX_AF_ECONET */ + AF_NOTSUPPORTED, /* LX_AF_ATMSVC */ + AF_NOTSUPPORTED, /* LX_AF_RDS */ + AF_NOTSUPPORTED, /* LX_AF_SNA */ + AF_NOTSUPPORTED, /* LX_AF_IRDA */ + AF_NOTSUPPORTED, /* LX_AF_PPOX */ + AF_NOTSUPPORTED, /* LX_AF_WANPIPE */ + AF_NOTSUPPORTED, /* LX_AF_LLC */ + AF_NOTSUPPORTED, /* NONE */ + AF_NOTSUPPORTED, /* NONE */ + AF_NOTSUPPORTED, /* LX_AF_CAN */ + AF_NOTSUPPORTED, /* LX_AF_TIPC */ + AF_NOTSUPPORTED, /* LX_AF_BLUETOOTH */ + AF_NOTSUPPORTED, /* LX_AF_IUCV */ + AF_NOTSUPPORTED /* LX_AF_RXRPC */ + /* LX_AF_ISDN */ + /* LX_AF_PHONET */ + /* LX_AF_IEEE802154 */ + /* LX_AF_CAIF */ + /* LX_AF_ALG */ + /* LX_AF_NFC */ + /* LX_AF_VSOCK */ +}; + +static const int stol_family[LX_AF_MAX + 1] = { + AF_UNSPEC, /* AF_UNSPEC */ + AF_UNIX, /* AF_UNIX */ + AF_INET, /* AF_INET */ + AF_NOTSUPPORTED, /* AF_IMPLINK */ + AF_NOTSUPPORTED, /* AF_PUP */ + AF_NOTSUPPORTED, /* AF_CHAOS */ + AF_NOTSUPPORTED, /* AF_NS */ + AF_NOTSUPPORTED, /* AF_NBS */ + AF_NOTSUPPORTED, /* AF_ECMA */ + AF_NOTSUPPORTED, /* AF_DATAKIT */ + AF_NOTSUPPORTED, /* AF_CCITT */ + AF_NOTSUPPORTED, /* AF_SNA */ + AF_NOTSUPPORTED, /* AF_DECNET */ + AF_NOTSUPPORTED, /* AF_DLI */ + AF_NOTSUPPORTED, /* AF_LAT */ + AF_NOTSUPPORTED, /* AF_HYLINK */ + AF_NOTSUPPORTED, /* AF_APPLETALK */ + AF_NOTSUPPORTED, /* AF_NIT */ + AF_NOTSUPPORTED, /* AF_802 */ + AF_NOTSUPPORTED, /* AF_OSI */ + AF_NOTSUPPORTED, /* AF_X25 */ + AF_NOTSUPPORTED, /* AF_OSINET */ + AF_NOTSUPPORTED, /* AF_GOSIP */ + AF_NOTSUPPORTED, /* AF_IPX */ + AF_NOTSUPPORTED, /* AF_ROUTE */ + AF_NOTSUPPORTED, /* AF_LINK */ + LX_AF_INET6, /* AF_INET6 */ + AF_NOTSUPPORTED, /* AF_KEY */ + AF_NOTSUPPORTED, /* AF_NCA */ + AF_NOTSUPPORTED, /* AF_POLICY */ + AF_NOTSUPPORTED, /* AF_INET_OFFLOAD */ + AF_NOTSUPPORTED, /* AF_TRILL */ + LX_AF_PACKET, /* AF_PACKET */ + LX_AF_NETLINK /* AF_LX_NETLINK */ +}; + +#define LTOS_FAMILY(d) ((d) <= LX_AF_MAX ? ltos_family[(d)] : AF_INVAL) +#define STOL_FAMILY(d) ((d) <= LX_AF_MAX ? stol_family[(d)] : AF_INVAL) + + +static const int ltos_socktype[LX_SOCK_PACKET + 1] = { + SOCK_NOTSUPPORTED, SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, + SOCK_RDM, SOCK_SEQPACKET, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, + SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED, SOCK_NOTSUPPORTED +}; + +static const int stol_socktype[SOCK_SEQPACKET + 1] = { + SOCK_NOTSUPPORTED, LX_SOCK_DGRAM, LX_SOCK_STREAM, SOCK_NOTSUPPORTED, + LX_SOCK_RAW, LX_SOCK_RDM, LX_SOCK_SEQPACKET +}; + +#define LTOS_SOCKTYPE(t) \ + ((t) <= LX_SOCK_PACKET ? ltos_socktype[(t)] : SOCK_INVAL) +#define STOL_SOCKTYPE(t) \ + ((t) <= SOCK_SEQPACKET ? stol_socktype[(t)] : SOCK_INVAL) + + +/* + * This string is used to prefix all abstract namespace Unix sockets, ie all + * abstract namespace sockets are converted to regular sockets in the /tmp + * directory with .ABSK_ prefixed to their names. + */ +#define ABST_PRFX "/tmp/.ABSK_" +#define ABST_PRFX_LEN (sizeof (ABST_PRFX) - 1) + +#define DATAFILT "datafilt" + +typedef enum { + lxa_none, + lxa_abstract, + lxa_devlog +} lx_addr_type_t; + +static int +ltos_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + + +typedef struct lx_flag_map { + enum { + LXFM_MAP, + LXFM_IGNORE, + LXFM_UNSUP + } lxfm_action; + int lxfm_sunos_flag; + int lxfm_linux_flag; + char *lxfm_name; +} lx_flag_map_t; + +static lx_flag_map_t lx_flag_map_tbl[] = { + { LXFM_MAP, MSG_OOB, LX_MSG_OOB, NULL }, + { LXFM_MAP, MSG_PEEK, LX_MSG_PEEK, NULL }, + { LXFM_MAP, MSG_DONTROUTE, LX_MSG_DONTROUTE, NULL }, + { LXFM_MAP, MSG_CTRUNC, LX_MSG_CTRUNC, NULL }, + { LXFM_MAP, MSG_TRUNC, LX_MSG_TRUNC, NULL }, + { LXFM_MAP, MSG_DONTWAIT, LX_MSG_DONTWAIT, NULL }, + { LXFM_MAP, MSG_EOR, LX_MSG_EOR, NULL }, + { LXFM_MAP, MSG_WAITALL, LX_MSG_WAITALL, NULL }, + /* MSG_CONFIRM is safe to ignore */ + { LXFM_IGNORE, 0, LX_MSG_CONFIRM, NULL }, + /* + * The NOSIGNAL and CMSG_CLOEXEC flags are handled by the emulation + * outside of the flag-conversion routine. + */ + { LXFM_IGNORE, 0, LX_MSG_NOSIGNAL, NULL }, + { LXFM_IGNORE, 0, LX_MSG_CMSG_CLOEXEC, NULL }, + { LXFM_UNSUP, LX_MSG_PROXY, 0, "MSG_PROXY" }, + { LXFM_UNSUP, LX_MSG_FIN, 0, "MSG_FIN" }, + { LXFM_UNSUP, LX_MSG_SYN, 0, "MSG_SYN" }, + { LXFM_UNSUP, LX_MSG_RST, 0, "MSG_RST" }, + { LXFM_UNSUP, LX_MSG_ERRQUEUE, 0, "MSG_ERRQUEUE" }, + { LXFM_UNSUP, LX_MSG_MORE, 0, "MSG_MORE" }, + { LXFM_UNSUP, LX_MSG_WAITFORONE, 0, "MSG_WAITFORONE" }, + { LXFM_UNSUP, LX_MSG_FASTOPEN, 0, "MSG_FASTOPEN" }, +}; + +#define LX_FLAG_MAP_MAX \ + (sizeof (lx_flag_map_tbl) / sizeof (lx_flag_map_tbl[0])) + +#define LX_UNSUP_BUFSZ 64 + +static int +lx_xlate_sock_flags(int inflags, lx_xlate_dir_t dir) +{ + int i, outflags = 0; + char buf[LX_UNSUP_BUFSZ]; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_FLAG_MAP_MAX; i++) { + lx_flag_map_t *map = &lx_flag_map_tbl[i]; + int match, out; + + if (dir == SUNOS_TO_LX) { + match = inflags & map->lxfm_sunos_flag; + out = map->lxfm_linux_flag; + } else { + match = inflags & map->lxfm_linux_flag; + out = map->lxfm_sunos_flag; + } + switch (map->lxfm_action) { + case LXFM_MAP: + if (match != 0) { + inflags &= ~(match); + outflags |= out; + } + break; + case LXFM_IGNORE: + if (match != 0) { + inflags &= ~(match); + } + break; + case LXFM_UNSUP: + if (match != 0) { + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "unsupported sock flag %s", map->lxfm_name); + lx_unsupported(buf); + } + } + } + if (inflags != 0) { + (void) snprintf(buf, LX_UNSUP_BUFSZ, + "unsupported sock flags 0x%08x", inflags); + lx_unsupported(buf); + } + + return (outflags); +} + +typedef enum lx_sun_type { + LX_SUN_NORMAL, + LX_SUN_ABSTRACT, +} lx_sun_type_t; + +static void +ltos_sockaddr_ux(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + struct sockaddr_un buf; + /* Calculate size of (sun_family + any padding) in sockaddr */ + int sizediff = (sizeof (buf) - sizeof (buf.sun_path)); + int len = inlen - sizediff; + + VERIFY(len > 0); + VERIFY(len <= sizeof (buf.sun_path)); + bzero(&buf, sizeof (buf)); + + if (inaddr->sa_data[0] == '\0') { + /* + * Linux supports abstract Unix sockets, which are simply + * sockets that do not exist on the file system. These sockets + * are denoted by beginning the path with a NULL character. To + * support these, we strip out the leading NULL character and + * change the path to point to a real place in /tmp directory, + * by prepending ABST_PRFX and replacing all illegal characters + * with * '_'. + * + * Since these sockets are supposed to exist outside the + * filesystem, they must be cleaned up after use. This removal + * is performed during bind(). + */ + int idx, odx; + + /* Add our abstract prefix */ + (void) strcpy(buf.sun_path, ABST_PRFX); + for (idx = 1, odx = ABST_PRFX_LEN; + idx < len && odx < sizeof (buf.sun_path); + idx++, odx++) { + char c = inaddr->sa_data[idx]; + if (c == '\0' || c == '/') { + buf.sun_path[odx] = '_'; + } else { + buf.sun_path[odx] = c; + } + } + + /* + * Since abstract socket addresses might not be NUL terminated, + * we must explicitly NUL terminate the translated path. + * Care is taken not to overflow the buffer. + */ + if (odx == sizeof (buf.sun_path)) { + buf.sun_path[odx - 1] = '\0'; + } else { + buf.sun_path[odx] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_ABSTRACT; + } + } else { + /* Copy the address directly, minding termination */ + (void) strncpy(buf.sun_path, inaddr->sa_data, len); + len = strnlen(buf.sun_path, len); + if (len == sizeof (buf.sun_path)) { + buf.sun_path[len - 1] = '\0'; + } else { + VERIFY(len < sizeof (buf.sun_path)); + buf.sun_path[len] = '\0'; + } + + if (sun_type != NULL) { + *sun_type = LX_SUN_NORMAL; + } + } + buf.sun_family = AF_UNIX; + *outlen = strlen(buf.sun_path) + 1 + sizediff; + VERIFY(*outlen <= sizeof (struct sockaddr_un)); + + *outaddr = kmem_alloc(*outlen, KM_SLEEP); + bcopy(&buf, *outaddr, *outlen); +} + +/* + * Copy in a Linux-native socket address from userspace and convert it into + * illumos format. When successful, it will allocate an appropriately sized + * struct to be freed by the caller. + */ +static long +ltos_sockaddr_copyin(const struct sockaddr *inaddr, const socklen_t inlen, + struct sockaddr **outaddr, socklen_t *outlen, lx_sun_type_t *sun_type) +{ + sa_family_t family; + struct sockaddr *laddr; + struct sockaddr_ll *sal; + int proto, error = 0; + + VERIFY(inaddr != NULL); + + if (inlen < sizeof (sa_family_t) || + inlen > sizeof (struct sockaddr_storage)) { + return (EINVAL); + } + laddr = kmem_alloc(inlen, KM_SLEEP); + if (copyin(inaddr, laddr, inlen) != 0) { + kmem_free(laddr, inlen); + return (EFAULT); + } + + family = LTOS_FAMILY(laddr->sa_family); + switch (family) { + case (sa_family_t)AF_NOTSUPPORTED: + error = EPROTONOSUPPORT; + break; + + case (sa_family_t)AF_INVAL: + error = EAFNOSUPPORT; + break; + + case AF_UNIX: + if (inlen < sizeof (sa_family_t) + 2 || + inlen > sizeof (struct sockaddr_un)) { + error = EINVAL; + break; + } + ltos_sockaddr_ux(laddr, inlen, outaddr, outlen, + sun_type); + + /* AF_UNIX bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + case AF_PACKET: + if (inlen < sizeof (struct sockaddr_ll)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_ll); + + /* sll_protocol must be translated */ + /* LINTED: alignment */ + sal = (struct sockaddr_ll *)laddr; + proto = ltos_pkt_proto(sal->sll_protocol); + if (proto < 0) { + error = EINVAL; + } + sal->sll_protocol = proto; + break; + + case AF_INET: + if (inlen < sizeof (struct sockaddr)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr); + break; + + case AF_INET6: + /* + * The illumos sockaddr_in6 has one more 32-bit field + * than the Linux version. We simply zero that field + * via kmem_zalloc. + */ + if (inlen < sizeof (lx_sockaddr_in6_t)) { + error = EINVAL; + break; + } + *outlen = sizeof (struct sockaddr_in6); + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, + KM_SLEEP); + bcopy(laddr, *outaddr, sizeof (lx_sockaddr_in6_t)); + (*outaddr)->sa_family = AF_INET6; + /* AF_INET6 bypasses the standard copy logic */ + kmem_free(laddr, inlen); + return (0); + + default: + *outlen = inlen; + } + + if (error == 0) { + /* + * For most address families, just copying into a sockaddr of + * the correct size and updating sa_family is adequate. + */ + VERIFY(inlen >= *outlen); + + *outaddr = (struct sockaddr *)kmem_zalloc(*outlen, KM_SLEEP); + bcopy(laddr, *outaddr, *outlen); + (*outaddr)->sa_family = family; + } + kmem_free(laddr, inlen); + return (error); +} + +/* + * Convert an illumos-native socket address into Linux format and copy it out + * to userspace. + */ +static long +stol_sockaddr_copyout(struct sockaddr *inaddr, socklen_t inlen, + struct sockaddr *outaddr, void *outlenp, socklen_t orig) +{ + socklen_t size = inlen; + struct sockaddr_storage buf; + struct sockaddr *bufaddr; + + /* + * Either we were passed a valid sockaddr (with length) or the length + * is set to 0. + */ + VERIFY(inaddr != NULL || inlen == 0); + + if (inlen == 0) { + goto finish; + } + + + switch (inaddr->sa_family) { + case AF_INET: + if (inlen != sizeof (struct sockaddr)) { + return (EINVAL); + } + break; + + case AF_INET6: + if (inlen != sizeof (struct sockaddr_in6)) { + return (EINVAL); + } + /* + * The linux sockaddr_in6 is shorter than illumos. + * Truncate the extra field on the way out. + */ + size = (sizeof (lx_sockaddr_in6_t)); + inlen = (sizeof (lx_sockaddr_in6_t)); + break; + + case AF_UNIX: + if (inlen > sizeof (struct sockaddr_un)) { + return (EINVAL); + } + + /* + * On Linux an empty AF_UNIX address is returned as NULL, which + * means setting the returned length to only encompass the + * address family part of the buffer. However, some code also + * references the address portion of the buffer and uses it, + * even though the returned length has been shortened. Thus, we + * clear the buffer to ensure that the address portion is NULL. + */ + if (inaddr->sa_data[0] == '\0') { + bzero(&buf, sizeof (buf)); + inlen = sizeof (inaddr->sa_family); + } + break; + + case (sa_family_t)AF_NOTSUPPORTED: + return (EPROTONOSUPPORT); + + case (sa_family_t)AF_INVAL: + return (EAFNOSUPPORT); + + default: + break; + } + + /* + * The input should be smaller than sockaddr_storage, the largest + * sockaddr we support. + */ + VERIFY(inlen <= sizeof (buf)); + + bufaddr = (struct sockaddr *)&buf; + bcopy(inaddr, bufaddr, inlen); + bufaddr->sa_family = STOL_FAMILY(bufaddr->sa_family); + + /* + * It is possible that userspace passed us a smaller buffer than we + * hope to output. When this is the case, we will truncate our output + * to the max size of their buffer but report the true size of the + * sockaddr when outputting the outlen value. + */ + size = (orig < size) ? orig : size; + + if (copyout(bufaddr, outaddr, size) != 0) { + return (EFAULT); + } + +finish: +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)inlen; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&inlen, outlenp, sizeof (inlen)) != 0) { + return (EFAULT); + } + } + + return (0); +} + +typedef struct lx_cmsg_xlate { + int lcx_sunos_level; + int lcx_sunos_type; + int (*lcx_stol_conv)(struct cmsghdr *, struct cmsghdr *); + int lcx_linux_level; + int lcx_linux_type; + int (*lcx_ltos_conv)(struct cmsghdr *, struct cmsghdr *); +} lx_cmsg_xlate_t; + +static int cmsg_conv_generic(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int ltos_conv_ucred(struct cmsghdr *, struct cmsghdr *); +static int stol_conv_recvttl(struct cmsghdr *, struct cmsghdr *); + +/* + * Table describing SunOS <-> Linux cmsg translation mappings. + * Certain types (IP_RECVTTL) are only converted in one direction and are + * indicated by one of the translation functions being set to NULL. + */ +static lx_cmsg_xlate_t lx_cmsg_xlate_tbl[] = { + { SOL_SOCKET, SCM_RIGHTS, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_RIGHTS, cmsg_conv_generic }, + { SOL_SOCKET, SCM_UCRED, stol_conv_ucred, + LX_SOL_SOCKET, LX_SCM_CRED, ltos_conv_ucred }, + { SOL_SOCKET, SCM_TIMESTAMP, cmsg_conv_generic, + LX_SOL_SOCKET, LX_SCM_TIMESTAMP, cmsg_conv_generic }, + { IPPROTO_IP, IP_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_PKTINFO, cmsg_conv_generic }, + { IPPROTO_IP, IP_RECVTTL, stol_conv_recvttl, + LX_IPPROTO_IP, LX_IP_TTL, NULL }, + { IPPROTO_IP, IP_RECVTOS, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_TOS, cmsg_conv_generic }, + { IPPROTO_IP, IP_TTL, cmsg_conv_generic, + LX_IPPROTO_IP, LX_IP_TTL, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_HOPLIMIT, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_HOPLIMIT, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_PKTINFO, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_PKTINFO, cmsg_conv_generic }, + { IPPROTO_IPV6, IPV6_TCLASS, cmsg_conv_generic, + LX_IPPROTO_IPV6, LX_IPV6_TCLASS, cmsg_conv_generic } +}; + +#define LX_MAX_CMSG_XLATE \ + (sizeof (lx_cmsg_xlate_tbl) / sizeof (lx_cmsg_xlate_tbl[0])) + +#if defined(_LP64) + +typedef struct { + int64_t cmsg_len; + int32_t cmsg_level; + int32_t cmsg_type; +} lx_cmsghdr64_t; + +/* The alignment/padding for 64bit Linux cmsghdr is not the same. */ +#define LX_CMSG64_ALIGNMENT 8 +#define ISALIGNED_LX_CMSG64(addr) \ + (((uintptr_t)(addr) & (LX_CMSG64_ALIGNMENT - 1)) == 0) +#define ROUNDUP_LX_CMSG64_LEN(len) \ + (((len) + LX_CMSG64_ALIGNMENT - 1) & ~(LX_CMSG64_ALIGNMENT - 1)) + +#define LX_CMSG64_IS_ALIGNED(m) \ + (((uintptr_t)(m) & (_CMSG_DATA_ALIGNMENT - 1)) == 0) +#define LX_CMSG64_DATA(c) ((unsigned char *)(((lx_cmsghdr64_t *)(c)) + 1)) +/* + * LX_CMSG64_VALID is closely derived from CMSG_VALID with one particularly + * important addition. Since cmsg_len is 64bit, (cmsg + cmsg_len) is checked + * against the start address as well. This prevents bogus inputs from wrapping + * around the address space. + */ +#define LX_CMSG64_VALID(cmsg, start, end) \ + (ISALIGNED_LX_CMSG64(cmsg) && \ + ((uintptr_t)(cmsg) >= (uintptr_t)(start)) && \ + ((uintptr_t)(cmsg) < (uintptr_t)(end)) && \ + ((cmsg)->cmsg_len >= sizeof (lx_cmsghdr64_t)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)) && \ + ((uintptr_t)(cmsg) + (cmsg)->cmsg_len >= (uintptr_t)(start))) +#define LX_CMSG64_NEXT(cmsg) \ + (lx_cmsghdr64_t *)((uintptr_t)(cmsg) + \ + ROUNDUP_LX_CMSG64_LEN((cmsg)->cmsg_len)) +#define LX_CMSG64_DIFF sizeof (uint32_t) + +#endif /* defined(_LP64) */ + +/* + * convert ucred_s to lx_ucred. + */ +static int +stol_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * Format the data correctly in the omsg buffer. + */ + if (omsg != NULL) { + struct ucred_s *scred; + prcred_t *cr; + lx_ucred_t lcred; + + scred = (struct ucred_s *)CMSG_CONTENT(inmsg); + lcred.lxu_pid = scred->uc_pid; + /* LINTED: alignment */ + cr = UCCRED(scred); + if (cr != NULL) { + lcred.lxu_uid = cr->pr_euid; + lcred.lxu_gid = cr->pr_egid; + } else { + lcred.lxu_uid = lcred.lxu_gid = 0; + } + + bcopy(&lcred, CMSG_CONTENT(omsg), sizeof (lx_ucred_t)); + } + + return (sizeof (struct cmsghdr) + sizeof (lx_ucred_t)); +} + +static int +ltos_conv_ucred(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + struct ucred_s *uc; + prcred_t *pc; + lx_ucred_t *lcred; + + uc = (struct ucred_s *)CMSG_CONTENT(omsg); + /* LINTED: alignment */ + pc = (prcred_t *)((char *)uc + sizeof (struct ucred_s)); + + uc->uc_credoff = sizeof (struct ucred_s); + + lcred = (lx_ucred_t *)CMSG_CONTENT(inmsg); + + uc->uc_pid = lcred->lxu_pid; + pc->pr_euid = lcred->lxu_uid; + pc->pr_egid = lcred->lxu_gid; + } + + return (sizeof (struct cmsghdr) + sizeof (struct ucred_s) + + sizeof (prcred_t)); + +} + +static int +stol_conv_recvttl(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + /* + * SunOS communicates the TTL of incoming packets via IP_RECVTTL using + * a uint8_t value instead of IP_TTL using an int. This conversion is + * only needed in the one direction since Linux does not handle + * IP_RECVTTL in the sendmsg path. + */ + if (omsg != NULL) { + uint8_t *inttl = (uint8_t *)CMSG_CONTENT(inmsg); + int *ottl = (int *)CMSG_CONTENT(omsg); + + *ottl = (int)*inttl; + } + + return (sizeof (struct cmsghdr) + sizeof (int)); +} + +static int +cmsg_conv_generic(struct cmsghdr *inmsg, struct cmsghdr *omsg) +{ + if (omsg != NULL) { + size_t data_len; + + data_len = inmsg->cmsg_len - sizeof (struct cmsghdr); + bcopy(CMSG_CONTENT(inmsg), CMSG_CONTENT(omsg), data_len); + } + + return (inmsg->cmsg_len); +} + +static int +lx_xlate_cmsg(struct cmsghdr *inmsg, struct cmsghdr *omsg, lx_xlate_dir_t dir) +{ + int i; + int len; + + VERIFY(dir == SUNOS_TO_LX || dir == LX_TO_SUNOS); + + for (i = 0; i < LX_MAX_CMSG_XLATE; i++) { + lx_cmsg_xlate_t *xlate = &lx_cmsg_xlate_tbl[i]; + if (dir == LX_TO_SUNOS && + inmsg->cmsg_level == xlate->lcx_linux_level && + inmsg->cmsg_type == xlate->lcx_linux_type && + xlate->lcx_ltos_conv != NULL) { + len = xlate->lcx_ltos_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_sunos_level; + omsg->cmsg_type = xlate->lcx_sunos_type; + } + return (len); + } else if (dir == SUNOS_TO_LX && + inmsg->cmsg_level == xlate->lcx_sunos_level && + inmsg->cmsg_type == xlate->lcx_sunos_type && + xlate->lcx_stol_conv != NULL) { + len = xlate->lcx_stol_conv(inmsg, omsg); + if (omsg != NULL) { + omsg->cmsg_len = len; + omsg->cmsg_level = xlate->lcx_linux_level; + omsg->cmsg_type = xlate->lcx_linux_type; + } + return (len); + } + } + /* + * The Linux man page for sendmsg does not define a specific error for + * unsupported cmsgs. While it is meant to indicated bad values for + * passed flags, EOPNOTSUPP appears to be the next closest choice. + */ + return (-EOPNOTSUPP); +} + +static long +ltos_cmsgs_copyin(void *addr, socklen_t inlen, void **outmsg, + socklen_t *outlenp) +{ + void *inbuf, *obuf; + struct cmsghdr *inmsg, *omsg; + int slen = 0; + + if (inlen < sizeof (struct cmsghdr) || inlen > SO_MAXARGSIZE) { + return (EINVAL); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE && + inlen < sizeof (lx_cmsghdr64_t)) { + /* The size requirements are more strict for 64bit. */ + return (EINVAL); + } +#endif /* defined(_LP64) */ + + inbuf = kmem_alloc(inlen, KM_SLEEP); + if (copyin(addr, inbuf, inlen) != 0) { + kmem_free(inbuf, inlen); + return (EFAULT); + } + +#if defined(_LP64) + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Linux cmsg headers are longer than illumos under x86_64. + * Convert to regular cmsgs first. + */ + lx_cmsghdr64_t *lmsg; + struct cmsghdr *smsg; + void *newbuf; + int len = 0; + + /* Inventory the new cmsg size */ + for (lmsg = (lx_cmsghdr64_t *)inbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg)) { + len += ROUNDUP_cmsglen(lmsg->cmsg_len - LX_CMSG64_DIFF); + } + + VERIFY(len < inlen); + if (len == 0) { + /* Input was bogus, so we can give up early. */ + kmem_free(inbuf, inlen); + *outmsg = NULL; + *outlenp = 0; + return (EINVAL); + } + + newbuf = kmem_alloc(len, KM_SLEEP); + + for (lmsg = (lx_cmsghdr64_t *)inbuf, + smsg = (struct cmsghdr *)newbuf; + LX_CMSG64_VALID(lmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + lmsg = LX_CMSG64_NEXT(lmsg), smsg = CMSG_NEXT(smsg)) { + smsg->cmsg_level = lmsg->cmsg_level; + smsg->cmsg_type = lmsg->cmsg_type; + smsg->cmsg_len = lmsg->cmsg_len - LX_CMSG64_DIFF; + + /* The above length measurement should ensure this */ + ASSERT(CMSG_VALID(smsg, newbuf, + (uintptr_t)newbuf + len)); + + bcopy(LX_CMSG64_DATA(lmsg), CMSG_CONTENT(smsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(inbuf, inlen); + inbuf = newbuf; + inlen = len; + } +#endif /* defined(_LP64) */ + + /* + * Now determine how much space we need for the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, LX_TO_SUNOS)) < 0) { + /* unsupported msg */ + kmem_free(inbuf, inlen); + return (-sz); + } + + slen += ROUNDUP_cmsglen(sz); + } + + obuf = kmem_zalloc(slen, KM_SLEEP); + + /* + * Now do the conversion. + */ + for (inmsg = (struct cmsghdr *)inbuf, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, inbuf, (uintptr_t)inbuf + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, LX_TO_SUNOS) >= 0); + } + + kmem_free(inbuf, inlen); + *outmsg = obuf; + *outlenp = slen; + return (0); +} + +static long +stol_cmsgs_copyout(void *input, socklen_t inlen, void *addr, + void *outlenp, socklen_t orig_outlen) +{ + void *obuf; + struct cmsghdr *inmsg, *omsg; + int error = 0; + socklen_t lx_len = 0; +#if defined(_LP64) + model_t model = get_udatamodel(); +#endif + + if (inlen == 0) { + /* Simply output the zero controllen */ + goto finish; + } + + VERIFY(inlen >= sizeof (struct cmsghdr)); + + /* + * First determine how much space we need for the conversion and + * make sure the caller has provided at least that much space to return + * results. + */ + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + int sz; + + if ((sz = lx_xlate_cmsg(inmsg, NULL, SUNOS_TO_LX)) < 0) { + /* unsupported msg */ + return (-sz); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* + * The converted 64-bit cmsgs require an additional 4 + * bytes of header space and must be aligned to 8 bytes + * (instead of the typical 4 for x86) + */ + sz = ROUNDUP_LX_CMSG64_LEN(sz + LX_CMSG64_DIFF); + } else +#endif /* defined(_LP64) */ + { + /* + * The converted 32-bit cmsgs do not require additional + * header space or padding for Linux conversion. + */ + sz = ROUNDUP_cmsglen(sz); + } + + /* + * Unlike SunOS, Linux requires that the last cmsg be + * adequately padded for alignment. + */ + lx_len += sz; + } + + if (lx_len > orig_outlen || addr == NULL) { + /* This will be interpreted by the caller */ + error = EMSGSIZE; + lx_len = 0; + goto finish; + } + + /* + * Since cmsgs are often padded to an aligned size, kmem_zalloc is + * necessary to prevent leaking the contents of uninitialized memory. + */ + obuf = kmem_zalloc(lx_len, KM_SLEEP); + + /* + * Convert the msgs. + */ + for (inmsg = (struct cmsghdr *)input, omsg = (struct cmsghdr *)obuf; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg), omsg = CMSG_NEXT(omsg)) { + VERIFY(lx_xlate_cmsg(inmsg, omsg, SUNOS_TO_LX) >= 0); + } + +#if defined(_LP64) + if (model == DATAMODEL_NATIVE) { + /* Linux cmsg headers are longer than illumos under x86_64. */ + struct cmsghdr *smsg; + lx_cmsghdr64_t *lmsg; + void *newbuf; + + /* + * Once again, kmem_zalloc is needed to avoid leaking the + * contents of uninialized memory + */ + newbuf = kmem_zalloc(lx_len, KM_SLEEP); + for (smsg = (struct cmsghdr *)obuf, + lmsg = (lx_cmsghdr64_t *)newbuf; + CMSG_VALID(smsg, obuf, (uintptr_t)obuf + inlen) != 0; + smsg = CMSG_NEXT(smsg), lmsg = LX_CMSG64_NEXT(lmsg)) { + lmsg->cmsg_level = smsg->cmsg_level; + lmsg->cmsg_type = smsg->cmsg_type; + lmsg->cmsg_len = smsg->cmsg_len + LX_CMSG64_DIFF; + + ASSERT(LX_CMSG64_VALID(lmsg, newbuf, + (uintptr_t)newbuf + lx_len) != 0); + + bcopy(CMSG_CONTENT(smsg), LX_CMSG64_DATA(lmsg), + smsg->cmsg_len - sizeof (*smsg)); + } + + kmem_free(obuf, lx_len); + obuf = newbuf; + } +#endif /* defined(_LP64) */ + + if (copyout(obuf, addr, lx_len) != 0) { + kmem_free(obuf, lx_len); + return (EFAULT); + } + kmem_free(obuf, lx_len); + +finish: + if (outlenp != NULL) { +#if defined(_LP64) + if (model != DATAMODEL_NATIVE) { + int32_t len32 = (int32_t)lx_len; + if (copyout(&len32, outlenp, sizeof (len32)) != 0) { + return (EFAULT); + } + } else +#endif /* defined(_LP64) */ + { + if (copyout(&lx_len, outlenp, sizeof (lx_len)) != 0) { + return (EFAULT); + } + } + } + return (error); +} + +static void +lx_cmsg_set_cloexec(void *input, socklen_t inlen) +{ + struct cmsghdr *inmsg; + + if (inlen == 0) { + return; + } + + for (inmsg = (struct cmsghdr *)input; + CMSG_VALID(inmsg, input, (uintptr_t)input + inlen) != 0; + inmsg = CMSG_NEXT(inmsg)) { + if (inmsg->cmsg_level == SOL_SOCKET && + inmsg->cmsg_type == SCM_RIGHTS) { + int *fds = (int *)CMSG_CONTENT(inmsg); + int i, num = (int)CMSG_CONTENTLEN(inmsg) / sizeof (int); + + for (i = 0; i < num; i++) { + char flags; + file_t *fp; + + fp = getf(fds[i]); + if (fp == NULL) { + /* + * It is possible that a received fd + * will already have been closed if a + * thread in the local process is + * indiscriminately issuing close(2) + * calls while the message is being + * received. If that is the case, no + * further processing of the fd is + * needed. It will still be passed + * up in the cmsg even though the + * caller chose to close it already. + */ + continue; + } + + flags = f_getfd(fds[i]); + flags |= FD_CLOEXEC; + f_setfd(fds[i], flags); + releasef(fds[i]); + } + } + } +} + +static int +lx_cmsg_try_ucred(sonode_t *so, struct nmsghdr *msg, socklen_t origlen) +{ + lx_socket_aux_data_t *sad; + struct cmsghdr *cmsg = NULL; + int msgsize; + cred_t *cred; + + if (origlen == 0) { + return (0); + } + sad = lx_sad_acquire(SOTOV(so)); + if ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0) { + mutex_exit(&sad->lxsad_lock); + return (0); + } + mutex_exit(&sad->lxsad_lock); + + mutex_enter(&so->so_lock); + if (so->so_peercred == NULL) { + mutex_exit(&so->so_lock); + return (0); + } + crhold(cred = so->so_peercred); + mutex_exit(&so->so_lock); + + msgsize = ucredminsize(cred) + sizeof (struct cmsghdr); + if (msg->msg_control == NULL) { + msg->msg_controllen = msgsize; + msg->msg_control = cmsg = kmem_zalloc(msgsize, KM_SLEEP); + } else { + /* + * The so_recvmsg operation may have allocated a msg_control + * buffer which precisely fits all returned cmsgs. We must + * manually verify the length of that cmsg data and reallocate + * the buffer if it lacks the necessary space. + */ + uintptr_t start = (uintptr_t)msg->msg_control; + uintptr_t end = start + msg->msg_controllen; + + ASSERT(msg->msg_controllen > 0); + cmsg = (struct cmsghdr *)msg->msg_control; + while (CMSG_VALID(cmsg, start, end) != 0) { + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_UCRED) { + /* + * If some later code change results in a ucred + * being attached anyways, there is no need for + * us to do it manually + */ + crfree(cred); + return (0); + } + cmsg = CMSG_NEXT(cmsg); + } + if (((uintptr_t)cmsg + msgsize) > end) { + socklen_t offset = (uintptr_t)cmsg - start; + socklen_t newsize = offset + msgsize; + void *newbuf; + + if (newsize < msg->msg_controllen) { + /* size overflow, bail */ + crfree(cred); + return (-1); + } + newbuf = kmem_alloc(newsize, KM_SLEEP); + bcopy(msg->msg_control, newbuf, msg->msg_controllen); + kmem_free(msg->msg_control, msg->msg_controllen); + + msg->msg_control = newbuf; + msg->msg_controllen = newsize; + cmsg = (struct cmsghdr *)((uintptr_t)newbuf + offset); + } + } + + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_UCRED; + cmsg->cmsg_len = msgsize; + (void) cred2ucred(cred, so->so_cpid, CMSG_CONTENT(cmsg), CRED()); + crfree(cred); + return (0); +} + +static lx_socket_aux_data_t * +lx_sad_acquire(vnode_t *vp) +{ + lx_socket_aux_data_t *cur, *created; + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + /* perform our allocation carefully */ + mutex_exit(&vp->v_vsd_lock); + + created = (lx_socket_aux_data_t *)kmem_zalloc( + sizeof (*created), KM_SLEEP); + + mutex_enter(&vp->v_vsd_lock); + cur = (lx_socket_aux_data_t *)vsd_get(vp, lx_socket_vsd); + if (cur == NULL) { + mutex_init(&created->lxsad_lock, NULL, MUTEX_DEFAULT, + NULL); + (void) vsd_set(vp, lx_socket_vsd, created); + cur = created; + } else { + kmem_free(created, sizeof (*created)); + } + } + mutex_exit(&vp->v_vsd_lock); + mutex_enter(&cur->lxsad_lock); + return (cur); +} + +static int +lx_convert_pkt_proto(int protocol) +{ + switch (ntohs(protocol)) { + case LX_ETH_P_802_2: + return (ETH_P_802_2); + case LX_ETH_P_IP: + return (ETH_P_IP); + case LX_ETH_P_ARP: + return (ETH_P_ARP); + case LX_ETH_P_IPV6: + return (ETH_P_IPV6); + case LX_ETH_P_ALL: + case LX_ETH_P_802_3: + return (ETH_P_ALL); + default: + return (-1); + } +} + +static int +lx_convert_sock_args(int in_dom, int in_type, int in_proto, int *out_dom, + int *out_type, int *out_options, int *out_proto) +{ + int domain, type, options; + + if (in_dom < 0 || in_type < 0 || in_proto < 0) + return (EINVAL); + + domain = LTOS_FAMILY(in_dom); + if (domain == AF_NOTSUPPORTED || domain == AF_UNSPEC) + return (EAFNOSUPPORT); + if (domain == AF_INVAL) + return (EINVAL); + + type = LTOS_SOCKTYPE(in_type & LX_SOCK_TYPE_MASK); + if (type == SOCK_INVAL) + return (EINVAL); + /* + * Linux does not allow the app to specify IP Protocol for raw sockets. + * SunOS does, so bail out here. + */ + if (type == SOCK_NOTSUPPORTED || + (domain == AF_INET && type == SOCK_RAW && in_proto == IPPROTO_IP)) { + if (lx_kern_release_cmp(curzone, "2.6.15") < 0) { + /* + * Use error appropriate for kernel version. + * See lx_socket_create for more detail. + */ + return (ESOCKTNOSUPPORT); + } + return (EPROTONOSUPPORT); + } + + options = 0; + in_type &= ~(LX_SOCK_TYPE_MASK); + if (in_type & LX_SOCK_NONBLOCK) { + in_type ^= LX_SOCK_NONBLOCK; + options |= SOCK_NONBLOCK; + } + if (in_type & LX_SOCK_CLOEXEC) { + in_type ^= LX_SOCK_CLOEXEC; + options |= SOCK_CLOEXEC; + } + if (in_type != 0) { + return (EINVAL); + } + + /* Protocol definitions for PF_PACKET differ between Linux and SunOS */ + if (domain == PF_PACKET && + (in_proto = lx_convert_pkt_proto(in_proto)) < 0) + return (EINVAL); + + *out_dom = domain; + *out_type = type; + *out_options = options; + *out_proto = in_proto; + return (0); +} + +/* + * For restartable socket syscall handling, the relevant syscalls are only + * restarted when a timeout is not set on the socket. + */ +static void +lx_sock_syscall_restart(sonode_t *so, boolean_t recv) +{ + if (recv) { + if (so->so_rcvtimeo != 0) + return; + } else { + if (so->so_sndtimeo != 0) + return; + } + + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; +} + +static int +lx_socket_create(int domain, int type, int protocol, int options, file_t **fpp, + int *fdp) +{ + sonode_t *so; + vnode_t *vp; + file_t *fp; + int err, fd; + + /* + * EACCES is returned in Linux when the user isn't allowed to use a + * "ping socket". EACCES is also used by the iputils-ping userland + * application to determine if fallback to SOCK_RAW is necessary. + * + * This can be removed if we ever implement SOCK_DGRAM + IPPROTO_ICMP. + */ + if ((domain == AF_INET && type == SOCK_DGRAM && protocol == + IPPROTO_ICMP) || (domain == AF_INET6 && type == SOCK_DGRAM && + protocol == IPPROTO_ICMPV6)) + return (EACCES); + + /* logic cloned from so_socket */ + so = socket_create(domain, type, protocol, NULL, NULL, SOCKET_SLEEP, + SOV_DEFAULT, CRED(), &err); + + if (so == NULL) { + switch (err) { + case EPROTOTYPE: + case EPROTONOSUPPORT: + if (lx_kern_release_cmp(curzone, "2.6.15") < 0) { + /* + * Linux changed its socket error behavior in + * versions 2.6.15 and later. See git commit + * 86c8f9d158f68538a971a47206a46a22c7479bac in + * the Linux repository. + * + * LTP presently checks for version 2.6.16. + */ + return (ESOCKTNOSUPPORT); + } + return (EPROTONOSUPPORT); + default: + return (err); + } + } + + /* Allocate a file descriptor for the socket */ + vp = SOTOV(so); + if ((err = falloc(vp, FWRITE|FREAD, &fp, &fd)) != 0) { + (void) socket_close(so, 0, CRED()); + socket_destroy(so); + return (err); + } + + /* + * Linux programs do not tolerate errors appearing from asynchronous + * events (such as ICMP messages arriving). Setting SM_DEFERERR will + * prevent checking/delivery of such errors. + */ + so->so_mode |= SM_DEFERERR; + + /* Now fill in the entries that falloc reserved */ + if (options & SOCK_NONBLOCK) { + so->so_state |= SS_NONBLOCK; + fp->f_flag |= FNONBLOCK; + } + mutex_exit(&fp->f_tlock); + *fpp = fp; + *fdp = fd; + return (0); +} + +static void +lx_socket_destroy(file_t *fp, int fd) +{ + sonode_t *so = VTOSO(fp->f_vnode); + + setf(fd, NULL); + + mutex_enter(&fp->f_tlock); + unfalloc(fp); + + (void) socket_close(so, 0, CRED()); + socket_destroy(so); +} + +long +lx_socket(int domain, int type, int protocol) +{ + int error, options, fd = -1; + file_t *fp = NULL; + + if ((error = lx_convert_sock_args(domain, type, protocol, &domain, + &type, &options, &protocol)) != 0) { + return (set_errno(error)); + } + + error = lx_socket_create(domain, type, protocol, options, &fp, &fd); + if (error != 0) { + return (set_errno(error)); + } + + setf(fd, fp); + if ((options & SOCK_CLOEXEC) != 0) { + f_setfd(fd, FD_CLOEXEC); + } + return (fd); +} + +long +lx_bind(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + socklen_t len = 0; + file_t *fp; + int error; + lx_sun_type_t sun_type; + boolean_t not_sock = B_FALSE; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, &sun_type); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + if (addr != NULL && addr->sa_family == AF_UNIX) { + vnode_t *vp; + + error = so_ux_lookup(so, (struct sockaddr_un *)addr, B_TRUE, + &vp); + if (error == 0) { + /* A valid socket exists and is open at this address. */ + VN_RELE(vp); + } else { + /* Keep track of paths which are not valid sockets. */ + if (error == ENOTSOCK) { + not_sock = B_TRUE; + } + + /* + * When binding to an abstract namespace address or + * /dev/log, implicit clean-up must occur if there is + * not a valid socket at the specififed address. See + * ltos_sockaddr_copyin for details about why these + * socket types act differently. + */ + if (sun_type == LX_SUN_ABSTRACT) { + (void) vn_removeat(NULL, addr->sa_data, + UIO_SYSSPACE, RMFILE); + } + } + } + + error = socket_bind(so, addr, len, _SOBIND_XPG4_2, CRED()); + + /* + * Linux returns EADDRINUSE for attempts to bind to Unix domain + * sockets that aren't sockets. + */ + if (error == EINVAL && addr != NULL && addr->sa_family == AF_UNIX && + not_sock == B_TRUE) { + error = EADDRINUSE; + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_connect(long sock, uintptr_t name, socklen_t namelen) +{ + struct sonode *so; + struct sockaddr *addr = NULL; + lx_socket_aux_data_t *sad = NULL; + socklen_t len = 0; + file_t *fp; + int error; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + /* + * Ensure the name is sized appropriately before we alloc memory and + * copy it in from userspace. We need at least the address family to + * make later sizing decisions. + */ + if (namelen != 0) { + error = ltos_sockaddr_copyin((struct sockaddr *)name, namelen, + &addr, &len, NULL); + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + } + + error = socket_connect(so, addr, len, fp->f_flag, + _SOCONNECT_XPG4_2, CRED()); + + if (error == EINTR) + lx_sock_syscall_restart(so, B_FALSE); + + /* + * Linux connect(2) behavior is rather strange when using the + * O_NONBLOCK flag. The first call will return EINPROGRESS, as + * expected. Provided that is successful, a second call to connect + * will return 0 instead of EISCONN. Subsequent connect calls will + * return EISCONN. + */ + if ((fp->f_flag & FNONBLOCK) != 0 && error != 0) { + sad = lx_sad_acquire(SOTOV(so)); + if (error == EISCONN && + sad->lxsad_status == LXSS_CONNECTING) { + /* Report the one success */ + sad->lxsad_status = LXSS_CONNECTED; + error = 0; + } else if (error == EINPROGRESS) { + sad->lxsad_status = LXSS_CONNECTING; + } + mutex_exit(&sad->lxsad_lock); + } + + /* + * When connecting to a UDP socket, configure it so that future + * sendto/sendmsg operations are allowed to specify a destination + * address. See the Posix spec. for sendto(2). Linux allows this while + * illumos would return EISCONN if the option is not set. + */ + if (error == 0 && so->so_protocol == IPPROTO_UDP && + (so->so_family == AF_INET || so->so_family == AF_INET6)) { + int val = 1; + + DTRACE_PROBE(lx__connect__udp); + (void) socket_setsockopt(so, IPPROTO_UDP, UDP_SND_TO_CONNECTED, + &val, sizeof (val), CRED()); + } + + releasef(sock); + + if (addr != NULL) { + kmem_free(addr, len); + } + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +/* + * Custom version of socket_recvmsg for error-handling overrides. + */ +static int +lx_socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr) +{ + int error; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache when reading data, as the application + * is likely to access the data shortly. + */ + uiop->uio_extflg |= UIO_COPY_CACHED; + + error = SOP_RECVMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial read */ + if (uiop->uio_resid != orig_resid) + error = 0; + break; + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + } + break; + default: + break; + } + return (error); +} + +static long +lx_recv_common(int sock, struct nmsghdr *msg, xuio_t *xuiop, int flags, + void *namelenp, void *controllenp, void *flagsp) +{ + struct sonode *so; + file_t *fp; + void *name; + socklen_t namelen; + void *control; + socklen_t controllen; + ssize_t len; + int error; + boolean_t fd_cloexec; + boolean_t is_peek_trunc; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + fd_cloexec = ((flags & LX_MSG_CMSG_CLOEXEC) != 0); + flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + is_peek_trunc = (flags & (MSG_PEEK|MSG_TRUNC)) == (MSG_PEEK|MSG_TRUNC); + len = xuiop->xu_uio.uio_resid; + xuiop->xu_uio.uio_fmode = fp->f_flag; + xuiop->xu_uio.uio_extflg = UIO_COPY_CACHED; + + /* + * Linux accepts MSG_TRUNC as an input flag, unlike SunOS and many + * other UNIX distributions. When combined with MSG_PEEK, it causes + * recvmsg to return the size of the waiting message, regardless of + * buffer size. This behavior is commonly used with a 0-length buffer + * to interrogate the size of a queued message prior to allocating a + * buffer for it. + * + * In order to support this functionality, a custom XUIO type is used + * to communicate the total message size out from the depths of sockfs. + */ + if (is_peek_trunc) { + xuiop->xu_uio.uio_extflg |= UIO_XUIO; + xuiop->xu_type = UIOTYPE_PEEKSIZE; + xuiop->xu_ext.xu_ps.xu_ps_set = B_FALSE; + xuiop->xu_ext.xu_ps.xu_ps_size = 0; + } + + name = msg->msg_name; + namelen = msg->msg_namelen; + control = msg->msg_control; + controllen = msg->msg_controllen; + + /* + * socket_recvmsg will allocate these if needed. + * NULL them out to prevent any confusion. + */ + msg->msg_name = NULL; + msg->msg_control = NULL; + + msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + + error = lx_socket_recvmsg(so, msg, (struct uio *)xuiop, CRED()); + if (error) { + if (error == EINTR) + lx_sock_syscall_restart(so, B_TRUE); + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGRCV, 1); + releasef(sock); + + if (namelen != 0) { + error = stol_sockaddr_copyout(msg->msg_name, msg->msg_namelen, + name, namelenp, namelen); + + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + msg->msg_namelen = 0; + } + + /* + * Errors during copyout of the name are not a concern to Linux + * callers at this point in the syscall + */ + if (error != 0 && error != EFAULT) { + goto err; + } + } + + if (controllen != 0) { + if (fd_cloexec) { + /* + * If CLOEXEC needs to set on file descriptors passed + * via SCM_RIGHTS, do so before formatting the cmsgs + * for Linux. + */ + lx_cmsg_set_cloexec(msg->msg_control, + msg->msg_controllen); + } + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + /* + * It may be necessary to append a SCM_UCRED cmsg to + * the controls if SO_PASSCRED is set on a + * connection-oriented AF_UNIX socket. + * + * See lx_setsockopt_socket for more details. + */ + if (lx_cmsg_try_ucred(so, msg, controllen) != 0) { + msg->msg_flags |= MSG_CTRUNC; + } + } + + error = stol_cmsgs_copyout(msg->msg_control, + msg->msg_controllen, control, controllenp, controllen); + + if (error != 0) { + /* + * If there was an error during cmsg translation or + * copyout, we need to clean up any FDs that are being + * passed back via SCM_RIGHTS. This prevents us from + * leaking those open files. + */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, + 0); + + /* + * An error during cmsg_copyout means we had + * _something_ to process. + */ + VERIFY(msg->msg_controllen != 0); + + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + + if (error == EMSGSIZE) { + /* Communicate that messages were truncated */ + msg->msg_flags |= MSG_CTRUNC; + error = 0; + } else { + goto err; + } + } else if (msg->msg_controllen != 0) { + kmem_free(msg->msg_control, + (size_t)msg->msg_controllen); + msg->msg_controllen = 0; + } + } + + if (flagsp != NULL) { + int flags; + + /* Clear internal flag. */ + flags = msg->msg_flags & ~MSG_XPG4_2; + flags = lx_xlate_sock_flags(flags, SUNOS_TO_LX); + + if (copyout(&flags, flagsp, sizeof (flags) != 0)) { + error = EFAULT; + goto err; + } + } + + /* + * If both MSG_PEEK|MSG_TRUNC were set on the input flags and the + * socket layer was able to calculate the total message size for us, + * return that instead of the copied size. + */ + if (is_peek_trunc && xuiop->xu_ext.xu_ps.xu_ps_set == B_TRUE) { + return (xuiop->xu_ext.xu_ps.xu_ps_size); + } + + return (len - xuiop->xu_uio.uio_resid); + +err: + if (msg->msg_controllen != 0) { + /* Prevent FD leakage (see above) */ + so_closefds(msg->msg_control, msg->msg_controllen, 0, 0); + kmem_free(msg->msg_control, (size_t)msg->msg_controllen); + } + if (msg->msg_namelen != 0) { + kmem_free(msg->msg_name, (size_t)msg->msg_namelen); + } + return (set_errno(error)); +} + +long +lx_recv(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* + * The input len is unsigned, so limit it to SSIZE_MAX since + * the return value is signed. + */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_namelen = 0; + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + return (lx_recv_common(sock, &smsg, &xuio, flags, NULL, NULL, NULL)); +} + +long +lx_recvfrom(int sock, void *buffer, size_t len, int flags, + struct sockaddr *srcaddr, socklen_t *addrlenp) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec uiov; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + uiov.iov_base = buffer; + uiov.iov_len = len; + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = &uiov; + xuio.xu_uio.uio_iovcnt = 1; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + smsg.msg_name = (char *)srcaddr; + if (addrlenp != NULL && srcaddr != NULL) { + /* + * Despite addrlenp being defined as a socklen_t *, Linux + * treats it internally as an int *. Certain LTP tests depend + * upon this behavior, so we must emulate it as well. + */ + int namelen; + + if (copyin(addrlenp, &namelen, sizeof (namelen)) != 0) { + return (set_errno(EFAULT)); + } + if (namelen < 0) { + return (set_errno(EINVAL)); + } + smsg.msg_namelen = namelen; + } else { + smsg.msg_namelen = 0; + } + smsg.msg_controllen = 0; + smsg.msg_flags = 0; + + return (lx_recv_common(sock, &smsg, &xuio, flags, addrlenp, NULL, + NULL)); +} + +long +lx_recvmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + xuio_t xuio; + struct iovec luiov[IOV_MAX_STACK], *uiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + void *namelenp, *controllenp, *flagsp; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + + namelenp = &((lx_msghdr32_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr32_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr32_t *)msg)->msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + + namelenp = &((lx_msghdr_t *)msg)->msg_namelen; + controllenp = &((lx_msghdr_t *)msg)->msg_controllen; + flagsp = &((lx_msghdr_t *)msg)->msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt < 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + uiov = kmem_alloc(iovsize, KM_SLEEP); + } else if (iovcnt > 0) { + iovsize = 0; + uiov = luiov; + } else { + iovsize = 0; + uiov = NULL; + goto noiov; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 luiov32[IOV_MAX_STACK], *uiov32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + uiov32 = kmem_alloc(iov32size, KM_SLEEP); + } else { + uiov32 = luiov32; + } + + if (copyin((struct iovec32 *)smsg.msg_iov, uiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = uiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + kmem_free(uiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + uiov[i].iov_len = iovlen32; + uiov[i].iov_base = + (caddr_t)(uintptr_t)uiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(uiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, uiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = uiov[i].iov_len; + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + +noiov: + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + xuio.xu_uio.uio_loffset = 0; + xuio.xu_uio.uio_iov = uiov; + xuio.xu_uio.uio_iovcnt = iovcnt; + xuio.xu_uio.uio_resid = len; + xuio.xu_uio.uio_segflg = UIO_USERSPACE; + xuio.xu_uio.uio_limit = 0; + + res = lx_recv_common(sock, &smsg, &xuio, flags, namelenp, controllenp, + flagsp); + + if (iovsize != 0) { + kmem_free(uiov, iovsize); + } + + return (res); +} + +long +lx_recvmmsg(int sock, void *msg, uint_t vlen, int flags, timespec_t *timeoutp) +{ + hrtime_t deadline = 0; + uint_t rcvd = 0; + long ret = 0; + boolean_t waitforone; + + waitforone = ((flags & LX_MSG_WAITFORONE) != 0); + flags &= ~LX_MSG_WAITFORONE; + + /* + * We want to limit the work that a thread calling recvmmsg() can + * perform in the kernel so that it cannot accrue too high a priority. + * Artificially capping vlen means that the thread will return to + * userspace after processing at most IOV_MAX messages, giving the + * system a chance to reset the thread priority. + * + * Linux does not cap vlen here and recvmmsg() is expected to return + * once vlen messages have been received, a timeout occurs, or if an + * error is encountered; the artificial cap adds another case. + * + * It is possible that returning "early" in this emulation will + * cause problems with some applications however a properly written + * recvmmsg() consumer should consume only the received datagrams + * and try again if it wants more. This may need revisiting in the + * future. + */ + if (vlen > IOV_MAX) + vlen = IOV_MAX; + + if (timeoutp != NULL) { + timespec_t timeout; + uhrtime_t utime = (uhrtime_t)gethrtime(); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &timeout, sizeof (timestruc_t))) + return (set_errno(EFAULT)); + } else { + timestruc32_t timeout32; + if (copyin(timeoutp, &timeout32, + sizeof (timestruc32_t))) + return (set_errno(EFAULT)); + timeout.tv_sec = (time_t)timeout32.tv_sec; + timeout.tv_nsec = timeout32.tv_nsec; + } + + if (itimerspecfix(&timeout)) + return (set_errno(EINVAL)); + + /* + * Make sure that deadline will not overflow. itimerspecfix() + * has already checked for negative values and too big a value + * in tv_nsec + */ + if (timeout.tv_sec >= HRTIME_MAX / NANOSEC) + return (set_errno(EINVAL)); + + utime += timeout.tv_sec * NANOSEC; + utime += timeout.tv_nsec; + + if (utime > HRTIME_MAX) + return (set_errno(EINVAL)); + + deadline = (hrtime_t)utime; + } + + for (rcvd = 0; rcvd < vlen; rcvd++) { + uint_t *ptr; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_mmsghdr_t *hdr = (lx_mmsghdr_t *)msg; + hdr += rcvd; + ret = lx_recvmsg(sock, (lx_msghdr_t *)hdr, flags); + ptr = &hdr->msg_len; + } else { + lx_mmsghdr32_t *hdr = (lx_mmsghdr32_t *)msg; + hdr += rcvd; + ret = lx_recvmsg(sock, (lx_msghdr32_t *)hdr, flags); + ptr = &hdr->msg_len; + } + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, ptr, sizeof (*ptr)); + /* + * If MSG_WAITFORONE is set, set MSG_DONTWAIT after the + * first packet has been received. + */ + if (waitforone) { + flags |= LX_MSG_DONTWAIT; + waitforone = B_FALSE; + } + /* + * The Linux man page documents the timeout option as + * only being checked after each datagram is received. + * The man page does not document ETIMEDOUT as a return + * code so we do not set an errno. + */ + if (deadline > 0 && gethrtime() >= deadline) + break; + } + + if (rcvd > 0) { + /* + * Any error code is deliberately discarded if any message + * was successfully received. + */ + ttolwp(curthread)->lwp_errno = 0; + return (rcvd); + } + + return (ret); +} + +/* + * Custom version of socket_sendmsg for error-handling overrides. + */ +static int +lx_socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr, boolean_t nosig) +{ + int error = 0; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache if we are doing a local (AF_UNIX) write. + */ + if (so->so_family == AF_UNIX) { + uiop->uio_extflg |= UIO_COPY_CACHED; + } else { + uiop->uio_extflg &= ~UIO_COPY_CACHED; + } + + error = SOP_SENDMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + case ENOMEM: + /* EAGAIN is EWOULDBLOCK */ + case EWOULDBLOCK: + /* We did a partial send */ + if (uiop->uio_resid != orig_resid) { + error = 0; + } + break; + + case ENOTCONN: + /* + * The rules are different for non-blocking sockets which are + * still in the process of making a connection + */ + if ((msg->msg_flags & MSG_DONTWAIT) != 0 || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)) != 0) { + error = EAGAIN; + break; + } + + /* Appease LTP and match behavior detailed in the man page */ + error = EPIPE; + /* FALLTHROUGH */ + case EPIPE: + if (nosig == B_FALSE) { + tsignal(curthread, SIGPIPE); + } + break; + + default: + break; + } + + return (error); +} + +static long +lx_send_common(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) +{ + struct sonode *so; + file_t *fp; + struct sockaddr *name = NULL; + socklen_t namelen; + void *control = NULL; + socklen_t controllen; + ssize_t len = 0; + int error; + boolean_t nosig; + + if ((so = getsonode(sock, &error, &fp)) == NULL) { + return (set_errno(error)); + } + + uiop->uio_fmode = fp->f_flag; + + /* Allocate and copyin name and control */ + if (msg->msg_name != NULL && msg->msg_namelen != 0) { + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + + error = ltos_sockaddr_copyin((struct sockaddr *)msg->msg_name, + msg->msg_namelen, &name, &namelen, NULL); + if (error != 0) { + goto done; + } + /* copyin_name null terminates addresses for AF_UNIX */ + msg->msg_namelen = namelen; + msg->msg_name = name; + } else { + msg->msg_name = name = NULL; + msg->msg_namelen = namelen = 0; + } + + if (msg->msg_control != NULL && msg->msg_controllen != 0) { + /* + * Verify that the length is not excessive to prevent + * an application from consuming all of kernel memory. + */ + if (msg->msg_controllen > SO_MAXARGSIZE) { + error = EINVAL; + goto done; + } + if ((error = ltos_cmsgs_copyin(msg->msg_control, + msg->msg_controllen, &control, &controllen)) != 0) { + goto done; + } + msg->msg_control = control; + msg->msg_controllen = controllen; + } else { + msg->msg_control = control = NULL; + msg->msg_controllen = controllen = 0; + } + + len = uiop->uio_resid; + msg->msg_flags = lx_xlate_sock_flags(flags, LX_TO_SUNOS); + /* Default to XPG4.2 operation */ + msg->msg_flags |= MSG_XPG4_2; + nosig = ((flags & LX_MSG_NOSIGNAL) != 0); + + error = lx_socket_sendmsg(so, msg, uiop, CRED(), nosig); + if (error == EINTR) + lx_sock_syscall_restart(so, B_FALSE); +done: + if (control != NULL) { + kmem_free(control, controllen); + } + if (name != NULL) { + kmem_free(name, namelen); + } + if (error != 0) { + releasef(sock); + return (set_errno(error)); + } + lwp_stat_update(LWP_STAT_MSGSND, 1); + releasef(sock); + return (len - uiop->uio_resid); +} + +/* + * For both send and sendto Linux evaluates errors in a different order than + * we do internally. Specifically it will check the buffer address before + * checking if the socket is connected. This can lead to a different errno on + * us vs. Linux (seen with LTP) but we don't bother to emulate this. + */ +long +lx_send(int sock, void *buffer, size_t len, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = NULL; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendto(int sock, void *buffer, size_t len, int flags, + struct sockaddr *dstaddr, socklen_t addrlen) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec aiov[1]; + + if ((ssize_t)len < 0) { + /* Keep len reasonably limited (see lx_recv) */ + return (set_errno(EINVAL)); + } + + aiov[0].iov_base = buffer; + aiov[0].iov_len = len; + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = 1; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + smsg.msg_name = (char *)dstaddr; + smsg.msg_namelen = addrlen; + smsg.msg_control = NULL; + return (lx_send_common(sock, &smsg, &auio, flags)); +} + +long +lx_sendmsg(int sock, void *msg, int flags) +{ + struct nmsghdr smsg; + struct uio auio; + struct iovec buf[IOV_MAX_STACK], *aiov; + int i, iovcnt, iovsize; + long res; + ssize_t len = 0; + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_msghdr32_t lmsg32; + if (copyin(msg, &lmsg32, sizeof (lmsg32)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = (void *)(uintptr_t)lmsg32.msg_name; + smsg.msg_namelen = lmsg32.msg_namelen; + smsg.msg_iov = (struct iovec *)(uintptr_t)lmsg32.msg_iov; + smsg.msg_iovlen = lmsg32.msg_iovlen; + smsg.msg_control = (void *)(uintptr_t)lmsg32.msg_control; + smsg.msg_controllen = lmsg32.msg_controllen; + smsg.msg_flags = lmsg32.msg_flags; + } else +#endif /* defined(_LP64) */ + { + lx_msghdr_t lmsg; + if (copyin(msg, &lmsg, sizeof (lmsg)) != 0) { + return (set_errno(EFAULT)); + } + smsg.msg_name = lmsg.msg_name; + smsg.msg_namelen = lmsg.msg_namelen; + smsg.msg_iov = lmsg.msg_iov; + smsg.msg_iovlen = lmsg.msg_iovlen; + smsg.msg_control = lmsg.msg_control; + smsg.msg_controllen = lmsg.msg_controllen; + smsg.msg_flags = lmsg.msg_flags; + } + + iovcnt = smsg.msg_iovlen; + if (iovcnt <= 0 || iovcnt > IOV_MAX) { + return (set_errno(EMSGSIZE)); + } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } else { + iovsize = 0; + aiov = buf; + } + +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + /* convert from 32bit iovec structs */ + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; + ssize32_t count32; + + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) { + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + } + + if (copyin((struct iovec32 *)smsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EFAULT)); + } + + count32 = 0; + for (i = 0; i < iovcnt; i++) { + ssize32_t iovlen32; + + iovlen32 = aiov32[i].iov_len; + count32 += iovlen32; + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + + return (set_errno(EINVAL)); + } + + aiov[i].iov_len = iovlen32; + aiov[i].iov_base = + (caddr_t)(uintptr_t)aiov32[i].iov_base; + } + len = count32; + + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + } + } else +#endif /* defined(_LP64) */ + { + if (copyin(smsg.msg_iov, aiov, + iovcnt * sizeof (struct iovec)) != 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } + + len = 0; + for (i = 0; i < iovcnt; i++) { + ssize_t iovlen = aiov[i].iov_len; + + len += iovlen; + if (iovlen < 0 || len < 0) { + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + } + } + /* Since the iovec is passed via the uio, NULL it out in the msg */ + smsg.msg_iov = NULL; + + auio.uio_loffset = 0; + auio.uio_iov = aiov; + auio.uio_iovcnt = iovcnt; + auio.uio_resid = len; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_limit = 0; + + res = lx_send_common(sock, &smsg, &auio, flags); + + if (iovsize != 0) { + kmem_free(aiov, iovsize); + } + + return (res); +} + +long +lx_sendmmsg(int sock, void *msg, uint_t vlen, int flags) +{ + long ret = 0; + uint_t sent = 0; + + /* + * Linux caps vlen to UIO_MAXIOV (1024). + */ + if (vlen > IOV_MAX) + vlen = IOV_MAX; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + lx_mmsghdr_t *hdr = msg; + + for (sent = 0; sent < vlen; sent++, hdr++) { + ret = lx_sendmsg(sock, (lx_msghdr_t *)hdr, flags); + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len)); + } + } else { + lx_mmsghdr32_t *hdr = msg; + + for (sent = 0; sent < vlen; sent++, hdr++) { + ret = lx_sendmsg(sock, (lx_msghdr32_t *)hdr, flags); + if (ttolwp(curthread)->lwp_errno != 0) + break; + copyout(&ret, &hdr->msg_len, sizeof (hdr->msg_len)); + } + } + + if (sent > 0) { + /* + * Any error code is deliberately discarded if any message + * was successfully sent. + */ + ttolwp(curthread)->lwp_errno = 0; + return (sent); + } + + return (ret); +} + +/* + * Linux socket option type definitions + * + * The protocol `levels` are well defined (see in.h) The option values are + * not so well defined. Linux often uses different values vs. Illumos + * although they mean the same thing. For example, IP_TOS in Linux is + * defined as value 1 but in Illumos it is defined as value 3. This table + * maps all the Protocol levels to their options and maps them between + * Linux and Illumos and vice versa. Hence the reason for the complexity. + * + * For a certain subset of sockopts, Linux will implicitly truncate optval + * input, so long as optlen meets a minimum size. Because illumos is strict + * about optlen, we must cap optlen for those options. + */ + +typedef struct lx_sockopt_map { + const int lsm_opt; /* Illumos-native equivalent */ + const int lsm_lcap; /* Cap optlen to this size. (Ignored if 0) */ +} lx_sockopt_map_t; + +typedef struct lx_proto_opts { + const lx_sockopt_map_t *lpo_entries; /* Linux->SunOS map entries */ + unsigned int lpo_max; /* max entries in table */ +} lx_proto_opts_t; + +#define OPTNOTSUP -1 /* we don't support it */ + +#define PROTO_SOCKOPTS(opts) \ + { (opts), sizeof ((opts)) / sizeof ((opts)[0]) } + +/* Shorten name so the columns can line up */ +#define IP_MREQ_SZ sizeof (struct ip_mreq) + +static const lx_sockopt_map_t ltos_ip_sockopts[LX_IP_UNICAST_IF + 1] = { + { OPTNOTSUP, 0 }, + { IP_TOS, sizeof (int) }, /* IP_TOS */ + { IP_TTL, sizeof (int) }, /* IP_TTL */ + { IP_HDRINCL, sizeof (int) }, /* IP_HDRINCL */ + { IP_OPTIONS, 0 }, /* IP_OPTIONS */ + { OPTNOTSUP, 0 }, /* IP_ROUTER_ALERT */ + { IP_RECVOPTS, sizeof (int) }, /* IP_RECVOPTS */ + { IP_RETOPTS, sizeof (int) }, /* IP_RETOPTS */ + { IP_PKTINFO, sizeof (int) }, /* IP_PKTINFO */ + { OPTNOTSUP, 0 }, /* IP_PKTOPTIONS */ + { OPTNOTSUP, 0 }, /* IP_MTUDISCOVER */ + { OPTNOTSUP, 0 }, /* IP_RECVERR */ + { IP_RECVTTL, sizeof (int) }, /* IP_RECVTTL */ + { IP_RECVTOS, sizeof (int) }, /* IP_RECVTOS */ + { OPTNOTSUP, 0 }, /* IP_MTU */ + { OPTNOTSUP, 0 }, /* IP_FREEBIND */ + { OPTNOTSUP, 0 }, /* IP_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IP_XFRM_POLICY */ + { OPTNOTSUP, 0 }, /* IP_PASSSEC */ + { OPTNOTSUP, 0 }, /* IP_TRANSPARENT */ + { OPTNOTSUP, 0 }, /* IP_ORIGDSTADDR */ + { OPTNOTSUP, 0 }, /* IP_MINTTL */ + { OPTNOTSUP, 0 }, /* IP_NODEFRAG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IP_MULTICAST_IF, sizeof (int) }, /* IP_MULTICAST_IF */ + { IP_MULTICAST_TTL, sizeof (int) }, /* IP_MULTICAST_TTL */ + { IP_MULTICAST_LOOP, sizeof (int) }, /* IP_MULTICAST_LOOP */ + { IP_ADD_MEMBERSHIP, IP_MREQ_SZ }, /* IP_ADD_MEMBERSHIP */ + { IP_DROP_MEMBERSHIP, IP_MREQ_SZ }, /* IP_DROP_MEMBERSHIP */ + { IP_UNBLOCK_SOURCE, 0 }, /* IP_UNBLOCK_SOURCE */ + { IP_BLOCK_SOURCE, 0 }, /* IP_BLOCK_SOURCE */ + { IP_ADD_SOURCE_MEMBERSHIP, 0 }, /* IP_ADD_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_DROP_SOURCE_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* IP_MSFILTER */ + { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */ + { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */ + { OPTNOTSUP, 0 }, /* IP_MULTICAST_ALL */ + { OPTNOTSUP, 0 } /* IP_UNICAST_IF */ +}; + +/* Shorten name so the columns can line up */ +#define IP6_MREQ_SZ sizeof (struct ipv6_mreq) + +static const lx_sockopt_map_t ltos_ipv6_sockopts[LX_IPV6_TCLASS + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* IPV6_ADDRFORM */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTINFO */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292DSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_2292RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_2292PKTOPTIONS */ + { IPV6_CHECKSUM, sizeof (int) }, /* IPV6_CHECKSUM */ + { OPTNOTSUP, 0 }, /* IPV6_2292HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_NEXTHOP */ + { OPTNOTSUP, 0 }, /* IPV6_AUTHHDR */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IPV6_UNICAST_HOPS, sizeof (int) }, /* IPV6_UNICAST_HOPS */ + { IPV6_MULTICAST_IF, sizeof (int) }, /* IPV6_MULTICAST_IF */ + { IPV6_MULTICAST_HOPS, sizeof (int) }, /* IPV6_MULTICAST_HOPS */ + { IPV6_MULTICAST_LOOP, sizeof (int) }, /* IPV6_MULTICAST_LOOP */ + { IPV6_ADD_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_JOIN_GROUP */ + { IPV6_DROP_MEMBERSHIP, IP6_MREQ_SZ }, /* IPV6_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* IPV6_ROUTER_ALERT */ + { OPTNOTSUP, 0 }, /* IPV6_MTU_DISCOVER */ + { OPTNOTSUP, 0 }, /* IPV6_MTU */ + { OPTNOTSUP, 0 }, /* IPV6_RECVERR */ + { IPV6_V6ONLY, sizeof (int) }, /* IPV6_V6ONLY */ + { OPTNOTSUP, 0 }, /* IPV6_JOIN_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_LEAVE_ANYCAST */ + { OPTNOTSUP, 0 }, /* IPV6_IPSEC_POLICY */ + { OPTNOTSUP, 0 }, /* IPV6_XFRM_POLICY */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { MCAST_JOIN_GROUP, 0 }, /* MCAST_JOIN_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_BLOCK_SOURCE */ + { OPTNOTSUP, 0 }, /* MCAST_UNBLOCK_SOURCE */ + { MCAST_LEAVE_GROUP, 0 }, /* MCAST_LEAVE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_JOIN_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_LEAVE_SOURCE_GROUP */ + { OPTNOTSUP, 0 }, /* MCAST_MSFILTER */ + { IPV6_RECVPKTINFO, sizeof (int) }, /* IPV6_RECVPKTINFO */ + { IPV6_PKTINFO, 0 }, /* IPV6_PKTINFO */ + { IPV6_RECVHOPLIMIT, sizeof (int) }, /* IPV6_RECVHOPLIMIT */ + { IPV6_HOPLIMIT, 0 }, /* IPV6_HOPLIMIT */ + { OPTNOTSUP, 0 }, /* IPV6_RECVHOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_HOPOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDRDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_RECVRTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RTHDR */ + { OPTNOTSUP, 0 }, /* IPV6_RECVDSTOPTS */ + { OPTNOTSUP, 0 }, /* IPV6_DSTOPTS */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IPV6_RECVTCLASS, sizeof (int) }, /* IPV6_RECVTCLASS */ + { IPV6_TCLASS, sizeof (int) } /* IPV6_TCLASS */ +}; + +static const lx_sockopt_map_t ltos_icmpv6_sockopts[LX_ICMP6_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { ICMP6_FILTER, 0 } /* ICMP6_FILTER */ +}; + +/* + * Options marked as "in code" in their comment are handled in the + * lx_setsockopt_tcp() and lx_getsockopt_tcp() functions. + * + * For the Linux TCP_SYNCNT option (the number of SYN retransmits) we emulate + * that by interpreting the two connection interval settings: + * TCP_CONN_NOTIFY_THRESHOLD + * tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval + * TCP_CONN_ABORT_THRESHOLD + * tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval + * The system (re)transmits a SYN and performs a doubling backoff from the + * first timer until it passes the second timer. We determine the SYN count + * from these two values. Normally it will be 5. Also see the TCPS_SYN_SENT + * case in tcp_timer(); a tcp_second_ctimer_threshold value of 0 means to + * retransmit SYN indefinitely. + * + * For the Linux TCP_USER_TIMEOUT option we use our TCP_ABORT_THRESHOLD since + * this seems to be the closest match. This value is the + * tcp_second_timer_threshold, which gets initialized to the + * tcp_ip_abort_interval value. The tunable guide describes this as: + * For a given TCP connection, if TCP has been retransmitting for + * tcp_ip_abort_interval period of time and it has not received any + * acknowledgment from the other endpoint during this period, TCP closes + * this connection. + * The value is in milliseconds, which matches TCP_USER_TIMEOUT. + */ +static const lx_sockopt_map_t ltos_tcp_sockopts[LX_TCP_NOTSENT_LOWAT + 1] = { + { OPTNOTSUP, 0 }, + { TCP_NODELAY, sizeof (int) }, /* TCP_NODELAY */ + { TCP_MAXSEG, sizeof (int) }, /* TCP_MAXSEG - in code */ + { TCP_CORK, sizeof (int) }, /* TCP_CORK */ + { TCP_KEEPIDLE, sizeof (int) }, /* TCP_KEEPIDLE */ + { TCP_KEEPINTVL, sizeof (int) }, /* TCP_KEEPINTVL */ + { TCP_KEEPCNT, sizeof (int) }, /* TCP_KEEPCNT */ + { OPTNOTSUP, 0 }, /* TCP_SYNCNT - in code */ + { TCP_LINGER2, sizeof (int) }, /* TCP_LINGER2 */ + { OPTNOTSUP, 0 }, /* TCP_DEFER_ACCEPT - in code */ + { OPTNOTSUP, 0 }, /* TCP_WINDOW_CLAMP - in code */ + { OPTNOTSUP, 0 }, /* TCP_INFO */ + { TCP_QUICKACK, sizeof (int) }, /* TCP_QUICKACK */ + { TCP_CONGESTION, CC_ALGO_NAME_MAX }, /* TCP_CONGESTION */ + { OPTNOTSUP, 0 }, /* TCP_MD5SIG */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* TCP_THIN_LINEAR_TIMEOUTS */ + { OPTNOTSUP, 0 }, /* TCP_THIN_DUPACK */ + { TCP_ABORT_THRESHOLD, sizeof (int) }, /* TCP_USER_TIMEOUT */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_QUEUE */ + { OPTNOTSUP, 0 }, /* TCP_QUEUE_SEQ */ + { OPTNOTSUP, 0 }, /* TCP_REPAIR_OPTIONS */ + { OPTNOTSUP, 0 }, /* TCP_FASTOPEN */ + { OPTNOTSUP, 0 }, /* TCP_TIMESTAMP */ + { OPTNOTSUP, 0 } /* TCP_NOTSENT_LOWAT */ +}; + +static const lx_sockopt_map_t ltos_igmp_sockopts[IGMP_MTRACE + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MINLEN, 0 }, /* IGMP_MINLEN */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MEMBERSHIP_QUERY, 0 }, /* IGMP_HOST_MEMBERSHIP_QUERY */ + { IGMP_V1_MEMBERSHIP_REPORT, 0 }, /* IGMP_HOST_MEMBERSHIP_REPORT */ + { IGMP_DVMRP, 0 }, /* IGMP_DVMRP */ + { IGMP_PIM, 0 }, /* IGMP_PIM */ + { OPTNOTSUP, 0 }, /* IGMP_TRACE */ + { IGMP_V2_MEMBERSHIP_REPORT, 0 }, /* IGMPV2_HOST_MEMBERSHIP_REPORT */ + { IGMP_V2_LEAVE_GROUP, 0 }, /* IGMP_HOST_LEAVE_MESSAGE */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, + { IGMP_MTRACE_RESP, 0 }, /* IGMP_MTRACE_RESP */ + { IGMP_MTRACE, 0 } /* IGMP_MTRACE */ +}; + +static const lx_sockopt_map_t ltos_socket_sockopts[LX_SO_BPF_EXTENSIONS + 1] = { + { OPTNOTSUP, 0 }, + { SO_DEBUG, sizeof (int) }, /* SO_DEBUG */ + { SO_REUSEADDR, sizeof (int) }, /* SO_REUSEADDR */ + { SO_TYPE, 0 }, /* SO_TYPE */ + { SO_ERROR, 0 }, /* SO_ERROR */ + { SO_DONTROUTE, sizeof (int) }, /* SO_DONTROUTE */ + { SO_BROADCAST, sizeof (int) }, /* SO_BROADCAST */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUF */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUF */ + { SO_KEEPALIVE, sizeof (int) }, /* SO_KEEPALIVE */ + { SO_OOBINLINE, sizeof (int) }, /* SO_OOBINLINE */ + { OPTNOTSUP, 0 }, /* SO_NO_CHECK */ + { OPTNOTSUP, 0 }, /* SO_PRIORITY */ + { SO_LINGER, 0 }, /* SO_LINGER */ + { OPTNOTSUP, 0 }, /* SO_BSDCOMPAT */ + { SO_REUSEPORT, sizeof (int) }, /* SO_REUSEPORT */ + { SO_RECVUCRED, sizeof (int) }, /* SO_PASSCRED */ + { OPTNOTSUP, 0 }, /* SO_PEERCRED */ + { SO_RCVLOWAT, sizeof (int) }, /* SO_RCVLOWAT */ + { SO_SNDLOWAT, sizeof (int) }, /* SO_SNDLOWAT */ + { SO_RCVTIMEO, 0 }, /* SO_RCVTIMEO */ + { SO_SNDTIMEO, 0 }, /* SO_SNDTIMEO */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_AUTHENTICATION */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_TRANSPORT */ + { OPTNOTSUP, 0 }, /* SO_SECURITY_ENCRYPTION_NETWORK */ + { OPTNOTSUP, 0 }, /* SO_BINDTODEVICE */ + { SO_ATTACH_FILTER, 0 }, /* SO_ATTACH_FILTER */ + { SO_DETACH_FILTER, 0 }, /* SO_DETACH_FILTER */ + { OPTNOTSUP, 0 }, /* SO_PEERNAME */ + { SO_TIMESTAMP, sizeof (int) }, /* SO_TIMESTAMP */ + { SO_ACCEPTCONN, 0 }, /* SO_ACCEPTCONN */ + { OPTNOTSUP, 0 }, /* SO_PEERSEC */ + { SO_SNDBUF, sizeof (int) }, /* SO_SNDBUFFORCE */ + { SO_RCVBUF, sizeof (int) }, /* SO_RCVBUFFORCE */ + { OPTNOTSUP, 0 }, /* SO_PASSSEC */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPNS */ + { OPTNOTSUP, 0 }, /* SO_MARK */ + { OPTNOTSUP, 0 }, /* SO_TIMESTAMPING */ + { SO_PROTOTYPE, 0 }, /* SO_PROTOCOL */ + { SO_DOMAIN, 0 }, /* SO_DOMAIN */ + { OPTNOTSUP, 0 }, /* SO_RXQ_OVFL */ + { OPTNOTSUP, 0 }, /* SO_WIFI_STATUS */ + { OPTNOTSUP, 0 }, /* SO_PEEK_OFF */ + { OPTNOTSUP, 0 }, /* SO_NOFCS */ + { OPTNOTSUP, 0 }, /* SO_LOCK_FILTER */ + { OPTNOTSUP, 0 }, /* SO_SELECT_ERR_QUEUE */ + { OPTNOTSUP, 0 }, /* SO_BUSY_POLL */ + { OPTNOTSUP, 0 }, /* SO_MAX_PACING_RATE */ + { OPTNOTSUP, 0 } /* SO_BPF_EXTENSIONS */ +}; + +static const lx_sockopt_map_t ltos_raw_sockopts[LX_ICMP_FILTER + 1] = { + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 } /* ICMP_FILTER */ +}; + +static const lx_sockopt_map_t ltos_packet_sockopts[LX_PACKET_STATISTICS + 1] = { + { OPTNOTSUP, 0 }, + { PACKET_ADD_MEMBERSHIP, 0 }, /* PACKET_ADD_MEMBERSHIP */ + { PACKET_DROP_MEMBERSHIP, 0 }, /* PACKET_DROP_MEMBERSHIP */ + { OPTNOTSUP, 0 }, /* PACKET_RECV_OUTPUT */ + { OPTNOTSUP, 0 }, + { OPTNOTSUP, 0 }, /* PACKET_RX_RING */ + { PACKET_STATISTICS, 0 } /* PACKET_STATISTICS */ +}; + +/* Needed for SO_ATTACH_FILTER */ +struct lx_bpf_program { + unsigned short bf_len; + caddr_t bf_insns; +}; + +/* Invert filter fields as Linux expects */ +#define LX_ICMP6_FILTER_INVERT(filterp) ( \ + ((filterp)->__icmp6_filt[0] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[1] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[2] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[3] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[4] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[5] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[6] ^= 0xFFFFFFFFU), \ + ((filterp)->__icmp6_filt[7] ^= 0xFFFFFFFFU)) + +static boolean_t +lx_sockopt_lookup(lx_proto_opts_t tbl, int *optname, socklen_t *optlen) +{ + const lx_sockopt_map_t *entry; + + if (*optname > tbl.lpo_max) { + return (B_FALSE); + } + entry = &tbl.lpo_entries[*optname]; + if (entry->lsm_opt == OPTNOTSUP) { + return (B_FALSE); + } + *optname = entry->lsm_opt; + /* Truncate the optlen if needed/allowed */ + if (entry->lsm_lcap != 0 && *optlen > entry->lsm_lcap) { + *optlen = entry->lsm_lcap; + } + return (B_TRUE); +} + +static int +lx_mcast_common(sonode_t *so, int level, int optname, void *optval, + socklen_t optlen) +{ + int error; + struct group_req gr; + lx_sockaddr_storage_t *lxss; + + ASSERT(optname == LX_MCAST_JOIN_GROUP || + optname == LX_MCAST_LEAVE_GROUP); + + /* + * For MCAST_JOIN_GROUP and MCAST_LEAVE_GROUP, Linux uses a + * gr_group that has a different size from the native gr_group. + * We need to translate to the native gr_group taking special + * care to do the right thing when dealing with a 32-bit program + * making a call into a 64-bit kernel. + */ + + bzero(&gr, sizeof (gr)); + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + if (optlen != sizeof (lx_group_req32_t)) { + return (EINVAL); + } + + lx_group_req32_t *lxgr = optval; + + /* use the 32-bit type */ + gr.gr_interface = lxgr->lxgr_interface; + lxss = &lxgr->lxgr_group; + } else +#endif /* defined(_SYSCALL32_IMPL) */ + { + if (optlen != sizeof (lx_group_req_t)) { + return (EINVAL); + } + + lx_group_req_t *lxgr = optval; + + gr.gr_interface = lxgr->lxgr_interface; + lxss = &lxgr->lxgr_group; + } + + bcopy(lxss, &gr.gr_group, sizeof (*lxss)); + gr.gr_group.ss_family = LTOS_FAMILY(lxss->lxss_family); + + optlen = sizeof (gr); + optname = (optname == LX_MCAST_JOIN_GROUP) ? + MCAST_JOIN_GROUP : MCAST_LEAVE_GROUP; + + error = socket_setsockopt(so, level, optname, &gr, + optlen, CRED()); + return (error); +} + + +/* + * NOTE: For now, the following mess applies to TCP (i.e. AF_INET{,6} + + * SOCK_STREAM) only, until we enable SO_REUSEPORT for other socket/protocol + * types as well. The lx_so_needs_reusehandling() macro indicates what + * socket(s) apply to the following mess. + */ +#define lx_so_needs_reusehandling(so) ((so)->so_type == SOCK_STREAM && \ + ((so)->so_family == AF_INET || (so)->so_family == AF_INET6)) + +/* + * So in Linux, the SO_REUSEADDR includes, essentially, SO_REUSEPORT as part + * of its functionality. Experiments on CentOS 7 with a 3.10-ish kernel show + * that querying on SO_REUSEPORT show it's "off" if SO_REUSEADDR gets set. + * This means we can't count on directly querying the native socket state. We + * munge things here in LX-land to essentially turn on both REUSEADDR and + * REUSEPORT in native conn_t state for LX processes that set SO_REUSEADDR. + * + * We also keep track if the wily Linux app sends BOTH REUSEADDR and REUSEPORT + * down. We can return that both are on, or if it uses just REUSEADDR, we + * don't return yes for a check of REUSEPORT. This means our conn_t state may + * be different than what an LX process will see. "REUSEPORT" for LX may be + * off, but internally it will be on. + * + * BEGIN CSTYLED + * State table for internal conn_reuse{addr,port}: + * + * LX ADDR,PORT Int. ADDR,PORT New ADDR New LX New Int. LXchg? Intchg? + * ============ ============== ======== ====== ======== ====== ======= + * + * off,off off,off off off,off off,off NO NO + * + * off,off off,off on on,off on,on YES YES(2) + * + * off,on off,on off off,on off,on NO NO + * + * off,on off,on on on,on on,on YES YES + * + * on,off on,on off off,off off,off YES YES(2) + * + * on,off on,on on on,off on,on NO NO + * + * on,on on,on off off,on off,on YES YES + * + * on,on on,on on on,on on,on NO NO + * + * + * LX ADDR,PORT Int. ADDR,PORT New PORT New LX New Int. LXchg? Intchg? + * ============ ============== ======== ====== ======== ====== ======= + * + * off,off off,off off off,off off,off NO NO + * + * off,off off,off on off,on off,on YES YES + * + * off,on off,on off off,off off,off YES YES + * + * off,on off,on on off,on off,on NO NO + * + * on,off on,on off on,off on,on NO NO + * + * on,off on,on on on,on on,on YES NO + * + * on,on on,on off on,off on,on YES NO + * + * on,on on,on on on,on on,on NO NO + * + * END CSTYLED + * + * For setting these options, we need to obey the state table above. + * For getting REUSEADDR, the native stack handles it already. + * For getting REUSEPORT, we'll have to track the auxiliary data's flags. + */ +static int +lx_set_reuse_handler(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + lx_socket_aux_data_t *sad; + boolean_t enable; + int error; + + if (optlen != sizeof (int)) + return (EINVAL); + enable = (*((int *)optval) != 0); + + ASSERT(optname == LX_SO_REUSEADDR || optname == LX_SO_REUSEPORT); + sad = lx_sad_acquire(SOTOV(so)); + + /* + * lx_sad_acquire() holds its mutex for us. This protects us + * against racing option-setters on the same socket. + */ + if (optname == LX_SO_REUSEADDR) { + /* Check if already set to what we want! */ + if (enable == + ((sad->lxsad_flags & LXSAD_FL_EMULRUADDR) != 0)) { + mutex_exit(&sad->lxsad_lock); + return (0); + } + + /* + * At this point, we know we need to change SO_REUSEADDR, + * Linux-style. We know these are supported options too, + * so we don't bother with any lookup. + */ + error = socket_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, + optval, optlen, CRED()); + if (error != 0) { + mutex_exit(&sad->lxsad_lock); + return (error); + } + if (enable) + sad->lxsad_flags |= LXSAD_FL_EMULRUADDR; + else + sad->lxsad_flags &= ~LXSAD_FL_EMULRUADDR; + + /* + * At THIS point, we need to figure out if we ALSO need to + * toggle the native-side SO_REUSEPORT state because Linux's + * SO_REUSEADDR ALSO include the moral equivalent of + * SO_REUSEPORT. There may be further subtleties, but for now + * assume a Linux app that uses SO_REUSEADDR wants that + * SO_REUSEPORT functionality thrown in for free. + * + * Check for SO_REUSEPORT already enabled first. + */ + if ((sad->lxsad_flags & LXSAD_FL_EMULRUPORT) != 0) { + /* Someone turned on REUSEPORT first, we're good. */ + mutex_exit(&sad->lxsad_lock); + return (0); + } + + /* + * Fall through to REUSEPORT setting, it'll know it's a + * supplement based on (optname == SO_REUSEADDR). + */ + } else if (enable == + ((sad->lxsad_flags & LXSAD_FL_EMULRUPORT) != 0)) { + /* + * If we reach here, we're setting REUSEPORT to what it's + * already set. + */ + ASSERT3U(optname, ==, LX_SO_REUSEPORT); + mutex_exit(&sad->lxsad_lock); + return (0); + } + + if (optname == LX_SO_REUSEPORT && + ((sad->lxsad_flags & LXSAD_FL_EMULRUADDR) != 0)) { + /* + * Corner case: REUSEPORT change *but* REUSEADDR is still + * enabled. We must not alter conn_t/native state here, as + * REUSEADDR *needs* REUSEPORT enabled on conn_t/native state. + * If we want to enable REUSEPORT, the setsockopt would be a + * NOP. If want to disable it, we MUST NOT turn off native + * REUSEPORT lest we break Linux-like behavior, and instead + * merely turn off the LXSAD_FL_EMULRUPORT flag. + */ + error = 0; + } else { + /* + * At this point, we need to change REUSEPORT. We may be + * doing it for an actual REUSEPORT change, OR for Linux + * REUSEADDR semantics. As earlier, we know the option map + * lookup is superfluous. + */ + error = socket_setsockopt(so, SOL_SOCKET, SO_REUSEPORT, optval, + optlen, CRED()); + } + + if (error != 0 && optname == LX_SO_REUSEADDR) { + int addr_error, revert_to_optval; + + ASSERT0(sad->lxsad_flags & LXSAD_FL_EMULRUPORT); + /* + * We need more cleanup if the REUSEPORT change fails during + * an actual REUSEADDR set. + */ + if (enable) { + sad->lxsad_flags &= ~LXSAD_FL_EMULRUADDR; + revert_to_optval = 0; + } else { + sad->lxsad_flags |= LXSAD_FL_EMULRUADDR; + revert_to_optval = 1; + } + + /* Just hardwire it, we're in trouble! */ + addr_error = socket_setsockopt(so, SOL_SOCKET, SO_REUSEADDR, + &revert_to_optval, optlen, CRED()); + if (addr_error != 0) { + /* + * Well this sucks, we really shot ourselves in the + * foot. We should somehow signal a catastrophic + * error. For now, just return the one we had earlier. + */ + DTRACE_PROBE1(lx__reuse__seconderr, int, addr_error); + mutex_exit(&sad->lxsad_lock); + return (error); + } + /* + * Else we managed successfully to clean up and can fall + * through the normal error path. + */ + } else if (error == 0 && optname == LX_SO_REUSEPORT) { + /* We successfully changed REUSEPORT explicitly. */ + if (enable) + sad->lxsad_flags |= LXSAD_FL_EMULRUPORT; + else + sad->lxsad_flags &= ~LXSAD_FL_EMULRUPORT; + } + /* Else it's an error for an explicit REUSEPORT, just return. */ + + mutex_exit(&sad->lxsad_lock); + return (error); +} + +static int +lx_setsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + switch (optname) { + case LX_IP_RECVERR: + /* + * Ping sets this option to receive errors on raw sockets. + * Currently we just ignore it to make ping happy. From the + * Linux ip.7 man page: + * + * For raw sockets, IP_RECVERR enables passing of all + * received ICMP errors to the application. + * + * Programs known to depend upon this: + * - ping + * - traceroute + * - mount.nfs + */ + return (0); + + case LX_IP_MTU_DISCOVER: { + int val; + + /* + * We translate Linux's IP_MTU_DISCOVER into our IP_DONTFRAG, + * allowing this be a byte or an integer and observing the + * inverted sense of the two relative to one another (and + * translating accordingly). + */ + if (optlen < sizeof (int)) { + val = *((uint8_t *)optval); + } else { + val = *((int *)optval); + } + + switch (val) { + case LX_IP_PMTUDISC_DONT: + val = 1; + break; + + case LX_IP_PMTUDISC_DO: + case LX_IP_PMTUDISC_WANT: + val = 0; + break; + + default: + return (EOPNOTSUPP); + } + + error = socket_setsockopt(so, IPPROTO_IP, IP_DONTFRAG, + &val, sizeof (val), CRED()); + return (error); + } + + case LX_IP_MULTICAST_TTL: + case LX_IP_MULTICAST_LOOP: + /* + * For IP_MULTICAST_TTL and IP_MULTICAST_LOOP, Linux defines + * the option value to be an integer while we define it to be + * an unsigned character. To prevent the kernel from spitting + * back an error on an illegal length, verify that the option + * value is less than UCHAR_MAX before truncating optlen. + */ + if (optlen <= 0 || optlen > sizeof (int) || + *intval > UINT8_MAX) { + return (EINVAL); + } + optlen = sizeof (uchar_t); + break; + + case LX_MCAST_JOIN_GROUP: + case LX_MCAST_LEAVE_GROUP: + error = lx_mcast_common(so, IPPROTO_IP, optname, optval, + optlen); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + switch (optname) { + case LX_IPV6_MTU: + /* + * There isn't a good translation for IPV6_MTU and certain apps + * such as bind9 will bail if it cannot be set. + * We just lie about the success for now. + */ + return (0); + case LX_MCAST_JOIN_GROUP: + case LX_MCAST_LEAVE_GROUP: + error = lx_mcast_common(so, IPPROTO_IPV6, optname, optval, + optlen); + return (error); + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_IPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_icmpv6(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER && optval != NULL) { + /* + * Surprise! The input to ICMP6_FILTER on Linux is inverted + * when compared to illumos. + */ + if (optlen != sizeof (icmp6_filter_t)) { + return (EINVAL); + } + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + error = socket_setsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + cred_t *cr = CRED(); + uint32_t rto_max, abrt_thresh; + boolean_t abrt_changed = B_FALSE, rto_max_changed = B_FALSE; + + if (optname == LX_TCP_WINDOW_CLAMP) { + /* It appears safe to lie and say we did this. */ + return (0); + } + + if (optname == LX_TCP_MAXSEG) { + /* + * We can get, but not set, TCP_MAXSEG. However, it appears + * safe to lie and say we did this. A future extension might + * be to allow setting this before a connection is established. + */ + return (0); + } + + if (optname == LX_TCP_SYNCNT) { + int intval; + uint64_t syn_last_backoff; + uint_t syn_cnt, syn_backoff, len; + + /* + * See the comment above the ltos_tcp_sockopts table for an + * explanation of the TCP_SYNCNT emulation. + */ + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = *(int *)optval; + if (intval > 255) { + return (EINVAL); + } + + len = sizeof (syn_backoff); + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0, cr); + if (error != 0) + return (error); + + syn_last_backoff = syn_backoff; + for (syn_cnt = 0; syn_cnt < intval; syn_cnt++) { + syn_last_backoff *= 2; + /* + * Since the tcps_ip_abort_cinterval is milliseconds and + * stored as a uint_t, it's basically impossible to get + * up to the Linux limit of 255 SYN retries due to the + * doubling on the backoff. + */ + if (syn_last_backoff > UINT_MAX) { + return (EINVAL); + } + } + + syn_backoff = (uint_t)syn_last_backoff; + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_CONN_ABORT_THRESHOLD, &syn_backoff, len, cr); + return (error); + } + + if (optname == LX_TCP_DEFER_ACCEPT) { + int *intval; + char *dfp; + + /* + * Emulate TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we can't emulate the timeout aspect so treat any + * non-zero value as enabling and zero as disabling. + */ + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + + /* + * socket_setsockopt asserts that the optval is aligned, so + * we use kmem_alloc to ensure this. + */ + dfp = (char *)kmem_alloc(sizeof (DATAFILT), KM_SLEEP); + (void) strcpy(dfp, DATAFILT); + + if (*intval > 0) { + error = socket_setsockopt(so, SOL_FILTER, FIL_ATTACH, + dfp, 9, cr); + if (error == EEXIST) { + error = 0; + } + } else { + error = socket_setsockopt(so, SOL_FILTER, FIL_DETACH, + dfp, 9, cr); + if (error == ENXIO) { + error = 0; + } + } + kmem_free(dfp, sizeof (DATAFILT)); + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + if (optname == TCP_KEEPINTVL) { + /* + * When setting TCP_KEEPINTVL there is an unfortunate set of + * dependencies. TCP_KEEPINTVL must be <= TCP_RTO_MAX and + * TCP_RTO_MAX must be <= TCP_ABORT_THRESHOLD. Thus, we may + * have to increase one or both of these in order to increase + * TCP_KEEPINTVL. Note that TCP_KEEPINTVL is passed in seconds + * but TCP_RTO_MAX and TCP_ABORT_THRESHOLD are in milliseconds. + * Also note that we currently make no attempt to handle + * concurrent application threads simultaneously changing + * TCP_KEEPINTVL, since that is unlikely. We could revisit + * locking if it ever becomes an issue. + */ + uint32_t new_val = *(uint_t *)optval * 1000; + uint32_t len; + + /* + * Linux limits this to 32k, so we do too. However, anything + * over 2 hours (7200000 ms) will fail anyway due to the + * system-wide default (see "_rexmit_interval_max" in + * tcp_tunables.c). Our 2 hour default seems reasonable as a + * practical limit for now. + */ + if (*(uint_t *)optval > SHRT_MAX) + return (EINVAL); + + len = sizeof (rto_max); + if ((error = socket_getsockopt(so, IPPROTO_TCP, TCP_RTO_MAX, + &rto_max, &len, 0, cr)) != 0) + return (error); + len = sizeof (abrt_thresh); + if ((error = socket_getsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &abrt_thresh, &len, 0, cr)) != 0) + return (error); + + if (new_val > abrt_thresh) { + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &new_val, sizeof (new_val), + cr); + if (error != 0) + goto fail; + abrt_changed = B_TRUE; + } + if (new_val > rto_max) { + error = socket_setsockopt(so, IPPROTO_TCP, + TCP_RTO_MAX, &new_val, sizeof (new_val), cr); + if (error != 0) + goto fail; + rto_max_changed = B_TRUE; + } + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, cr); + +fail: + if (error != 0 && optname == TCP_KEEPINTVL) { + /* + * If changing TCP_KEEPINTVL failed then we may need to + * restore the previous values for TCP_ABORT_THRESHOLD and + * TCP_RTO_MAX. + */ + if (rto_max_changed) { + (void) socket_setsockopt(so, IPPROTO_TCP, + TCP_RTO_MAX, &rto_max, + sizeof (rto_max), cr); + } + if (abrt_changed) { + (void) socket_setsockopt(so, IPPROTO_TCP, + TCP_ABORT_THRESHOLD, &abrt_thresh, + sizeof (abrt_thresh), cr); + } + } + + return (error); +} + +static int +lx_setsockopt_socket(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + struct lx_bpf_program *lbp; + int *intval; + struct bpf_program bp; + + switch (optname) { + case LX_SO_BSDCOMPAT: + /* Linux ignores this option. */ + return (0); + + case LX_SO_TIMESTAMP: + /* + * SO_TIMESTAMP is not supported on AF_UNIX sockets but we have + * some of those which apps use for logging, etc., so pretend + * this worked. + */ + if (so->so_family == AF_UNIX) { + return (0); + } + break; + + case LX_SO_ATTACH_FILTER: + /* + * Convert bpf program struct + */ + if (optlen != sizeof (struct lx_bpf_program)) { + return (EINVAL); + } + lbp = (struct lx_bpf_program *)optval; + bp.bf_len = lbp->bf_len; + /* LINTED: alignment */ + bp.bf_insns = (struct bpf_insn *)lbp->bf_insns; + optval = &bp; + break; + + case LX_SO_PASSSEC: + /* + * SO_PASSSEC is very similar to SO_PASSCRED (emulated by + * SO_RECVUCRED) in that it requests that cmsgs containing + * identity information be attached to recieved messages. + * Instead of ucred information, security-module-specific + * information such as selinux label is expected + * + * Since LX does not at all support selinux today, the + * option is silently accepted. + */ + return (0); + + case LX_SO_REUSEADDR: + case LX_SO_REUSEPORT: + if (lx_so_needs_reusehandling(so)) { + /* + * See lx_set_reuse_handler's comments for the oddness + * of REUSE* in some cases. + */ + return (lx_set_reuse_handler(so, optname, optval, + optlen)); + } + break; + + case LX_SO_PASSCRED: + /* + * In many cases, the Linux SO_PASSCRED is mapped to the SunOS + * SO_RECVUCRED to enable the passing of peer credential + * information via received cmsgs. One exception is for + * connection-oriented AF_UNIX sockets which do not yet support + * that option. Instead, we track the setting internally and, + * when there is appropriate cmsg space, emulate the credential + * passing by querying the STREAMS ioctl. + * + * Note: this approach is broken for the case when a process + * sets up a Unix-domain socket with SO_PASSCRED, then forks + * one or more children, and expects to use the cmsg cred to + * accurately know which child pid sent the message (currently + * a pid is recorded when the socket is connected, not for each + * msg sent). getpeerucred(3c) suffers from the same problem. + * We have a workaround in lx_socketpair (use DGRAM if + * SEQPACKET), but the general case requires enhancing our + * streams support to allow passing credential cmsgs on a + * connection-oriented Unix socket. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + lx_socket_aux_data_t *sad; + + if (optlen != sizeof (int)) { + return (EINVAL); + } + intval = (int *)optval; + sad = lx_sad_acquire(SOTOV(so)); + if (*intval == 0) { + sad->lxsad_flags &= ~LXSAD_FL_STRCRED; + } else { + sad->lxsad_flags |= LXSAD_FL_STRCRED; + } + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_SOCKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + switch (optname) { + case LX_ICMP_FILTER: + /* + * This option is currently ignored to appease ping. + */ + return (0); + + case LX_IPV6_CHECKSUM: + /* + * Ping6 tries to set the IPV6_CHECKSUM offset in a way that + * illumos won't allow. Quietly ignore this to prevent it from + * complaining. + */ + return (0); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_TCP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_packet(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + struct packet_mreq *mr; + + switch (optname) { + case LX_PACKET_ADD_MEMBERSHIP: + case LX_PACKET_DROP_MEMBERSHIP: + /* Convert Linux mr_type to illumos */ + if (optlen != sizeof (struct packet_mreq)) { + return (EINVAL); + } + mr = (struct packet_mreq *)optval; + if (--mr->mr_type > PACKET_MR_ALLMULTI) + return (EINVAL); + optval = mr; + break; + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, SOL_PACKET, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_setsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t optlen) +{ + int error; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, &optlen)) { + return (ENOPROTOOPT); + } + + error = socket_setsockopt(so, IPPROTO_IGMP, optname, optval, optlen, + CRED()); + return (error); +} + +static int +lx_getsockopt_ip(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ip_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_ipv6(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_ipv6_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IPV6, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_icmpv6(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_icmpv6_sockopts); + + if (optname == LX_ICMP6_FILTER) { + error = socket_getsockopt(so, IPPROTO_ICMPV6, ICMP6_FILTER, + optval, optlen, 0, CRED()); + + /* + * ICMP6_FILTER is inverted on Linux. Make it so before copying + * back to caller's buffer. + */ + if (error == 0) { + LX_ICMP6_FILTER_INVERT((icmp6_filter_t *)optval); + } + return (error); + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_ICMPV6, optname, optval, optlen, + 0, CRED()); + return (error); +} + +/* + * When attempting to get socket options on AF_UNIX sockets we need to be a bit + * careful with the returned errno values. It turns out different OSes return + * different errno values here: + * - illumos: ENOPROTOOPT + * - Linux: EOPNOTSUPP + * - FreeBSD: EINVAL + * Therefore we remap ENOPROTOOPT to EOPNOTSUPP when a userland program attempts + * to get one of the various TCP_XXX options under this condition. + */ +static int +lx_getsockopt_tcp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + cred_t *cr = CRED(); + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_tcp_sockopts); + + switch (optname) { + case LX_TCP_WINDOW_CLAMP: + /* + * We do not support these options but some apps rely on them. + * Rather than return an error we just return 0. This isn't + * exactly a lie, since the options really aren't set, but it's + * not the whole truth either. Fortunately, we aren't under + * oath. + */ + if (*optlen < sizeof (int)) { + return (EINVAL); + } + if (so->so_family == AF_UNIX) { + return (EOPNOTSUPP); + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_TCP_SYNCNT: + /* + * See the comment above the ltos_tcp_sockopts table for an + * explanation of the TCP_SYNCNT emulation. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + uint_t syn_cnt, syn_backoff, syn_abortconn, len; + + len = sizeof (syn_backoff); + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_NOTIFY_THRESHOLD, &syn_backoff, &len, 0, + cr); + if (error != 0) + goto out; + error = socket_getsockopt(so, IPPROTO_TCP, + TCP_CONN_ABORT_THRESHOLD, &syn_abortconn, &len, 0, + cr); + if (error != 0) + goto out; + + syn_cnt = 0; + while (syn_backoff < syn_abortconn) { + syn_cnt++; + syn_backoff *= 2; + } + if (syn_cnt > 255) /* clamp to Linux limit */ + syn_cnt = 255; + + *intval = syn_cnt; + *optlen = sizeof (int); + } + + goto out; + + case LX_TCP_DEFER_ACCEPT: + /* + * We do support TCP_DEFER_ACCEPT using the datafilt(7M) socket + * filter but we don't emulate the timeout aspect so treat the + * existence as 1 and absence as 0. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + struct fil_info fi[10]; + int i; + socklen_t len = sizeof (fi); + + if ((error = socket_getsockopt(so, SOL_FILTER, + FIL_LIST, fi, &len, 0, cr)) != 0) { + *optlen = sizeof (int); + goto out; + } + + *intval = 0; + len = len / sizeof (struct fil_info); + for (i = 0; i < len; i++) { + if (fi[i].fi_flags == FILF_PROG && + strcmp(fi[i].fi_name, "datafilt") == 0) { + *intval = 1; + break; + } + } + } + *optlen = sizeof (int); + goto out; + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + if (optname <= sockopts_tbl.lpo_max && + so->so_family == AF_UNIX) { + return (EOPNOTSUPP); + } + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_TCP, optname, optval, optlen, 0, + cr); + +out: + if (error == ENOPROTOOPT && so->so_family == AF_UNIX) { + return (EOPNOTSUPP); + } + return (error); +} + +static int +lx_getsockopt_socket(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + int *intval = (int *)optval; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_socket_sockopts); + lx_socket_aux_data_t *sad; + + switch (optname) { + case LX_SO_PROTOCOL: + /* + * We need to special-case netlink and AF_UNIX too. + */ + if (so->so_family != AF_LX_NETLINK && so->so_family != AF_UNIX) + break; /* Common-case it. */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = so->so_protocol; + } + *optlen = sizeof (int); + return (error); + + case LX_SO_TYPE: + /* + * Special handling for connectionless AF_UNIX sockets. + * See lx_socketpair for more details. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) == 0) { + if (*optlen < sizeof (int)) + return (EINVAL); + sad = lx_sad_acquire(SOTOV(so)); + if ((sad->lxsad_flags & LXSAD_FL_EMULSEQPKT) != 0) { + *intval = LX_SOCK_SEQPACKET; + *optlen = sizeof (int); + mutex_exit(&sad->lxsad_lock); + return (0); + } + mutex_exit(&sad->lxsad_lock); + } + break; + + case LX_SO_PASSSEC: + /* + * Communicate value of 0 since selinux-related functionality + * is not supported. + */ + if (*optlen < sizeof (int)) { + error = EINVAL; + } else { + *intval = 0; + } + *optlen = sizeof (int); + return (error); + + case LX_SO_PASSCRED: + /* + * Special handling for connection-oriented AF_UNIX sockets. + * See lx_setsockopt_socket for more details. + */ + if (so->so_family == AF_UNIX && + (so->so_mode & SM_CONNREQUIRED) != 0) { + if (*optlen < sizeof (int)) { + return (EINVAL); + } + sad = lx_sad_acquire(SOTOV(so)); + *intval = ((sad->lxsad_flags & LXSAD_FL_STRCRED) == 0 ? + 0 : 1); + *optlen = sizeof (int); + mutex_exit(&sad->lxsad_lock); + return (0); + } + break; + + case LX_SO_REUSEPORT: + /* + * See lx_so_needs_reusehandling() and lx_set_reuse_handler() + * for the sordid details. + */ + if (!lx_so_needs_reusehandling(so)) + break; + + if (*optlen < sizeof (int)) + return (EINVAL); + sad = lx_sad_acquire(SOTOV(so)); + *optlen = sizeof (int); + *intval = + (sad->lxsad_flags & LXSAD_FL_EMULRUPORT) == 0 ? 0 : 1; + mutex_exit(&sad->lxsad_lock); + return (0); + + case LX_SO_PEERCRED: + if (*optlen < sizeof (struct lx_ucred)) { + error = EINVAL; + } else { + struct lx_ucred *lcred = (struct lx_ucred *)optval; + + mutex_enter(&so->so_lock); + if ((so->so_mode & SM_CONNREQUIRED) == 0) { + error = ENOTSUP; + } else if (so->so_peercred == NULL) { + error = EINVAL; + } else { + lcred->lxu_uid = crgetuid(so->so_peercred); + lcred->lxu_gid = crgetgid(so->so_peercred); + lcred->lxu_pid = so->so_cpid; + } + mutex_exit(&so->so_lock); + } + *optlen = sizeof (struct lx_ucred); + return (error); + + default: + break; + } + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_SOCKET, optname, optval, optlen, 0, + CRED()); + + if (error == 0) { + switch (optname) { + case SO_TYPE: + /* translate our type back to Linux */ + *intval = STOL_SOCKTYPE(*intval); + break; + + case SO_ERROR: + *intval = lx_errno(*intval, EINVAL); + break; + default: + break; + } + } + return (error); +} + +static int +lx_getsockopt_raw(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_raw_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_RAW, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_packet(sonode_t *so, int optname, void *optval, + socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_packet_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, SOL_PACKET, optname, optval, optlen, 0, + CRED()); + return (error); +} + +static int +lx_getsockopt_igmp(sonode_t *so, int optname, void *optval, socklen_t *optlen) +{ + int error = 0; + lx_proto_opts_t sockopts_tbl = PROTO_SOCKOPTS(ltos_igmp_sockopts); + + if (!lx_sockopt_lookup(sockopts_tbl, &optname, optlen)) { + return (ENOPROTOOPT); + } + + error = socket_getsockopt(so, IPPROTO_IGMP, optname, optval, optlen, 0, + CRED()); + return (error); +} + +long +lx_setsockopt(int sock, int level, int optname, void *optval, socklen_t optlen) +{ + struct sonode *so; + file_t *fp; + int buflen = 0; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + int error = 0; + + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_alloc(optlen, KM_SLEEP); + } else { + /* + * Zero the on-stack buffer to avoid poisoning smaller + * optvals with stack garbage. + */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + if (copyin(optval, optbuf, optlen) != 0) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(EFAULT)); + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_setsockopt_ip(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_setsockopt_ipv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_setsockopt_icmpv6(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_TCP: + error = lx_setsockopt_tcp(so, optname, optbuf, optlen); + break; + case LX_SOL_SOCKET: + error = lx_setsockopt_socket(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_RAW: + error = lx_setsockopt_raw(so, optname, optbuf, optlen); + break; + case LX_SOL_PACKET: + error = lx_setsockopt_packet(so, optname, optbuf, optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_setsockopt_igmp(so, optname, optbuf, optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_setsockopt(so, LX_SOL_NETLINK, optname, optval, + optlen, CRED()); + break; + default: + error = ENOPROTOOPT; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + (void) snprintf(buf, LX_UNSUP_BUFSZ, "setsockopt(%d, %d)", + level, optname); + lx_unsupported(buf); + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getsockopt(int sock, int level, int optname, void *optval, + socklen_t *optlenp) +{ + struct sonode *so; + file_t *fp; + int error = 0, buflen = 0; + socklen_t optlen; + intptr_t stkbuf[2]; + void *optbuf = stkbuf; + + if (copyin(optlenp, &optlen, sizeof (optlen)) != 0) { + return (set_errno(EFAULT)); + } + if (optlen != 0) { + if (optlen > SO_MAXARGSIZE) { + return (set_errno(EINVAL)); + } + if (optlen > sizeof (stkbuf)) { + buflen = optlen; + optbuf = kmem_zalloc(optlen, KM_SLEEP); + } else { + /* zero the on-stack buffer, just in case */ + stkbuf[0] = 0; + stkbuf[1] = 0; + } + } else { + optbuf = NULL; + } + if ((so = getsonode(sock, &error, &fp)) == NULL) { + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + return (set_errno(error)); + } + + switch (level) { + case LX_IPPROTO_IP: + error = lx_getsockopt_ip(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IPV6: + error = lx_getsockopt_ipv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_ICMPV6: + error = lx_getsockopt_icmpv6(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_TCP: + error = lx_getsockopt_tcp(so, optname, optbuf, &optlen); + break; + case LX_SOL_SOCKET: + error = lx_getsockopt_socket(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_RAW: + error = lx_getsockopt_raw(so, optname, optbuf, &optlen); + break; + case LX_SOL_PACKET: + error = lx_getsockopt_packet(so, optname, optbuf, &optlen); + break; + case LX_IPPROTO_IGMP: + error = lx_getsockopt_igmp(so, optname, optbuf, &optlen); + break; + case LX_SOL_NETLINK: + /* + * Since our netlink implmentation is modeled after Linux, + * sockopts can be passed directly through. + */ + error = socket_getsockopt(so, LX_SOL_NETLINK, optname, optval, + &optlen, 0, CRED()); + break; + default: + error = EOPNOTSUPP; + break; + } + + if (error == ENOPROTOOPT) { + char buf[LX_UNSUP_BUFSZ]; + + (void) snprintf(buf, LX_UNSUP_BUFSZ, "getsockopt(%d, %d)", + level, optname); + lx_unsupported(buf); + } + if (copyout(&optlen, optlenp, sizeof (optlen)) != 0) { + error = EFAULT; + } + if (error == 0 && optlen > 0) { + VERIFY(optlen <= sizeof (stkbuf) || optlen <= buflen); + if (copyout(optbuf, optval, optlen) != 0) { + error = EFAULT; + } + } + if (buflen != 0) { + kmem_free(optbuf, buflen); + } + releasef(sock); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_getname_common(lx_getname_type_t type, int sockfd, void *np, int *nlp) +{ + struct sockaddr_storage buf; + struct sockaddr *name = (struct sockaddr *)&buf; + socklen_t namelen, namelen_orig; + int err, tmp; + struct sonode *so; + + /* We need to validate the name address up front to pass LTP. */ + if (copyin(np, &tmp, sizeof (tmp)) != 0) + return (set_errno(EFAULT)); + + if (copyin(nlp, &namelen, sizeof (socklen_t)) != 0) + return (set_errno(EFAULT)); + namelen_orig = namelen; + + /* LTP can pass -1 */ + if ((int)namelen < 0) + return (set_errno(EINVAL)); + + if ((so = getsonode(sockfd, &err, NULL)) == NULL) + return (set_errno(err)); + + bzero(&buf, sizeof (buf)); + namelen = sizeof (struct sockaddr_storage); + if (type == LX_GETPEERNAME) { + err = socket_getpeername(so, name, &namelen, B_FALSE, CRED()); + } else { + err = socket_getsockname(so, name, &namelen, CRED()); + } + + if (err == 0) { + ASSERT(namelen <= so->so_max_addr_len); + err = stol_sockaddr_copyout(name, namelen, + (struct sockaddr *)np, (socklen_t *)nlp, namelen_orig); + } + + releasef(sockfd); + return (err != 0 ? set_errno(err) : 0); +} + +long +lx_getpeername(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETPEERNAME, sockfd, np, nlp)); +} + +long +lx_getsockname(int sockfd, void *np, int *nlp) +{ + return (lx_getname_common(LX_GETSOCKNAME, sockfd, np, nlp)); +} + +static int +lx_accept_common(int sock, struct sockaddr *name, socklen_t *nlp, int flags) +{ + struct sonode *so; + file_t *fp; + int error; + socklen_t namelen; + struct sonode *nso; + struct vnode *nvp; + struct file *nfp; + int nfd; + int arg; + + if (flags & ~(LX_SOCK_CLOEXEC | LX_SOCK_NONBLOCK)) { + return (set_errno(EINVAL)); + } + + if ((so = getsonode(sock, &error, &fp)) == NULL) + return (set_errno(error)); + + if (name != NULL) { + /* + * The Linux man page says that -1 is returned and errno is set + * to EFAULT if the "name" address is bad, but it is silent on + * what to set errno to if the "namelen" address is bad. + * LTP expects EINVAL. + * + * Note that we must first check the name pointer, as the Linux + * docs state nothing is copied out if the "name" pointer is + * NULL. If it is NULL, we don't care about the namelen + * pointer's value or about dereferencing it. + */ + if (copyin(nlp, &namelen, sizeof (namelen))) { + releasef(sock); + return (set_errno(EINVAL)); + } + if (namelen == 0) { + name = NULL; + } + } else { + namelen = 0; + } + + /* + * Allocate the user fd before socket_accept() in order to + * catch EMFILE errors before calling socket_accept(). + */ + if ((error = falloc(NULL, FWRITE|FREAD, &nfp, &nfd)) != 0) { + eprintsoline(so, EMFILE); + releasef(sock); + return (set_errno(error)); + } + if ((error = socket_accept(so, fp->f_flag, CRED(), &nso)) != 0) { + if (error == EINTR) + lx_sock_syscall_restart(so, B_TRUE); + setf(nfd, NULL); + unfalloc(nfp); + releasef(sock); + return (set_errno(error)); + } + + nvp = SOTOV(nso); + + if (namelen != 0) { + socklen_t addrlen = sizeof (struct sockaddr_storage); + struct sockaddr_storage buf; + struct sockaddr *addrp = (struct sockaddr *)&buf; + + if ((error = socket_getpeername(nso, addrp, &addrlen, B_TRUE, + CRED())) == 0) { + error = stol_sockaddr_copyout(addrp, addrlen, + name, nlp, namelen); + /* + * Logic might dictate that we should check if we can + * write to the namelen pointer earlier so we don't + * accept a pending connection only to fail the call + * because we can't write the namelen value back out. + * However, testing shows Linux does indeed fail the + * call after accepting the connection so we must + * behave in a compatible manner. + */ + } else { + ASSERT(error == EINVAL || error == ENOTCONN); + error = ECONNABORTED; + } + } + + if (error != 0) { + setf(nfd, NULL); + unfalloc(nfp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + releasef(sock); + return (set_errno(error)); + } + + /* Fill in the entries that falloc reserved */ + nfp->f_vnode = nvp; + mutex_exit(&nfp->f_tlock); + setf(nfd, nfp); + + /* Act on LX_SOCK_CLOEXEC from flags */ + if (flags & LX_SOCK_CLOEXEC) { + f_setfd(nfd, FD_CLOEXEC); + } + + /* + * In Linux, accept()ed sockets do not inherit anything set by fcntl(), + * so either explicitly set the flags or filter those out. + * + * The VOP_SETFL code is a simplification of the F_SETFL code in + * fcntl(). Ignore any errors from VOP_SETFL. + */ + arg = 0; + if (flags & LX_SOCK_NONBLOCK) + arg |= FNONBLOCK; + + error = VOP_SETFL(nvp, nfp->f_flag, arg, nfp->f_cred, NULL); + if (error != 0) { + eprintsoline(so, error); + error = 0; + } else { + mutex_enter(&nfp->f_tlock); + nfp->f_flag &= ~FMASK | (FREAD|FWRITE); + nfp->f_flag |= arg; + mutex_exit(&nfp->f_tlock); + } + + releasef(sock); + return (nfd); +} + +long +lx_accept(int sockfd, void *np, int *nlp) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, 0)); +} + +long +lx_accept4(int sockfd, void *np, int *nlp, int flags) +{ + return (lx_accept_common(sockfd, (struct sockaddr *)np, + (socklen_t *)nlp, flags)); +} + +long +lx_listen(int sockfd, int backlog) +{ + return (listen(sockfd, backlog, 0)); +} + +long +lx_shutdown(int sockfd, int how) +{ + return (shutdown(sockfd, how, 0)); +} + +/* + * Connect two sockets together for a socketpair. This is derived from + * so_socketpair, but forgoes the task of dealing with file descriptors. + */ +static int +lx_socketpair_connect(file_t *fp1, file_t *fp2) +{ + sonode_t *so1, *so2; + sotpi_info_t *sti1, *sti2; + struct sockaddr_ux name; + int error; + + so1 = VTOSO(fp1->f_vnode); + so2 = VTOSO(fp2->f_vnode); + sti1 = SOTOTPI(so1); + sti2 = SOTOTPI(so2); + + VERIFY(so1->so_ops == &sotpi_sonodeops && + so2->so_ops == &sotpi_sonodeops); + + if (so1->so_type == SOCK_DGRAM) { + /* + * Bind both sockets and connect them with each other. + */ + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + name.sou_family = AF_UNIX; + name.sou_addr = sti2->sti_ux_laddr; + error = socket_connect(so1, (struct sockaddr *)&name, + (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED()); + if (error) { + return (error); + } + name.sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, (struct sockaddr *)&name, + (socklen_t)sizeof (name), 0, _SOCONNECT_NOXLATE, CRED()); + return (error); + } else { + sonode_t *nso; + + /* + * Bind both sockets, with 'so1' being a listener. Connect + * 'so2' to 'so1', doing so as nonblocking to avoid waiting for + * soaccept to complete. Accept the connection on 'so1', + * replacing the socket/vnode in 'fp1' with the new connection. + * + * We could simply call socket_listen() here (which would do the + * binding automatically) if the code didn't rely on passing + * _SOBIND_NOXLATE to the TPI implementation of socket_bind(). + */ + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC| + _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR, CRED()); + if (error) { + return (error); + } + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); + if (error) { + return (error); + } + + name.sou_family = AF_UNIX; + name.sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, + (struct sockaddr *)&name, + (socklen_t)sizeof (name), + FNONBLOCK, _SOCONNECT_NOXLATE, CRED()); + if (error != 0 && error != EINPROGRESS) { + return (error); + } + + error = socket_accept(so1, 0, CRED(), &nso); + if (error) { + return (error); + } + + /* wait for so2 being SS_CONNECTED */ + mutex_enter(&so2->so_lock); + error = sowaitconnected(so2, 0, 0); + mutex_exit(&so2->so_lock); + if (error != 0) { + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); + return (error); + } + + (void) socket_close(so1, 0, CRED()); + socket_destroy(so1); + fp1->f_vnode = SOTOV(nso); + } + return (0); +} + +long +lx_socketpair(int domain, int type, int protocol, int *sv) +{ + int err, options, fds[2]; + file_t *fps[2]; + boolean_t emul_seqp = B_FALSE; + + /* + * For the special case of SOCK_SEQPACKET for AF_UNIX, we want to treat + * this as a SOCK_DGRAM. The semantics are similar, but our native code + * will not pass cmsg creds over a connection-oriented socket, unlike a + * connectionless one. Some Linux code depends on this for Unix-domain + * sockets. In particular, a sockopt of SO_PASSCRED, which we map into + * our native SO_RECVUCRED, must work across fork so that the correct + * pid of the sender is available in the cmsg. See the comment in + * lx_setsockopt_socket(). + */ + if (domain == LX_AF_UNIX && type == LX_SOCK_SEQPACKET) { + type = LX_SOCK_DGRAM; + emul_seqp = B_TRUE; + } + + if ((err = lx_convert_sock_args(domain, type, protocol, &domain, &type, + &options, &protocol)) != 0) { + return (set_errno(err)); + } + + if ((err = lx_socket_create(domain, type, protocol, options, &fps[0], + &fds[0])) != 0) { + return (set_errno(err)); + } + + /* + * While it seems silly to check the family after socket creation, this + * is done to appease LTP when it tries some outlandish combinations of + * domain/type/protocol. The socket_create function is relied upon to + * emit the expected errors. + */ + if (VTOSO(fps[0]->f_vnode)->so_family != AF_UNIX) { + lx_socket_destroy(fps[0], fds[0]); + return (set_errno(EOPNOTSUPP)); + } + + if ((err = lx_socket_create(domain, type, protocol, options, &fps[1], + &fds[1])) != 0) { + lx_socket_destroy(fps[0], fds[0]); + return (set_errno(err)); + } + + err = lx_socketpair_connect(fps[0], fps[1]); + if (err != 0) { + lx_socket_destroy(fps[0], fds[0]); + lx_socket_destroy(fps[1], fds[1]); + return (set_errno(err)); + } + + if (emul_seqp) { + int i; + for (i = 0; i < 2; i++) { + sonode_t *so = VTOSO(fps[i]->f_vnode); + lx_socket_aux_data_t *sad = lx_sad_acquire(SOTOV(so)); + sad->lxsad_flags |= LXSAD_FL_EMULSEQPKT; + mutex_exit(&sad->lxsad_lock); + } + } + + setf(fds[0], fps[0]); + setf(fds[1], fps[1]); + + if ((options & SOCK_CLOEXEC) != 0) { + f_setfd(fds[0], FD_CLOEXEC); + f_setfd(fds[1], FD_CLOEXEC); + } + if (copyout(fds, sv, sizeof (fds)) != 0) { + (void) closeandsetf(fds[0], NULL); + (void) closeandsetf(fds[1], NULL); + return (set_errno(EFAULT)); + } + return (0); +} + + +#if defined(_SYSCALL32_IMPL) + +#define LX_SYS_SOCKETCALL 102 +#define LX_SOCKETCALL_MAX 20 + +typedef long (*lx_sockfn_t)(); + +static struct { + lx_sockfn_t s_fn; /* Function implementing the subcommand */ + int s_nargs; /* Number of arguments the function takes */ +} lx_socketcall_fns[] = { + lx_socket, 3, /* socket */ + lx_bind, 3, /* bind */ + lx_connect, 3, /* connect */ + lx_listen, 2, /* listen */ + lx_accept, 3, /* accept */ + lx_getsockname, 3, /* getsockname */ + lx_getpeername, 3, /* getpeername */ + lx_socketpair, 4, /* socketpair */ + lx_send, 4, /* send */ + lx_recv, 4, /* recv */ + lx_sendto, 6, /* sendto */ + lx_recvfrom, 6, /* recvfrom */ + lx_shutdown, 2, /* shutdown */ + lx_setsockopt, 5, /* setsockopt */ + lx_getsockopt, 5, /* getsockopt */ + lx_sendmsg, 3, /* sendmsg */ + lx_recvmsg, 3, /* recvmsg */ + lx_accept4, 4, /* accept4 */ + lx_recvmmsg, 5, /* recvmmsg */ + lx_sendmmsg, 4 /* sendmmsg */ +}; + +long +lx_socketcall(long p1, uint32_t *p2) +{ + int subcmd, i; + unsigned long args[6] = { 0, 0, 0, 0, 0, 0 }; + + /* incoming subcmds are 1-indexed */ + subcmd = (int)p1 - 1; + + if (subcmd < 0 || subcmd >= LX_SOCKETCALL_MAX || + lx_socketcall_fns[subcmd].s_fn == NULL) { + return (set_errno(EINVAL)); + } + + /* + * Copy the arguments to the subcommand in from the app's address + * space, returning EFAULT if we get a bogus pointer. + */ + for (i = 0; i < lx_socketcall_fns[subcmd].s_nargs; i++) { + uint32_t arg; + + if (copyin(&p2[i], &arg, sizeof (uint32_t)) != 0) { + return (set_errno(EFAULT)); + } + args[i] = (unsigned long)arg; + } + + return ((lx_socketcall_fns[subcmd].s_fn)(args[0], args[1], args[2], + args[3], args[4], args[5])); +} + +#endif /* defined(_SYSCALL32_IMPL) */ + +static void +lx_socket_vsd_free(void *data) +{ + lx_socket_aux_data_t *entry; + + entry = (lx_socket_aux_data_t *)data; + mutex_destroy(&entry->lxsad_lock); + kmem_free(entry, sizeof (*entry)); +} + +void +lx_socket_init() +{ + vsd_create(&lx_socket_vsd, lx_socket_vsd_free); +} + +void +lx_socket_fini() +{ + vsd_destroy(&lx_socket_vsd); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_splice.c b/usr/src/uts/common/brand/lx/syscall/lx_splice.c new file mode 100644 index 0000000000..64db538413 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_splice.c @@ -0,0 +1,491 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sunddi.h> +#include <sys/fs/fifonode.h> +#include <sys/strsun.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_misc.h> +#include <sys/lx_signal.h> + +/* Splice flags */ +#define LX_SPLICE_F_MOVE 0x01 +#define LX_SPLICE_F_NONBLOCK 0x02 +#define LX_SPLICE_F_MORE 0x04 +#define LX_SPLICE_F_GIFT 0x08 + +/* + * Use a max buffer size of 32k. This is a good compromise between doing I/O in + * large chunks, the limit on how much data we can write into an lx pipe by + * default (LX_DEFAULT_PIPE_SIZE), and how much kernel memory we'll allocate. + */ +#define LX_SPL_BUF_SIZE (32 * 1024) + +/* + * We only want to read as much from the input fd as we can write into the + * output fd, up to our buffer size. Figure out what that quantity is. + * Note that len will continuously decrease to 0 which triggers the typical + * end of the splice loop. + */ +static size_t +lx_spl_wr_sz(file_t *fp_out, u_offset_t fileoff, size_t bsz, size_t len, + boolean_t first) +{ + size_t sz; + + sz = MIN(bsz, len); + if (fp_out->f_vnode->v_type == VFIFO) { + /* + * If no readers on pipe, or if it would go over high water + * mark then return 0. Note that the first write into a + * pipe is expected to block if we're over the high water mark. + */ + fifonode_t *fn_dest = VTOF(fp_out->f_vnode)->fn_dest; + fifolock_t *fn_lock = fn_dest->fn_lock; + + mutex_enter(&fn_lock->flk_lock); + if (fn_dest->fn_rcnt == 0) { + sz = 0; + } else if (!first && + (sz + fn_dest->fn_count) > fn_dest->fn_hiwat) { + sz = 0; + } + mutex_exit(&fn_lock->flk_lock); + } else if (fp_out->f_vnode->v_type == VREG) { + if (fileoff >= curproc->p_fsz_ctl || + fileoff >= OFFSET_MAX(fp_out)) { + sz = 0; + } else { + sz = MIN(sz, (size_t)curproc->p_fsz_ctl - fileoff); + sz = MIN(sz, (size_t)OFFSET_MAX(fp_out) - fileoff); + } + } + + /* + * if (fp_out->f_vnode->v_type == VSOCK) + * + * There is no good way to determine if a socket is "full". A write for + * the different protocol implementations can return EWOULDBLOCK under + * different conditions, none of which we can easily check for in + * advance. + */ + + return (sz); +} + +/* + * The splice read function handles "reading" from a pipe and passes everything + * else along to our normal VOP_READ code path. + * + * When we have a pipe as our input, we don't want to consume the data out + * of the pipe until the write has succeeded. This aligns more closely with + * the Linux behavior when a write error occurs. Thus, when a pipe is the input + * and we got some data, we return with the fifo flagged as FIFORDBLOCK. This + * ensures that the data we're writing cannot be consumed by another thread + * until we consume it ourself. + * + * The pipe "read" code here is derived from the fifo I_PEEK code. + */ +static int +lx_spl_read(file_t *fp, uio_t *uiop, size_t *nread, boolean_t pipe_in, + boolean_t rd_pos) +{ + fifonode_t *fnp; + fifolock_t *fn_lock; + int count; + mblk_t *bp; + + if (!pipe_in) + return (lx_read_common(fp, uiop, nread, rd_pos)); + + ASSERT(fp->f_vnode->v_type == VFIFO); + fnp = VTOF(fp->f_vnode); + fn_lock = fnp->fn_lock; + *nread = 0; + + mutex_enter(&fn_lock->flk_lock); + + /* + * If the pipe has been switched to socket mode then this implies an + * internal programmatic error. Likewise, if it was switched to + * socket mode because we dropped the lock to set the stayfast flag. + */ + if ((fnp->fn_flag & FIFOFAST) == 0 || !fifo_stayfast_enter(fnp)) { + mutex_exit(&fn_lock->flk_lock); + return (EBADF); + } + + while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) { + fifonode_t *fn_dest = fnp->fn_dest; + + /* No writer, EOF */ + if (fn_dest->fn_wcnt == 0 || fn_dest->fn_rcnt == 0) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + return (0); + } + + /* If non-blocking, return EAGAIN otherwise 0. */ + if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + if (uiop->uio_fmode & FNONBLOCK) + return (EAGAIN); + return (0); + } + + /* Wait for data */ + fnp->fn_flag |= FIFOWANTR; + if (!cv_wait_sig_swap(&fnp->fn_wait_cv, &fn_lock->flk_lock)) { + fifo_stayfast_exit(fnp); + mutex_exit(&fn_lock->flk_lock); + return (EINTR); + } + } + + VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0); + VERIFY((fnp->fn_flag & FIFOSTAYFAST) != 0); + + /* Get up to our read size or whatever is currently available. */ + count = MIN(uiop->uio_resid, fnp->fn_count); + ASSERT(count > 0); + *nread = count; + bp = fnp->fn_mp; + while (count > 0) { + uint_t cnt = MIN(uiop->uio_resid, MBLKL(bp)); + + /* + * We have the input pipe locked and we know there is data + * available to consume. We're doing a UIO_SYSSPACE move into + * an internal buffer that we allocated in lx_splice() so + * this should never fail. + */ + VERIFY(uiomove((char *)bp->b_rptr, cnt, UIO_READ, uiop) == 0); + count -= cnt; + bp = bp->b_cont; + } + + fnp->fn_flag |= FIFORDBLOCK; + + mutex_exit(&fn_lock->flk_lock); + return (0); +} + +/* + * We've already "read" the data out of the pipe without actually consuming it. + * Here we update the pipe to consume the data and discard it. This is derived + * from the fifo_read code, except that we already know the amount of data + * in the pipe to consume and we don't have to actually move any data. + */ +static void +lx_spl_consume(file_t *fp, uint_t count) +{ + fifonode_t *fnp, *fn_dest; + fifolock_t *fn_lock; + + ASSERT(fp->f_vnode->v_type == VFIFO); + + fnp = VTOF(fp->f_vnode); + fn_lock = fnp->fn_lock; + + mutex_enter(&fn_lock->flk_lock); + VERIFY(fnp->fn_count >= count); + + while (count > 0) { + int bpsize = MBLKL(fnp->fn_mp); + int decr_size = MIN(bpsize, count); + + fnp->fn_count -= decr_size; + if (bpsize <= decr_size) { + mblk_t *bp = fnp->fn_mp; + fnp->fn_mp = fnp->fn_mp->b_cont; + freeb(bp); + } else { + fnp->fn_mp->b_rptr += decr_size; + } + + count -= decr_size; + } + + fnp->fn_flag &= ~FIFORDBLOCK; + fifo_stayfast_exit(fnp); + + fifo_wakereader(fnp, fn_lock); + + /* + * Wake up any blocked writers, processes sleeping on POLLWRNORM, or + * processes waiting for SIGPOLL. + */ + fn_dest = fnp->fn_dest; + if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) && + fnp->fn_count < fn_dest->fn_hiwat) { + fifo_wakewriter(fn_dest, fn_lock); + } + + /* Update vnode update access time */ + fnp->fn_atime = fnp->fn_dest->fn_atime = gethrestime_sec(); + + mutex_exit(&fn_lock->flk_lock); +} + +/* + * Transfer data from the input file descriptor to the output file descriptor + * without leaving the kernel. For Linux this is limited by it's kernel + * implementation which forces at least one of the file descriptors to be a + * pipe. Our implementation is likely quite different from the Linux + * one, which appears to play some VM tricks with shared pages from the pipe + * code. Instead, our implementation uses our normal VOP_READ/VOP_WRITE + * operations to internally move the data while using a single uio buffer. We + * implement the additional Linux behavior around the various checks and + * limitations. + * + * One key point on the read side is how we handle an input pipe. We don't + * want to consume the data out of the pipe until the write has succeeded. + * This aligns more closely with the Linux behavior when a write error occurs. + * The lx_spl_read() and lx_spl_consume() functions are used to handle this + * case. + */ +long +lx_splice(int fd_in, off_t *off_in, int fd_out, off_t *off_out, size_t len, + uint_t flags) +{ + int error = 0; + file_t *fp_in = NULL, *fp_out = NULL; + boolean_t found_pipe = B_FALSE, rd_pos = B_FALSE, wr_pos = B_FALSE; + boolean_t first = B_TRUE, pipe_in = B_FALSE; + iovec_t iov; + uio_t uio; + void *buf = NULL; + off_t r_off = 0, w_off = 0; + ushort_t r_flag, w_flag; + size_t bsize = 0, wr_sz, nread, nwrite, total = 0; + + /* + * Start by validating the inputs. + * + * Linux doesn't bother to check for valid flags, so neither do we. + * Also, aside from SPLICE_F_NONBLOCK, we ignore the rest of the + * flags since they're just hints to the Linux kernel implementation + * and have no effect on the proper functioning of the syscall. + */ + + if (len == 0) + return (0); + + if ((fp_in = getf(fd_in)) == NULL) { + error = EBADF; + goto done; + } + switch (fp_in->f_vnode->v_type) { + case VFIFO: + /* A fifo that is not in fast mode does not count as a pipe */ + if (((VTOF(fp_in->f_vnode))->fn_flag & FIFOFAST) != 0) { + found_pipe = B_TRUE; + pipe_in = B_TRUE; + } + /*FALLTHROUGH*/ + case VSOCK: + if (off_in != NULL) { + error = ESPIPE; + goto done; + } + break; + case VREG: + case VBLK: + case VCHR: + case VPROC: + if (off_in != NULL) { + if (copyin(off_in, &r_off, sizeof (r_off)) != 0) { + error = EFAULT; + goto done; + } + rd_pos = B_TRUE; + } + break; + default: + error = EBADF; + goto done; + } + r_flag = fp_in->f_flag; + if ((r_flag & FREAD) == 0) { + error = EBADF; + goto done; + } + + if ((fp_out = getf(fd_out)) == NULL) { + error = EBADF; + goto done; + } + switch (fp_out->f_vnode->v_type) { + case VFIFO: + found_pipe = B_TRUE; + /* Splicing to ourself returns EINVAL on Linux */ + if (pipe_in) { + fifonode_t *fnp = VTOF(fp_in->f_vnode); + if (VTOF(fp_out->f_vnode) == fnp->fn_dest) { + error = EINVAL; + goto done; + } + } + /*FALLTHROUGH*/ + case VSOCK: + if (off_out != NULL) { + error = ESPIPE; + goto done; + } + break; + case VREG: + case VBLK: + case VCHR: + case VPROC: + if (off_out != NULL) { + if (copyin(off_out, &w_off, sizeof (w_off)) != 0) { + error = EFAULT; + goto done; + } + wr_pos = B_TRUE; + } + break; + default: + error = EBADF; + goto done; + } + w_flag = fp_out->f_flag; + if ((w_flag & FWRITE) == 0) { + error = EBADF; + goto done; + } + /* Appending is invalid for output fd in splice */ + if ((w_flag & FAPPEND) != 0) { + error = EINVAL; + goto done; + } + + if (!found_pipe) { + error = EINVAL; + goto done; + } + + /* + * Check for non-blocking pipe operations. If no data in the input + * pipe, return EAGAIN. If the output pipe is full, return EAGAIN. + */ + if (flags & LX_SPLICE_F_NONBLOCK) { + fifonode_t *fn_dest; + + if (fp_in->f_vnode->v_type == VFIFO) { + fn_dest = VTOF(fp_in->f_vnode)->fn_dest; + if (fn_dest->fn_count == 0) { + error = EAGAIN; + goto done; + } + } + if (fp_out->f_vnode->v_type == VFIFO) { + fn_dest = VTOF(fp_out->f_vnode)->fn_dest; + fifolock_t *fn_lock = fn_dest->fn_lock; + mutex_enter(&fn_lock->flk_lock); + if (fn_dest->fn_count >= fn_dest->fn_hiwat) { + mutex_exit(&fn_lock->flk_lock); + error = EAGAIN; + goto done; + } + mutex_exit(&fn_lock->flk_lock); + } + } + + bsize = MIN(LX_SPL_BUF_SIZE, len); + + buf = kmem_alloc(bsize, KM_SLEEP); + bzero(&uio, sizeof (uio)); + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_llimit = curproc->p_fsz_ctl; + + /* + * Loop reading data from fd_in and writing to fd_out. This is + * controlled by how much of the requested data we can actually write, + * particularly when the destination is a pipe. This matches the Linux + * behavior, which may terminate earlier than the full 'len' if the + * pipe fills up. However, we need to block when writing into a full + * pipe on the first iteration of the loop. We already checked above + * for a full output pipe when non-blocking. + */ + while ((wr_sz = lx_spl_wr_sz(fp_out, w_off, bsize, len, first)) > 0) { + first = B_FALSE; + + /* (re)setup for a read */ + uio.uio_resid = iov.iov_len = wr_sz; /* only rd. max writable */ + iov.iov_base = buf; + uio.uio_offset = r_off; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_fmode = r_flag; + error = lx_spl_read(fp_in, &uio, &nread, pipe_in, rd_pos); + if (error != 0 || nread == 0) + break; + r_off = uio.uio_offset; + + /* Setup and perform a write from the same buffer */ + uio.uio_resid = iov.iov_len = nread; + iov.iov_base = buf; + uio.uio_offset = w_off; + uio.uio_extflg = UIO_COPY_DEFAULT; + uio.uio_fmode = w_flag; + error = lx_write_common(fp_out, &uio, &nwrite, wr_pos); + if (error != 0) { + if (pipe_in) { + /* Need to unblock reading from the fifo. */ + fifonode_t *fnp = VTOF(fp_in->f_vnode); + + mutex_enter(&fnp->fn_lock->flk_lock); + fnp->fn_flag &= ~FIFORDBLOCK; + fifo_stayfast_exit(fnp); + fifo_wakereader(fnp, fnp->fn_lock); + mutex_exit(&fnp->fn_lock->flk_lock); + } + break; + } + w_off = uio.uio_offset; + + /* + * If input is a pipe, then we can consume the amount of data + * out of the pipe that we successfully wrote. + */ + if (pipe_in) + lx_spl_consume(fp_in, nwrite); + + total += nwrite; + len -= nwrite; + } + +done: + if (buf != NULL) + kmem_free(buf, bsize); + if (fp_in != NULL) + releasef(fd_in); + if (fp_out != NULL) + releasef(fd_out); + if (error != 0) + return (set_errno(error)); + + return (total); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_stat.c b/usr/src/uts/common/brand/lx/syscall/lx_stat.c new file mode 100644 index 0000000000..0f5460816b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_stat.c @@ -0,0 +1,481 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/model.h> +#include <sys/mode.h> +#include <sys/stat.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> +#include <sys/lx_impl.h> +#include <sys/brand.h> +#include <sys/ddi.h> + +/* From "uts/common/syscall/stat.c" */ +extern int cstatat_getvp(int, char *, int, vnode_t **, cred_t **); + +typedef struct lx_timespec32 { + int32_t ts_sec; + int32_t ts_nsec; +} lx_timespec32_t; + +typedef struct lx_timespec64 { + int64_t ts_sec; + int64_t ts_nsec; +}lx_timespec64_t; + +struct lx_stat32 { + uint16_t st_dev; + uint16_t st_pad1; + uint32_t st_ino; + uint16_t st_mode; + uint16_t st_nlink; + uint16_t st_uid; + uint16_t st_gid; + uint16_t st_rdev; + uint16_t st_pad2; + uint32_t st_size; + uint32_t st_blksize; + uint32_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint32_t st_pad3; + uint32_t st_pad4; +}; + +#pragma pack(4) +struct lx_stat64_32 { + uint64_t st_dev; + uint32_t st_pad1; + uint32_t st_small_ino; + uint32_t st_mode; + uint32_t st_nlink; + uint32_t st_uid; + uint32_t st_gid; + uint64_t st_rdev; + uint32_t st_pad2; + uint64_t st_size; + uint32_t st_blksize; + uint64_t st_blocks; + lx_timespec32_t st_atime; + lx_timespec32_t st_mtime; + lx_timespec32_t st_ctime; + uint64_t st_ino; +}; +#pragma pack() + +#if defined(_LP64) +struct lx_stat64_64 { + uint64_t st_dev; + uint64_t st_ino; + uint64_t st_nlink; /* yes, the order really is */ + uint32_t st_mode; /* different for these two */ + uint32_t st_uid; + uint32_t st_gid; + uint32_t st_pad0; + uint64_t st_rdev; + int64_t st_size; + int64_t st_blksize; + int64_t st_blocks; + lx_timespec64_t st_atime; + lx_timespec64_t st_mtime; + lx_timespec64_t st_ctime; + int64_t st_unused[3]; +}; +#endif /* defined(_LP64) */ + +typedef enum lx_stat_fmt { + LXF_STAT32, + LXF_STAT64_32, + LXF_STAT64_64 +} lx_stat_fmt_t; + +static void +lx_stat_xlate_dev(vattr_t *vattr) +{ + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + dev_t dev; + lx_virt_disk_t *vd; + boolean_t is_dev; + + if (S_ISCHR(vattr->va_mode) || S_ISBLK(vattr->va_mode)) { + dev = vattr->va_rdev; + is_dev = B_TRUE; + } else { + dev = vattr->va_fsid; + is_dev = B_FALSE; + } + + /* + * See if this is the /dev/zfs device. If it is, the device number has + * already been converted to Linux format in the lx devfs so we have + * to check for that and not a native major/minor style. + */ + if (S_ISCHR(vattr->va_mode) && + LX_GETMAJOR(dev) == getmajor(lxzd->lxzd_zfs_dev) && + LX_GETMINOR(dev) == 0) { + /* + * We use the /dev/zfs device as a placeholder for our in-zone + * fabricated /dev/zfsds0 device that we're pretending / is + * mounted on. lx_zone_get_zfsds has pre-allocated this + * entry in the emulated device list. Reset dev so we can + * properly match in the following loop. + */ + dev = curproc->p_zone->zone_rootvp->v_vfsp->vfs_dev; + } + + /* Substitute emulated major/minor on zvols or mounted datasets. */ + vd = list_head(lxzd->lxzd_vdisks); + while (vd != NULL) { + if (vd->lxvd_real_dev == dev) { + dev = vd->lxvd_emul_dev; + /* + * We only update rdev for matching zfds/zvol devices + * so that the other devices are unchanged. + */ + if (is_dev) { + vattr->va_rdev = LX_MAKEDEVICE(getmajor(dev), + getminor(dev)); + } + break; + } + vd = list_next(lxzd->lxzd_vdisks, vd); + } + + /* Mangle st_dev into expected format */ + vattr->va_fsid = LX_MAKEDEVICE(getmajor(dev), getminor(dev)); +} + +static long +lx_stat_common(vnode_t *vp, cred_t *cr, void *outp, lx_stat_fmt_t fmt, + int follow) +{ + vattr_t vattr; + mode_t mode; + int error, flags; + + /* + * When symlink following is desired, the ATTR_REAL flag is necessary + * to circumvent some of the weird behavior present in filesystems like + * lx_proc. + */ + flags = (follow == FOLLOW) ? ATTR_REAL : 0; + + vattr.va_mask = AT_STAT | AT_NBLOCKS | AT_BLKSIZE | AT_SIZE; + if ((error = VOP_GETATTR(vp, &vattr, flags, cr, NULL)) != 0) { + return (error); + } + + mode = VTTOIF(vattr.va_type) | vattr.va_mode; + if ((mode & S_IFMT) == S_IFBLK) { + /* Linux seems to report a 0 st_size for all block devices */ + vattr.va_size = 0; + } + if (vattr.va_rdev == NODEV) { + /* Linux leaves st_rdev zeroed when it is absent */ + vattr.va_rdev = 0; + } + + lx_stat_xlate_dev(&vattr); + + if (fmt == LXF_STAT32) { + struct lx_stat32 sb; + + if (vattr.va_fsid > USHRT_MAX || vattr.va_rdev > USHRT_MAX || + vattr.va_nlink > USHRT_MAX || vattr.va_size > INT_MAX) { + return (EOVERFLOW); + } + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = LX_UID32_TO_UID16(vattr.va_uid); + sb.st_gid = LX_GID32_TO_GID16(vattr.va_gid); + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_32) { + struct lx_stat64_32 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_small_ino = (vattr.va_nodeid & UINT_MAX); + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); + } else if (fmt == LXF_STAT64_64) { +#if defined(_LP64) + struct lx_stat64_64 sb; + + bzero(&sb, sizeof (sb)); + sb.st_dev = vattr.va_fsid; + sb.st_ino = vattr.va_nodeid; + sb.st_mode = mode; + sb.st_nlink = vattr.va_nlink; + sb.st_uid = vattr.va_uid; + sb.st_gid = vattr.va_gid; + sb.st_rdev = vattr.va_rdev; + sb.st_size = vattr.va_size; + sb.st_blksize = vattr.va_blksize; + sb.st_blocks = vattr.va_nblocks; + sb.st_atime.ts_sec = vattr.va_atime.tv_sec; + sb.st_atime.ts_nsec = vattr.va_atime.tv_nsec; + sb.st_mtime.ts_sec = vattr.va_mtime.tv_sec; + sb.st_mtime.ts_nsec = vattr.va_mtime.tv_nsec; + sb.st_ctime.ts_sec = vattr.va_ctime.tv_sec; + sb.st_ctime.ts_nsec = vattr.va_ctime.tv_nsec; + if (copyout(&sb, outp, sizeof (sb)) != 0) { + return (EFAULT); + } + return (0); +#else + /* Invalid output format on 32-bit */ + VERIFY(0); +#endif + } + + /* Invalid output format */ + VERIFY(0); + return (0); +} + +long +lx_stat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32, FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat32(int fd, void *outp) +{ + file_t *fp; + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, LXF_STAT32, + FOLLOW); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat32(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, LXF_STAT32, NO_FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_stat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fstat64(int fd, void *outp) +{ + file_t *fp; + model_t model = get_udatamodel(); + int error; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + error = lx_stat_common(fp->f_vnode, fp->f_cred, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, FOLLOW); + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +#define LX_FSTATAT_ALLOWED (LX_AT_SYMLINK_NOFOLLOW | LX_AT_EMPTY_PATH | \ + LX_AT_NO_AUTOMOUNT) + +long +lx_fstatat64(int fd, char *name, void *outp, int flag) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int follow = FOLLOW; + int error; + char c; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + if ((flag & ~LX_FSTATAT_ALLOWED) != 0) { + return (set_errno(EINVAL)); + } + if ((flag & LX_AT_SYMLINK_NOFOLLOW) != 0) { + follow = NO_FOLLOW; + } + + if ((flag & LX_AT_NO_AUTOMOUNT) != 0) + follow |= __FLXNOAUTO; + + if (copyin(name, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + if (c == '\0') { + if ((flag & LX_AT_EMPTY_PATH) == 0) { + return (set_errno(ENOENT)); + } + + /* + * When AT_EMPTY_PATH is set and and empty string has been + * passed for the name parameter, direct the lookup against the + * vnode for that fd. + */ + if (fd == AT_FDCWD) { + mutex_enter(&curproc->p_lock); + vp = PTOU(curproc)->u_cdir; + VN_HOLD(vp); + mutex_exit(&curproc->p_lock); + cr = CRED(); + crhold(cr); + } else { + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + VN_HOLD(vp); + cr = fp->f_cred; + crhold(cr); + releasef(fd); + } + } else { + if ((error = cstatat_getvp(fd, name, follow, &vp, &cr)) != 0) { + return (set_errno(error)); + } + } + + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, follow); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lstat64(char *name, void *outp) +{ + vnode_t *vp = NULL; + cred_t *cr = NULL; + model_t model = get_udatamodel(); + int error; + + if ((error = cstatat_getvp(AT_FDCWD, name, NO_FOLLOW, &vp, &cr)) != 0) { + return (set_errno(error)); + } + error = lx_stat_common(vp, cr, outp, + (model == DATAMODEL_LP64) ? LXF_STAT64_64 : LXF_STAT64_32, + NO_FOLLOW); + VN_RELE(vp); + crfree(cr); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sync.c b/usr/src/uts/common/brand/lx/syscall/lx_sync.c new file mode 100644 index 0000000000..614afca0b0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sync.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +long +lx_syncfs(int fd) +{ + file_t *fp; + vfs_t *vfsp; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + vfsp = fp->f_vnode->v_vfsp; + releasef(fd); + + (void) (vfsp->vfs_op->vfs_sync)(vfsp, 0, CRED()); + + return (0); +} + +#define LX_SYNC_FILE_RANGE_WAIT_BEFORE 0x1 +#define LX_SYNC_FILE_RANGE_WRITE 0x2 +#define LX_SYNC_FILE_RANGE_WAIT_AFTER 0x4 + +#define LX_SYNC_FILE_RANGE_VALID (LX_SYNC_FILE_RANGE_WAIT_BEFORE | \ + LX_SYNC_FILE_RANGE_WRITE | LX_SYNC_FILE_RANGE_WAIT_AFTER) + + +long +lx_sync_file_range(int fd, off_t offset, off_t nbytes, int flags) +{ + file_t *fp; + int error, sflags = 0; + + if ((flags & ~LX_SYNC_FILE_RANGE_VALID) != 0) + return (set_errno(EINVAL)); + if (offset < 0 || nbytes < 0) + return (set_errno(EINVAL)); + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + /* + * Since sync_file_range is implemented in terms of VOP_PUTPAGE, both + * SYNC_FILE_RANGE_WAIT flags are treated as forcing synchronous + * operation. While this differs from the Linux behavior where + * BEFORE/AFTER are distinct, it achieves an adequate level of safety + * since the requested data is synced out at the end of the call. + */ + if ((flags & (LX_SYNC_FILE_RANGE_WAIT_BEFORE | + LX_SYNC_FILE_RANGE_WAIT_AFTER)) == 0) { + sflags |= B_ASYNC; + } + + error = VOP_PUTPAGE(fp->f_vnode, offset, nbytes, sflags, CRED(), NULL); + if (error == ENOSYS) { + error = ESPIPE; + } + + releasef(fd); + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c new file mode 100644 index 0000000000..052ad322a7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -0,0 +1,207 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. + */ + +#include <vm/anon.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/zone.h> +#include <sys/time.h> + +typedef struct lx_sysinfo { + int64_t si_uptime; /* Seconds since boot */ + uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint64_t si_totalram; /* Total memory size */ + uint64_t si_freeram; /* Available memory */ + uint64_t si_sharedram; /* Shared memory */ + uint64_t si_bufferram; /* Buffer memory */ + uint64_t si_totalswap; /* Total swap space */ + uint64_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint64_t si_totalhigh; /* High memory size */ + uint64_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ +} lx_sysinfo_t; + +#if defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit usermode struct. + */ +#pragma pack(4) +typedef struct lx_sysinfo32 { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ + char __si_pad[8]; +} lx_sysinfo32_t; +#pragma pack() +#endif + +extern pgcnt_t swapfs_minfree; + +static void +lx_sysinfo_common(lx_sysinfo_t *si) +{ + zone_t *zone = curzone; + pgcnt_t zphysmem, zfreemem; + ulong_t ztotswap, zfreeswap; + + si->si_uptime = gethrestime_sec() - zone->zone_boot_time; + + si->si_loads[0] = zone->zone_hp_avenrun[0]; + si->si_loads[1] = zone->zone_hp_avenrun[1]; + si->si_loads[2] = zone->zone_hp_avenrun[2]; + + /* + * In linux each thread looks like a process, so we conflate the + * two in this stat as well. + */ + si->si_procs = (int32_t)zone->zone_nlwps; + + zone_get_physmem_data(zone->zone_id, &zphysmem, &zfreemem); + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + ztotswap = k_anoninfo.ani_max; + zfreeswap = k_anoninfo.ani_free; + } else { + /* + * See the comment in swapctl for a description of how free is + * calculated within a zone. + */ + rctl_qty_t used; + spgcnt_t avail; + uint64_t max; + + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail; + + mutex_enter(&zone->zone_mem_lock); + ztotswap = btop(zone->zone_max_swap_ctl); + used = btop(zone->zone_max_swap); + mutex_exit(&zone->zone_mem_lock); + + zfreeswap = MIN(ztotswap, max) - used; + } + + /* + * If the maximum memory stat is less than 1^20 pages (i.e. 4GB), + * then we report the result in bytes. Otherwise we use pages. + * Once we start supporting >1TB systems/zones, we'll need a third + * option. + */ + if (MAX(zphysmem, ztotswap) < 1024 * 1024) { + si->si_totalram = ptob(zphysmem); + si->si_freeram = ptob(zfreemem); + si->si_totalswap = ptob(ztotswap); + si->si_freeswap = ptob(zfreeswap); + si->si_mem_unit = 1; + } else { + si->si_totalram = zphysmem; + si->si_freeram = zfreemem; + si->si_totalswap = ztotswap; + si->si_freeswap = zfreeswap; + si->si_mem_unit = PAGESIZE; + } + si->si_bufferram = 0; + si->si_sharedram = 0; + + /* + * These two stats refer to high physical memory. If an + * application running in a Linux zone cares about this, then + * either it or we are broken. + */ + si->si_totalhigh = 0; + si->si_freehigh = 0; +} + +long +lx_sysinfo64(caddr_t sip) +{ + lx_sysinfo_t si; + + bzero(&si, sizeof (si)); + lx_sysinfo_common(&si); + + if (copyout(&si, sip, sizeof (si)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +long +lx_sysinfo32(caddr_t sip) +{ + lx_sysinfo_t si; + lx_sysinfo32_t si32; + int i; + + lx_sysinfo_common(&si); + + /* + * Convert the lx_sysinfo_t into the legacy 32-bit view: + */ + bzero(&si32, sizeof (si32)); + si32.si_uptime = si.si_uptime; + + for (i = 0; i < 3; i++) { + if ((si.si_loads[i]) > 0x7fffffff) + si32.si_loads[i] = 0x7fffffff; + else + si32.si_loads[i] = si.si_loads[i]; + } + + si32.si_procs = si.si_procs; + si32.si_totalram = si.si_totalram; + si32.si_freeram = si.si_freeram; + si32.si_totalswap = si.si_totalswap; + si32.si_freeswap = si.si_freeswap; + si32.si_mem_unit = si.si_mem_unit; + + si32.si_bufferram = si.si_bufferram; + si32.si_sharedram = si.si_sharedram; + + si32.si_totalhigh = si.si_totalhigh; + si32.si_freehigh = si.si_freehigh; + + if (copyout(&si32, sip, sizeof (si32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c new file mode 100644 index 0000000000..a84c17e139 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cpuvar.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <lx_syscall.h> + +/* ARGSUSED */ +long +lx_arch_prctl(int code, ulong_t addr) +{ +#if defined(__amd64) + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *llwp = lwptolxlwp(lwp); + pcb_t *pcb = &lwp->lwp_pcb; + + switch (code) { + case LX_ARCH_GET_FS: + if (copyout(&llwp->br_lx_fsbase, (void *)addr, + sizeof (llwp->br_lx_fsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_FS: + llwp->br_lx_fsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_fsbase != llwp->br_lx_fsbase) { + pcb->pcb_fsbase = llwp->br_lx_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + break; + + case LX_ARCH_GET_GS: + if (copyout(&llwp->br_lx_gsbase, (void *)addr, + sizeof (llwp->br_lx_gsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_GS: + llwp->br_lx_gsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_gsbase != llwp->br_lx_gsbase) { + pcb->pcb_gsbase = llwp->br_lx_gsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + break; + + default: + return (set_errno(EINVAL)); + } +#endif + + return (0); +} + +long +lx_get_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + + if (fuword32(&inf->entry_number, (uint32_t *)&entry)) + return (set_errno(EFAULT)); + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + /* + * convert the solaris ldt to the linux format expected by the + * caller + */ + DESC_TO_LDT_INFO(dscrp, &ldt_inf); + ldt_inf.entry_number = entry; + + if (copyout(&ldt_inf, inf, sizeof (struct ldt_info))) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_set_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + int i; + + if (copyin(inf, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + entry = ldt_inf.entry_number; + if (entry == -1) { + /* + * Find an empty entry in the tls for this thread. + * The casts assume each user_desc_t entry is 8 bytes. + */ + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) { + if (((uint_t *)dscrp)[0] == 0 && + ((uint_t *)dscrp)[1] == 0) + break; + } + + if (i < LX_TLSNUM) { + /* + * found one + */ + entry = i + GDT_TLSMIN; + if (suword32(&inf->entry_number, entry)) + return (set_errno(EFAULT)); + } else { + return (set_errno(ESRCH)); + } + } + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + /* + * convert the linux ldt info to standard intel descriptor + */ + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + if (LDT_INFO_EMPTY(&ldt_inf)) { + ((uint_t *)dscrp)[0] = 0; + ((uint_t *)dscrp)[1] = 0; + } else { + LDT_INFO_TO_DESC(&ldt_inf, dscrp); + } + + /* + * update the gdt with the new descriptor + */ + kpreempt_disable(); + + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) + lx_set_gdt(GDT_TLSMIN + i, dscrp); + + kpreempt_enable(); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_time.c b/usr/src/uts/common/brand/lx/syscall/lx_time.c new file mode 100644 index 0000000000..b9bc8e5ab4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_time.c @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/times.h> +#include <sys/msacct.h> +#include <sys/lx_userhz.h> + +/* See the comment on LX_USERHZ for more details. */ +#define LX_NSEC_PER_USERHZ (NANOSEC / LX_USERHZ) +#define NSEC_TO_LX_USERHZ(nsec) ((nsec) / LX_NSEC_PER_USERHZ) + +/* + * Our times(2) implementation is based on the native times(2), but with + * the necessary scaling to adjust to USER_HZ. Also, Linux avoids writing + * to a NULL tp, whereas our native code returns EFAULT. + */ +long +lx_times(struct tms *tp) +{ + proc_t *p = curproc; + struct tms p_time; + clock_t ret_lbolt; + + mutex_enter(&p->p_lock); + p_time.tms_utime = + (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_USER)); + p_time.tms_stime = + (clock_t)NSEC_TO_LX_USERHZ(mstate_aggr_state(p, LMS_SYSTEM)); + p_time.tms_cutime = HZ_TO_LX_USERHZ(p->p_cutime); + p_time.tms_cstime = HZ_TO_LX_USERHZ(p->p_cstime); + mutex_exit(&p->p_lock); + +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct tms32 t32; + + t32.tms_utime = p_time.tms_utime; + t32.tms_stime = p_time.tms_stime; + t32.tms_cutime = p_time.tms_cutime; + t32.tms_cstime = p_time.tms_cstime; + + if (tp != NULL && copyout(&t32, tp, sizeof (t32)) != 0) + return (set_errno(EFAULT)); + + ret_lbolt = ddi_get_lbolt(); + return ((clock32_t)HZ_TO_LX_USERHZ(ret_lbolt)); + } else +#endif /* _SYSCALL32_IMPL */ + { + if (tp != NULL && copyout(&p_time, tp, sizeof (p_time)) != 0) + return (set_errno(EFAULT)); + + ret_lbolt = ddi_get_lbolt(); + return (HZ_TO_LX_USERHZ(ret_lbolt)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c new file mode 100644 index 0000000000..279bdbddc7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c @@ -0,0 +1,637 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * The illumos kernel provides two clock backends: CLOCK_REALTIME, the + * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically + * increasing time source that is not subject to drift or adjustment. By + * contrast, the Linux kernel is furnished with an overabundance of narrowly + * differentiated clock types. + * + * Fortunately, most of the commonly used Linux clock types are either similar + * enough to the native clock backends that they can be directly mapped, or + * represent queries to the per-process and per-LWP microstate counters. + * + * CLOCK_BOOTTIME is identical to CLOCK_MONOTONIC, except that it takes into + * account time that the system is suspended. Since that is uninteresting to + * us, we treat it the same. + */ + +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <lx_signum.h> + +/* + * From "uts/common/os/timer.c": + */ +extern int clock_settime(clockid_t, timespec_t *); +extern int clock_gettime(clockid_t, timespec_t *); +extern int clock_getres(clockid_t, timespec_t *); +extern int nanosleep(timespec_t *, timespec_t *); + + +static int lx_emul_clock_getres(clockid_t, timespec_t *); +static int lx_emul_clock_gettime(clockid_t, timespec_t *); +static int lx_emul_clock_settime(clockid_t, timespec_t *); + +typedef struct lx_clock_backend { + clockid_t lclk_ntv_id; + int (*lclk_clock_getres)(clockid_t, timespec_t *); + int (*lclk_clock_gettime)(clockid_t, timespec_t *); + int (*lclk_clock_settime)(clockid_t, timespec_t *); +} lx_clock_backend_t; + +/* + * NOTE: The Linux man pages state this structure is obsolete and is + * unsupported, so it is declared here for sizing purposes only. + */ +struct lx_timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +/* + * Use the native clock_* system call implementation, but with a translated + * clock identifier: + */ +#define NATIVE(ntv_id) \ + { ntv_id, clock_getres, clock_gettime, clock_settime } + +/* + * This backend is not supported, so we provide an emulation handler: + */ +#define EMUL(ntv_id) \ + { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime, \ + lx_emul_clock_settime } + +static lx_clock_backend_t lx_clock_backends[] = { + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC */ + EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */ + EMUL(CLOCK_THREAD_CPUTIME_ID), /* LX_CLOCK_THREAD_CPUTIME_ID */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_RAW */ + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME_COARSE */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_COARSE */ + NATIVE(CLOCK_HIGHRES) /* LX_CLOCK_BOOTTIME */ +}; + +#define LX_CLOCK_MAX \ + (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0])) +#define LX_CLOCK_BACKEND(clk) (((clk) < LX_CLOCK_MAX && (clk) >= 0) ? \ + &lx_clock_backends[(clk)] : NULL) + +/* + * Linux defines the size of the sigevent structure to be 64 bytes. In order + * to meet that definition, the trailing union includes a member which pads it + * out to the desired length for the given architecture. + */ +#define LX_SIGEV_PAD_SIZE ((64 - \ + (sizeof (int) * 2 + sizeof (union sigval))) / sizeof (int)) + +typedef struct { + union sigval lx_sigev_value; + int lx_sigev_signo; + int lx_sigev_notify; + union { + int lx_pad[LX_SIGEV_PAD_SIZE]; + int lx_tid; + struct { + void (*lx_notify_function)(union sigval); + void *lx_notify_attribute; + } lx_sigev_thread; + } lx_sigev_un; +} lx_sigevent_t; + + +#ifdef _SYSCALL32_IMPL + +#define LX_SIGEV32_PAD_SIZE ((64 - \ + (sizeof (int) * 2 + sizeof (union sigval32))) / sizeof (int)) + +typedef struct { + union sigval32 lx_sigev_value; + int lx_sigev_signo; + int lx_sigev_notify; + union { + int lx_pad[LX_SIGEV32_PAD_SIZE]; + int lx_tid; + struct { + caddr32_t lx_notify_function; + caddr32_t lx_notify_attribute; + } lx_sigev_thread; + } lx_sigev_un; +} lx_sigevent32_t; + +#endif /* _SYSCALL32_IMPL */ + +#define LX_SIGEV_SIGNAL 0 +#define LX_SIGEV_NONE 1 +#define LX_SIGEV_THREAD 2 +#define LX_SIGEV_THREAD_ID 4 + +/* + * Access private SIGEV_THREAD_ID callback state in itimer_t + */ +#define LX_SIGEV_THREAD_ID_LPID(it) ((it)->it_cb_data[0]) +#define LX_SIGEV_THREAD_ID_TID(it) ((it)->it_cb_data[1]) + + +/* ARGSUSED */ +static int +lx_emul_clock_settime(clockid_t clock, timespec_t *tp) +{ + return (set_errno(EINVAL)); +} + +static int +lx_emul_clock_gettime(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: { + proc_t *p = ttoproc(curthread); + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage() in "rusagesys.c": + */ + mutex_enter(&p->p_lock); + unsecs = mstate_aggr_state(p, LMS_USER); + snsecs = mstate_aggr_state(p, LMS_SYSTEM); + mutex_exit(&p->p_lock); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + case CLOCK_THREAD_CPUTIME_ID: { + klwp_t *lwp = ttolwp(curthread); + struct mstate *ms = &lwp->lwp_mstate; + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage_lwp() in "rusagesys.c": + */ + unsecs = ms->ms_acct[LMS_USER]; + snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + + scalehrtime(&unsecs); + scalehrtime(&snsecs); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static int +lx_emul_clock_getres(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + if (tp == NULL) { + return (0); + } + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: + case CLOCK_THREAD_CPUTIME_ID: + /* + * These clock backends return microstate accounting values for + * the LWP or the entire process. The Linux kernel claims they + * have nanosecond resolution; so will we. + */ + t.tv_sec = 0; + t.tv_nsec = 1; + break; + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static void +lx_clock_unsupported(int clock) +{ + char buf[100]; + + (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock); + lx_unsupported(buf); +} + +long +lx_clock_settime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_gettime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_getres(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + /* + * It is important this check is performed after the clock + * check. Both glibc and musl, in their clock_getcpuclockid(), + * use clock_getres() with a NULL tp to validate a clock + * value. Performing the tp check before the clock check could + * indicate a valid clock to libc when it shouldn't. + */ + if (tp == NULL) { + return (0); + } + + return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp)); +} + +static int +lx_ltos_sigev(lx_sigevent_t *lev, struct sigevent *sev) +{ + bzero(sev, sizeof (*sev)); + + switch (lev->lx_sigev_notify) { + case LX_SIGEV_NONE: + sev->sigev_notify = SIGEV_NONE; + break; + + case LX_SIGEV_SIGNAL: + case LX_SIGEV_THREAD_ID: + sev->sigev_notify = SIGEV_SIGNAL; + break; + + case LX_SIGEV_THREAD: + /* + * Just as in illumos, SIGEV_THREAD handling is performed in + * userspace with the help of SIGEV_SIGNAL/SIGEV_THREAD_ID. + * + * It's not expected to make an appearance in the syscall. + */ + default: + return (EINVAL); + } + + sev->sigev_signo = lx_ltos_signo(lev->lx_sigev_signo, 0); + sev->sigev_value = lev->lx_sigev_value; + + /* Ensure SIGEV_SIGNAL has a valid signo to work with. */ + if (sev->sigev_notify == SIGEV_SIGNAL && sev->sigev_signo == 0) { + return (EINVAL); + } + return (0); +} + +static int +lx_sigev_copyin(lx_sigevent_t *userp, lx_sigevent_t *levp) +{ +#ifdef _SYSCALL32_IMPL + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_sigevent32_t lev32; + + if (copyin(userp, &lev32, sizeof (lev32)) != 0) { + return (EFAULT); + } + levp->lx_sigev_value.sival_int = lev32.lx_sigev_value.sival_int; + levp->lx_sigev_signo = lev32.lx_sigev_signo; + levp->lx_sigev_notify = lev32.lx_sigev_notify; + levp->lx_sigev_un.lx_tid = lev32.lx_sigev_un.lx_tid; + } else +#endif /* _SYSCALL32_IMPL */ + { + if (copyin(userp, levp, sizeof (lx_sigevent_t)) != 0) { + return (EFAULT); + } + } + return (0); +} + +static void +lx_sigev_thread_fire(itimer_t *it) +{ + proc_t *p = it->it_proc; + pid_t lpid = (pid_t)LX_SIGEV_THREAD_ID_LPID(it); + id_t tid = (id_t)LX_SIGEV_THREAD_ID_TID(it); + lwpdir_t *ld; + + ASSERT(MUTEX_HELD(&it->it_mutex)); + ASSERT(it->it_pending == 0); + ASSERT(it->it_flags & IT_SIGNAL); + ASSERT(MUTEX_HELD(&p->p_lock)); + + ld = lwp_hash_lookup(p, tid); + if (ld != NULL) { + lx_lwp_data_t *lwpd; + kthread_t *t; + + t = ld->ld_entry->le_thread; + lwpd = ttolxlwp(t); + if (lwpd != NULL && lwpd->br_pid == lpid) { + /* + * A thread matching the LX pid is still present in the + * process. Send a targeted signal as requested. + */ + it->it_pending = 1; + mutex_exit(&it->it_mutex); + sigaddqa(p, t, it->it_sigq); + return; + } + } + + mutex_exit(&it->it_mutex); +} + +long +lx_timer_create(int clock, lx_sigevent_t *sevp, timer_t *tidp) +{ + int error; + lx_sigevent_t lev; + struct sigevent sev; + clock_backend_t *backend = NULL; + proc_t *p = curproc; + itimer_t *itp; + timer_t tid; + + if (clock == -2) { + /* + * A change was made to the old userspace timer emulation to + * handle this specific clock ID for MapR. It was wrongly + * mapped to CLOCK_REALTIME rather than CLOCK_THREAD_CPUTIME_ID + * which it maps to. Until the CLOCK_*_CPUTIME_ID timers can + * be emulated, the admittedly incorrect mapping will remain. + */ + backend = clock_get_backend(CLOCK_REALTIME); + } else { + lx_clock_backend_t *lback = LX_CLOCK_BACKEND(clock); + + if (lback != NULL) { + backend = clock_get_backend(lback->lclk_ntv_id); + } + } + if (backend == NULL) { + return (set_errno(EINVAL)); + } + + /* We have to convert the Linux sigevent layout to the illumos layout */ + if (sevp != NULL) { + if ((error = lx_sigev_copyin(sevp, &lev)) != 0) { + return (set_errno(error)); + } + if ((error = lx_ltos_sigev(&lev, &sev)) != 0) { + return (set_errno(error)); + } + } else { + bzero(&sev, sizeof (sev)); + sev.sigev_notify = SIGEV_SIGNAL; + sev.sigev_signo = SIGALRM; + } + + if ((error = timer_setup(backend, &sev, NULL, &itp, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * The SIGEV_THREAD_ID notification method in Linux allows the caller + * to target a specific thread to receive the signal. The IT_CALLBACK + * timer functionality is used to fulfill this need. After translating + * the LX pid to a SunOS thread ID (ensuring it exists in the current + * process), those IDs are attached to the timer along with the custom + * lx_sigev_thread_fire callback. This targets the signal notification + * properly when the timer fires. + */ + if (lev.lx_sigev_notify == LX_SIGEV_THREAD_ID) { + pid_t lpid, spid; + id_t stid; + + lpid = (pid_t)lev.lx_sigev_un.lx_tid; + if (lx_lpid_to_spair(lpid, &spid, &stid) != 0 || + spid != curproc->p_pid) { + error = EINVAL; + goto err; + } + + itp->it_flags |= IT_CALLBACK; + itp->it_cb_func = lx_sigev_thread_fire; + LX_SIGEV_THREAD_ID_LPID(itp) = lpid; + LX_SIGEV_THREAD_ID_TID(itp) = stid; + } + + /* + * When the sigevent is not specified, its sigev_value field is + * expected to be populated with the timer ID. + */ + if (sevp == NULL) { + itp->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + error = EFAULT; + goto err; + } + + timer_release(p, itp); + return (0); + +err: + timer_delete_grabbed(p, tid, itp); + return (set_errno(error)); +} + +long +lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp) +{ + struct lx_timezone tz; + + bzero(&tz, sizeof (tz)); + + /* + * We want to be similar to libc which just does a fasttrap to + * gethrestime and simply converts that result. We follow how uniqtime + * does the conversion but we can't use that code since it does some + * extra work which can cause the result to bounce around based on which + * CPU we run on. + */ + if (tvp != NULL) { + struct timeval tv; + timestruc_t ts; + int usec, nsec; + + gethrestime(&ts); + nsec = ts.tv_nsec; + usec = nsec + (nsec >> 2); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 4); + usec = nsec - (usec >> 3); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 3); + usec = nsec + (usec >> 4); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 6); + usec = usec >> 10; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = usec; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(&tv, tvp, sizeof (tv)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (copyout(&tv32, tvp, sizeof (tv32))) + return (set_errno(EFAULT)); + } +#endif + } + + /* + * The Linux man page states use of the second parameter is obsolete, + * but gettimeofday(2) should still return EFAULT if it is set + * to a bad non-NULL pointer (sigh...) + */ + if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure + * mode is documented as "undefined." + */ +long +lx_time(time_t *tp) +{ + timestruc_t ts; + struct timeval tv; + + gethrestime(&ts); + tv.tv_sec = ts.tv_sec; + tv.tv_usec = 0; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (tp != NULL && + copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0) + return (set_errno(EFAULT)); + + return (tv.tv_sec); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (tp != NULL && + copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec))) + return (set_errno(EFAULT)); + + return (tv32.tv_sec); + } +#endif /* _SYSCALL32_IMPL */ + /* NOTREACHED */ +} + +long +lx_nanosleep(timespec_t *rqtp, timespec_t *rmtp) +{ + return (nanosleep(rqtp, rmtp)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_umask.c b/usr/src/uts/common/brand/lx/syscall/lx_umask.c new file mode 100644 index 0000000000..cb5e4ed232 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_umask.c @@ -0,0 +1,52 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> + +/* From usr/src/uts/common/syscall/umask.c */ +extern int umask(int); + +/* + * Just do what umask() does, but for the given process. + */ +static int +lx_clone_umask_cb(proc_t *pp, void *arg) +{ + mode_t cmask = (mode_t)(intptr_t)arg; + mode_t orig; + + orig = PTOU(pp)->u_cmask; + PTOU(pp)->u_cmask = (mode_t)(cmask & PERMMASK); + return ((int)orig); +} + +long +lx_umask(mode_t cmask) +{ + lx_proc_data_t *lproc = ttolxproc(curthread); + + /* Handle the rare case of being in a CLONE_FS clone group */ + if (lx_clone_grp_member(lproc, LX_CLONE_FS)) { + int omask; + + omask = lx_clone_grp_walk(lproc, LX_CLONE_FS, lx_clone_umask_cb, + (void *)(intptr_t)cmask); + return (omask); + } + + return (umask(cmask)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_uname.c b/usr/src/uts/common/brand/lx/syscall/lx_uname.c new file mode 100644 index 0000000000..2d18408eaa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_uname.c @@ -0,0 +1,82 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> + +struct lx_utsname { + char lxu_sysname[LX_SYS_UTS_LN]; + char lxu_nodename[LX_SYS_UTS_LN]; + char lxu_release[LX_SYS_UTS_LN]; + char lxu_version[LX_SYS_UTS_LN]; + char lxu_machine[LX_SYS_UTS_LN]; + char lxu_domainname[LX_SYS_UTS_LN]; +}; + +long +lx_uname(void *uptr) +{ + proc_t *p = curproc; + lx_proc_data_t *lxpd = ptolxproc(p); + lx_zone_data_t *lxzd = ztolxzd(p->p_zone); + struct lx_utsname un; + + bzero(&un, sizeof (un)); + + (void) strlcpy(un.lxu_sysname, LX_UNAME_SYSNAME, LX_SYS_UTS_LN); + (void) strlcpy(un.lxu_nodename, p->p_zone->zone_nodename, + LX_SYS_UTS_LN); + + mutex_enter(&lxzd->lxzd_lock); + + if (lxpd->l_uname_release[0] != '\0') { + (void) strlcpy(un.lxu_release, lxpd->l_uname_release, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_release, lxzd->lxzd_kernel_release, + LX_SYS_UTS_LN); + } + if (lxpd->l_uname_version[0] != '\0') { + (void) strlcpy(un.lxu_version, lxpd->l_uname_version, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_version, lxzd->lxzd_kernel_version, + LX_SYS_UTS_LN); + } + + mutex_exit(&lxzd->lxzd_lock); + + if (get_udatamodel() == DATAMODEL_LP64) { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE64, + LX_SYS_UTS_LN); + } else { + (void) strlcpy(un.lxu_machine, LX_UNAME_MACHINE32, + LX_SYS_UTS_LN); + } + (void) strlcpy(un.lxu_domainname, p->p_zone->zone_domain, + LX_SYS_UTS_LN); + + if (copyout(&un, uptr, sizeof (un)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c new file mode 100644 index 0000000000..3a5ba69b93 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c @@ -0,0 +1,377 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +/* + * wait() family of functions. + * + * The first minor difference between the Linux and Solaris family of wait() + * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully, + * the exit status values are identical between the two implementations. + * + * Things get very different and very complicated when we introduce the Linux + * threading model. Under linux, both threads and child processes are + * represented as processes. However, the behavior of wait() with respect to + * each child varies according to the flags given to clone() + * + * SIGCHLD The SIGCHLD signal should be sent on termination + * CLONE_THREAD The child shares the same thread group as the parent + * CLONE_DETACHED The parent receives no notification when the child exits + * + * The following flags control the Linux behavior w.r.t. the above attributes: + * + * __WALL Wait on all children, regardless of type + * __WCLONE Wait only on non-SIGCHLD children + * __WNOTHREAD Don't wait on children of other threads in this group + * + * The following chart shows whether wait() returns when the child exits: + * + * default __WCLONE __WALL + * no SIGCHLD - X X + * SIGCHLD X - X + * + * The following chart shows whether wait() returns when the grandchild exits: + * + * default __WNOTHREAD + * no CLONE_THREAD - - + * CLONE_THREAD X - + * + * The CLONE_DETACHED flag is universal - when the child exits, no state is + * stored and wait() has no effect. + * + * XXX Support the above combination of options, or some reasonable subset that + * covers at least fork() and pthread_create(). + */ + +#include <sys/wait.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_errno.h> +#include <lx_syscall.h> + +/* + * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c": + */ +extern int waitid(idtype_t, id_t, k_siginfo_t *, int); +extern int rusagesys(int, void *, void *, void *, void *); + +/* + * Convert between Linux options and Solaris options, returning -1 if any + * invalid flags are found. + */ +#define LX_WNOHANG 0x00000001 +#define LX_WUNTRACED 0x00000002 +#define LX_WSTOPPED LX_WUNTRACED +#define LX_WEXITED 0x00000004 +#define LX_WCONTINUED 0x00000008 +#define LX_WNOWAIT 0x01000000 + +#define LX_WNOTHREAD 0x20000000 +#define LX_WALL 0x40000000 +#define LX_WCLONE 0x80000000 + +#define LX_P_ALL 0x0 +#define LX_P_PID 0x1 +#define LX_P_GID 0x2 + +/* + * Split the passed waitpid/waitid options into two separate variables: + * those for the native illumos waitid(2), and the extra Linux-specific + * options we will handle in our brand-specific code. + */ +static int +ltos_options(uintptr_t options, int *native_options, int *extra_options) +{ + int newoptions = 0; + + if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED | + LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL | + LX_WCLONE)) != 0) { + return (-1); + } + + *extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE); + + if (options & LX_WNOHANG) + newoptions |= WNOHANG; + if (options & LX_WUNTRACED) + newoptions |= WUNTRACED; + if (options & LX_WEXITED) + newoptions |= WEXITED; + if (options & LX_WCONTINUED) + newoptions |= WCONTINUED; + if (options & LX_WNOWAIT) + newoptions |= WNOWAIT; + + /* + * The trapped option is implicit on Linux. + */ + newoptions |= WTRAPPED; + + *native_options = newoptions; + return (0); +} + +static int +lx_wstat(int code, int status) +{ + int stat = 0; + + switch (code) { + case CLD_EXITED: + stat = status << 8; + break; + case CLD_DUMPED: + stat = lx_stol_signo(status, SIGKILL) | WCOREFLG; + break; + case CLD_KILLED: + stat = lx_stol_signo(status, SIGKILL); + break; + case CLD_TRAPPED: + case CLD_STOPPED: + stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG; + break; + case CLD_CONTINUED: + stat = WCONTFLG; + break; + } + + return (stat); +} + +static int +lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options, + int extra_options) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int error; + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = extra_options; + + if ((error = waitid(idtype, id, sip, native_options)) == EINTR) { + /* + * According to signal(7), the wait4(2), waitid(2), and + * waitpid(2) system calls are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (error); +} + +long +lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) +{ + k_siginfo_t info = { 0 }; + idtype_t idtype; + id_t id; + int status = 0; + pid_t pid = (pid_t)p1; + int error; + int native_options, extra_options; + int *statusp = (int *)p2; + void *rup = (void *)p4; + + if (ltos_options(p3, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (pid > maxpid) { + return (set_errno(ECHILD)); + } + + /* + * While not listed as a valid return code, Linux's wait4(2) does, + * in fact, get an EFAULT if either the status pointer or rusage + * pointer is invalid. Since a failed waitpid should leave child + * process in a state where a future wait4(2) will succeed, we + * check them by copying out the values their buffers originally + * contained. (We need to do this as a failed system call should + * never affect the contents of a passed buffer.) + * + * This will fail if the buffers in question are write-only. + */ + if (statusp != NULL) { + if (copyin(statusp, &status, sizeof (status)) != 0 || + copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + /* + * Do the same check for the "struct rusage" pointer, which differs + * in size for 32- and 64-bit processes. + */ + if (rup != NULL) { + struct rusage ru; + void *krup = &ru; + size_t rusz = sizeof (ru); +#if defined(_SYSCALL32_IMPL) + struct rusage32 ru32; + + if (get_udatamodel() != DATAMODEL_NATIVE) { + krup = &ru32; + rusz = sizeof (ru32); + } +#endif + + if (copyin(rup, krup, rusz) != 0 || + copyout(krup, rup, rusz) != 0) { + return (set_errno(EFAULT)); + } + } + + if (pid < -1) { + idtype = P_PGID; + id = -pid; + } else if (pid == -1) { + idtype = P_ALL; + id = 0; + } else if (pid == 0) { + idtype = P_PGID; + mutex_enter(&pidlock); + id = curproc->p_pgrp; + mutex_exit(&pidlock); + } else { + idtype = P_PID; + id = pid; + } + + native_options |= (WEXITED | WTRAPPED); + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + + status = lx_wstat(info.si_code, info.si_status); + + /* + * Unfortunately if this attempt to copy out either the status or the + * rusage fails, the process will be in an inconsistent state as + * subsequent calls to wait for the same child will fail where they + * should succeed on a Linux system. This, however, is rather + * unlikely since we tested the validity of both above. + */ + if (statusp != NULL) { + if (copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + if (rup != NULL) { + if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL, + NULL, NULL)) != 0) { + return (set_errno(error)); + } + } + + return (info.si_pid); +} + +long +lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3) +{ + return (lx_wait4(p1, p2, p3, (uintptr_t)NULL)); +} + +long +lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) +{ + int error; + int native_options, extra_options; + k_siginfo_t info = { 0 }; + + if (ltos_options(opt, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) { + return (set_errno(EINVAL)); + } + + switch (idtype) { + case LX_P_ALL: + idtype = P_ALL; + break; + case LX_P_PID: + idtype = P_PID; + break; + case LX_P_GID: + idtype = P_PGID; + break; + default: + return (set_errno(EINVAL)); + } + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (stol_ksiginfo32_copyout(&info, (void *)infop)); + } else +#endif + { + return (stol_ksiginfo_copyout(&info, (void *)infop)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c new file mode 100644 index 0000000000..19bf9a4ebb --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c @@ -0,0 +1,519 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/lx_acl.h> + + +#define LX_XATTR_NAME_MAX 255 +#define LX_XATTR_SIZE_MAX 65536 +#define LX_XATTR_LIST_MAX 65536 + +#define LX_XATTR_FLAG_CREATE 0x1 +#define LX_XATTR_FLAG_REPLACE 0x2 +#define LX_XATTR_FLAGS_VALID (LX_XATTR_FLAG_CREATE | LX_XATTR_FLAG_REPLACE) + +enum lx_xattr_ns { + LX_XATTR_NS_SECURITY, + LX_XATTR_NS_SYSTEM, + LX_XATTR_NS_TRUSTED, + LX_XATTR_NS_USER, + LX_XATTR_NS_INVALID /* Catch-all for invalid namespaces */ +}; + +/* Present under the 'security.' namespace */ +#define LX_XATTR_CAPABILITY "capability" + +typedef struct lx_xattr_ns_list { + const char *lxnl_name; + unsigned lxnl_len; + enum lx_xattr_ns lxnl_ns; +} lx_xattr_ns_list_t; + +static lx_xattr_ns_list_t lx_xattr_namespaces[] = { + { "user.", 5, LX_XATTR_NS_USER }, + { "system.", 7, LX_XATTR_NS_SYSTEM }, + { "trusted.", 8, LX_XATTR_NS_TRUSTED }, + { "security.", 9, LX_XATTR_NS_SECURITY }, + { NULL, 0, LX_XATTR_NS_INVALID } +}; + +static int +lx_xattr_parse(const char *name, size_t nlen, const char **key) +{ + lx_xattr_ns_list_t *lxn = lx_xattr_namespaces; + + for (; lxn->lxnl_name != NULL; lxn++) { + if (nlen < lxn->lxnl_len) { + continue; + } + if (strncmp(lxn->lxnl_name, name, lxn->lxnl_len) == 0) { + *key = name + (lxn->lxnl_len); + return (lxn->lxnl_ns); + } + } + + *key = name; + return (LX_XATTR_NS_INVALID); +} + +/* + * *xattr() family of functions. + * + * These are largely unimplemented. In most cases we return EOPNOTSUPP, rather + * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1). + * + * Note that CRED() is used instead of f_cred in the f*xattr functions. This + * is intentional as Linux does not have the same notion of per-fd credentials. + */ + +/* ARGSUSED */ +static int +lx_setxattr_common(vnode_t *vp, char *name, void *value, size_t sz, int flags) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + void *buf = NULL; + + if ((flags & ~LX_XATTR_FLAGS_VALID) != 0) { + return (EINVAL); + } + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + type = lx_xattr_parse(name_buf, name_len, &key); + + if (sz != 0) { + if (sz > LX_XATTR_SIZE_MAX) { + return (E2BIG); + } + buf = kmem_alloc(sz, KM_SLEEP); + if (copyin(value, buf, sz) != 0) { + kmem_free(buf, sz); + return (EFAULT); + } + } + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SECURITY: + /* + * In order to keep package management software happy, despite + * lacking support for file-based Linux capabilities via + * xattrs, we fake success when root attempts a setxattr on + * that attribute. + */ + if (crgetuid(CRED()) == 0 && + strcmp(key, LX_XATTR_CAPABILITY) == 0) { + error = 0; + } + break; + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_setxattr(vp, LX_ACL_ACCESS, buf, sz); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_setxattr(vp, LX_ACL_DEFAULT, buf, sz); + } + default: + break; + } + + if (buf != NULL) { + kmem_free(buf, sz); + } + return (error); +} + +/* ARGSUSED */ +static int +lx_getxattr_common(vnode_t *vp, char *name, char *value, size_t sz, + ssize_t *osz) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + void *buf = NULL; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + if (sz != 0) { + if (sz > LX_XATTR_SIZE_MAX) { + sz = LX_XATTR_SIZE_MAX; + } + buf = kmem_alloc(sz, KM_SLEEP); + } + + type = lx_xattr_parse(name_buf, name_len, &key); + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_getxattr(vp, LX_ACL_ACCESS, buf, sz, + osz); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_getxattr(vp, LX_ACL_DEFAULT, buf, sz, + osz); + } + break; + default: + break; + } + + if (error == 0 && buf != NULL) { + VERIFY(*osz <= sz); + + if (copyout(buf, value, *osz) != 0) { + error = EFAULT; + } + } + if (buf != NULL) { + kmem_free(buf, sz); + } + return (error); +} + +/* ARGSUSED */ +static int +lx_listxattr_common(vnode_t *vp, void *value, size_t size, ssize_t *osize) +{ + struct uio auio; + struct iovec aiov; + int err = 0; + + aiov.iov_base = value; + aiov.iov_len = size; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = 0; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_resid = size; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Call into all the listxattr routines (which may be no-ops) which are + * currently implemented. + */ + err = lx_acl_listxattr(vp, &auio); + + if (err == 0) { + *osize = size - auio.uio_resid; + } + + return (err); +} + +/* ARGSUSED */ +static int +lx_removexattr_common(vnode_t *vp, char *name) +{ + int error, type; + char name_buf[LX_XATTR_NAME_MAX + 1]; + const char *key; + size_t name_len; + + error = copyinstr(name, name_buf, sizeof (name_buf), &name_len); + if (error == ENAMETOOLONG || name_len == sizeof (name_buf)) { + return (ERANGE); + } else if (error != 0) { + return (EFAULT); + } + + + type = lx_xattr_parse(name_buf, name_len, &key); + + error = EOPNOTSUPP; + switch (type) { + case LX_XATTR_NS_SYSTEM: + if (strcmp(key, LX_XATTR_POSIX_ACL_ACCESS) == 0) { + error = lx_acl_removexattr(vp, LX_ACL_ACCESS); + } else if (strcmp(key, LX_XATTR_POSIX_ACL_DEFAULT) == 0) { + error = lx_acl_removexattr(vp, LX_ACL_DEFAULT); + } + default: + break; + } + + return (EOPNOTSUPP); +} + + +long +lx_setxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_lsetxattr(char *path, char *name, void *value, size_t size, int flags) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_setxattr_common(vp, name, value, size, flags); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +long +lx_fsetxattr(int fd, char *name, void *value, size_t size, int flags) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_setxattr_common(fp->f_vnode, name, value, size, flags); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +ssize_t +lx_getxattr(char *path, char *name, void *value, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_lgetxattr(char *path, char *name, void *value, size_t size) +{ + + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_getxattr_common(vp, name, value, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_fgetxattr(int fd, char *name, void *value, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + /* + * When a file is opened with O_PATH we clear read/write and fgetxattr + * is expected to return EBADF. + */ + if ((fp->f_flag & (FREAD | FWRITE)) == 0) { + releasef(fd); + return (set_errno(EBADF)); + } + + error = lx_getxattr_common(fp->f_vnode, name, value, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_listxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_llistxattr(char *path, char *list, size_t size) +{ + int error; + vnode_t *vp = NULL; + ssize_t osize; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_listxattr_common(vp, list, size, &osize); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +ssize_t +lx_flistxattr(int fd, char *list, size_t size) +{ + int error; + file_t *fp; + ssize_t osize; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_listxattr_common(fp->f_vnode, list, size, &osize); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (osize); +} + +int +lx_removexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_lremovexattr(char *path, char *name) +{ + int error; + vnode_t *vp = NULL; + + error = lookupname(path, UIO_USERSPACE, NO_FOLLOW, NULLVPP, &vp); + if (error != 0) { + return (set_errno(error)); + } + + error = lx_removexattr_common(vp, name); + VN_RELE(vp); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} + +int +lx_fremovexattr(int fd, char *name) +{ + int error; + file_t *fp; + + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + + error = lx_removexattr_common(fp->f_vnode, name); + releasef(fd); + + if (error != 0) { + return (set_errno(error)); + } + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h new file mode 100644 index 0000000000..f34ed31dcb --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h @@ -0,0 +1,198 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _LXSYSFS_H +#define _LXSYSFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_sysfs.h: declarations, data structures and macros for lx_sysfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> + +/* + * Convert a vnode into an lxsys_mnt_t + */ +#define VTOLXSM(vp) ((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxsys_node + */ +#define VTOLXS(vp) ((lxsys_node_t *)(vp)->v_data) + +/* + * convert a lxsys_node into a vnode + */ +#define LXSTOV(lxsnp) ((lxsnp)->lxsys_vnode) + +/* + * convert a lxsys_node into zone for fs + */ +#define LXSTOZ(lxsnp) \ + (((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone) + +#define LXSNSIZ 256 /* max size of lx /sys file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXSYS_SDSIZE 16 + +/* Root sysfs lxsys_instance */ +#define LXSYS_INST_ROOT 0 + +/* + * Node/file types for lx /sys files + * (directories and files contained therein). + */ +typedef enum lxsys_nodetype { + LXSYS_NONE, /* None-type to keep inodes non-zero */ + LXSYS_STATIC, /* Statically defined entries */ + LXSYS_CLASS_NET, /* /sys/class/net/<iface> */ + LXSYS_DEV_NET, /* /sys/devices/virtual/net/<iface> */ + LXSYS_BLOCK, /* /sys/block/<dev> */ + LXSYS_DEV_ZFS, /* /sys/devices/zfs/<dev> */ + LXSYS_DEV_SYS_CPU, /* /sys/devices/system/cpu/<cpu> */ + LXSYS_DEV_SYS_CPUINFO, /* /sys/devices/system/cpu/cpuN/<info> */ + LXSYS_DEV_SYS_NODE, /* /sys/devices/system/node/node0/<info> */ + LXSYS_MAXTYPE, /* type limit */ +} lxsys_nodetype_t; + +/* + * external dirent characteristics + */ +typedef struct { + unsigned int d_idnum; + char *d_name; +} lxsys_dirent_t; + +typedef struct { + unsigned int dl_instance; + lxsys_dirent_t *dl_list; + int dl_length; +} lxsys_dirlookup_t; + +/* + * This is the lx sysfs private data object + * which is attached to v_data in the vnode structure + */ +struct lxsys_node; +typedef struct lxsys_node lxsys_node_t; +struct lxsys_node { + lxsys_nodetype_t lxsys_type; /* type ID of node */ + unsigned int lxsys_instance; /* instance ID node */ + unsigned int lxsys_endpoint; /* endpoint ID node */ + vnode_t *lxsys_vnode; /* vnode for the node */ + vnode_t *lxsys_parentvp; /* parent directory */ + lxsys_node_t *lxsys_next; /* next list entry */ + timestruc_t lxsys_time; /* creation time */ + mode_t lxsys_mode; /* file mode bits */ + uid_t lxsys_uid; /* file owner */ + gid_t lxsys_gid; /* file group owner */ + ino_t lxsys_ino; /* node id */ +}; + +/* + * This is the lxsysfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxsys_mnt { + kmutex_t lxsysm_lock; /* protects fields */ + lxsys_node_t *lxsysm_node; /* node at root of sys mount */ + zone_t *lxsysm_zone; /* zone for this mount */ +} lxsys_mnt_t; + +extern vnodeops_t *lxsys_vnodeops; + +typedef struct mounta mounta_t; + +extern void lxsys_initnodecache(); +extern void lxsys_fininodecache(); +extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int); +extern ino_t lxsys_parentinode(lxsys_node_t *); +extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int, + unsigned int); +extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int); +extern void lxsys_freenode(lxsys_node_t *); + +extern netstack_t *lxsys_netstack(lxsys_node_t *); +extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t); + +extern int lxsys_ino_get_type(ino_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t bufsize; + char *pos; + size_t beg; + int error; +} lxsys_uiobuf_t; + +extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *); +extern void lxsys_uiobuf_free(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int); +extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t); +extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LXSYSFS_H */ diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c new file mode 100644 index 0000000000..69234ddbaa --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c @@ -0,0 +1,443 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +/* + * lx_syssubr.c: Various functions for the /sys vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_sysfs.h" + +#define LXSYSCACHE_NAME "lxsys_cache" + +static int lxsys_node_constructor(void *, void *, int); +static void lxsys_node_destructor(void *, void *); + +static kmem_cache_t *lxsys_node_cache; + +void +lxsys_initnodecache() +{ + lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME, + sizeof (lxsys_node_t), 0, + lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxsys_fininodecache() +{ + kmem_cache_destroy(lxsys_node_cache); +} + +/* ARGSUSED */ +static int +lxsys_node_constructor(void *buf, void *un, int kmflags) +{ + lxsys_node_t *lxsnp = buf; + vnode_t *vp; + + vp = lxsnp->lxsys_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxsys_vnodeops); + vp->v_data = lxsnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxsys_node_destructor(void *buf, void *un) +{ + lxsys_node_t *lxsnp = buf; + + vn_free(LXSTOV(lxsnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxsys node + */ +ino_t +lxsys_inode(lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + /* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + */ + ASSERT(instance <= 0xffff); + ASSERT(endpoint <= 0xff); + + return ((ino_t)(type << 24)|(instance << 8)|endpoint); +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxsys_parentinode(lxsys_node_t *lxsnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxsnp->lxsys_type == LXSYS_STATIC && + lxsnp->lxsys_instance == LXSYS_INST_ROOT) { + return (lxsnp->lxsys_ino); + } else { + return (VTOLXS(lxsnp->lxsys_parentvp)->lxsys_ino); + } +} + +/* + * Allocate a new lxsys node + * + * This also allocates the vnode associated with it + */ +lxsys_node_t * +lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + lxsys_node_t *lxsnp; + vnode_t *vp; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxsnp->lxsys_type = type; + lxsnp->lxsys_instance = instance; + lxsnp->lxsys_endpoint = endpoint; + lxsnp->lxsys_next = NULL; + lxsnp->lxsys_parentvp = dp; + VN_HOLD(dp); + + lxsnp->lxsys_time = now; + lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0; + lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint); + + /* initialize the vnode data */ + vp = lxsnp->lxsys_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Default to a directory with open permissions. + * Specific components will override this + */ + if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) { + vp->v_flag |= VROOT; + } + vp->v_type = VDIR; + lxsnp->lxsys_mode = 0555; + + return (lxsnp); +} + +lxsys_node_t * +lxsys_getnode_static(vnode_t *dp, unsigned int instance) +{ + lxsys_mnt_t *lxsm = VTOLXSM(dp); + lxsys_node_t *lnp, *tail = NULL; + + mutex_enter(&lxsm->lxsysm_lock); + for (lnp = lxsm->lxsysm_node; lnp != NULL; lnp = lnp->lxsys_next) { + if (lnp->lxsys_instance == instance) { + VERIFY(lnp->lxsys_parentvp == dp); + + VN_HOLD(lnp->lxsys_vnode); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); + } else if (lnp->lxsys_next == NULL) { + /* Found no match by the end of the list */ + tail = lnp; + break; + } + } + + tail->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0); + lnp = tail->lxsys_next; + /* Allow mounts on static entries */ + LXSTOV(lnp)->v_flag &= (~VNOMOUNT); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); +} + +/* Clean up persistence for static lxsys_node */ +int +lxsys_freenode_static(lxsys_node_t *lnp) +{ + lxsys_node_t *plnp; + vnode_t *vp = LXSTOV(lnp); + lxsys_mnt_t *lxsm = VTOLXSM(vp); + + if (lnp->lxsys_instance == LXSYS_INST_ROOT) { + /* + * The root vnode does not need special cleanup since it + * anchors the list and is freed by lxsys_unmount. + */ + return (0); + } + + mutex_enter(&lxsm->lxsysm_lock); + + /* + * It is possible that a different process acquired a fresh reference + * to this vnode via lookup while we were waiting on the lxsysm_lock. + * To avoid freeing the vnode out from under them, we will double-check + * v_count and bail from the fop_inactive if it was grabbed. + */ + mutex_enter(&vp->v_lock); + if (vp->v_count != 1) { + VERIFY(vp->v_count > 0); + + /* Release our hold before bailing out of lxsys_inactive */ + vp->v_count--; + + mutex_exit(&vp->v_lock); + mutex_exit(&lxsm->lxsysm_lock); + return (-1); + } + mutex_exit(&vp->v_lock); + + /* search for the record pointing to lnp */ + plnp = lxsm->lxsysm_node; + while (plnp != NULL && plnp->lxsys_next != lnp) { + plnp = plnp->lxsys_next; + } + /* entry should always be found */ + VERIFY(plnp != NULL); + plnp->lxsys_next = lnp->lxsys_next; + + mutex_exit(&lxsm->lxsysm_lock); + return (0); +} + +/* + * Free the storage obtained from lxsys_getnode(). + */ +void +lxsys_freenode(lxsys_node_t *lxsnp) +{ + vnode_t *vp = LXSTOV(lxsnp); + + VERIFY(vp != NULL); + + if (lxsnp->lxsys_type == LXSYS_STATIC) { + if (lxsys_freenode_static(lxsnp) != 0) { + return; + } + } + + /* + * delete any association with parent vp + */ + if (lxsnp->lxsys_parentvp != NULL) + VN_RELE(lxsnp->lxsys_parentvp); + + /* + * Release the lxsysnode. + */ + kmem_cache_free(lxsys_node_cache, lxsnp); +} + +/* + * Get the netstack associated with this lxsys mount + */ +netstack_t * +lxsys_netstack(lxsys_node_t *lnp) +{ + zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone; + + return (netstack_hold_if_active(zone->zone_netstack)); +} + +ill_t * +lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex) +{ + ill_t *ill; + phyint_t *phyi; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &ifindex, NULL); + if (phyi != NULL) { + /* + * Since interface information presented via /sys is not + * specific to IPv4 or IPv6, an ill reference from either + * protocol will be adequate. Check both, starting with IPv4 + * for a valid reference to use. + */ + for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6; + ill = phyi->phyint_illv6) { + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ill); + } + mutex_exit(&ill->ill_lock); + } + } + } + rw_exit(&ipst->ips_ill_g_lock); + return (NULL); +} + + +#define LXSYSUIOBUFSZ 4096 + +lxsys_uiobuf_t * +lxsys_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxsys_uiobuf and output buffer */ + int bufsize = LXSYSUIOBUFSZ; + lxsys_uiobuf_t *uiobuf = + kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->bufsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize); +} + +void +lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->bufsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxsys_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxsys_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxsys_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c new file mode 100644 index 0000000000..fddc1e0234 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c @@ -0,0 +1,365 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lxsysvfsops.c: vfs operations for lx sysfs. + * + * sysfs has a close relationship with the lx getdents(2) syscall. This is + * necessary so that the getdents code can populate the 'd_type' entries + * during a sysfs readdir operation. The glibc code which accesses sysfs + * (specifically the 'cpu' subtree) expects dirents to have the d_type field + * populated. One problematic consumer is java, which becomes unstable if it + * gets the incorrect data from glibc. When sysfs loads, it populates the + * lx_sysfs_vfs_type and lx_sysfs_vtype variables defined in lx_getdents.c. + * The getdents code can then call into sysfs to determine the d_type for any + * given inode directory entry. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_sysfs.h" + +/* Module level parameters */ +static int lxsysfstype; +static dev_t lxsysdev; +static kmutex_t lxsys_mount_lock; + +extern int lx_sysfs_vfs_type; +extern int (*lx_sysfs_vtype)(ino_t); + +static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxsys_unmount(vfs_t *, int, cred_t *); +static int lxsys_root(vfs_t *, vnode_t **); +static int lxsys_statvfs(vfs_t *, statvfs64_t *); +static int lxsys_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_sysfs", + lxsys_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand sysfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + lx_sysfs_vfs_type = 0; + lx_sysfs_vtype = NULL; + + /* + * destroy lxsys_node cache + */ + lxsys_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxsysfstype); + vn_freevnodeops(lxsys_vnodeops); + + mutex_destroy(&lxsys_mount_lock); +done: + return (retval); +} + +static int +lxsys_init(int fstype, char *name) +{ + static const fs_operation_def_t lxsys_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxsys_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxsys_unmount }, + VFSNAME_ROOT, { .vfs_root = lxsys_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxsys_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxsys_vnodeops_template[]; + int error; + major_t dev; + + lx_sysfs_vtype = lxsys_ino_get_type; + lx_sysfs_vfs_type = lxsysfstype = fstype; + ASSERT(lxsysfstype != 0); + + mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxsys_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxsys_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxsys_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxsysdev = makedevice(dev, 0); + + /* + * Initialise cache for lxsys_nodes + */ + lxsys_initnodecache(); + + return (0); +} + +static int +lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt; + zone_t *zone = curproc->p_zone; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxsys" doesn't make sense + */ + vfs_setresource(vfsp, "lxsys", 0); + + lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP); + + mutex_enter(&lxsys_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxsys_mount_lock); + kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + + mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL); + zone_hold(lxsys_mnt->lxsysm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC, + LXSYS_INST_ROOT, 0); + lxsys_mnt->lxsysm_node->lxsys_next = NULL; + + /* Correctly set the fs for the root node */ + lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxsysfstype; + vfsp->vfs_data = (caddr_t)lxsys_mnt; + vfsp->vfs_dev = lxsysdev; + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data; + lxsys_node_t *lnp; + vnode_t *vp; + int count; + + VERIFY(lxsys_mnt != NULL); + + mutex_enter(&lxsys_mount_lock); + + /* must be root to unmount */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxsys_mount_lock); + return (EPERM); + } + + /* forced unmount is not supported by this fs */ + if (flag & MS_FORCE) { + mutex_exit(&lxsys_mount_lock); + return (ENOTSUP); + } + + /* Ensure that no vnodes are in use on this mount point. */ + lnp = lxsys_mnt->lxsysm_node; + vp = LXSTOV(lnp); + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxsys_mount_lock); + return (EBUSY); + } + + /* + * If there are no references to the root vnode the list of persistent + * static vnodes should be empty + */ + VERIFY(lnp->lxsys_next == NULL); + + (void) dnlc_purge_vfsp(vfsp, 0); + + lxsys_mnt->lxsysm_node = NULL; + lxsys_freenode(lnp); + zone_rele(lxsys_mnt->lxsysm_zone); + vfsp->vfs_data = NULL; + kmem_free(lxsys_mnt, sizeof (*lxsys_mnt)); + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + + VERIFY(lxsm != NULL); + VERIFY(lxsm->lxsysm_node != NULL); + + vp = LXSTOV(lxsm->lxsysm_node); + VN_HOLD(vp); + *vpp = vp; + + return (0); +} + +static int +lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + dev32_t d32; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)3; + sp->f_ffree = (fsfilcnt64_t)0; /* none */ + sp->f_favail = (fsfilcnt64_t)0; /* none */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/sys"); + (void) strcpy(&sp->f_fstr[6], "/sys"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c new file mode 100644 index 0000000000..10c99baa7b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c @@ -0,0 +1,2165 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lx_sysfs -- a Linux-compatible /sys for the LX brand + */ + +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/sunddi.h> +#include <sys/vnode.h> +#include <sys/netstack.h> +#include <sys/ethernet.h> +#include <inet/ip_arp.h> + +#include "lx_sysfs.h" + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxsys_init() in lx_sysvfsops.c + */ +vnodeops_t *lxsys_vnodeops; + +static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxsys_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxsys_sync(void); +static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_blockdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_zfsdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpu(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_syscpuinfo(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_sysnode(lxsys_node_t *, char *); + +static int lxsys_read_static(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_zfs_block(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_syscpu(lxsys_node_t *, lxsys_uiobuf_t *); +static int lxsys_read_devices_sysnode(lxsys_node_t *, lxsys_uiobuf_t *); + +static int lxsys_readdir_devices_syscpu(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_syscpuinfo(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_sysnode(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_blockdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_zfsdir(lxsys_node_t *, uio_t *, int *); + +static int lxsys_readlink_class_net(lxsys_node_t *, char *, size_t); +static int lxsys_readlink_block(lxsys_node_t *, char *, size_t); + +/* + * The lx /sys vnode operations vector + */ +const fs_operation_def_t lxsys_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxsys_open }, + VOPNAME_CLOSE, { .vop_close = lxsys_close }, + VOPNAME_READ, { .vop_read = lxsys_read }, + VOPNAME_GETATTR, { .vop_getattr = lxsys_getattr }, + VOPNAME_ACCESS, { .vop_access = lxsys_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxsys_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxsys_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxsys_readlink }, + VOPNAME_FSYNC, { .error = lxsys_sync }, + VOPNAME_SEEK, { .error = lxsys_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxsys_inactive }, + VOPNAME_CMP, { .vop_cmp = lxsys_cmp }, + NULL, NULL +}; + +typedef enum lxsys_cpu_state { + LXSYS_CPU_ON, /* online */ + LXSYS_CPU_OFF, /* offline */ + LXSYS_CPU_ANY, /* don't care */ +} lxsys_cpu_state_t; + +static void lxsys_format_cpu(char *, int, lxsys_cpu_state_t); + +/* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + * + * Where TYPE is one of: + * 1 - SYS_STATIC + * 2 - SYS_CLASS_NET + * 3 - SYS_DEV_NET + * 4 - SYS_BLOCK + * 5 - SYS_DEV_ZFS + * 6 - SYS_DEV_SYS_CPU + * 7 - SYS_DEV_SYS_CPUINFO + * 8 - SYS_DEV_SYS_NODE + * + * Static entries will have assigned INSTANCE identifiers: + * - 0x00: /sys + * - 0x01: /sys/class + * - 0x02: /sys/devices + * - 0x03: /sys/fs + * - 0x04: /sys/class/net + * - 0x05: /sys/devices/virtual + * - 0x06: /sys/devices/system + * - 0x07: /sys/fs/cgroup + * - 0x08: /sys/devices/virtual/net + * - 0x09: /sys/block + * - 0x0a: /sys/devices/zfs + * - 0x0b: /sys/devices/system/cpu + * - 0x0c: /sys/devices/system/node + * - 0x0d: /sys/bus + * + * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived + * from the corresonding ifindex. + * + * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use + * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs + * for the contained entries. + * + * Dynamic /sys/block/<dev> symlinks will use an INSTANCE derived from the + * device major and instance from records listed in kstat or zvols. + * + * Dynamic /sys/devices/zfs/<dev> directories will use an INSTANCE derived from + * the emulated minor number. + * + * Semi-static/Dynamic /sys/devices/system/cpu contains the fixed 'kernel_max', + * 'offline', 'online', 'possible', and 'present' files, and a dynamic set of + * cpuN subdirectories. All of these are dynamic nodes. + * + * Static /sys/devices/system/node/node0 currently only contains a + * static cpulist file, but will likely need future dynamic entries for cpuN + * symlinks, and perhaps other static files. By only providing 'node0' we + * pretend that there is only a single NUMA node available to a zone (trying to + * be NUMA-aware inside a zone is generally not going to work anyway). + * If dynamic entries are added under node0, it must be converted to the + * semi-static/dynamic approach as used under /sys/devices/system/cpu. + * + * The dyn_ino_type table must be updated whenever a new static instance is + * defined. + */ + +#define LXSYS_INST_CLASSDIR 0x1 +#define LXSYS_INST_DEVICESDIR 0x2 +#define LXSYS_INST_FSDIR 0x3 +#define LXSYS_INST_CLASS_NETDIR 0x4 +#define LXSYS_INST_DEVICES_VIRTUALDIR 0x5 +#define LXSYS_INST_DEVICES_SYSTEMDIR 0x6 +#define LXSYS_INST_FS_CGROUPDIR 0x7 +#define LXSYS_INST_DEVICES_VIRTUAL_NETDIR 0x8 +#define LXSYS_INST_BLOCKDIR 0x9 +#define LXSYS_INST_DEVICES_ZFSDIR 0xa +#define LXSYS_INST_DEVICES_SYSCPU 0xb +#define LXSYS_INST_DEVICES_SYSNODE 0xc +#define LXSYS_INST_BUSDIR 0xd +#define LXSYS_INST_MAX LXSYS_INST_BUSDIR /* limit */ + +/* + * These are of dynamic type (LXSYS_DEV_SYS_CPU), but essentially fixed + * instances. Under /sys/devices/system/cpu we have: kernel_max, offline, + * online, possible and present. We also have a dynamic set of cpuN subdirs. + * The cpuN subdirs are actually of type LXSYS_DEV_SYS_CPUINFO, so we can use + * the following instance IDs for the fixed files. + */ +#define LXSYS_INST_DEV_SYSCPU_KMAX 0x1 +#define LXSYS_INST_DEV_SYSCPU_OFFLINE 0x2 +#define LXSYS_INST_DEV_SYSCPU_ONLINE 0x3 +#define LXSYS_INST_DEV_SYSCPU_POSSIBLE 0x4 +#define LXSYS_INST_DEV_SYSCPU_PRESENT 0x5 + +/* + * This array is used for directory inode correction in lxsys_readdir_common + * when a directory's static-type entry is actually a dynamic-type. + */ +static int dyn_ino_type [] = { + 0, /* invalid */ + 0, /* LXSYS_INST_CLASSDIR */ + 0, /* LXSYS_INST_DEVICESDIR */ + 0, /* LXSYS_INST_FSDIR */ + LXSYS_CLASS_NET, /* LXSYS_INST_CLASS_NETDIR */ + 0, /* LXSYS_INST_DEVICES_VIRTUALDIR */ + 0, /* LXSYS_INST_DEVICES_SYSTEMDIR */ + 0, /* LXSYS_INST_FS_CGROUPDIR */ + LXSYS_DEV_NET, /* LXSYS_INST_DEV_VIRTUAL_NETDIR */ + LXSYS_BLOCK, /* LXSYS_INST_BLOCKDIR */ + LXSYS_DEV_ZFS, /* LXSYS_INST_DEVICES_ZFSDIR */ + LXSYS_DEV_SYS_CPU, /* LXSYS_INST_DEVICES_SYSCPU */ + LXSYS_DEV_SYS_NODE, /* LXSYS_INST_DEV_SYSNODE */ + 0, /* LXSYS_INST_BUSDIR */ +}; +#define DYN_INO_LEN \ + (sizeof (dyn_ino_type) / sizeof ((dyn_ino_type)[0])) + +/* + * file contents of an lx /sys directory. + */ +static lxsys_dirent_t dirlist_root[] = { + { LXSYS_INST_BLOCKDIR, "block" }, + { LXSYS_INST_BUSDIR, "bus" }, + { LXSYS_INST_CLASSDIR, "class" }, + { LXSYS_INST_DEVICESDIR, "devices" }, + { LXSYS_INST_FSDIR, "fs" } +}; +static lxsys_dirent_t dirlist_class[] = { + { LXSYS_INST_CLASS_NETDIR, "net" } +}; +static lxsys_dirent_t dirlist_fs[] = { + { LXSYS_INST_FS_CGROUPDIR, "cgroup" } +}; +static lxsys_dirent_t dirlist_devices[] = { + { LXSYS_INST_DEVICES_SYSTEMDIR, "system" }, + { LXSYS_INST_DEVICES_VIRTUALDIR, "virtual" }, + { LXSYS_INST_DEVICES_ZFSDIR, "zfs" } +}; +static lxsys_dirent_t dirlist_devices_virtual[] = { + { LXSYS_INST_DEVICES_VIRTUAL_NETDIR, "net" } +}; + +static lxsys_dirent_t dirlist_devices_system[] = { + { LXSYS_INST_DEVICES_SYSCPU, "cpu" }, + { LXSYS_INST_DEVICES_SYSNODE, "node" } +}; + +#define LXSYS_ENDP_NET_ADDRESS 1 +#define LXSYS_ENDP_NET_ADDRLEN 2 +#define LXSYS_ENDP_NET_FLAGS 3 +#define LXSYS_ENDP_NET_IFINDEX 4 +#define LXSYS_ENDP_NET_MTU 5 +#define LXSYS_ENDP_NET_TXQLEN 6 +#define LXSYS_ENDP_NET_TYPE 7 + +#define LXSYS_ENDP_BLOCK_DEVICE 1 + +#define LXSYS_ENDP_NODE_CPULIST 1 +#define LXSYS_ENDP_NODE_CPUMAP 2 + +static lxsys_dirent_t dirlist_devices_virtual_net[] = { + { LXSYS_ENDP_NET_ADDRESS, "address" }, + { LXSYS_ENDP_NET_ADDRLEN, "addr_len" }, + { LXSYS_ENDP_NET_FLAGS, "flags" }, + { LXSYS_ENDP_NET_IFINDEX, "ifindex" }, + { LXSYS_ENDP_NET_MTU, "mtu" }, + { LXSYS_ENDP_NET_TXQLEN, "tx_queue_len" }, + { LXSYS_ENDP_NET_TYPE, "type" } +}; + +static lxsys_dirent_t dirlist_devices_zfs_block[] = { + { LXSYS_ENDP_BLOCK_DEVICE, "device" } +}; + +static lxsys_dirent_t dirlist_devices_sysnode[] = { + { LXSYS_ENDP_NODE_CPULIST, "cpulist" }, + { LXSYS_ENDP_NODE_CPUMAP, "cpumap" } +}; + +#define SYSDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +#define SYSDLENT(i, l) { i, l, SYSDIRLISTSZ(l) } +static lxsys_dirlookup_t lxsys_dirlookup[] = { + SYSDLENT(LXSYS_INST_ROOT, dirlist_root), + SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class), + SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs), + { LXSYS_INST_FS_CGROUPDIR, NULL, 0 }, + SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices), + SYSDLENT(LXSYS_INST_DEVICES_SYSTEMDIR, dirlist_devices_system), + SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual), + SYSDLENT(LXSYS_INST_DEVICES_SYSNODE, dirlist_devices_sysnode), + { LXSYS_INST_BUSDIR, NULL, 0 }, +}; + + +/* + * Array of lookup functions, indexed by lx /sys file type. + */ +static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_lookup_static, /* LXSYS_STATIC */ + lxsys_lookup_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_lookup_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_lookup_blockdir, /* LXSYS_BLOCK */ + lxsys_lookup_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_lookup_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_lookup_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_lookup_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readdir functions, indexed by /sys file type. + */ +static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_readdir_static, /* LXSYS_STATIC */ + lxsys_readdir_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_readdir_devices_virtual_netdir, /* LXSYS_DEV_NET */ + lxsys_readdir_blockdir, /* LXSYS_BLOCK */ + lxsys_readdir_devices_zfsdir, /* LXSYS_DEV_ZFS */ + lxsys_readdir_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + lxsys_readdir_devices_syscpuinfo, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_readdir_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of read functions, indexed by /sys file type. + */ +static int (*lxsys_read_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_read_static, /* LXSYS_STATIC */ + NULL, /* LXSYS_CLASS_NET */ + lxsys_read_devices_virtual_net, /* LXSYS_DEV_NET */ + NULL, /* LXSYS_BLOCK */ + lxsys_read_devices_zfs_block, /* LXSYS_DEV_ZFS */ + lxsys_read_devices_syscpu, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + lxsys_read_devices_sysnode, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Array of readlink functions, indexed by /sys file type. + */ +static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + NULL, /* LXSYS_STATIC */ + lxsys_readlink_class_net, /* LXSYS_CLASS_NET */ + NULL, /* LXSYS_DEV_NET */ + lxsys_readlink_block, /* LXSYS_BLOCK */ + NULL, /* LXSYS_DEV_ZFS */ + NULL, /* LXSYS_DEV_SYS_CPU */ + NULL, /* LXSYS_DEV_SYS_CPUINFO */ + NULL, /* LXSYS_DEV_SYS_NODE */ +}; + +/* + * Given one of our inodes, return the vnode type. + * + * lxsys_getnode will always set the vnode type to VDIR. It expects the + * caller (normally the lookup functions) to fix the type. Those same rules are + * encoded here for our inode-to-type translation. + */ +int +lxsys_ino_get_type(ino_t ino) +{ + lxsys_nodetype_t type; + unsigned int instance; + unsigned int endpoint; + + type = (ino & 0xff000000) >> 24; + instance = (ino & 0xffff00) >> 8; + endpoint = (ino & 0xff); + + if (instance > LXSYS_INST_MAX) + return (VNON); + + /* Validate non-static node types */ + if (type != LXSYS_STATIC && + (type <= LXSYS_STATIC || type >= LXSYS_MAXTYPE)) { + return (VNON); + } + + if (type != LXSYS_STATIC) { + /* Non-static node types */ + switch (type) { + case LXSYS_CLASS_NET: + if (instance != 0) { + return (VLNK); + } + break; + case LXSYS_DEV_NET: + /* + * /sys/devices/virtual/net usually has the eth0 and + * lo directories. Each network device directory is an + * instances with a 0 endpoint. The files within + * that directory have a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_BLOCK: + if (instance != 0) { + return (VLNK); + } + break; + case LXSYS_DEV_ZFS: + /* + * /sys/devices/zfs usually has the zfsds0 directory + * instance with a 0 endpoint. The device file within + * that directory has a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_CPU: + if (instance != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_CPUINFO: + /* + * There is an instance of /sys/devices/system/cpu/cpuN + * for each CPU. These have an instance per CPU and + * currently the endpoint is 0 since there is nothing + * underneath the cpuN subdirectories. Future + * regular file entries are likely to be added there. + */ + if (endpoint != 0) { + return (VREG); + } + break; + case LXSYS_DEV_SYS_NODE: + /* + * /sys/devices/system/node has the node0 directory + * instance with a 0 endpoint. The cpulist file within + * that directory has a non-0 endpoint. + */ + if (endpoint != 0) { + return (VREG); + } + break; + default: + break; + } + } + return (VDIR); +} + +/* + * lxsys_open(): Vnode operation for VOP_OPEN() + */ +/* ARGSUSED */ +static int +lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + /* + * We only allow reading in this file system + */ + if (flag & FWRITE) + return (EROFS); + + return (0); +} + + +/* + * lxsys_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + + +/* + * lxsys_read(): Vnode operation for VOP_READ() + * All we currently have in this fs are directories. + */ +/* ARGSUSED */ +static int +lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + lxsys_uiobuf_t *luio; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type == VDIR) { + return (EISDIR); + } + + rlfunc = lxsys_read_function[type]; + if (rlfunc != NULL) { + luio = lxsys_uiobuf_new(uiop); + if ((error = rlfunc(lnp, luio)) == 0) { + error = lxsys_uiobuf_flush(luio); + } + lxsys_uiobuf_free(luio); + } else { + error = EIO; + } + + return (error); +} + +/* + * lxsys_getattr(): Vnode operation for VOP_GETATTR() + */ +/* ARGSUSED */ +static int +lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxsys_node_t *lxsnp = VTOLXS(vp); + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxsnp->lxsys_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxsnp->lxsys_uid; + vap->va_gid = lxsnp->lxsys_gid; + vap->va_nodeid = lxsnp->lxsys_ino; + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxsys_access(): Vnode operation for VOP_ACCESS() + */ +/* ARGSUSED */ +static int +lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxsys_node_t *lxsnp = VTOLXS(vp); + int shift = 0; + + /* + * Although our lx sysfs is basically a read only file system, Linux + * expects it to be writable so we can't just error if (mode & VWRITE). + */ + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxsnp->lxsys_uid) { + shift += 3; + if (!groupmember((uid_t)lxsnp->lxsys_gid, cr)) + shift += 3; + } + + mode &= ~(lxsnp->lxsys_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* + * lxsys_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + int error; + + VERIFY(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxsnp->lxsys_parentvp); + *vpp = lxsnp->lxsys_parentvp; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxsys_lookup_function[type](lxsnp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +static lxsys_node_t * +lxsys_lookup_disk(lxsys_node_t *ldp, char *comp, lxsys_nodetype_t type) +{ + lxsys_node_t *lnp = NULL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (NULL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int inst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (strcmp(vd->lxvd_name, comp) == 0 && inst != 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, type, inst, 0); + break; + } + + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (lnp); +} + +static vnode_t * +lxsys_lookup_static(lxsys_node_t *ldp, char *comp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + if (dirent == NULL) { + return (NULL); + } + + for (i = 0; i < len; i++) { + if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) { + lxsys_nodetype_t node_type = ldp->lxsys_type; + unsigned int node_instance = 0; + lxsys_node_t *lnp; + + switch (dirent[i].d_idnum) { + case LXSYS_INST_BLOCKDIR: + node_type = LXSYS_BLOCK; + break; + case LXSYS_INST_CLASS_NETDIR: + node_type = LXSYS_CLASS_NET; + break; + case LXSYS_INST_DEVICES_VIRTUAL_NETDIR: + node_type = LXSYS_DEV_NET; + break; + case LXSYS_INST_DEVICES_ZFSDIR: + node_type = LXSYS_DEV_ZFS; + break; + case LXSYS_INST_DEVICES_SYSCPU: + node_type = LXSYS_DEV_SYS_CPU; + break; + case LXSYS_INST_DEVICES_SYSNODE: + node_type = LXSYS_DEV_SYS_NODE; + break; + default: + /* Another static node */ + node_instance = dirent[i].d_idnum; + } + if (node_type == LXSYS_STATIC) { + lnp = lxsys_getnode_static(ldp->lxsys_vnode, + node_instance); + } else { + lnp = lxsys_getnode(ldp->lxsys_vnode, + node_type, node_instance, 0); + } + return (lnp->lxsys_vnode); + } + } + return (NULL); +} + +static vnode_t * +lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp) +{ + vnode_t *result = NULL; + lxsys_node_t *lnp; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + if (ldp->lxsys_type != LXSYS_CLASS_NET || + ldp->lxsys_instance != 0) { + /* Lookups only allowed at directory level */ + return (NULL); + } + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + result->v_type = VLNK; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); +} + +static vnode_t * +lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level interface listing */ + vnode_t *result = NULL; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); + } else if (ldp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_virtual_net); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_virtual_net[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_blockdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_BLOCK); + + if (lnp != NULL) { + lnp->lxsys_vnode->v_type = VLNK; + return (lnp->lxsys_vnode); + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_zfsdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level dev listing */ + lnp = lxsys_lookup_disk(ldp, comp, LXSYS_DEV_ZFS); + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + /* + * All of these entries currently look like regular files + * but on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + size = SYSDIRLISTSZ(dirlist_devices_zfs_block); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_zfs_block[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_syscpu(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* top-level cpu listing */ + + /* If fixed entry */ + if (strcmp(comp, "kernel_max") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_KMAX, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "offline") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_OFFLINE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "online") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_ONLINE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "possible") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_POSSIBLE, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else if (strcmp(comp, "present") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, LXSYS_DEV_SYS_CPU, + LXSYS_INST_DEV_SYSCPU_PRESENT, 0); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + } else { + /* Else dynamic cpuN entry */ + cpuset_t *avail; /* all installed CPUs */ + uint_t i, avlo, avhi; + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + for (i = avlo; i <= avhi; i++) { + char cpunm[16]; + + if (!cpu_in_set(avail, i)) + continue; + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%u", + i); + + if (strcmp(comp, cpunm) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + LXSYS_DEV_SYS_CPUINFO, i + 1, 0); + break; + } + } + cpuset_free(avail); + } + + if (lnp != NULL) { + return (lnp->lxsys_vnode); + } + } else if (ldp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing, currently empty */ + /* EMPTY */ + } + + return (NULL); +} + +/* ARGSUSED */ +static vnode_t * +lxsys_lookup_devices_syscpuinfo(lxsys_node_t *ldp, char *comp) +{ + return (NULL); +} + +static vnode_t * +lxsys_lookup_devices_sysnode(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp = NULL; + + if (ldp->lxsys_instance == 0) { + /* + * The system is presently represented as a single node, + * regardless of any NUMA topology which exists. + * The instances are offset by 1 to account for the top level + * directory occupying instance 0. + */ + if (strcmp(comp, "node0") == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + 1, 0); + return (lnp->lxsys_vnode); + } + } else { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_sysnode); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_sysnode[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + +static int +lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + netstack_t *ns; + ill_t *ill; + uint_t ifindex = lnp->lxsys_instance; + uint8_t *addr; + uint64_t flags; + int error = 0; + + if (ifindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (EIO); + } + + ill = lxsys_find_ill(ns->netstack_ip, ifindex); + if (ill == NULL) { + netstack_rele(ns); + return (EIO); + } + + switch (lnp->lxsys_endpoint) { + case LXSYS_ENDP_NET_ADDRESS: + if (ill->ill_phys_addr_length != ETHERADDRL) { + lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n"); + break; + } + addr = ill->ill_phys_addr; + lxsys_uiobuf_printf(luio, + "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + break; + case LXSYS_ENDP_NET_ADDRLEN: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length); + break; + case LXSYS_ENDP_NET_FLAGS: + flags = (ill->ill_flags | ill->ill_ipif->ipif_flags | + ill->ill_phyint->phyint_flags) & 0xffff; + lx_ifflags_convert(&flags, LX_IF_FROMNATIVE); + lxsys_uiobuf_printf(luio, "0x%x\n", flags); + break; + case LXSYS_ENDP_NET_IFINDEX: + lxsys_uiobuf_printf(luio, "%u\n", ifindex); + break; + case LXSYS_ENDP_NET_MTU: + lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu); + break; + case LXSYS_ENDP_NET_TXQLEN: + /* perpetuate the txqlen lie */ + if (IS_LOOPBACK(ill)) { + lxsys_uiobuf_printf(luio, "0\n"); + } else { + lxsys_uiobuf_printf(luio, "1\n"); + } + break; + case LXSYS_ENDP_NET_TYPE: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK : + arp_hw_type(ill->ill_mactype)); + break; + default: + error = EIO; + } + + ill_refrele(ill); + netstack_rele(ns); + return (error); +} + +/* ARGSUSED1 */ +static int +lxsys_read_devices_zfs_block(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t dskindex = lnp->lxsys_instance; + + if (dskindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + return (EIO); +} + +/* + * In the Linux src tree, see ABI/stable/sysfs-devices-node. + * + * For the 'cpumap' file, each CPU is treated as a bit, then those are + * accumulated and printed as a hex digit, with CPU0 as the rightmost bit. + * Each set of 8 digits (i.e. 32 CPUs) is then delimited with a comma. + * Since we are emulating a single NUMA group, all of our CPUs will be listed + * in this file. For example, a 48 CPU system would look like: + * 00000000,00000000,00000000,00000000,00000000,00000000,0000ffff,ffffffff + * It comes out this way because 'kernel_max' is NCPU, which is currently + * defined to be 256. + */ +static int +lxsys_read_devices_sysnode(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + if (lnp->lxsys_instance == 1) { + char outbuf[256]; + + if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPULIST) { + /* Show the range of CPUs */ + lxsys_format_cpu(outbuf, sizeof (outbuf), + LXSYS_CPU_ANY); + } else if (lnp->lxsys_endpoint == LXSYS_ENDP_NODE_CPUMAP) { + int i; + uint_t j, ndigits; + cpuset_t *avail; /* all installed CPUs */ + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + outbuf[0] = '\0'; + ndigits = 0; + for (i = NCPU - 1; i >= 0; i -= 4) { + char buf[8]; + int cnt = 3; + uint_t digit = 0; + + for (j = i; cnt >= 0; j--, cnt--) { + if (cpu_in_set(avail, j)) + digit |= 1 << cnt; + } + (void) snprintf(buf, sizeof (buf), "%x", digit); + if (ndigits == 8) { + (void) strlcat(outbuf, ",", + sizeof (outbuf)); + ndigits = 0; + } + (void) strlcat(outbuf, buf, sizeof (outbuf)); + ndigits++; + } + + cpuset_free(avail); + } else { + return (EISDIR); + } + + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + return (EISDIR); +} + +static void +lxsys_format_range(char *buf, int blen, boolean_t *first, uint_t start, + uint_t cnt) +{ + char tmp[256]; + char *delim; + + if (cnt == 0) + return; + + if (*first) { + *first = B_FALSE; + delim = ""; + } else { + delim = ","; + } + if (cnt > 1) { + (void) snprintf(tmp, sizeof (tmp), "%s%u-%u", delim, start, + start + cnt - 1); + } else { + (void) snprintf(tmp, sizeof (tmp), "%s%u", delim, start); + } + (void) strlcat(buf, tmp, blen); +} + +/* + * Format a string of which CPUs are online, offline, or don't care (depending + * on chk_state), and which would be formatted like this: + * 0-31 + * or + * 0-12,14,20-31 + */ +static void +lxsys_format_cpu(char *buf, int blen, lxsys_cpu_state_t chk_state) +{ + uint_t start, cnt, avlo, avhi; + boolean_t first = B_TRUE; + cpuset_t *active; /* CPUs online */ + cpuset_t *avail; /* all installed CPUs */ + + active = cpuset_alloc(KM_SLEEP); + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(active); + cpuset_all(avail); + + /* Take a snapshot of the available and active sets */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + cpuset_and(active, &cpu_active_set); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + buf[0] = '\0'; + if (chk_state == LXSYS_CPU_ANY) { + start = avlo; + cnt = avhi + 1; + } else { + uint_t i; + boolean_t incl_cpu = B_TRUE; + + start = 0; + cnt = 0; + for (i = avlo; i <= avhi; i++) { + if (chk_state == LXSYS_CPU_ON) { + if (!cpu_in_set(active, i)) + incl_cpu = B_FALSE; + } else { + if (cpu_in_set(active, i)) + incl_cpu = B_FALSE; + } + + if (incl_cpu && cpu_in_set(avail, i)) { + cnt++; + } else { + /* + * Note: this may print nothing if our 'cnt' + * is 0, but we advance 'start' properly so we + * handle the next range of elements we're + * looking for. + */ + lxsys_format_range(buf, blen, &first, start, + cnt); + start += cnt + 1; + cnt = 0; + incl_cpu = B_TRUE; + } + } + } + + cpuset_free(avail); + cpuset_free(active); + + lxsys_format_range(buf, blen, &first, start, cnt); +} + +static int +lxsys_read_devices_syscpu(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + uint_t inst = lnp->lxsys_instance; + char outbuf[256]; + + /* + * For 'kernel_max', 'offline', 'online', 'possible', and 'present', + * see the Documentaion/cputopology.txt file in the Linux src tree. + */ + if (inst == LXSYS_INST_DEV_SYSCPU_KMAX) { + lxsys_uiobuf_printf(luio, "%d\n", NCPU - 1); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_OFFLINE) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_OFF); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_ONLINE) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ON); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + if (inst == LXSYS_INST_DEV_SYSCPU_POSSIBLE || + inst == LXSYS_INST_DEV_SYSCPU_PRESENT) { + lxsys_format_cpu(outbuf, sizeof (outbuf), LXSYS_CPU_ANY); + lxsys_uiobuf_printf(luio, "%s\n", outbuf); + return (0); + } + + /* All other nodes are directories */ + return (EISDIR); +} + +/* ARGSUSED */ +static int +lxsys_read_static(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + /* All static nodes are directories */ + return (EISDIR); +} + +/* + * lxsys_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + ssize_t uresid; + off_t uoffset; + int error, leof; + + ASSERT(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXSYS_SDSIZE) + return (ENOENT); + + /* Free lower functions from having to check eofp == NULL */ + if (eofp == NULL) { + eofp = &leof; + } + + return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp)); +} + +static int +lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio) +{ + int error; + off_t offset = uio->uio_offset; + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the + * same amount. But we want uiop->uio_offset to change in increments + * of LXSYS_SDSIZE, which is different from the number of bytes being + * returned to the user. To accomplish this, we set uiop->uio_offset + * separately on success, overriding what uiomove() does. + */ + d->d_off = (off64_t)(offset + LXSYS_SDSIZE); + d->d_reclen = n; + if ((error = uiomove(d, n, UIO_READ, uio)) != 0) { + return (error); + } + uio->uio_offset = offset + LXSYS_SDSIZE; + return (0); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + int idnum, ino_type = 0; + + idnum = dirtab[dirindex].d_idnum; + if (idnum > 0 && idnum < DYN_INO_LEN) + ino_type = dyn_ino_type[idnum]; + + if (ino_type != 0) { + /* + * Correct the inode for static directories + * which contain non-static lxsys_nodetype_t's. + */ + dirent->d_ino = lxsys_inode(ino_type, 0, 0); + DTRACE_PROBE3(lxsys__fix__inode, + char *, dirtab[dirindex].d_name, + int, ino_type, int, dirent->d_ino); + } else { + dirent->d_ino = lxsys_inode(LXSYS_STATIC, + idnum, 0); + } + + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + VERIFY(dirtab != NULL || dirtablen == 0); + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(lxsnp->lxsys_type, + lxsnp->lxsys_instance, dirtab[dirindex].d_idnum); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + int error, i; + + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + if ((ns = lxsys_netstack(ldp)) == NULL) { + *eofp = 1; + return (0); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_first(phytree); + if (phyi == NULL) { + *eofp = 1; + } + bzero(bp, sizeof (bp)); + + /* + * Skip records we have already passed with the offset. + * This accounts for the two "." and ".." records already seen. + */ + for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) { + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) { + uint_t ifindex; + int reclen; + + ifindex = phyi->phyint_ifindex; + (void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE); + dirent->d_ino = lxsys_inode(type, ifindex, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readdir_disks(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + uint_t instance; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (EINVAL); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + if (skip > 0) { + skip--; + goto next; + } + + if (strnlen(vd->lxvd_name, sizeof (vd->lxvd_name)) > LXSNSIZ) + goto next; + + (void) strncpy(dirent->d_name, vd->lxvd_name, LXSNSIZ); + + instance = getminor(vd->lxvd_emul_dev) & 0xffff; + if (instance == 0) + goto next; + + dirent->d_ino = lxsys_inode(type, instance, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + +next: + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + /* Indicate EOF if we reached the end of the virtual disks. */ + if (vd == NULL) { + *eofp = 1; + } + + return (error); +} + + +static int +lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + boolean_t found = B_FALSE; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + found = B_TRUE; + break; + } + } + + if (!found) { + return (ENOTDIR); + } + + return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len)); +} + +static int +lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_CLASS_NET || + lnp->lxsys_instance != 0) { + /* + * Since /sys/class/net contains only symlinks, readdir + * operations should not be performed anywhere except the top + * level (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET)); +} + +static int +lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level interface listing */ + error = lxsys_readdir_ifaces(lnp, uiop, eofp, + LXSYS_DEV_NET); + } else if (lnp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_virtual_net, + SYSDIRLISTSZ(dirlist_devices_virtual_net)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_blockdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_BLOCK || + lnp->lxsys_instance != 0) { + /* + * Since /sys/block contains only symlinks, readdir operations + * should not be performed anywhere except the top level + * (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_disks(lnp, uiop, eofp, LXSYS_BLOCK)); +} + +static int +lxsys_readdir_devices_zfsdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level dev listing */ + error = lxsys_readdir_disks(lnp, uiop, eofp, + LXSYS_DEV_ZFS); + } else if (lnp->lxsys_endpoint == 0) { + /* disk-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_zfs_block, + SYSDIRLISTSZ(dirlist_devices_zfs_block)); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +/* Handle fixed entries within the cpu directory. */ +static int +lxsys_do_sub_cpu(struct uio *uiop, ssize_t oresid, dirent64_t *dirent, + char *nm, int inst, int *errp) +{ + int reclen; + ssize_t uresid; + + (void) strncpy(dirent->d_name, nm, LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPU, inst, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + *errp = EINVAL; + } + return (-1); + } + if ((*errp = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (-1); + } + + return (0); +} + +static int +lxsys_readdir_cpu(lxsys_node_t *ldp, struct uio *uiop, int *eofp) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int skip, error; + int reclen; + cpuset_t *avail; + uint_t i, avlo, avhi; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "kernel_max", + LXSYS_INST_DEV_SYSCPU_KMAX, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "offline", + LXSYS_INST_DEV_SYSCPU_OFFLINE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "online", + LXSYS_INST_DEV_SYSCPU_ONLINE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "possible", + LXSYS_INST_DEV_SYSCPU_POSSIBLE, &error) != 0) + goto done; + + if (lxsys_do_sub_cpu(uiop, oresid, dirent, "present", + LXSYS_INST_DEV_SYSCPU_PRESENT, &error) != 0) + goto done; + } + + avail = cpuset_alloc(KM_SLEEP); + cpuset_all(avail); + + /* Take a snapshot of the available set */ + mutex_enter(&cpu_lock); + cpuset_and(avail, &cpu_available); + mutex_exit(&cpu_lock); + + cpuset_bounds(avail, &avlo, &avhi); + + /* Output dynamic CPU info */ + for (i = avlo; i <= avhi; i++) { + char cpunm[16]; + + if (skip > 0) { + skip--; + continue; + } + + if (!cpu_in_set(avail, i)) + continue; + + (void) snprintf(cpunm, sizeof (cpunm), "cpu%u", i); + (void) strncpy(dirent->d_name, cpunm, LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_CPUINFO, i + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + } + cpuset_free(avail); + + /* Indicate EOF if we reached the end of the CPU list. */ + if (i == avhi) { + *eofp = 1; + } + +done: + return (error); +} + +static int +lxsys_readdir_devices_syscpu(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level cpu listing */ + error = lxsys_readdir_cpu(lnp, uiop, eofp); + } else if (lnp->lxsys_endpoint == 0) { + /* cpu-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, NULL, 0); + } else { + /* + * Currently there shouldn't be subdirs below this but + * on a real Linux system some will be subdirs. This should + * be fixed when we populate the directory for real. + */ + error = ENOTDIR; + } + + return (error); +} + +static int +lxsys_readdir_devices_syscpuinfo(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_type != LXSYS_DEV_SYS_CPUINFO) { + /* + * Since /sys/devices/system/cpu/cpuN is empty, readdir + * operations should not be performed anywhere except the top + * level. + */ + return (ENOTDIR); + } + + /* + * Emit "." and ".." entries + * All cpuN directories are currently empty. + */ + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + /* Indicate EOF */ + *eofp = 1; + + return (error); +} + +static int +lxsys_readdir_devices_sysnode(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level node listing */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + int reclen, skip; + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(lnp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + skip = (uiop->uio_offset/LXSYS_SDSIZE) - 2; + + /* Fixed entries */ + if (skip > 0) { + skip--; + } else { + (void) strncpy(dirent->d_name, "node0", LXSNSIZ); + + dirent->d_ino = lxsys_inode(LXSYS_DEV_SYS_NODE, + 1, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + uresid = uiop->uio_resid; + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + return (EINVAL); + } + return (0); + } + error = lxsys_dirent_out(dirent, reclen, uiop); + } + /* Indicate EOF */ + if (error == 0) { + *eofp = 1; + } + } else if (lnp->lxsys_endpoint == 0) { + /* node-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_sysnode, + SYSDIRLISTSZ(dirlist_devices_sysnode)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +/* + * lxsys_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char buf[MAXPATHLEN + 1]; + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type != VLNK) { + return (EINVAL); + } + + rlfunc = lxsys_readlink_function[lnp->lxsys_type]; + if (rlfunc != NULL) { + if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) { + error = uiomove(buf, strlen(buf), UIO_READ, uiop); + } + } else { + error = EINVAL; + } + + return (error); +} + + +static int +lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len) +{ + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + uint_t ifindex; + char ifname[LIFNAMSIZ]; + int error = EINVAL; + + if ((ifindex = lnp->lxsys_instance) == 0) { + return (error); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (error); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_find(phytree, &ifindex, NULL); + if (phyi != NULL) { + (void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + (void) snprintf(buf, len, "/sys/devices/virtual/net/%s", + ifname); + error = 0; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +static int +lxsys_readlink_block(lxsys_node_t *lnp, char *buf, size_t len) +{ + int inst, error = EINVAL; + lx_zone_data_t *lxzdata; + lx_virt_disk_t *vd; + + if ((inst = lnp->lxsys_instance) == 0) { + return (error); + } + + lxzdata = ztolxzd(curproc->p_zone); + if (lxzdata == NULL) + return (error); + ASSERT(lxzdata->lxzd_vdisks != NULL); + + vd = list_head(lxzdata->lxzd_vdisks); + while (vd != NULL) { + int vinst = getminor(vd->lxvd_emul_dev) & 0xffff; + + if (vinst == inst) { + (void) snprintf(buf, len, + "../devices/zfs/%s", vd->lxvd_name); + error = 0; + break; + } + vd = list_next(lxzdata->lxzd_vdisks, vd); + } + + return (error); +} + +/* + * lxsys_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxsys_freenode(VTOLXS(vp)); +} + +/* + * lxsys_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxsys_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxsys_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + if (vn_matchops(vp1, lxsys_vnodeops) || + vn_matchops(vp2, lxsys_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index 02d293fcb2..ebdabce2b5 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -43,43 +43,69 @@ char *sn1_emulation_table = NULL; -void sn1_init_brand_data(zone_t *); +void sn1_init_brand_data(zone_t *, kmutex_t *); void sn1_free_brand_data(zone_t *); void sn1_setbrand(proc_t *); int sn1_getattr(zone_t *, int, void *, size_t *); int sn1_setattr(zone_t *, int, void *, size_t); int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void sn1_copy_procdata(proc_t *, proc_t *); -void sn1_proc_exit(struct proc *, klwp_t *); +void sn1_proc_exit(struct proc *); void sn1_exec(); -int sn1_initlwp(klwp_t *); +void sn1_initlwp(klwp_t *, void *); void sn1_forklwp(klwp_t *, klwp_t *); void sn1_freelwp(klwp_t *); void sn1_lwpexit(klwp_t *); int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - size_t *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); /* sn1 brand */ struct brand_ops sn1_brops = { - sn1_init_brand_data, - sn1_free_brand_data, - sn1_brandsys, - sn1_setbrand, - sn1_getattr, - sn1_setattr, - sn1_copy_procdata, - sn1_proc_exit, - sn1_exec, - lwp_setrval, - sn1_initlwp, - sn1_forklwp, - sn1_freelwp, - sn1_lwpexit, - sn1_elfexec, - NULL, - NULL, - NSIG, + sn1_init_brand_data, /* b_init_brand_data */ + sn1_free_brand_data, /* b_free_brand_data */ + sn1_brandsys, /* b_brandsys */ + sn1_setbrand, /* b_setbrand */ + sn1_getattr, /* b_getattr */ + sn1_setattr, /* b_setattr */ + sn1_copy_procdata, /* b_copy_procdata */ + sn1_proc_exit, /* b_proc_exit */ + sn1_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + sn1_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + sn1_forklwp, /* b_forklwp */ + sn1_freelwp, /* b_freelwp */ + sn1_lwpexit, /* b_lwpexit */ + sn1_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE, /* b_intp_parse_arg */ + NULL, /* b_clearbrand */ + NULL, /* b_rpc_statd */ + NULL /* b_acct_out */ }; #ifdef sparc @@ -95,9 +121,12 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, + NULL, sn1_brand_int91_callback, sn1_brand_syscall_callback, - sn1_brand_syscall32_callback + sn1_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -105,7 +134,10 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, NULL, + NULL, sn1_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -116,7 +148,8 @@ struct brand sn1_brand = { BRAND_VER_1, "sn1", &sn1_brops, - &sn1_mops + &sn1_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -149,10 +182,10 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) return (EINVAL); } -/*ARGSUSED*/ +/* ARGSUSED5 */ int sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4) { int res; @@ -172,9 +205,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent) } void -sn1_proc_exit(struct proc *p, klwp_t *l) +sn1_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &sn1_brand); + brand_solaris_proc_exit(p, &sn1_brand); } void @@ -183,10 +216,10 @@ sn1_exec() brand_solaris_exec(&sn1_brand); } -int -sn1_initlwp(klwp_t *l) +void +sn1_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &sn1_brand)); + brand_solaris_initlwp(l, &sn1_brand); } void @@ -215,18 +248,18 @@ sn1_free_brand_data(zone_t *zone) /*ARGSUSED*/ void -sn1_init_brand_data(zone_t *zone) +sn1_init_brand_data(zone_t *zone, kmutex_t *zsl) { } int sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME, - SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32)); + SN1_LIB, SN1_LIB32)); } int diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h index b487745e21..fef9dc128b 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.h +++ b/usr/src/uts/common/brand/sn1/sn1_brand.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SN1_BRAND_H @@ -37,20 +38,14 @@ extern "C" { #define SN1_VERSION SN1_VERSION_1 #define SN1_LIB_NAME "sn1_brand.so.1" -#define SN1_LINKER_NAME "ld.so.1" #define SN1_LIB32 BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME -#define SN1_LINKER32 "/lib/" SN1_LINKER_NAME - #define SN1_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME -#define SN1_LINKER64 "/lib/64/" SN1_LINKER_NAME #if defined(_LP64) #define SN1_LIB SN1_LIB64 -#define SN1_LINKER SN1_LINKER64 #else /* !_LP64 */ #define SN1_LIB SN1_LIB32 -#define SN1_LINKER SN1_LINKER32 #endif /* !_LP64 */ #if defined(_KERNEL) diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index b3ea043cdb..4de7cbcc05 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -46,45 +46,71 @@ char *s10_emulation_table = NULL; -void s10_init_brand_data(zone_t *); +void s10_init_brand_data(zone_t *, kmutex_t *); void s10_free_brand_data(zone_t *); void s10_setbrand(proc_t *); int s10_getattr(zone_t *, int, void *, size_t *); int s10_setattr(zone_t *, int, void *, size_t); int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void s10_copy_procdata(proc_t *, proc_t *); -void s10_proc_exit(struct proc *, klwp_t *); +void s10_proc_exit(struct proc *); void s10_exec(); -int s10_initlwp(klwp_t *); +void s10_initlwp(klwp_t *, void *); void s10_forklwp(klwp_t *, klwp_t *); void s10_freelwp(klwp_t *); void s10_lwpexit(klwp_t *); int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - size_t *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); void s10_sigset_native_to_s10(sigset_t *); void s10_sigset_s10_to_native(sigset_t *); /* s10 brand */ struct brand_ops s10_brops = { - s10_init_brand_data, - s10_free_brand_data, - s10_brandsys, - s10_setbrand, - s10_getattr, - s10_setattr, - s10_copy_procdata, - s10_proc_exit, - s10_exec, - lwp_setrval, - s10_initlwp, - s10_forklwp, - s10_freelwp, - s10_lwpexit, - s10_elfexec, - s10_sigset_native_to_s10, - s10_sigset_s10_to_native, - S10_NSIG, + s10_init_brand_data, /* b_init_brand_data */ + s10_free_brand_data, /* b_free_brand_data */ + s10_brandsys, /* b_brandsys */ + s10_setbrand, /* b_setbrand */ + s10_getattr, /* b_getattr */ + s10_setattr, /* b_setattr */ + s10_copy_procdata, /* b_copy_procdata */ + s10_proc_exit, /* b_proc_exit */ + s10_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + s10_initlwp, /* b_initlwp */ + NULL, /* b_initlwp_post */ + s10_forklwp, /* b_forklwp */ + s10_freelwp, /* b_freelwp */ + s10_lwpexit, /* b_lwpexit */ + s10_elfexec, /* b_elfexec */ + s10_sigset_native_to_s10, /* b_sigset_native_to_brand */ + s10_sigset_s10_to_native, /* b_sigset_brand_to_native */ + NULL, /* b_sigfd_translate */ + S10_NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL, /* b_pagefault */ + B_TRUE, /* b_intp_parse_arg */ + NULL, /* b_clearbrand */ + NULL, /* b_rpc_statd */ + NULL /* b_acct_out */ }; #ifdef sparc @@ -100,9 +126,12 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, + NULL, s10_brand_int91_callback, s10_brand_syscall_callback, - s10_brand_syscall32_callback + s10_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -110,7 +139,10 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, NULL, + NULL, s10_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -121,7 +153,8 @@ struct brand s10_brand = { BRAND_VER_1, "solaris10", &s10_brops, - &s10_mops + &s10_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -250,10 +283,10 @@ s10_native(void *cmd, void *args) return (0); } -/*ARGSUSED*/ +/* ARGSUSED5 */ int s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4) { proc_t *p = curproc; int res; @@ -327,9 +360,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent) } void -s10_proc_exit(struct proc *p, klwp_t *l) +s10_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &s10_brand); + brand_solaris_proc_exit(p, &s10_brand); } void @@ -338,10 +371,10 @@ s10_exec() brand_solaris_exec(&s10_brand); } -int -s10_initlwp(klwp_t *l) +void +s10_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &s10_brand)); + brand_solaris_initlwp(l, &s10_brand); } void @@ -381,7 +414,7 @@ s10_free_brand_data(zone_t *zone) } void -s10_init_brand_data(zone_t *zone) +s10_init_brand_data(zone_t *zone, kmutex_t *zsl) { ASSERT(zone->zone_brand == &s10_brand); ASSERT(zone->zone_brand_data == NULL); @@ -391,11 +424,11 @@ s10_init_brand_data(zone_t *zone) int s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, - S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); + S10_LIB, S10_LIB32)); } void diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h index 11f9853f48..ffef485e12 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.h +++ b/usr/src/uts/common/brand/solaris10/s10_brand.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _S10_BRAND_H @@ -42,17 +43,12 @@ extern "C" { #define S10_LINKER_NAME "ld.so.1" #define S10_LIB32 BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME -#define S10_LINKER32 "/lib/" S10_LINKER_NAME - #define S10_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME -#define S10_LINKER64 "/lib/64/" S10_LINKER_NAME #if defined(_LP64) #define S10_LIB S10_LIB64 -#define S10_LINKER S10_LINKER64 #else /* !_LP64 */ #define S10_LIB S10_LIB32 -#define S10_LINKER S10_LINKER32 #endif /* !_LP64 */ /* diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index a4266ba267..93481c378e 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright 2022 Garrett D'Amore <garrett@damore.org> */ @@ -538,8 +539,8 @@ char *isa_list = architecture; static pgcnt_t original_physmem = 0; #define MIN_DEFAULT_MAXUSERS 8u -#define MAX_DEFAULT_MAXUSERS 2048u -#define MAX_MAXUSERS 4096u +#define MAX_DEFAULT_MAXUSERS 10000u +#define MAX_MAXUSERS 20000u void param_preset(void) @@ -551,7 +552,7 @@ void param_calc(int platform_max_nprocs) { /* - * Default to about one "user" per megabyte, taking into + * Default to about one "user" per 8MB, taking into * account both physical and virtual constraints. * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT) * converts pages to megs without integer overflow. @@ -565,8 +566,9 @@ param_calc(int platform_max_nprocs) if (maxusers == 0) { pgcnt_t physmegs = physmem >> (20 - PAGESHIFT); pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20; - maxusers = MIN(MAX(MIN(physmegs, virtmegs), - MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS); + maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */ + maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS); + maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS); } if (maxusers > MAX_MAXUSERS) { maxusers = MAX_MAXUSERS; @@ -583,15 +585,26 @@ param_calc(int platform_max_nprocs) /* * We need to dynamically change any variables now so that - * the setting of maxusers and pidmax propagate to the other + * the setting of maxusers and maxpid propagate to the other * variables that are dependent on them. */ if (reserved_procs == 0) reserved_procs = 5; - if (pidmax < reserved_procs || pidmax > MAX_MAXPID) + if (pidmax < reserved_procs || pidmax > MAX_MAXPID) { maxpid = MAX_MAXPID; - else + } else { + /* + * If pidmax has not been explicity set in /etc/system, then + * increase it to the maximum on larger machines. We choose a + * 128GB memory size as the threshold to increase pidmax. + */ + if (pidmax == DEFAULT_MAXPID) { + if (physmem > (btop(128ULL * 0x40000000ULL))) { + pidmax = MAX_MAXPID; + } + } maxpid = pidmax; + } /* * This allows platform-dependent code to constrain the maximum diff --git a/usr/src/uts/common/contract/process.c b/usr/src/uts/common/contract/process.c index 3f25547b50..32e5707cb2 100644 --- a/usr/src/uts/common/contract/process.c +++ b/usr/src/uts/common/contract/process.c @@ -952,6 +952,18 @@ contract_process_exit(cont_process_t *ctp, proc_t *p, int exitstatus) (void) cte_publish_all(ct, event, nvl, NULL); mutex_enter(&ct->ct_lock); } + + /* + * CT_PR_EV_EXIT is not part of the CT_PR_ALLFATAL definition since + * we never allow including this in the fatal set via a user-land + * application, but we do allow CT_PR_EV_EXIT in the contract's fatal + * set for a process setup for zone init. See zone_start_init(). + */ + if (EVFATALP(ctp, CT_PR_EV_EXIT)) { + ASSERT(MUTEX_HELD(&ct->ct_lock)); + contract_process_kill(ct, p, B_TRUE); + } + if (empty) { /* * Send EMPTY message. @@ -1054,6 +1066,17 @@ contract_process_fork(ctmpl_process_t *rtmpl, proc_t *cp, proc_t *pp, event->cte_type = CT_PR_EV_FORK; (void) cte_publish_all(ct, event, nvl, NULL); } + + /* + * Because the CT_PR_KEEP_EXEC flag is meant to be used by applications + * which are not contract aware, we can assume that these applications + * will never explicitly abandon the child's new contract. Thus, we + * abandon it now. + */ + if (ctp->conp_params & CT_PR_KEEP_EXEC) { + (void) contract_abandon(ct, pp, 1); + } + return (ctp); } diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c index b99a834503..80bbb64505 100644 --- a/usr/src/uts/common/crypto/api/kcf_random.c +++ b/usr/src/uts/common/crypto/api/kcf_random.c @@ -70,6 +70,7 @@ #include <sys/cpuvar.h> #include <sys/taskq.h> #include <rng/fips_random.h> +#include <sys/strlog.h> #define RNDPOOLSIZE 1024 /* Pool size in bytes */ #define MINEXTRACTBYTES 20 diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index 9e079a079e..ec9df915c5 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg) case 0: case -1: /* - * Woke up with no work to do. Check - * if this thread should exit. We keep - * at least kcf_minthreads. + * Woke up with no work to do. Check if we + * should lwp_exit() (which won't return). We + * keep at least kcf_minthreads. */ if (kcfpool->kp_threads > kcf_minthreads) { KCF_ATOMIC_DECR(kcfpool->kp_threads); diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index fd734bd229..30a69a23f0 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp) /* * Return non-zero if thread can migrate between "from" and "to" - * without a performance penalty + * without a performance penalty. This is true only if we share a core on + * virtually any CPU; sharing the last-level cache is insufficient to make + * migration possible without penalty. */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { - if (from->cpu_physid->cpu_cacheid == - to->cpu_physid->cpu_cacheid) + if (from->cpu_physid->cpu_coreid == + to->cpu_physid->cpu_coreid) return (1); return (0); } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..2a4365ff73 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap) ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); + + /* do this first to avoid race with cap_kstat_update */ + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; - if (cap->cap_kstat != NULL) { - kstat_delete(cap->cap_kstat); - cap->cap_kstat = NULL; - } - } /* @@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t) /* * Convert internal cap statistics into values exported by cap kstat. + * Note that the kstat is held throughout this function but caps_lock is not. */ static int cap_kstat_update(kstat_t *ksp, int rw) @@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index a9d5f969dc..f0e4aaecab 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -2256,7 +2256,7 @@ disp_getbest(disp_t *dp) * placed earlier. */ if (tcp == NULL || - pri >= minclsyspri || + (pri >= minclsyspri && tp->t_procp == &p0) || tp->t_cpu != tcp) break; diff --git a/usr/src/uts/common/disp/fx.c b/usr/src/uts/common/disp/fx.c index 7fc81e7815..191075e032 100644 --- a/usr/src/uts/common/disp/fx.c +++ b/usr/src/uts/common/disp/fx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -70,16 +70,6 @@ static struct modlinkage modlinkage = { }; -/* - * control flags (kparms->fx_cflags). - */ -#define FX_DOUPRILIM 0x01 /* change user priority limit */ -#define FX_DOUPRI 0x02 /* change user priority */ -#define FX_DOTQ 0x04 /* change FX time quantum */ - - -#define FXMAXUPRI 60 /* maximum user priority setting */ - #define FX_MAX_UNPRIV_PRI 0 /* maximum unpriviledge priority */ /* diff --git a/usr/src/uts/common/disp/priocntl.c b/usr/src/uts/common/disp/priocntl.c index 5412df83f5..60e870ba28 100644 --- a/usr/src/uts/common/disp/priocntl.c +++ b/usr/src/uts/common/disp/priocntl.c @@ -114,7 +114,7 @@ copyin_vaparms32(caddr_t arg, pc_vaparms_t *vap, uio_seg_t seg) #endif -static int donice(procset_t *, pcnice_t *); +int donice(procset_t *, pcnice_t *); static int doprio(procset_t *, pcprio_t *); static int proccmp(proc_t *, struct pcmpargs *); static int setparms(proc_t *, struct stprmargs *); @@ -991,7 +991,7 @@ setprocnice(proc_t *pp, pcnice_t *pcnice) /* * Update the nice value of the specified LWP or set of processes. */ -static int +int donice(procset_t *procset, pcnice_t *pcnice) { int err_proc = 0; diff --git a/usr/src/uts/common/disp/rt.c b/usr/src/uts/common/disp/rt.c index f87f8c56ce..115e42ccb8 100644 --- a/usr/src/uts/common/disp/rt.c +++ b/usr/src/uts/common/disp/rt.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -103,13 +103,6 @@ _info(struct modinfo *modinfop) pri_t rt_maxpri = RTMAXPRI; /* maximum real-time priority */ rtdpent_t *rt_dptbl; /* real-time dispatcher parameter table */ -/* - * control flags (kparms->rt_cflags). - */ -#define RT_DOPRI 0x01 /* change priority */ -#define RT_DOTQ 0x02 /* change RT time quantum */ -#define RT_DOSIG 0x04 /* change RT time quantum signal */ - static int rt_admin(caddr_t, cred_t *); static int rt_enterclass(kthread_t *, id_t, void *, cred_t *, void *); static int rt_fork(kthread_t *, kthread_t *, void *); diff --git a/usr/src/uts/common/disp/rt_dptbl.c b/usr/src/uts/common/disp/rt_dptbl.c index 1012b5aef2..a5c8836518 100644 --- a/usr/src/uts/common/disp/rt_dptbl.c +++ b/usr/src/uts/common/disp/rt_dptbl.c @@ -68,8 +68,6 @@ _info(struct modinfo *modinfop) return (mod_info(&modlinkage, modinfop)); } -#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ - rtdpent_t config_rt_dptbl[] = { /* prilevel Time quantum */ diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index 764942d4df..b2b28ec06f 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -77,6 +77,10 @@ #include <sys/ctype.h> #include <sys/smt.h> +#ifndef STACK_GROWTH_DOWN +#error Stacks do not grow downward; 3b2 zombie attack detected! +#endif + struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ struct kmem_cache *turnstile_cache; /* cache of free turnstiles */ @@ -374,7 +378,7 @@ thread_create( if (stksize <= sizeof (kthread_t) + PTR24_ALIGN) cmn_err(CE_PANIC, "thread_create: proposed stack size" " too small to hold thread."); -#ifdef STACK_GROWTH_DOWN + stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1); stksize &= -PTR24_ALIGN; /* make thread aligned */ t = (kthread_t *)(stk + stksize); @@ -383,13 +387,6 @@ thread_create( audit_thread_create(t); t->t_stk = stk + stksize; t->t_stkbase = stk; -#else /* stack grows to larger addresses */ - stksize -= SA(sizeof (kthread_t)); - t = (kthread_t *)(stk); - bzero(t, sizeof (kthread_t)); - t->t_stk = stk + sizeof (kthread_t); - t->t_stkbase = stk + stksize + sizeof (kthread_t); -#endif /* STACK_GROWTH_DOWN */ t->t_flag |= T_TALLOCSTK; t->t_swap = stk; } else { @@ -402,13 +399,8 @@ thread_create( * Initialize t_stk to the kernel stack pointer to use * upon entry to the kernel */ -#ifdef STACK_GROWTH_DOWN t->t_stk = stk + stksize; t->t_stkbase = stk; -#else - t->t_stk = stk; /* 3b2-like */ - t->t_stkbase = stk + stksize; -#endif /* STACK_GROWTH_DOWN */ } if (kmem_stackinfo != 0) { @@ -584,6 +576,9 @@ thread_exit(void) if ((t->t_proc_flag & TP_ZTHREAD) != 0) cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called"); + if ((t->t_flag & T_SPLITSTK) != 0) + cmn_err(CE_PANIC, "thread_exit: called when stack is split"); + tsd_exit(); /* Clean up this thread's TSD */ kcpc_passivate(); /* clean up performance counter state */ @@ -2050,6 +2045,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) return (on_rq); } + +/* + * There are occasions in the kernel when we need much more stack than we + * allocate by default, but we do not wish to have that work done + * asynchronously by another thread. To accommodate these scenarios, we allow + * for a split stack (also known as a "segmented stack") whereby a new stack + * is dynamically allocated and the current thread jumps onto it for purposes + * of executing the specified function. After the specified function returns, + * the stack is deallocated and control is returned to the caller. This + * functionality is implemented by thread_splitstack(), below; there are a few + * constraints on its use: + * + * - The caller must be in a context where it is safe to block for memory. + * - The caller cannot be in a t_onfault context + * - The called function must not call thread_exit() while on the split stack + * + * The code will explicitly panic if these constraints are violated. Notably, + * however, thread_splitstack() _can_ be called on a split stack -- there + * is no limit to the level that split stacks can nest. + * + * When the stack is split, it is constructed such that stack backtraces + * from kernel debuggers continue to function -- though note that DTrace's + * stack() action and stackdepth function will only show the stack up to and + * including thread_splitstack_run(); DTrace explicitly bounds itself to + * pointers that exist within the current declared stack as a safety + * mechanism. + */ +void +thread_splitstack(void (*func)(void *), void *arg, size_t stksize) +{ + kthread_t *t = curthread; + caddr_t ostk, ostkbase, stk; + ushort_t otflag; + + if (t->t_onfault != NULL) + panic("thread_splitstack: called with non-NULL t_onfault"); + + ostk = t->t_stk; + ostkbase = t->t_stkbase; + otflag = t->t_flag; + + stksize = roundup(stksize, PAGESIZE); + + if (stksize < default_stksize) + stksize = default_stksize; + + if (stksize == default_stksize) { + stk = (caddr_t)segkp_cache_get(segkp_thread); + } else { + stksize = roundup(stksize, PAGESIZE); + stk = (caddr_t)segkp_get(segkp, stksize, + (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED)); + } + + /* + * We're going to lock ourselves before we set T_SPLITSTK to assure + * that we're not swapped out in the meantime. (Note that we don't + * bother to set t_swap, as we're not going to be swapped out.) + */ + thread_lock(t); + + if (!(otflag & T_SPLITSTK)) + t->t_flag |= T_SPLITSTK; + + t->t_stk = stk + stksize; + t->t_stkbase = stk; + + thread_unlock(t); + + /* + * Now actually run on the new (split) stack... + */ + thread_splitstack_run(t->t_stk, func, arg); + + /* + * We're back onto our own stack; lock ourselves and restore our + * pre-split state. + */ + thread_lock(t); + + t->t_stk = ostk; + t->t_stkbase = ostkbase; + + if (!(otflag & T_SPLITSTK)) + t->t_flag &= ~T_SPLITSTK; + + thread_unlock(t); + + /* + * Now that we are entirely back on our own stack, call back into + * the platform layer to perform any platform-specific cleanup. + */ + thread_splitstack_cleanup(); + + segkp_release(segkp, stk); +} + /* * Tunable kmem_stackinfo is set, fill the kernel thread stack with a * specific pattern. diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c index 054cb43c9b..5ecd546d01 100644 --- a/usr/src/uts/common/dtrace/dtrace.c +++ b/usr/src/uts/common/dtrace/dtrace.c @@ -7793,7 +7793,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); - *zoneidp = crgetzoneid(cr); + *zoneidp = crgetzonedid(cr); priv = 0; if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) @@ -8289,7 +8289,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { provider->dtpv_priv.dtpp_uid = crgetuid(cr); - provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr); } provider->dtpv_pops = *pops; @@ -8893,6 +8893,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) uint32_t priv; uid_t uid; zoneid_t zoneid; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; ASSERT(MUTEX_HELD(&dtrace_lock)); dtrace_ecb_create_cache = NULL; @@ -8907,8 +8908,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) } dtrace_probekey(desc, &pkey); - dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, - &priv, &uid, &zoneid); + dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid); + + if ((priv & DTRACE_PRIV_ZONEOWNER) && + state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) { + /* + * If we have the privilege of instrumenting all zones but we + * have been told to instrument but one, we will spoof this up + * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes + * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for + * security but rather for performance: it allows the global + * zone to instrument USDT probes in a local zone without + * requiring all zones to be instrumented.) + */ + priv &= ~DTRACE_PRIV_ZONEOWNER; + zoneid = state->dts_options[DTRACEOPT_ZONE]; + } return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab)); diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index 5112b1dc37..2db9d5447e 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -114,6 +114,10 @@ static dtrace_pattr_t smb_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, }; +/* + * When adding a new provider you must add it before sdt as sdt is a catch all + * for remaining probes. + */ sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr }, { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER }, @@ -136,6 +140,7 @@ sdt_provider_t sdt_providers[] = { { "fc", "__fc_", &fc_attr }, { "srp", "__srp_", &fc_attr }, { "sysevent", "__sysevent_", &stab_attr }, + { "vnd", "__vnd_", &stab_attr }, { "sdt", NULL, &sdt_attr }, { NULL } }; @@ -1473,6 +1478,34 @@ sdt_argdesc_t sdt_args[] = { { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", "fc_port_info_t *" }, + { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-in", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-out", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" }, + { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "send", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "send", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" }, + { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "recv", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" }, { NULL } }; diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index 73d302aaa5..53bbd078ba 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright 2019, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2022 Oxide Computer Company */ @@ -94,7 +94,6 @@ static int mapelfexec(vnode_t *, Ehdr *, uint_t, caddr_t, Phdr **, Phdr **, Phdr **, Phdr **, Phdr *, caddr_t *, caddr_t *, intptr_t *, uintptr_t *, size_t, size_t *, size_t *); - #ifdef _ELF32_COMPAT /* Link against the non-compat instances when compiling the 32-bit version. */ extern size_t elf_datasz_max; @@ -181,12 +180,16 @@ elf_ctx_resize_scratch(elf_core_ctx_t *ctx, size_t sz) #endif /* _ELF32_COMPAT */ /* - * Map in the executable pointed to by vp. Returns 0 on success. + * Map in the executable pointed to by vp. Returns 0 on success. Note that + * this function currently has the maximum number of arguments allowed by + * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without + * adding to MAXNARG. (Better yet, do not add to this monster of a function + * signature!) */ int mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, - intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase, - caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap) + intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase, + caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp) { size_t len, phdrsize; struct vattr vat; @@ -197,12 +200,16 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, Phdr *junk = NULL; Phdr *dynphdr = NULL; Phdr *dtrphdr = NULL; + char *interp = NULL; uintptr_t lddata, minaddr; size_t execsz; if (lddatap != NULL) *lddatap = 0; + if (minaddrp != NULL) + *minaddrp = (uintptr_t)NULL; + if (error = execpermissions(vp, &vat, args)) { uprintf("%s: Cannot execute %s\n", exec_file, args->pathname); return (error); @@ -234,24 +241,85 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, return (error); } + if (minaddrp != NULL) + *minaddrp = minaddr; + /* - * Inform our caller if the executable needs an interpreter. + * If the executable requires an interpreter, determine its name. */ - *interp = (dynphdr == NULL) ? 0 : 1; + if (dynphdr != NULL) { + ssize_t resid; + + if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) { + uprintf("%s: Invalid interpreter\n", exec_file); + kmem_free(phdrbase, phdrsize); + return (ENOEXEC); + } + + interp = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if ((error = vn_rdwr(UIO_READ, vp, interp, + (ssize_t)dynphdr->p_filesz, + (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0, + (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 || + interp[dynphdr->p_filesz - 1] != '\0') { + uprintf("%s: Cannot obtain interpreter pathname\n", + exec_file); + kmem_free(interp, MAXPATHLEN); + kmem_free(phdrbase, phdrsize); + return (error != 0 ? error : ENOEXEC); + } + } /* * If this is a statically linked executable, voffset should indicate * the address of the executable itself (it normally holds the address * of the interpreter). */ - if (ehdr->e_type == ET_EXEC && *interp == 0) + if (ehdr->e_type == ET_EXEC && interp == NULL) *voffset = minaddr; + /* + * If the caller has asked for the interpreter name, return it (it's + * up to the caller to free it); if the caller hasn't asked for it, + * free it ourselves. + */ + if (interpp != NULL) { + *interpp = interp; + } else if (interp != NULL) { + kmem_free(interp, MAXPATHLEN); + } + if (uphdr != NULL) { *uphdr_vaddr = uphdr->p_vaddr; if (uphdr->p_flags == 0) kmem_free(uphdr, sizeof (Phdr)); + } else if (ehdr->e_type == ET_DYN) { + /* + * If we don't have a uphdr, we'll apply the logic found + * in mapelfexec() and use the p_vaddr of the first PT_LOAD + * section as the base address of the object. + */ + const Phdr *phdr = (Phdr *)phdrbase; + const uint_t hsize = ehdr->e_phentsize; + uint_t i; + + for (i = nphdrs; i > 0; i--) { + if (phdr->p_type == PT_LOAD) { + *uphdr_vaddr = (uintptr_t)phdr->p_vaddr + + ehdr->e_phoff; + break; + } + + phdr = (Phdr *)((caddr_t)phdr + hsize); + } + + /* + * If we don't have a PT_LOAD segment, we should have returned + * ENOEXEC when elfsize() returned 0, above. + */ + VERIFY(i > 0); } else { *uphdr_vaddr = (Addr)-1; } @@ -263,13 +331,13 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { caddr_t phdrbase = NULL; caddr_t bssbase = 0; caddr_t brkbase = 0; size_t brksize = 0; - size_t dlnsize; + size_t dlnsize, nsize = 0; aux_entry_t *aux; int error; ssize_t resid; @@ -349,7 +417,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, *execsz = btopr(SINCR) + btopr(SSIZE) + btopr(NCARGS32-1); } else { args->to_model = DATAMODEL_LP64; - args->stk_prot &= ~PROT_EXEC; + if (!args->stk_prot_override) { + args->stk_prot &= ~PROT_EXEC; + } #if defined(__x86) args->dat_prot &= ~PROT_EXEC; #endif @@ -361,11 +431,25 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* _LP64 */ /* - * We delay invoking the brand callback until we've figured out - * what kind of elf binary we're trying to run, 32-bit or 64-bit. - * We do this because now the brand library can just check - * args->to_model to see if the target is 32-bit or 64-bit without - * having do duplicate all the code above. + * We delay invoking the brand callback until we've figured out what + * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this + * because now the brand library can just check args->to_model to see if + * the target is 32-bit or 64-bit without having do duplicate all the + * code above. + * + * We also give the brand a chance to indicate that based on the ELF + * OSABI of the target binary it should become unbranded and optionally + * indicate that it should be treated as existing in a specific prefix. + * + * Note that if a brand opts to go down this route it does not actually + * end up being debranded. In other words, future programs that exec + * will still be considered for branding unless this escape hatch is + * used. Consider the case of lx brand for example. If a user runs + * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable + * of DTrace that's in /native will take this escape hatch and be run + * and interpreted using the normal system call table; however, the + * execution of a non-illumos binary in the form of /bin/ls will still + * be branded and be subject to all of the normal actions of the brand. * * The level checks associated with brand handling below are used to * prevent a loop since the brand elfexec function typically comes back @@ -373,8 +457,20 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * handling in the #! interpreter code will increment the level before * calling gexec to run the final elfexec interpreter. */ + if ((level <= INTP_MAXDEPTH) && (*brand_action != EBA_NATIVE) && + (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) { + if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI], + &args->brand_nroot) == B_TRUE) { + ASSERT(ehdrp->e_ident[EI_OSABI]); + *brand_action = EBA_NATIVE; + /* Add one for the trailing '/' in the path */ + if (args->brand_nroot != NULL) + nsize = strlen(args->brand_nroot) + 1; + } + } + if ((level <= INTP_MAXDEPTH) && - (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { error = BROP(p)->b_elfexec(vp, uap, args, idatap, level + 1, execsz, setid, exec_file, cred, brand_action); @@ -448,6 +544,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * AT_BASE * AT_FLAGS * AT_PAGESZ + * AT_RANDOM (added in stk_copyout) * AT_SUN_AUXFLAGS * AT_SUN_HWCAP * AT_SUN_HWCAP2 @@ -456,7 +553,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * AT_SUN_EXECNAME (added in stk_copyout) * AT_NULL * - * total == 10 + * total == 11 */ if (hasintp && hasu) { /* @@ -471,7 +568,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 5 */ - args->auxsize = (10 + 5) * sizeof (aux_entry_t); + args->auxsize = (11 + 5) * sizeof (aux_entry_t); } else if (hasintp) { /* * Has PT_INTERP but no PT_PHDR @@ -481,9 +578,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 2 */ - args->auxsize = (10 + 2) * sizeof (aux_entry_t); + args->auxsize = (11 + 2) * sizeof (aux_entry_t); } else { - args->auxsize = 10 * sizeof (aux_entry_t); + args->auxsize = 11 * sizeof (aux_entry_t); } } else { args->auxsize = 0; @@ -497,6 +594,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, args->auxsize += sizeof (aux_entry_t); /* + * If this is a native binary that's been given a modified interpreter + * root, inform it that the native system exists at that root. + */ + if (args->brand_nroot != NULL) { + args->auxsize += sizeof (aux_entry_t); + } + + + /* * On supported kernels (x86_64) make room in the auxv for the * AT_SUN_COMMPAGE entry. This will go unpopulated on i86xpv systems * which do not provide such functionality. @@ -508,13 +614,24 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, args->auxsize += 3 * sizeof (aux_entry_t); #endif /* defined(__amd64) */ - if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + /* + * If we have user credentials, we'll supply the following entries: + * AT_SUN_UID + * AT_SUN_RUID + * AT_SUN_GID + * AT_SUN_RGID + */ + if (cred != NULL) { + args->auxsize += 4 * sizeof (aux_entry_t); + } + + if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { branded = 1; /* - * We will be adding 4 entries to the aux vectors. One for - * the the brandname and 3 for the brand specific aux vectors. + * We will be adding 5 entries to the aux vectors. One for + * the brandname and 4 for the brand specific aux vectors. */ - args->auxsize += 4 * sizeof (aux_entry_t); + args->auxsize += 5 * sizeof (aux_entry_t); } /* If the binary has an explicit ASLR flag, it must be honoured */ @@ -595,7 +712,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, aux = bigwad->elfargs; /* * Move args to the user's stack. - * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries. + * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM + * aux entries. */ if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) { if (error == -1) { @@ -645,7 +763,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, char *p; struct vnode *nvp; - dlnsize = intphdr->p_filesz; + dlnsize = intphdr->p_filesz + nsize; /* * Make sure none of the component pieces of dlnsize result in @@ -656,10 +774,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, goto bad; } + if (nsize != 0) { + bcopy(args->brand_nroot, dlnp, nsize - 1); + dlnp[nsize - 1] = '/'; + } + /* * Read in "interpreter" pathname. */ - if ((error = vn_rdwr(UIO_READ, vp, dlnp, + if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize, (ssize_t)intphdr->p_filesz, (offset_t)intphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0, CRED(), &resid)) != 0) { uprintf("%s: Cannot obtain interpreter pathname\n", @@ -842,8 +965,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* defined(__amd64) */ /* - * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via - * exec_args() + * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were + * filled in via exec_args() */ ADDAUX(aux, AT_BASE, voffset) ADDAUX(aux, AT_FLAGS, at_flags) @@ -871,7 +994,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * malicious user within the zone from crafting a wrapper to * run native suid commands with unsecure libraries interposed. */ - if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && + if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && (setid &= ~EXECSETID_SETID) != 0)) auxf &= ~AF_SUN_SETUGID; @@ -886,6 +1009,17 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_AUXFLAGS, auxf); /* + * Record information about the real and effective user and + * group IDs. + */ + if (cred != NULL) { + ADDAUX(aux, AT_SUN_UID, crgetuid(cred)); + ADDAUX(aux, AT_SUN_RUID, crgetruid(cred)); + ADDAUX(aux, AT_SUN_GID, crgetgid(cred)); + ADDAUX(aux, AT_SUN_RGID, crgetrgid(cred)); + } + + /* * Hardware capability flag word (performance hints) * Used for choosing faster library routines. * (Potentially different between 32-bit and 64-bit ABIs) @@ -912,6 +1046,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_BRAND_AUX1, 0) ADDAUX(aux, AT_SUN_BRAND_AUX2, 0) ADDAUX(aux, AT_SUN_BRAND_AUX3, 0) + ADDAUX(aux, AT_SUN_BRAND_AUX4, 0) } /* @@ -1119,10 +1254,10 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs, * We got here by the first two bytes in ident, * now read the entire ELF header. */ - if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, - sizeof (Ehdr), (offset_t)0, UIO_SYSSPACE, 0, - (rlim64_t)0, credp, &resid)) != 0) + if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)ehdr, sizeof (Ehdr), + (offset_t)0, UIO_SYSSPACE, 0, (rlim64_t)0, credp, &resid)) != 0) { return (error); + } /* * Since a separate version is compiled for handling 32-bit and @@ -1131,8 +1266,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs, */ if (resid != 0 || ehdr->e_ident[EI_MAG2] != ELFMAG2 || - ehdr->e_ident[EI_MAG3] != ELFMAG3) + ehdr->e_ident[EI_MAG3] != ELFMAG3) { return (ENOEXEC); + } if ((ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) || #if defined(_ILP32) || defined(_ELF32_COMPAT) @@ -1141,8 +1277,9 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs, ehdr->e_ident[EI_CLASS] != ELFCLASS64 || #endif !elfheadcheck(ehdr->e_ident[EI_DATA], ehdr->e_machine, - ehdr->e_flags)) + ehdr->e_flags)) { return (EINVAL); + } *nshdrs = ehdr->e_shnum; *shstrndx = ehdr->e_shstrndx; @@ -1162,9 +1299,8 @@ getelfhead(vnode_t *vp, cred_t *credp, Ehdr *ehdr, uint_t *nshdrs, if ((error = vn_rdwr(UIO_READ, vp, (caddr_t)&shdr, sizeof (shdr), (offset_t)ehdr->e_shoff, UIO_SYSSPACE, 0, - (rlim64_t)0, credp, NULL)) != 0) { + (rlim64_t)0, credp, NULL)) != 0) return (error); - } if (*nshdrs == 0) *nshdrs = shdr.sh_size; @@ -1335,7 +1471,7 @@ mapelfexec( size_t *brksize) { Phdr *phdr; - int error, page, prot; + int error, page, prot, lastprot = 0; caddr_t addr = NULL; caddr_t minaddr = (caddr_t)UINTPTR_MAX; uint_t i; @@ -1343,9 +1479,11 @@ mapelfexec( boolean_t ptload = B_FALSE; off_t offset; const uint_t hsize = ehdr->e_phentsize; + uintptr_t lastaddr = 0; extern int use_brk_lpg; if (ehdr->e_type == ET_DYN) { + caddr_t vaddr; secflagset_t flags = 0; /* * Obtain the virtual address of a hole in the @@ -1357,23 +1495,65 @@ mapelfexec( map_addr(&addr, len, (offset_t)0, 1, flags); if (addr == NULL) return (ENOMEM); - *voffset = (intptr_t)addr; /* - * Calculate the minimum vaddr so it can be subtracted out. - * According to the ELF specification, since PT_LOAD sections - * must be sorted by increasing p_vaddr values, this is - * guaranteed to be the first PT_LOAD section. + * Despite the fact that mmapobj(2) refuses to load them, we + * need to support executing ET_DYN objects that have a + * non-NULL p_vaddr. When found in the wild, these objects + * are likely to be due to an old (and largely obviated) Linux + * facility, prelink(8), that rewrites shared objects to + * prefer specific (disjoint) virtual address ranges. (Yes, + * this is putatively for performance -- and yes, it has + * limited applicability, many edge conditions and grisly + * failure modes; even for Linux, it's insane.) As ELF + * mandates that the PT_LOAD segments be in p_vaddr order, we + * find the lowest p_vaddr by finding the first PT_LOAD + * segment. */ phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { if (phdr->p_type == PT_LOAD) { - *voffset -= (uintptr_t)phdr->p_vaddr; + addr = (caddr_t)(uintptr_t)phdr->p_vaddr; break; } phdr = (Phdr *)((caddr_t)phdr + hsize); } + /* + * We have a non-zero p_vaddr in the first PT_LOAD segment -- + * presumably because we're directly executing a prelink(8)'d + * ld-linux.so. While we could correctly execute such an + * object without locating it at its desired p_vaddr (it is, + * after all, still relocatable), our inner antiquarian + * derives a perverse pleasure in accommodating the steampunk + * prelink(8) contraption -- goggles on! + */ + if ((vaddr = addr) != NULL) { + if (as_gap(curproc->p_as, len, &addr, &len, + AH_LO, NULL) == -1 || addr != vaddr) { + addr = NULL; + } + } + + if (addr == NULL) { + /* + * We either have a NULL p_vaddr (the common case, by + * many orders of magnitude) or we have a non-NULL + * p_vaddr and we were unable to obtain the specified + * VA range (presumably because it's an illegal + * address). Either way, obtain an address in which + * to map the interpreter. + */ + map_addr(&addr, len, (offset_t)0, 1, 0); + if (addr == NULL) + return (ENOMEM); + } + + /* + * Our voffset is the difference between where we landed and + * where we wanted to be. + */ + *voffset = (uintptr_t)addr - (uintptr_t)vaddr; } else { *voffset = 0; } @@ -1437,6 +1617,41 @@ mapelfexec( if (addr < minaddr) minaddr = addr; + /* + * Segments need not correspond to page boundaries: + * they are permitted to share a page. If two PT_LOAD + * segments share the same page, and the permissions + * of the segments differ, the behavior is historically + * that the permissions of the latter segment are used + * for the page that the two segments share. This is + * also historically a non-issue: binaries generated + * by most anything will make sure that two PT_LOAD + * segments with differing permissions don't actually + * share any pages. However, there exist some crazy + * things out there (including at least an obscure + * Portuguese teaching language called G-Portugol) that + * actually do the wrong thing and expect it to work: + * they have a segment with execute permission share + * a page with a subsequent segment that does not + * have execute permissions and expect the resulting + * shared page to in fact be executable. To accommodate + * such broken link editors, we take advantage of a + * latitude explicitly granted to the loader: it is + * permitted to make _any_ PT_LOAD segment executable + * (provided that it is readable or writable). If we + * see that we're sharing a page and that the previous + * page was executable, we will add execute permissions + * to our segment. + */ + if (btop(lastaddr) == btop((uintptr_t)addr) && + (phdr->p_flags & (PF_R | PF_W)) && + (lastprot & PROT_EXEC)) { + prot |= PROT_EXEC; + } + + lastaddr = (uintptr_t)addr + phdr->p_filesz; + lastprot = prot; + zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz; offset = phdr->p_offset; @@ -1521,8 +1736,22 @@ mapelfexec( break; case PT_INTERP: - if (ptload) - goto bad; + /* + * The ELF specification is unequivocal about the + * PT_INTERP program header with respect to any PT_LOAD + * program header: "If it is present, it must precede + * any loadable segment entry." Linux, however, makes + * no attempt to enforce this -- which has allowed some + * binary editing tools to get away with generating + * invalid ELF binaries in the respect that PT_INTERP + * occurs after the first PT_LOAD program header. This + * is unfortunate (and of course, disappointing) but + * it's no worse than that: there is no reason that we + * can't process the PT_INTERP entry (if present) after + * one or more PT_LOAD entries. We therefore + * deliberately do not check ptload here and always + * store dyphdr to be the PT_INTERP program header. + */ *intphdr = phdr; break; @@ -1629,6 +1858,7 @@ elfnote(vnode_t *vp, offset_t *offsetp, int type, int descsz, void *desc, return (0); } + /* * Copy the section data from one vnode to the section of another vnode. */ @@ -1676,28 +1906,38 @@ elf_copy_scn(elf_core_ctx_t *ctx, const Shdr *src, vnode_t *src_vp, Shdr *dst) } /* + * The design of this check is intentional. + * In particular, we want to capture any sections that begin with '.debug_' for + * a few reasons: + * + * 1) Various revisions to the DWARF spec end up changing the set of section + * headers that exist. This ensures that we don't need to change the kernel + * to get a new version. + * + * 2) Other software uses .debug_ sections for things which aren't DWARF. This + * allows them to be captured as well. + */ +#define IS_DEBUGSECTION(name) (strncmp(name, ".debug_", strlen(".debug_")) == 0) + +/* * Walk sections for a given ELF object, counting (or copying) those of * interest (CTF, symtab, strtab, .debug_*). */ -static int +static uint_t elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, - Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, uint_t *countp) + Shdr *v, uint_t idx, uint_t remain, shstrtab_t *shstrtab, int *errp) { Ehdr ehdr; const core_content_t content = ctx->ecc_content; cred_t *credp = ctx->ecc_credp; Shdr *ctf = NULL, *symtab = NULL, *strtab = NULL; uintptr_t off = 0; - uint_t nshdrs, shstrndx, nphdrs, count = 0; + uint_t nshdrs, shstrndx, nphdrs, ndebug, count = 0; u_offset_t *doffp = &ctx->ecc_doffset; boolean_t ctf_link = B_FALSE; caddr_t shbase; size_t shsize, shstrsize; char *shstrbase; - int error = 0; - const boolean_t justcounting = v == NULL; - - *countp = 0; if ((content & (CC_CONTENT_CTF | CC_CONTENT_SYMTAB | CC_CONTENT_DEBUG)) == 0) { @@ -1712,6 +1952,7 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, /* Starting at index 1 skips SHT_NULL which is expected at index 0 */ off = ehdr.e_shentsize; + ndebug = 0; for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) { Shdr *shdr, *symchk = NULL, *strchk; const char *name; @@ -1739,51 +1980,8 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, strcmp(name, shstrtab_data[STR_SYMTAB]) == 0) { symchk = shdr; } else if ((content & CC_CONTENT_DEBUG) != 0 && - strncmp(name, ".debug_", strlen(".debug_")) == 0) { - /* - * The design of the above check is intentional. In - * particular, we want to capture any sections that - * begin with '.debug_' for a few reasons: - * - * 1) Various revisions to the DWARF spec end up - * changing the set of section headers that exist. This - * ensures that we don't need to change the kernel to - * get a new version. - * - * 2) Other software uses .debug_ sections for things - * which aren't DWARF. This allows them to be captured - * as well. - */ - count++; - - if (!justcounting) { - if (count > remain) { - error = ENOMEM; - goto done; - } - - elf_ctx_resize_scratch(ctx, shdr->sh_size); - - if (!shstrtab_ndx(shstrtab, - name, &v[idx].sh_name)) { - error = ENOMEM; - goto done; - } - - v[idx].sh_addr = (Addr)(uintptr_t)saddr; - v[idx].sh_type = shdr->sh_type; - v[idx].sh_addralign = shdr->sh_addralign; - *doffp = roundup(*doffp, v[idx].sh_addralign); - v[idx].sh_offset = *doffp; - v[idx].sh_size = shdr->sh_size; - v[idx].sh_link = 0; - v[idx].sh_entsize = shdr->sh_entsize; - v[idx].sh_info = shdr->sh_info; - - elf_copy_scn(ctx, shdr, mvp, &v[idx]); - idx++; - } - + IS_DEBUGSECTION(name)) { + ndebug++; continue; } else { continue; @@ -1815,24 +2013,19 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, count += 1; if (symtab != NULL) count += 2; - - if (count > remain) { - count = remain; - if (!justcounting) - error = ENOMEM; + count += ndebug; + if (v == NULL || count == 0 || count > remain) { + count = MIN(count, remain); goto done; } - if (justcounting) - goto done; - /* output CTF section */ if (ctf != NULL) { elf_ctx_resize_scratch(ctx, ctf->sh_size); if (!shstrtab_ndx(shstrtab, shstrtab_data[STR_CTF], &v[idx].sh_name)) { - error = ENOMEM; + *errp = ENOMEM; goto done; } v[idx].sh_addr = (Addr)(uintptr_t)saddr; @@ -1875,12 +2068,12 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, if (!shstrtab_ndx(shstrtab, shstrtab_data[symtab_type], &symtab_name)) { - error = ENOMEM; + *errp = ENOMEM; goto done; } if (!shstrtab_ndx(shstrtab, shstrtab_data[strtab_type], &strtab_name)) { - error = ENOMEM; + *errp = ENOMEM; goto done; } @@ -1915,14 +2108,52 @@ elf_process_obj_scns(elf_core_ctx_t *ctx, vnode_t *mvp, caddr_t saddr, idx++; } + if (ndebug == 0) + goto done; + + /* output DEBUG sections */ + off = 0; + for (uint_t i = 1; i < nshdrs; i++, off += ehdr.e_shentsize) { + const char *name; + Shdr *shdr; + + shdr = (Shdr *)(shbase + off); + if (shdr->sh_name >= shstrsize || shdr->sh_type == SHT_NULL) + continue; + + name = shstrbase + shdr->sh_name; + + if (!IS_DEBUGSECTION(name)) + continue; + + elf_ctx_resize_scratch(ctx, shdr->sh_size); + + if (!shstrtab_ndx(shstrtab, name, &v[idx].sh_name)) { + *errp = ENOMEM; + goto done; + } + + v[idx].sh_addr = (Addr)(uintptr_t)saddr; + v[idx].sh_type = shdr->sh_type; + v[idx].sh_addralign = shdr->sh_addralign; + *doffp = roundup(*doffp, v[idx].sh_addralign); + v[idx].sh_offset = *doffp; + v[idx].sh_size = shdr->sh_size; + v[idx].sh_link = 0; + v[idx].sh_entsize = shdr->sh_entsize; + v[idx].sh_info = shdr->sh_info; + + elf_copy_scn(ctx, shdr, mvp, &v[idx]); + idx++; + + if (--ndebug == 0) + break; + } + done: kmem_free(shstrbase, shstrsize); kmem_free(shbase, shsize); - - if (error == 0) - *countp = count; - - return (error); + return (count); } /* @@ -1979,8 +2210,9 @@ elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp) if (seg->s_ops != &segvn_ops || SEGOP_GETVP(seg, seg->s_base, &mvp) != 0 || mvp == lastvp || mvp == NULL || mvp->v_type != VREG || - (segsize = pr_getsegsize(seg, 1)) == 0) + (segsize = pr_getsegsize(seg, 1)) == 0) { continue; + } eaddr = saddr + segsize; prot = pr_getprot(seg, 1, &tmp, &saddr, &naddr, eaddr); @@ -1993,8 +2225,8 @@ elf_process_scns(elf_core_ctx_t *ctx, Shdr *v, uint_t nv, uint_t *nshdrsp) if ((prot & (PROT_WRITE | PROT_EXEC)) != PROT_EXEC) continue; - error = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain, - &shstrtab, &count); + count = elf_process_obj_scns(ctx, mvp, saddr, v, idx, remain, + &shstrtab, &error); if (error != 0) goto done; @@ -2106,8 +2338,9 @@ top: * we overflow the 16 bits allotted to the program header count in * the ELF header, we'll need that program header at index zero. */ - if (nshdrs == 0 && nphdrs >= PN_XNUM) + if (nshdrs == 0 && nphdrs >= PN_XNUM) { nshdrs = 1; + } /* * Allocate a buffer which is sized adequately to hold the ehdr, phdrs @@ -2556,7 +2789,7 @@ static struct modlexec modlexec = { extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action); + int *brand_action); extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); diff --git a/usr/src/uts/common/exec/elf/elf_notes.c b/usr/src/uts/common/exec/elf/elf_notes.c index 78305cc076..0a0d405eba 100644 --- a/usr/src/uts/common/exec/elf/elf_notes.c +++ b/usr/src/uts/common/exec/elf/elf_notes.c @@ -347,11 +347,13 @@ write_elfnotes(proc_t *p, int sig, vnode_t *vp, offset_t offset, /* open file table */ + mutex_enter(&p->p_lock); vroot = PTOU(p)->u_rdir; if (vroot == NULL) vroot = rootdir; VN_HOLD(vroot); + mutex_exit(&p->p_lock); fip = P_FINFO(p); diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c index 935bad0a8c..95583a57ce 100644 --- a/usr/src/uts/common/exec/intp/intp.c +++ b/usr/src/uts/common/exec/intp/intp.c @@ -48,6 +48,7 @@ #include <sys/kmem.h> #include <sys/note.h> #include <sys/sdt.h> +#include <sys/brand.h> /* * This is the loadable module wrapper. @@ -55,7 +56,7 @@ #include <sys/modctl.h> extern int intpexec(struct vnode *, struct execa *, struct uarg *, - struct intpdata *, int, size_t *, int, caddr_t, struct cred *, int); + struct intpdata *, int, size_t *, int, caddr_t, struct cred *, int *); static struct execsw esw = { intpmagicstr, @@ -127,13 +128,20 @@ getintphead(struct vnode *vp, struct intpdata *idatap) *cp = '\0'; /* - * Locate the beginning and end of the interpreter name. - * In addition to the name, one additional argument may - * optionally be included here, to be prepended to the - * arguments provided on the command line. Thus, for - * example, you can say + * Locate the beginning and end of the interpreter name. Historically, + * for illumos and its predecessors, in addition to the name, one + * additional argument may optionally be included here, to be prepended + * to the arguments provided on the command line. Thus, for example, + * you can say * - * #! /usr/bin/awk -f + * #! /usr/bin/awk -f + * + * However, handling of interpreter arguments varies across operating + * systems and other systems allow more than one argument. In + * particular, Linux allows more than one and delivers all arguments + * as a single string (argv[1] is "-arg1 -arg2 ..."). We support this + * style of argument handling as a brand-specific option (setting + * b_intp_parse_arg to B_FALSE). */ for (cp = &linep[2]; *cp == ' '; cp++) ; @@ -152,9 +160,12 @@ getintphead(struct vnode *vp, struct intpdata *idatap) idatap->intp_arg[0] = NULL; else { idatap->intp_arg[0] = cp; - while (*cp && *cp != ' ') - cp++; - *cp = '\0'; + if (!PROC_IS_BRANDED(curproc) || + BROP(curproc)->b_intp_parse_arg) { + while (*cp && *cp != ' ') + cp++; + *cp = '\0'; + } } } return (0); @@ -189,9 +200,8 @@ intpexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { - _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; int error = 0; struct intpdata idata; @@ -282,7 +292,7 @@ intpexec( } error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, - EBA_NONE); + brand_action); if (!error) { /* diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c index 9b38dec5a0..7e9029fa55 100644 --- a/usr/src/uts/common/exec/java/java.c +++ b/usr/src/uts/common/exec/java/java.c @@ -86,10 +86,10 @@ char *jexec_arg = "-jar"; static int javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, size_t *execsz, int setid, - caddr_t execfile, cred_t *cred, int brand_action) + caddr_t execfile, cred_t *cred, int *brand_action) { struct intpdata idata; - int error; + int error, eba; ssize_t resid; vnode_t *nvp; off_t xoff, xoff_end; @@ -161,8 +161,9 @@ javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, args->pathname = resolvepn.pn_path; /* don't free resolvepn until we are done with args */ pn_free(&lookpn); - error = gexec(&nvp, uap, args, &idata, level + 1, execsz, execfile, - cred, EBA_NONE); + eba = EBA_NONE; + error = gexec(&nvp, uap, args, &idata, level + 1, execsz, + execfile, cred, &eba); if (!error) { /* diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c index 25a88b05c1..c1f69ed4b1 100644 --- a/usr/src/uts/common/exec/shbin/shbin.c +++ b/usr/src/uts/common/exec/shbin/shbin.c @@ -59,7 +59,7 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action); + int *brand_action); #define SHBIN_CNTL(x) ((x)&037) #define SHBINMAGIC_LEN 4 @@ -161,11 +161,11 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; - int error = 0; + int error = 0, eba; struct intpdata idata; struct pathname intppn; struct pathname resolvepn; @@ -245,8 +245,9 @@ shbinexec( args->fname = devfd; } + eba = EBA_NONE; error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, - EBA_NONE); + &eba); if (!error) { /* diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c index a426eeaf10..ce08e3697b 100644 --- a/usr/src/uts/common/fs/dev/sdev_netops.c +++ b/usr/src/uts/common/fs/dev/sdev_netops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -41,8 +42,102 @@ #include <sys/zone.h> #include <sys/dls.h> +static const char *devnet_zpath = "/dev/net/zone/"; struct vnodeops *devnet_vnodeops; +static zoneid_t +devnet_nodetozone(sdev_node_t *dv) +{ + char *zname = NULL, *dup; + zone_t *zone; + int duplen; + zoneid_t zid; + + /* + * If in a non-global zone, always return it's zid no matter what the + * node is. + */ + zid = getzoneid(); + if (zid != GLOBAL_ZONEID) + return (zid); + + /* + * If it doesn't have /dev/net/zone/ then it can't be a specific zone + * we're targetting. + */ + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0) + return (GLOBAL_ZONEID); + + if (dv->sdev_vnode->v_type == VDIR) { + zone = zone_find_by_name(dv->sdev_name); + } else { + /* Non directories have the form /dev/net/zone/%z/%s */ + dup = strdup(dv->sdev_path); + duplen = strlen(dup); + zname = strrchr(dup, '/'); + *zname = '\0'; + zname--; + zname = strrchr(dup, '/'); + zname++; + zone = zone_find_by_name(zname); + kmem_free(dup, duplen + 1); + } + if (zone == NULL) + return (GLOBAL_ZONEID); + zid = zone->zone_id; + zone_rele(zone); + return (zid); +} + +static int +devnet_mkdir(struct sdev_node *ddv, char *name) +{ + sdev_node_t *dv; + struct vattr va; + int ret; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + dv = sdev_cache_lookup(ddv, name); + if (dv != NULL) { + SDEV_SIMPLE_RELE(dv); + return (EEXIST); + } + + va = *sdev_getdefault_attr(VDIR); + gethrestime(&va.va_atime); + va.va_mtime = va.va_atime; + va.va_ctime = va.va_atime; + + ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(dv); + return (0); +} + +/* + * We basically need to walk down the directory path to determine what we should + * do. At the top level of /dev/net, only the directory /dev/net/zone is valid, + * and it is always valid. Following on that, /dev/net/zone/%zonename is valid + * if and only if we can look up that zone name. If it's not, or it's some other + * name, then it's SDEV_VTOR_INVALID. + */ +static int +devnet_dirvalidate(struct sdev_node *dv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, dv->sdev_path) == 0) + return (SDEV_VTOR_VALID); + + zonep = zone_find_by_name(dv->sdev_name); + if (zonep == NULL) + return (SDEV_VTOR_INVALID); + zone_rele(zonep); + return (SDEV_VTOR_VALID); +} + /* * Check if a net sdev_node is still valid - i.e. it represents a current * network link. @@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv) ASSERT(dv->sdev_state == SDEV_READY); - if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0) + if (dv->sdev_vnode->v_type == VDIR) + return (devnet_dirvalidate(dv)); + + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) { + ASSERT(SDEV_IS_GLOBAL(dv)); + zoneid = devnet_nodetozone(dv); + } else { + zoneid = getzoneid(); + } + + if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0) return (SDEV_VTOR_INVALID); - if (SDEV_IS_GLOBAL(dv)) + if (zoneid == GLOBAL_ZONEID) return (SDEV_VTOR_VALID); - zoneid = getzoneid(); return (zone_check_datalink(&zoneid, linkid) == 0 ? SDEV_VTOR_VALID : SDEV_VTOR_INVALID); } @@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv) * a net entry when the node is not found in the cache. */ static int -devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp) +devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp, + zoneid_t zid) { timestruc_t now; dev_t dev; int error; - if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) { + if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) { sdcmn_err12(("devnet_create_rvp: not a valid vanity name " "network node: %s\n", nm)); return (error); @@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct sdev_node *ddv = VTOSDEV(dvp); struct sdev_node *dv = NULL; dls_dl_handle_t ddh = NULL; + zone_t *zone; struct vattr vattr; int nmlen; int error = ENOENT; @@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, if (SDEVTOV(ddv)->v_type != VDIR) return (ENOTDIR); + if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID) + return (EPERM); + /* * Empty name or ., return node itself. */ @@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, rw_enter(&ddv->sdev_contents, RW_WRITER); /* + * ZOMBIED parent does not allow new node creation, bail out early. + */ + if (ddv->sdev_state == SDEV_ZOMBIE) + goto failed; + + /* * directory cache lookup: */ if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) { @@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, goto found; } + if (SDEV_IS_GLOBAL(ddv)) { + /* + * Check for /dev/net/zone + */ + if (strcmp("zone", nm) == 0 && strcmp("/dev/net", + ddv->sdev_path) == 0) { + (void) devnet_mkdir(ddv, nm); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + + /* + * Check for /dev/net/zone/%z. We can't use devnet_zpath due to + * its trailing slash. + */ + if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) { + zone = zone_find_by_name(nm); + if (zone == NULL) + goto failed; + (void) devnet_mkdir(ddv, nm); + zone_rele(zone); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + } else if (strcmp("/dev/net", ddv->sdev_path) != 0) { + goto failed; + } + /* - * ZOMBIED parent does not allow new node creation, bail out early. + * We didn't find what we were looking for. What that is depends a lot + * on what directory we're in. */ - if (ddv->sdev_state == SDEV_ZOMBIE) - goto failed; - error = devnet_create_rvp(nm, &vattr, &ddh); + error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv)); if (error != 0) goto failed; @@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg) if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL) goto found; - if (devnet_create_rvp(link, &vattr, &ddh) != 0) + if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0) return (0); ASSERT(ddh != NULL); @@ -244,16 +388,77 @@ found: return (0); } +/* + * Fill in all the entries for the current zone. + */ static void -devnet_filldir(struct sdev_node *ddv) +devnet_fillzone(struct sdev_node *ddv, zoneid_t zid) { - sdev_node_t *dv, *next; datalink_id_t linkid; + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + if (zid == GLOBAL_ZONEID) { + ASSERT(SDEV_IS_GLOBAL(ddv)); + linkid = DATALINK_INVALID_LINKID; + do { + linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); + if (linkid != DATALINK_INVALID_LINKID) + (void) devnet_filldir_datalink(linkid, ddv); + } while (linkid != DATALINK_INVALID_LINKID); + } else { + (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv); + } +} + +/* + * Callback for zone_walk when filling up /dev/net/zone/... + */ +static int +devnet_fillzdir_cb(zone_t *zonep, void *arg) +{ + sdev_node_t *ddv = arg; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + (void) devnet_mkdir(ddv, zonep->zone_name); + return (0); +} + +/* + * Fill in a directory that isn't the top level /dev/net. + */ +static void +devnet_fillzdir(struct sdev_node *ddv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, ddv->sdev_path) == 0) { + (void) zone_walk(devnet_fillzdir_cb, ddv); + return; + } + + zonep = zone_find_by_name(ddv->sdev_name); + if (zonep == NULL) + return; + devnet_fillzone(ddv, zonep->zone_id); + zone_rele(zonep); +} + +static void +devnet_filldir(struct sdev_node *ddv) +{ + int ret; + sdev_node_t *dv, *next; + ASSERT(RW_READ_HELD(&ddv->sdev_contents)); if (rw_tryupgrade(&ddv->sdev_contents) == 0) { rw_exit(&ddv->sdev_contents); rw_enter(&ddv->sdev_contents, RW_WRITER); + if (ddv->sdev_state == SDEV_ZOMBIE) { + rw_exit(&ddv->sdev_contents); + return; + } } for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) { @@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv) if (SDEVTOV(dv)->v_count > 0) continue; + SDEV_HOLD(dv); + + /* + * Clean out everything underneath before we remove ourselves. + */ + if (SDEVTOV(dv)->v_type == VDIR) { + ret = sdev_cleandir(dv, NULL, 0); + ASSERT(ret == 0); + } /* remove the cache node */ (void) sdev_cache_update(ddv, &dv, dv->sdev_name, SDEV_CACHE_DELETE); SDEV_RELE(dv); } + if (strcmp(ddv->sdev_path, "/dev/net") != 0) { + devnet_fillzdir(ddv); + goto done; + } + if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild()) goto done; if (SDEV_IS_GLOBAL(ddv)) { - linkid = DATALINK_INVALID_LINKID; - do { - linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, - DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); - if (linkid != DATALINK_INVALID_LINKID) - (void) devnet_filldir_datalink(linkid, ddv); - } while (linkid != DATALINK_INVALID_LINKID); + devnet_fillzone(ddv, GLOBAL_ZONEID); + (void) devnet_mkdir(ddv, "zone"); } else { - (void) zone_datalink_walk(getzoneid(), - devnet_filldir_datalink, ddv); + devnet_fillzone(ddv, getzoneid()); } ddv->sdev_flags &= ~SDEV_BUILD; - done: rw_downgrade(&ddv->sdev_contents); } @@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, ASSERT(sdvp); + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + if (uiop->uio_offset == 0) devnet_filldir(sdvp); diff --git a/usr/src/uts/common/fs/dev/sdev_vnops.c b/usr/src/uts/common/fs/dev/sdev_vnops.c index 8fe926f6fb..5a00242482 100644 --- a/usr/src/uts/common/fs/dev/sdev_vnops.c +++ b/usr/src/uts/common/fs/dev/sdev_vnops.c @@ -894,6 +894,9 @@ sdev_remove(struct vnode *dvp, char *nm, struct cred *cred, } } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } @@ -1218,6 +1221,7 @@ sdev_symlink(struct vnode *dvp, char *lnm, struct vattr *tva, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1290,6 +1294,7 @@ sdev_mkdir(struct vnode *dvp, char *nm, struct vattr *va, struct vnode **vpp, sdev_update_timestamps(dvp, kcred, AT_MTIME|AT_ATIME); if (SDEV_IS_GLOBAL(parent)) atomic_inc_ulong(&parent->sdev_gdir_gen); + i_ddi_di_cache_invalidate(); /* wake up other threads blocked on looking up this node */ mutex_enter(&self->sdev_lookup_lock); @@ -1405,6 +1410,9 @@ sdev_rmdir(struct vnode *dvp, char *nm, struct vnode *cdir, struct cred *cred, } + if (error == 0) + i_ddi_di_cache_invalidate(); + return (error); } diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index 8f22ef32f0..e236eb3f72 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -472,8 +472,10 @@ devzvol_create_pool_dirs(struct vnode *dvp) ASSERT(dvp->v_count > 0); rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, NULL, kcred, NULL, 0, NULL); - /* should either work, or not be visible from a zone */ - ASSERT(rc == 0 || rc == ENOENT); + /* + * should either work or we should get an error if this should + * not be visible from the zone, or disallowed in the zone + */ if (rc == 0) VN_RELE(vp); pools++; diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index 9f7f284842..769316bb4c 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> @@ -33,11 +37,12 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> - #include <sys/fem.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */ /* @@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1) } #endif +/* + * File event monitoring handoffs + * + * File event monitoring relies on being able to inject stack frames between + * vnode consumers and the underlying file systems. This becomes problematic + * when there exist many monitors, as kernel stack depth is finite. The model + * very much encodes this injected frame: the flow of control deliberately + * lies with the monitor, not with the monitoring system. While we could + * conceivably address this by allowing each subsystem to install at most + * one monitor per vnode (and impose on subsystems that they handle any + * of their own consumer multiplexing internally), this in fact exports a + * substantial amount of run-time complexity to deal with an uncommon case + * (and, it must be said, assumes a small number of consuming subsystems). + * To allow our abstraction to remain clean, we instead check our remaining + * stack in every vnext_*() call; if the amount of stack remaining is lower + * than a threshold (fem_stack_needed), we call thread_splitstack() to carry + * on the execution of the monitors and the underlying vnode operation on a + * split stack. Because we can only pass a single argument to our split stack + * function, we must marshal our arguments, the mechanics of which are somewhat + * ornate in terms of the code: to marshal in a type-safe manner, we define a + * baton that is a union of payload structures for each kind of operation, + * loading the per-operation payload explicitly and calling into common handoff + * code that itself calls thread_splitstack(). The function passed to + * thread_splitstack() is a per-entry point function that continues monitor + * processing given the specified (marshalled) arguments. While this method + * is a little verbose to implement, it has the advantage of being relatively + * robust (that is, broadly type-safe) while imposing minimal burden on each + * vnext_*() entry point. + * + * In terms of the implementation: + * + * - The FEM_BATON_n macros define the per-entry point baton structures + * - The fem_baton_payload_t contains the union of these structures + * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point + * - The FEM_VNEXTn macros constitute the per-handoff entry point + * + * Note that we don't use variadic macros -- we define a variant of these + * macros for each of our relevant argument counts. This may seem overly + * explicit, but it is deliberate: the object here is to minimize the + * future maintenance burden by minimizing the likelihood of introduced + * error -- not to minimize the number of characters in this source file. + */ + +#ifndef STACK_GROWTH_DOWN +#error Downward stack growth assumed. +#endif + +int fem_stack_toodeep; +uintptr_t fem_stack_needed = 8 * 1024; +size_t fem_handoff_stacksize = 128 * 1024; + +#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ + (uintptr_t)curthread->t_stkbase < fem_stack_needed) + +#define FEM_BATON_1(what, t0, l0) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + } fb_##what + +#define FEM_BATON_2(what, t0, l0, t1, l1) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + } fb_##what + +#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + } fb_##what + +#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + } fb_##what + +#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + } fb_##what + +#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + } fb_##what + +#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + } fb_##what + +#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7, t8, l8) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + t8 fb_##what##_##l8; \ + } fb_##what + +typedef union { + FEM_BATON_2(open, int, mode, cred_t *, cr); + FEM_BATON_4(close, int, flag, int, count, + offset_t, offset, cred_t *, cr); + FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_5(ioctl, int, cmd, intptr_t, arg, + int, flag, cred_t *, cr, int *, rvalp); + FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr); + FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr); + FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp, + pathname_t *, pnp, int, flags, vnode_t *, rdir, + cred_t *, cr, int *, direntflags, pathname_t *, realpnp); + FEM_BATON_8(create, char *, name, vattr_t *, vap, + vcexcl_t, excl, int, mode, vnode_t **, vpp, + cred_t *, cr, int, flag, vsecattr_t *, vsecp); + FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags); + FEM_BATON_4(link, vnode_t *, svp, char *, tnm, + cred_t *, cr, int, flags); + FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp, + char *, tnm, cred_t *, cr, int, flags); + FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap, + vnode_t **, vpp, cred_t *, cr, int, flags, + vsecattr_t *, vsecp); + FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir, + cred_t *, cr, int, flags); + FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr, + int *, eofp, int, flags); + FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap, + char *, target, cred_t *, cr, int, flags); + FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr); + FEM_BATON_2(fsync, int, syncflag, cred_t *, cr); + FEM_BATON_1(inactive, cred_t *, cr); + FEM_BATON_1(fid, fid_t *, fidp); + FEM_BATON_1(rwlock, int, write_lock); + FEM_BATON_1(rwunlock, int, write_lock); + FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp); + FEM_BATON_1(cmp, vnode_t *, vp2); + FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, struct flk_callback *, flk_cbp, + cred_t *, cr); + FEM_BATON_5(space, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, cred_t *, cr); + FEM_BATON_1(realvp, vnode_t **, vpp); + FEM_BATON_9(getpage, offset_t, off, size_t, len, + uint_t *, protp, struct page **, plarr, size_t, plsz, + struct seg *, seg, caddr_t, addr, enum seg_rw, rw, + cred_t *, cr); + FEM_BATON_4(putpage, offset_t, off, size_t, len, + int, flags, cred_t *, cr); + FEM_BATON_8(map, offset_t, off, struct as *, as, + caddr_t *, addrp, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(addmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(delmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uint_t, prot, + uint_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_4(poll, short, events, int, anyyet, + short *, reventsp, struct pollhead **, phpp); + FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks); + FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr); + FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off, + size_t, io_len, int, flags, cred_t *, cr); + FEM_BATON_2(dumpctl, int, action, offset_t *, blkp); + FEM_BATON_4(dispose, struct page *, pp, int, flag, + int, dn, cred_t *, cr); + FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr, + int, flag, cred_t *, cr); + FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname); + FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag, + xuio_t *, xuiop, cred_t *, cr); + FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr); +} fem_baton_payload_t; + +typedef struct { + fem_baton_payload_t fb_payload; + int (*fb_func)(); + void (*fb_handoff)(); + int fb_rval; +} fem_baton_t; + +static int +fem_handoff(fem_baton_t *bp) +{ + fem_stack_toodeep++; + thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize); + + return (bp->fb_rval); +} + +#define FEM_VNEXT3_DECL(what, a0, a1, a2) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2); \ +} + +#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3); \ +} + +#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4); \ +} + +#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5); \ +} + +#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6); \ +} + +#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7); \ +} + +#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9); \ +} + +#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9, \ + bp->fb_payload.fb_##what.fb_##what##_##a10); \ +} + +#define FEM_VNEXT3(what, func, a0, a1, a2) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2)) + +#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3)) + +#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4)) + +#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5)) + +#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6)) + +#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7)) + +#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)) + +#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)) + static fem_t * fem_alloc() { @@ -2040,10 +2575,60 @@ static struct fs_operation_def fshead_vfs_spec[] = { * 5. Return by invoking the base operation with the base object. * * for each classification, there needs to be at least one "next" operation - * for each "head"operation. - * + * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros + * to define the function to run when the stack is split; see the discussion + * on "File event monitoring handoffs", above. */ +FEM_VNEXT4_DECL(open, arg0, mode, cr, ct) +FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct) +FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct) +FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct) +FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct) +FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp) +FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp) +FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags) +FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags) +FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags) +FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp) +FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags) +FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags) +FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags) +FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct) +FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct) +FEM_VNEXT3_DECL(fid, arg0, fidp, ct) +FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct) +FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct) +FEM_VNEXT3_DECL(cmp, arg0, vp2, ct) +FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct) +FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct) +FEM_VNEXT3_DECL(realvp, arg0, vpp, ct) +FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz, + seg, addr, rw, cr, ct) +FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct) +FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct) +FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct) +FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct) +FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct) +FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct) +FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct) +FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct) +FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct) +FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct) + int vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) { @@ -2055,7 +2640,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_open, femop_open); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, cr, ct)); + FEM_VNEXT4(open, func, arg0, mode, cr, ct); } int @@ -2070,7 +2655,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_close, femop_close); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, flag, count, offset, cr, ct)); + FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct); } int @@ -2085,7 +2670,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_read, femop_read); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct); } int @@ -2100,7 +2685,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_write, femop_write); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct); } int @@ -2115,7 +2700,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct)); + FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct); } int @@ -2130,7 +2715,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, oflags, nflags, cr, ct)); + FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct); } int @@ -2145,7 +2730,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct); } int @@ -2160,7 +2745,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct); } int @@ -2175,7 +2760,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_access, femop_access); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, flags, cr, ct)); + FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct); } int @@ -2191,8 +2776,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp, vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct, - direntflags, realpnp)); + FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct, + direntflags, realpnp); } int @@ -2208,7 +2793,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, vsop_find(vf, &func, int, &arg0, vop_create, femop_create); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)); + FEM_VNEXT10(create, func, arg0, name, vap, excl, + mode, vpp, cr, flag, ct, vsecp); } int @@ -2223,7 +2809,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cr, ct, flags)); + FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags); } int @@ -2238,7 +2824,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_link, femop_link); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, svp, tnm, cr, ct, flags)); + FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags); } int @@ -2253,7 +2839,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags)); + FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags); } int @@ -2268,7 +2854,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp, vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp)); + FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp); } int @@ -2283,7 +2869,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cdir, cr, ct, flags)); + FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags); } int @@ -2298,7 +2884,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, eofp, ct, flags)); + FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags); } int @@ -2313,7 +2899,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target, vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, linkname, vap, target, cr, ct, flags)); + FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags); } int @@ -2327,7 +2913,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, ct)); + FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct); } int @@ -2341,7 +2927,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, syncflag, cr, ct)); + FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct); } void @@ -2369,7 +2955,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, fidp, ct)); + FEM_VNEXT3(fid, func, arg0, fidp, ct); } int @@ -2383,7 +2969,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, write_lock, ct)); + FEM_VNEXT3(rwlock, func, arg0, write_lock, ct); } void @@ -2411,7 +2997,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ooff, noffp, ct)); + FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct); } int @@ -2425,7 +3011,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vp2, ct)); + FEM_VNEXT3(cmp, func, arg0, vp2, ct); } int @@ -2441,7 +3027,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)); + FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct); } int @@ -2456,7 +3042,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_space, femop_space); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct)); + FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct); } int @@ -2470,7 +3056,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vpp, ct)); + FEM_VNEXT3(realvp, func, arg0, vpp, ct); } int @@ -2486,8 +3072,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp, vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw, - cr, ct)); + FEM_VNEXT11(getpage, func, arg0, off, len, protp, + plarr, plsz, seg, addr, rw, cr, ct); } int @@ -2502,7 +3088,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags, vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, flags, cr, ct)); + FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct); } int @@ -2518,8 +3104,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp, vsop_find(vf, &func, int, &arg0, vop_map, femop_map); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags, + cr, ct); } int @@ -2535,8 +3121,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2552,8 +3138,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2568,7 +3154,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp, vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, events, anyyet, reventsp, phpp, ct)); + FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct); } int @@ -2583,7 +3169,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks, vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, addr, lbdn, dblks, ct)); + FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct); } int @@ -2598,7 +3184,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, valp, cr, ct)); + FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct); } int @@ -2613,7 +3199,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off, vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct)); + FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct); } int @@ -2627,7 +3213,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, action, blkp, ct)); + FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct); } void @@ -2657,7 +3243,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2672,7 +3258,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2687,7 +3273,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, shr, flag, cr, ct)); + FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct); } int @@ -2702,7 +3288,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vnevent, dvp, cname, ct)); + FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct); } int @@ -2717,7 +3303,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ioflag, xuiop, cr, ct)); + FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct); } int @@ -2731,7 +3317,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, xuiop, cr, ct)); + FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct); } int diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 6e56000ffe..a908f91267 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -61,7 +62,6 @@ #if FIFODEBUG int Fifo_fastmode = 1; /* pipes/fifos will be opened in fast mode */ int Fifo_verbose = 0; /* msg when switching out of fast mode */ -int Fifohiwat = FIFOHIWAT; /* Modifiable FIFO high water mark */ #endif /* @@ -196,6 +196,7 @@ fnode_constructor(void *buf, void *cdrarg, int kmflags) fnp->fn_dest = fnp; fnp->fn_mp = NULL; fnp->fn_count = 0; + fnp->fn_hiwat = FIFOHIWAT; fnp->fn_rsynccnt = 0; fnp->fn_wsynccnt = 0; fnp->fn_wwaitcnt = 0; @@ -388,11 +389,7 @@ fifoinit(int fstype, char *name) pipe_constructor, pipe_destructor, NULL, (void *)(sizeof (fifodata_t)), NULL, 0); -#if FIFODEBUG - if (Fifohiwat < FIFOHIWAT) - Fifohiwat = FIFOHIWAT; -#endif /* FIFODEBUG */ - fifo_strdata.qi_minfo->mi_hiwat = Fifohiwat; + fifo_strdata.qi_minfo->mi_hiwat = FIFOHIWAT; return (0); } @@ -614,9 +611,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld) /* * The other end of the pipe is almost closed so * reject any other open on this end of the pipe - * This only happens with a pipe mounted under namefs + * This normally only happens with a pipe mounted under namefs, but + * we can also see an open via proc/fd, which should still succeed. + * To indicate the proc/fd case the FKLYR flag is passed. */ - if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) { + if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) && + (flag & FKLYR) == 0) { fifo_cleanup(oldvp, flag); cv_broadcast(&fnp->fn_wait_cv); if (!lockheld) @@ -1161,7 +1161,8 @@ fifo_wakewriter(fifonode_t *fn_dest, fifolock_t *fn_lock) int fn_dflag = fn_dest->fn_flag; ASSERT(MUTEX_HELD(&fn_lock->flk_lock)); - ASSERT(fn_dest->fn_dest->fn_count < Fifohiwat); + ASSERT(fn_dest->fn_dest->fn_count < fn_dest->fn_dest->fn_hiwat); + if ((fn_dflag & FIFOWANTW)) { cv_broadcast(&fn_dest->fn_wait_cv); } diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index c1b4652633..ceec9bd012 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -28,7 +28,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2017, Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. */ @@ -104,10 +104,6 @@ static int fifo_setsecattr(struct vnode *, vsecattr_t *, int, struct cred *, static int fifo_getsecattr(struct vnode *, vsecattr_t *, int, struct cred *, caller_context_t *); -/* functions local to this file */ -static boolean_t fifo_stayfast_enter(fifonode_t *); -static void fifo_stayfast_exit(fifonode_t *); - /* * Define the data structures external to this file. */ @@ -645,7 +641,7 @@ fifo_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *crp, * (3) write-only FIFO with no data * (4) no data and FNDELAY flag is set. * Otherwise return - * EAGAIN if FNONBLOCK is set and no data to read + * EAGAIN if FNONBLOCK is set and no data to read or FIFORDBLOCK is set * EINTR if signal received while waiting for data * * While there is no data to read.... @@ -681,7 +677,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, * Check for data on our input queue */ - while (fnp->fn_count == 0) { + while (fnp->fn_count == 0 || (fnp->fn_flag & FIFORDBLOCK) != 0) { /* * No data on first attempt and no writer, then EOF */ @@ -731,6 +727,7 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, } ASSERT(fnp->fn_mp != NULL); + VERIFY((fnp->fn_flag & FIFORDBLOCK) == 0); /* For pipes copy should not bypass cache */ uiop->uio_extflg |= UIO_COPY_CACHED; @@ -772,6 +769,18 @@ fifo_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *crp, &fn_lock->flk_lock)) goto trywake; + /* + * If another thread snuck in and started to + * consume data using read-blocking out of + * the pipe while we were blocked in the + * cv_wait, then since we have already consumed + * some of the data out of the pipe we need + * to return with a short read. + */ + if ((fnp->fn_flag & FIFORDBLOCK) != 0) { + goto trywake; + } + if (!(fnp->fn_flag & FIFOFAST)) goto stream_mode; } @@ -787,11 +796,11 @@ trywake: /* * wake up any blocked writers, processes * sleeping on POLLWRNORM, or processes waiting for SIGPOLL - * Note: checking for fn_count < Fifohiwat emulates + * Note: checking for fn_count < fn_hiwat emulates * STREAMS functionality when low water mark is 0 */ if (fn_dest->fn_flag & (FIFOWANTW | FIFOHIWATW) && - fnp->fn_count < Fifohiwat) { + fnp->fn_count < fn_dest->fn_hiwat) { fifo_wakewriter(fn_dest, fn_lock); } goto done; @@ -904,7 +913,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp, /* * check to make sure we are not over high water mark */ - while (fn_dest->fn_count >= Fifohiwat) { + while (fn_dest->fn_count >= fn_dest->fn_hiwat) { /* * Indicate that we have gone over high * water mark @@ -962,7 +971,7 @@ fifo_write(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *crp, * then we must break the message up into PIPE_BUF * chunks to stay compliant with STREAMS */ - if (uiop->uio_resid + fn_dest->fn_count > Fifohiwat) + if (uiop->uio_resid + fn_dest->fn_count > fn_dest->fn_hiwat) size = MIN(uiop->uio_resid, PIPE_BUF); else size = uiop->uio_resid; @@ -1213,7 +1222,8 @@ fifo_fastioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, if (arg != 0) { goto turn_fastoff; } - *rvalp = (fnp->fn_dest->fn_count < Fifohiwat) ? 1 : 0; + *rvalp = (fnp->fn_dest->fn_count < fnp->fn_dest->fn_hiwat) ? + 1 : 0; mutex_exit(&fn_lock->flk_lock); return (0); @@ -1827,7 +1837,7 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, retevents = POLLHUP; } else if (events & (POLLWRNORM | POLLWRBAND)) { if (events & POLLWRNORM) { - if (fn_dest->fn_count < Fifohiwat) + if (fn_dest->fn_count < fn_dest->fn_hiwat) retevents = POLLWRNORM; else fnp->fn_flag |= FIFOHIWATW; @@ -1996,7 +2006,7 @@ fifo_getsecattr(struct vnode *vp, vsecattr_t *vsap, int flag, struct cred *crp, * the lock. * If the fifo switches into stream mode while we are waiting, return failure. */ -static boolean_t +boolean_t fifo_stayfast_enter(fifonode_t *fnp) { ASSERT(MUTEX_HELD(&fnp->fn_lock->flk_lock)); @@ -2018,7 +2028,7 @@ fifo_stayfast_enter(fifonode_t *fnp) * - threads wanting to turn into stream mode waiting in fifo_fastoff(), * - other writers threads waiting in fifo_stayfast_enter(). */ -static void +void fifo_stayfast_exit(fifonode_t *fnp) { fifonode_t *fn_dest = fnp->fn_dest; diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..cc03f41c8d --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + kmem_free(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + kmem_free(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = kmem_zalloc(alloc_size, KM_NOSLEEP_LAZY); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..1d857309f3 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..bf80da6dbe --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,613 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), KM_NOSLEEP_LAZY)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + kmem_free(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..52dba31761 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (EISDIR); + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + uint_t len, cnt; + int i, error; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen >= sizeof (nm)) { + kmem_free(e, len); + return (EINVAL); + } + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen >= sizeof (path)) { + kmem_free(e, len); + return (EINVAL); + } + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen >= sizeof (nm)) { + kmem_free(e32, len); + return (EINVAL); + } + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen >= sizeof (path)) { + kmem_free(e32, len); + return (EINVAL); + } + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + vattr_t tmp_va; + + if (tp->hln_looped == 1) { + int error; + + if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr, + ct)) != 0) + return (error); + } + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + if (tp->hln_looped == 1) { + vap->va_nblocks = tmp_va.va_nblocks; + } else { + vap->va_nblocks = + (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + } + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, uint_t *pcnt, uint_t n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + uint_t cnt; + uint_t len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == vn_vpath_empty) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + uint_t limit, cnt; + int error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + ulong_t outcount = 0; + ulong_t bufsize; + size_t reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + kmem_free(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 2e48a21150..71e2aeb48b 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -58,6 +58,7 @@ #include <sys/zone.h> #include <sys/dnlc.h> #include <sys/fs/snode.h> +#include <sys/brand.h> /* Controls whether paths are stored with vnodes. */ int vfs_vnode_path = 1; @@ -244,6 +245,9 @@ lookuppnvp( pp = &presrvd; } + if (flags & __FLXNOAUTO) + lookup_flags |= __FLXNOAUTO; + if (auditing) audit_anchorpath(pnp, vp == rootvp); diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..e19281fc15 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB || (p->p_flag & SEXITING) != 0) { + lxpnp->lxpr_realvp = NULL; + } else { + ASSERT(MUTEX_HELD(&p->p_lock)); + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..60b3d52f09 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support the LX brand with a Linux /proc entirely compatible with the Linux + * world view; the other -- this one -- is to support native (but Linux-borne) + * programs that wish to view the native system via the Linux /proc model. So + * the aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more -- and as such, it should not be unified with the much + * more complete Linux /proc implementation found in the LX brand. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + -1, /* 41: illumos SIGINFO */ + LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + if ((seg->s_flags & S_HOLE) != 0) { + continue; + } + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & (SSYS | SEXITING)) && + (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + ulong_t total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + zone_get_physmem_data(zone->zone_id, (pgcnt_t *)&total_mem, + (pgcnt_t *)&free_mem); + total_mem = ptob(total_mem); + free_mem = ptob(free_mem); + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = ptob(k_anoninfo.ani_max); + used_swap = ptob(k_anoninfo.ani_phys_resv); + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemAvailable: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached: %8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + btok(free_mem), /* MemAvailable */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & (SSYS | SEXITING)) || + (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..eadb2ccd27 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,278 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LXPROC_H +#define _LXPROC_H + +#ifdef _LXPROC_BRANDED_H +#error Attempted to include native lxproc.h after branded lx_proc.h +#endif + +#define _LXPROC_NATIVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index d6a88a97c3..f6c6b62925 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index 74d47dd93d..279cc60877 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -29,7 +29,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2022 Oxide Computer Company */ @@ -3354,10 +3354,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -5537,8 +5536,13 @@ nfs3_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs3setattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index f0320aaee0..25088aafcb 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index 2a501bc898..b57ad066e4 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -38,7 +38,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2022 Oxide Computer Company */ @@ -3757,8 +3757,13 @@ nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, */ error = nfs4setattr(vp, vap, flags, cr, NULL); - if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (vap->va_mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -8074,8 +8079,9 @@ link_call: * vnode if it already existed. */ if (error == 0) { - vnode_t *tvp; + vnode_t *tvp, *tovp; rnode4_t *trp; + /* * Notify the vnode. Each links is represented by * a different vnode, in nfsv4. @@ -8088,23 +8094,20 @@ link_call: vnevent_rename_dest(tvp, ndvp, nnm, ct); } - /* - * if the source and destination directory are not the - * same notify the destination directory. - */ - if (VTOR4(odvp) != VTOR4(ndvp)) { - trp = VTOR4(ndvp); - tvp = ndvp; - if (IS_SHADOW(ndvp, trp)) - tvp = RTOV4(trp); - vnevent_rename_dest_dir(tvp, ct); - } - trp = VTOR4(ovp); - tvp = ovp; + tovp = ovp; if (IS_SHADOW(ovp, trp)) + tovp = RTOV4(trp); + + vnevent_rename_src(tovp, odvp, onm, ct); + + trp = VTOR4(ndvp); + tvp = ndvp; + + if (IS_SHADOW(ndvp, trp)) tvp = RTOV4(trp); - vnevent_rename_src(tvp, odvp, onm, ct); + + vnevent_rename_dest_dir(tvp, tovp, nnm, ct); } if (nvp) { @@ -11021,8 +11024,13 @@ nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfs4setattr(vp, &va, 0, cr, NULL); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/nfs/nfs_sys.c b/usr/src/uts/common/fs/nfs/nfs_sys.c index 434c9a2a3e..8048d13ca3 100644 --- a/usr/src/uts/common/fs/nfs/nfs_sys.c +++ b/usr/src/uts/common/fs/nfs/nfs_sys.c @@ -30,6 +30,7 @@ */ /* + * Copyright 2017 Joyent, Inc. * Copyright 2018 Nexenta Systems, Inc. */ @@ -242,7 +243,7 @@ nfssys(enum nfssys_op opcode, void *arg) lsa.n_fmly = STRUCT_FGET(ulsa, n_fmly); lsa.n_proto = STRUCT_FGET(ulsa, n_proto); lsa.n_rdev = expldev(STRUCT_FGET(ulsa, n_rdev)); - lsa.debug = STRUCT_FGET(ulsa, debug); + lsa.n_v4_only = STRUCT_FGET(ulsa, n_v4_only); lsa.timout = STRUCT_FGET(ulsa, timout); lsa.grace = STRUCT_FGET(ulsa, grace); lsa.retransmittimeout = STRUCT_FGET(ulsa, diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c index c9cc306f95..5041ebb6fe 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 1a1082bcb8..ee3bac484f 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -26,7 +26,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -1174,8 +1174,13 @@ nfs_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, error = nfssetattr(vp, vap, flags, cr); - if (error == 0 && (mask & AT_SIZE) && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0 && (mask & AT_SIZE)) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } return (error); } @@ -2688,11 +2693,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); - ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { @@ -4620,8 +4623,13 @@ nfs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, va.va_size = bfp->l_start; error = nfssetattr(vp, &va, 0, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else error = EINVAL; } diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c index 976715e346..275330a0ae 100644 --- a/usr/src/uts/common/fs/pcfs/pc_dir.c +++ b/usr/src/uts/common/fs/pcfs/pc_dir.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -826,8 +826,7 @@ top: if (error == 0) { vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); - if (dp != tdp) - vnevent_rename_dest_dir(PCTOV(tdp), ctp); + vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp); } done: diff --git a/usr/src/uts/common/fs/pcfs/pc_vnops.c b/usr/src/uts/common/fs/pcfs/pc_vnops.c index 013a6d3352..1965444071 100644 --- a/usr/src/uts/common/fs/pcfs/pc_vnops.c +++ b/usr/src/uts/common/fs/pcfs/pc_vnops.c @@ -782,8 +782,11 @@ pcfs_setattr( if (error) goto out; - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file modified times. diff --git a/usr/src/uts/common/fs/portfs/port_fop.c b/usr/src/uts/common/fs/portfs/port_fop.c index c9c417fda8..e11d5c8be4 100644 --- a/usr/src/uts/common/fs/portfs/port_fop.c +++ b/usr/src/uts/common/fs/portfs/port_fop.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright 2022 Oxide Computer Company */ @@ -540,14 +540,14 @@ port_fop_trimpfplist(vnode_t *vp) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } } /* * This routine returns 1, if the vnode can be rele'ed by the caller. - * The caller has to VN_RELE the vnode with out holding any + * The caller has to VN_PHANTOM_RELE the vnode with out holding any * locks. */ int @@ -617,7 +617,7 @@ port_fop_femuninstall(vnode_t *vp) * able to remove it from the port's queue). * * vpp and dvpp will point to the vnode and directory vnode which the caller - * is required to VN_RELE without holding any locks. + * is required to VN_PHANTOM_RELE without holding any locks. */ int port_remove_fop(portfop_t *pfp, portfop_cache_t *pfcp, int cleanup, @@ -727,8 +727,8 @@ port_cache_lookup_fop(portfop_cache_t *pfcp, pid_t pid, uintptr_t obj) /* * Given the file name, get the vnode and also the directory vnode - * On return, the vnodes are held (VN_HOLD). The caller has to VN_RELE - * the vnode(s). + * On return, the vnodes are held with phantom holds (VN_PHANTOM_HOLD). The + * caller has to VN_PHANTOM_RELE the vnode(s). */ int port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, char **cname, @@ -778,6 +778,17 @@ port_fop_getdvp(void *objptr, vnode_t **vp, vnode_t **dvp, char **cname, } } + /* Trade VN_HOLD()s from lookuppn with VN_PHANTOM_HOLD()s */ + if (dvp != NULL && *dvp != NULL) { + VN_PHANTOM_HOLD(*dvp); + VN_RELE(*dvp); + } + + if (vp != NULL && *vp != NULL) { + VN_PHANTOM_HOLD(*vp); + VN_RELE(*vp); + } + pn_free(&pn); return (error); } @@ -1177,7 +1188,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold a reference to the vnode since * we successfully installed the hooks. */ - VN_HOLD(vp); + VN_PHANTOM_HOLD(vp); } else { (void) fem_uninstall(vp, femp, vp); pvp->pvp_femp = NULL; @@ -1210,7 +1221,7 @@ port_pfp_setup(portfop_t **pfpp, port_t *pp, vnode_t *vp, portfop_cache_t *pfcp, * Hold the directory vnode since we have a reference now. */ if (dvp != NULL) - VN_HOLD(dvp); + VN_PHANTOM_HOLD(dvp); *pfpp = pfp; return (0); } @@ -1225,9 +1236,9 @@ port_resolve_vp(vnode_t *vp) */ if (vfs_mntdummyvp && mntfstype != 0 && vp->v_vfsp->vfs_fstype == mntfstype) { - VN_RELE(vp); + VN_PHANTOM_RELE(vp); vp = vfs_mntdummyvp; - VN_HOLD(vfs_mntdummyvp); + VN_PHANTOM_HOLD(vfs_mntdummyvp); } /* @@ -1235,8 +1246,8 @@ port_resolve_vp(vnode_t *vp) * hardlinks. */ if ((VOP_REALVP(vp, &rvp, NULL) == 0) && vp != rvp) { - VN_HOLD(rvp); - VN_RELE(vp); + VN_PHANTOM_HOLD(rvp); + VN_PHANTOM_RELE(vp); vp = rvp; } return (vp); @@ -1248,10 +1259,10 @@ port_resolve_vp(vnode_t *vp) * The association is identified by the object pointer and the pid. * The events argument contains the events to be monitored for. * - * The vnode will have a VN_HOLD once the fem hooks are installed. + * The vnode will have a VN_PHANTOM_HOLD once the fem hooks are installed. * - * Every reference(pfp) to the directory vnode will have a VN_HOLD to ensure - * that the directory vnode pointer does not change. + * Every reference(pfp) to the directory vnode will have a VN_PHANTOM_HOLD to + * ensure that the directory vnode pointer does not change. */ int port_associate_fop(port_t *pp, int source, uintptr_t object, int events, @@ -1331,7 +1342,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, */ if (dvp != NULL && dvp->v_vfsp != vp->v_vfsp && !(orig->v_type == VPROC && vp != NULL && vp->v_type != VPROC)) { - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); dvp = NULL; } @@ -1351,8 +1362,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, pfp = port_cache_lookup_fop(pfcp, curproc->p_pid, object); /* - * If it is not the same vnode, just discard it. VN_RELE needs to be - * called with no locks held, therefore save vnode pointers and + * If it is not the same vnode, just discard it. VN_PHANTOM_RELE needs + * to be called with no locks held, therefore save vnode pointers and * vn_rele them later. */ if (pfp != NULL && (pfp->pfop_vp != vp || pfp->pfop_dvp != dvp)) { @@ -1405,7 +1416,7 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * This vnode pointer is just used * for comparison, so rele it */ - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); } } @@ -1438,8 +1449,8 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, * active and it is not being removed from * the vnode list. This is checked in * port_remove_fop with the vnode lock held. - * The vnode returned is VN_RELE'ed after dropping - * the locks. + * The vnode returned is VN_PHANTOM_RELE'ed after + * dropping the locks. */ tdvp = tvp = NULL; if (port_remove_fop(pfp, pfcp, 0, NULL, &tvp, &tdvp)) { @@ -1452,9 +1463,9 @@ port_associate_fop(port_t *pp, int source, uintptr_t object, int events, } mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); goto errout; } } else { @@ -1521,14 +1532,14 @@ errout: * Release the hold acquired due to the lookup operation. */ if (vp != NULL) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); if (dvp != NULL) - VN_RELE(dvp); + VN_PHANTOM_RELE(dvp); if (oldvp != NULL) - VN_RELE(oldvp); + VN_PHANTOM_RELE(oldvp); if (olddvp != NULL) - VN_RELE(olddvp); + VN_PHANTOM_RELE(olddvp); /* * copied file name not used, free it. @@ -1589,9 +1600,9 @@ port_dissociate_fop(port_t *pp, uintptr_t object) (void) port_remove_fop(pfp, pfcp, 1, &active, &tvp, &tdvp); mutex_exit(&pfcp->pfc_lock); if (tvp != NULL) - VN_RELE(tvp); + VN_PHANTOM_RELE(tvp); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); return (active ? 0 : ENOENT); } @@ -1629,7 +1640,7 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) * be possible as the port is being closed. * * The common case is that the port is not shared and all the entries - * are of this pid and have to be freed. Since VN_RELE has to be + * are of this pid and have to be freed. Since VN_PHANTOM_RELE has to be * called outside the lock, we do it in batches. */ hashtbl = (portfop_t **)pfcp->pfc_hash; @@ -1656,14 +1667,14 @@ port_close_fop(void *arg, int port, pid_t pid, int lastclose) if (pfp == NULL) index++; /* - * Now call VN_RELE if we have collected enough vnodes or - * we have reached the end of the hash table. + * Now call VN_PHANTOM_RELE if we have collected enough vnodes + * or we have reached the end of the hash table. */ if (i >= (PORTFOP_NVP - 1) || (i > 0 && index == PORTFOP_HASHSIZE)) { mutex_exit(&pfcp->pfc_lock); while (i > 0) { - VN_RELE(vpl[--i]); + VN_PHANTOM_RELE(vpl[--i]); vpl[i] = NULL; } mutex_enter(&pfcp->pfc_lock); @@ -1771,7 +1782,7 @@ port_fop_excep(list_t *tlist, int op) port_pcache_remove_fop(pfcp, pfp); mutex_exit(&pfcp->pfc_lock); if (tdvp != NULL) - VN_RELE(tdvp); + VN_PHANTOM_RELE(tdvp); } } @@ -1935,7 +1946,7 @@ port_fop_sendevent(vnode_t *vp, int events, vnode_t *dvp, char *cname) * that may be attempting to remove an object from the vnode's. */ if (port_fop_femuninstall(vp)) - VN_RELE(vp); + VN_PHANTOM_RELE(vp); /* * Send exception events and discard the watch entries. @@ -2070,7 +2081,7 @@ port_fop_unmount(fsemarg_t *vf, int flag, cred_t *cr) * unmount is in process. */ port_fop_sendevent(pvp->pvp_vp, UNMOUNTED, NULL, NULL); - VN_RELE(pvp->pvp_vp); + VN_PHANTOM_RELE(pvp->pvp_vp); } error = vfsnext_unmount(vf, flag, cr); diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c new file mode 100644 index 0000000000..60d098d125 --- /dev/null +++ b/usr/src/uts/common/fs/proc/prargv.c @@ -0,0 +1,530 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/sysmacros.h> +#include <vm/as.h> + +/* + * Safely read a contiguous region of memory from 'addr' in the address space + * of a particular process into the supplied kernel buffer (*buf, sz). + * Partially mapped regions will result in a partial read terminating at the + * first hole in the address space. The number of bytes actually read is + * returned to the caller via 'rdsz'. + */ +int +prreadbuf(proc_t *p, uintptr_t ustart, char *buf, size_t sz, size_t *rdsz) +{ + int error = 0; + size_t rem = sz; + off_t pos = 0; + + if (rdsz != NULL) + *rdsz = 0; + + while (rem != 0) { + uintptr_t addr = ustart + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if ((error = uread(p, buf + pos, len, addr)) != 0) { + if (error == ENXIO) { + /* + * ENXIO from uread() indicates that the page + * does not exist. This will simply be a + * partial read. + */ + error = 0; + } + break; + } + + rem -= len; + pos += len; + } + + if (rdsz != NULL) + *rdsz = pos; + + return (error); +} + + +/* + * Effectively a truncating version of copyinstr(). + * + * The resulting string is guaranteed to be truncated to fit within the buffer + * (hence sz == 0 is not supported). The returned size includes the truncating + * NUL. + */ +int +prreadstr(proc_t *p, uintptr_t ustart, char *buf, size_t bufsz, size_t *rdsz) +{ + size_t slen; + int err; + + VERIFY(bufsz != 0); + + if ((err = prreadbuf(p, ustart, buf, bufsz, &slen)) != 0) + return (err); + + slen = strnlen(buf, slen); + + if (slen == bufsz) + slen--; + + buf[slen++] = '\0'; + + if (rdsz != NULL) + *rdsz = slen; + return (0); +} + +/* + * /proc/pid/cmdline: Linux-compatible '\0'-separated process argv. + * + * Unlike /proc/pid/argv, this looks at the exec()-time argv string area, rather + * than starting from the argv[] array. Thus changes to the array are not + * noticed, but direct modifications of the string are visible here. Since it's + * common for applications to expect it, we implement the Linux semantics here. + * + * There is special handling if the process has modified its argv: if the last + * byte of the argv string area is no longer NUL, then we presume that it has + * done setproctitle() or similar, and we should copy it as a single string from + * the start, even though it overflows into the env string area. Note that we + * can't use copyinstr() as that returns ENAMETOOLONG rather than truncating as + * we need. + * + * Otherwise, we provide the argv string area in toto. + */ +int +prreadcmdline(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + user_t *up = &p->p_user; + uint8_t term; + int err = 0; + + VERIFY(bufsz == PRMAXARGVLEN); + VERIFY(MUTEX_HELD(&p->p_lock)); + + if ((p->p_flag & SSYS) || p->p_as == &kas || up->u_argvstrsize == 0) { + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + return (0); + } + + VERIFY(up->u_argvstrs != (uintptr_t)NULL); + + mutex_exit(&p->p_lock); + + if (uread(p, &term, sizeof (term), + up->u_argvstrs + up->u_argvstrsize - 1) != 0) { + err = EFAULT; + goto out; + } + + if (term != '\0') { + err = prreadstr(p, up->u_argvstrs, buf, bufsz, slen); + } else { + size_t size = MIN(bufsz, up->u_argvstrsize); + err = prreadbuf(p, up->u_argvstrs, buf, size, slen); + } + +out: + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (err); +} + + +/* + * Attempt to read the argument vector (argv) from this process. The caller + * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via + * prlock or lx_prlock). + * + * The caller must provide a buffer (buf, buflen). We will concatenate each + * argument string (including the NUL terminator) into this buffer. The number + * of characters written to this buffer (including the final NUL terminator) + * will be stored in 'slen'. + */ +int +prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *argv = NULL; + size_t argvsz = 0; + int i; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_argv == (uintptr_t)NULL) { + /* + * Return the regular psargs string to the caller. + */ + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + + return (0); + } + + /* + * Allocate space to store argv array. + */ + argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + argv = kmem_alloc(argvsz, KM_SLEEP); + + /* + * Extract the argv array from the target process. Drop p_lock + * while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + if ((error = prreadbuf(p, up->u_argv, (char *)argv, + argvsz, NULL)) != 0) { + kmem_free(argv, argvsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each argument string from the pointers in the argv array. + */ + pos = 0; + for (i = 0; i < up->u_argc; i++) { + size_t rdsz, trysz; + uintptr_t arg; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + arg = (uintptr_t)((caddr32_t *)argv)[i]; + } else { + arg = (uintptr_t)argv[i]; + } +#else + arg = (uintptr_t)argv[i]; +#endif + + /* + * Stop trying to read arguments if we reach a NULL + * pointer in the vector. + */ + if (arg == (uintptr_t)NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual argument strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this argument. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, arg, (char *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(argv, argvsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} + +/* + * Similar to prreadargv except reads the env vector. This is slightly more + * complex because there is no count for the env vector that corresponds to + * u_argc. + */ +int +prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *envp = NULL; + uintptr_t tmpp = (uintptr_t)NULL; + size_t envpsz = 0, rdsz = 0; + int i; + int cnt, bound; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_envp == (uintptr_t)NULL) { + /* + * Return empty string. + */ + buf[0] = '\0'; + *slen = 1; + + return (0); + } + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + + /* + * We first have to count how many env entries we have. This is + * somewhat painful. We extract the env entries from the target process + * one entry at a time. Stop trying to read env entries if we reach a + * NULL pointer in the vector or hit our upper bound (which we take + * as the bufsz/4) to ensure we don't run off. + */ + rdsz = (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + bound = (int)(bufsz / 4); + for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) { + caddr_t tmp = NULL; + + if ((error = prreadbuf(p, tmpp, (char *)&tmp, rdsz, + NULL)) != 0) { + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + if (tmp == NULL) + break; + } + if (cnt == 0) { + /* Return empty string. */ + buf[0] = '\0'; + *slen = 1; + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (0); + } + + /* + * Allocate space to store env array. + */ + envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + envp = kmem_alloc(envpsz, KM_SLEEP); + + /* + * Extract the env array from the target process. + */ + if ((error = prreadbuf(p, up->u_envp, (char *)envp, envpsz, + NULL)) != 0) { + kmem_free(envp, envpsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each env string from the pointers in the env array. + */ + pos = 0; + for (i = 0; i < cnt; i++) { + size_t rdsz, trysz; + uintptr_t ev; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + ev = (uintptr_t)((caddr32_t *)envp)[i]; + } else { + ev = (uintptr_t)envp[i]; + } +#else + ev = (uintptr_t)envp[i]; +#endif + + /* + * Stop trying to read env entries if we reach a NULL + * pointer in the vector. + */ + if (ev == (uintptr_t)NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual env strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this env var. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, ev, (char *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(envp, envpsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index 6b151a6369..07dcb1e7db 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ p->p_stopsig = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; t->t_dtrace_stop = 0; setrun_locked(t); } @@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr) return (EPERM); if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id) return (EINVAL); - if ((zptr = zone_find_by_id(zoneid)) == NULL) - return (EINVAL); + /* + * We cannot hold p_lock when we call zone_find_by_id since that can + * lead to a deadlock. zone_find_by_id() takes zonehash_lock. + * zone_enter() can hold the zonehash_lock and needs p_lock when it + * calls task_join. + */ mutex_exit(&p->p_lock); + if ((zptr = zone_find_by_id(zoneid)) == NULL) { + mutex_enter(&p->p_lock); + return (EINVAL); + } mutex_enter(&p->p_crlock); oldcred = p->p_cred; crhold(oldcred); diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index a661478c50..6d8ac2e6ed 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ @@ -124,6 +124,8 @@ typedef enum prnodetype { #if defined(__i386) || defined(__amd64) PR_LDT, /* /proc/<pid>/ldt */ #endif + PR_ARGV, /* /proc/<pid>/argv */ + PR_CMDLINE, /* /proc/<pid>/cmdline */ PR_USAGE, /* /proc/<pid>/usage */ PR_LUSAGE, /* /proc/<pid>/lusage */ PR_PAGEDATA, /* /proc/<pid>/pagedata */ @@ -355,6 +357,9 @@ extern file_t *pr_getf(proc_t *, uint_t, short *); extern void pr_releasef(file_t *); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadcmdline(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); extern int prwritectl(vnode_t *, struct uio *, cred_t *); extern int prlock(prnode_t *, int); extern void prunmark(proc_t *); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 5591ffd89b..be41826b54 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -222,6 +222,7 @@ prchoose(proc_t *p) case PR_SYSEXIT: case PR_SIGNALLED: case PR_FAULTED: + case PR_BRAND: /* * Make an lwp calling exit() be the * last lwp seen in the process. @@ -555,6 +556,12 @@ prexecend(void) pcp->prc_tslot = tslot; } } + + /* + * There may be threads waiting for the flag change blocked behind the + * pr_pid_cv as well. + */ + cv_signal(&pr_pid_cv[p->p_slot]); } /* diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index e535b1f647..39f8e6f01e 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 MNX Cloud, Inc. @@ -171,8 +171,12 @@ static prdirent_t piddir[] = { "contracts" }, { PR_SECFLAGS, 28 * sizeof (prdirent_t), sizeof (prdirent_t), "secflags" }, + { PR_ARGV, 29 * sizeof (prdirent_t), sizeof (prdirent_t), + "argv" }, + { PR_CMDLINE, 30 * sizeof (prdirent_t), sizeof (prdirent_t), + "cmdline" }, #if defined(__x86) - { PR_LDT, 29 * sizeof (prdirent_t), sizeof (prdirent_t), + { PR_LDT, 31 * sizeof (prdirent_t), sizeof (prdirent_t), "ldt" }, #endif }; @@ -593,6 +597,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(), #if defined(__x86) pr_read_ldt(), #endif + pr_read_argv(), pr_read_cmdline(), pr_read_usage(), pr_read_lusage(), pr_read_pagedata(), pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(), pr_read_lwpusage(), pr_read_lwpname(), @@ -623,6 +628,8 @@ static int (*pr_read_function[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ + pr_read_cmdline, /* /proc/<pid>/cmdline */ pr_read_usage, /* /proc/<pid>/usage */ pr_read_lusage, /* /proc/<pid>/lusage */ pr_read_pagedata, /* /proc/<pid>/pagedata */ @@ -689,6 +696,76 @@ pr_uioread(void *base, long count, uio_t *uiop) } static int +pr_read_cmdline(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = PRMAXARGVLEN, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_CMDLINE); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadcmdline(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int +pr_read_argv(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = PRMAXARGVLEN, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_ARGV); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int pr_read_as(prnode_t *pnp, uio_t *uiop) { int error; @@ -1913,6 +1990,8 @@ static int (*pr_read_function_32[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ + pr_read_cmdline, /* /proc/<pid>/cmdline */ pr_read_usage_32, /* /proc/<pid>/usage */ pr_read_lusage_32, /* /proc/<pid>/lusage */ pr_read_pagedata_32, /* /proc/<pid>/pagedata */ @@ -2841,6 +2920,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) #endif } +/* + * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile + * time that PRFNSZ has the same definition as MAXCOMLEN. + */ +#if PRFNSZ != MAXCOMLEN +#error PRFNSZ/MAXCOMLEN mismatch +#endif + +static int +pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop) +{ + char fname[PRFNSZ]; + int offset = offsetof(psinfo_t, pr_fname), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_fname); +#endif + + /* + * If this isn't a write to pr_fname (or if the size doesn't match + * PRFNSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ) + return (0); + + if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0) + return (error); + + fname[PRFNSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ); + + prunlock(pnp); + + return (0); +} + +/* + * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile + * time that PRARGSZ has the same definition as PSARGSZ. + */ +#if PRARGSZ != PSARGSZ +#error PRARGSZ/PSARGSZ mismatch +#endif + +static int +pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop) +{ + char psargs[PRARGSZ]; + int offset = offsetof(psinfo_t, pr_psargs), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_psargs); +#endif + + /* + * If this isn't a write to pr_psargs (or if the size doesn't match + * PRARGSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ) + return (0); + + if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0) + return (error); + + psargs[PRARGSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ); + + prunlock(pnp); + + return (0); +} + +int +pr_write_psinfo(prnode_t *pnp, uio_t *uiop) +{ + int error; + + if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0) + return (error); + + if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0) + return (error); + + return (0); +} + + /* Note we intentionally don't handle partial writes/updates. */ static int pr_write_lwpname(prnode_t *pnp, uio_t *uiop) @@ -2967,6 +3143,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) uiop->uio_resid = resid; return (error); + case PR_PSINFO: + return (pr_write_psinfo(pnp, uiop)); + case PR_LWPNAME: return (pr_write_lwpname(pnp, uiop)); @@ -3296,6 +3475,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, case PR_AUXV: vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t); break; + case PR_ARGV: + if ((p->p_flag & SSYS) || p->p_as == &kas) { + vap->va_size = PSARGSZ; + } else { + vap->va_size = PRMAXARGVLEN; + } + break; #if defined(__x86) case PR_LDT: mutex_exit(&p->p_lock); @@ -3418,6 +3604,7 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, #endif case PR_CTL: case PR_LWPCTL: + case PR_CMDLINE: default: vap->va_size = 0; break; @@ -3472,6 +3659,8 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: + case PR_CMDLINE: p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) @@ -3557,6 +3746,8 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = { #if defined(__x86) pr_lookup_notdir, /* /proc/<pid>/ldt */ #endif + pr_lookup_notdir, /* /proc/<pid>/argv */ + pr_lookup_notdir, /* /proc/<pid>/cmdline */ pr_lookup_notdir, /* /proc/<pid>/usage */ pr_lookup_notdir, /* /proc/<pid>/lusage */ pr_lookup_notdir, /* /proc/<pid>/pagedata */ @@ -4887,16 +5078,18 @@ prgetnode(vnode_t *dp, prnodetype_t type) pnp->pr_mode = 0600; /* read-write by owner only */ break; + case PR_PSINFO: case PR_LWPNAME: pnp->pr_mode = 0644; /* readable by all + owner can write */ break; - case PR_PSINFO: case PR_LPSINFO: case PR_LWPSINFO: case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: + case PR_CMDLINE: pnp->pr_mode = 0444; /* read-only by all */ break; @@ -5004,6 +5197,8 @@ static int (*pr_readdir_function[PR_NFILES])() = { #if defined(__x86) pr_readdir_notdir, /* /proc/<pid>/ldt */ #endif + pr_readdir_notdir, /* /proc/<pid>/argv */ + pr_readdir_notdir, /* /proc/<pid>/cmdline */ pr_readdir_notdir, /* /proc/<pid>/usage */ pr_readdir_notdir, /* /proc/<pid>/lusage */ pr_readdir_notdir, /* /proc/<pid>/pagedata */ @@ -5157,6 +5352,8 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp) case PR_PROCDIR: case PR_PSINFO: case PR_USAGE: + case PR_ARGV: + case PR_CMDLINE: break; default: continue; diff --git a/usr/src/uts/common/fs/smbsrv/smb_kshare.c b/usr/src/uts/common/fs/smbsrv/smb_kshare.c index 01d382fed7..056619d90b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_kshare.c +++ b/usr/src/uts/common/fs/smbsrv/smb_kshare.c @@ -351,6 +351,7 @@ smb_kshare_g_fini(void) kmem_cache_destroy(smb_kshare_cache_share); } + /* * A list of shares in nvlist format can be sent down * from userspace thourgh the IOCTL interface. The nvlist diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c index 7f56792f7d..af12a0c30b 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_server.c +++ b/usr/src/uts/common/fs/smbsrv/smb_server.c @@ -897,6 +897,22 @@ smb_server_enum(smb_ioc_svcenum_t *ioc) smb_svcenum_t *svcenum = &ioc->svcenum; smb_server_t *sv; int rc; + uint32_t buflen_adjusted; + + /* + * Reality check that the buffer-length insize the enum doesn't + * overrun the ioctl's total length. + * + * NOTE: Assume se_buf is at the end of smb_svcenum_t. + */ + buflen_adjusted = svcenum->se_buflen + + offsetof(smb_svcenum_t, se_buf) + sizeof (ioc->hdr); + if (buflen_adjusted < svcenum->se_buflen || /* Overflow check 1, */ + buflen_adjusted < offsetof(smb_svcenum_t, se_buf) || /* check 2, */ + buflen_adjusted < sizeof (ioc->hdr) || /* check 3. */ + buflen_adjusted > ioc->hdr.len) { + return (EINVAL); + } /* * Reality check that the buffer-length insize the enum doesn't diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index b1f74b993b..768a001d72 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -129,7 +129,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr)); ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); @@ -279,7 +279,7 @@ so_connect(struct sonode *so, struct sockaddr *name, * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -378,7 +378,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -487,7 +487,7 @@ so_sendmblk_impl(struct sonode *so, struct nmsghdr *msg, int fflag, error = EPIPE; break; } - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); @@ -622,7 +622,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, { int error; - SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); if (so->so_filter_active == 0 || (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) @@ -671,7 +671,7 @@ so_getsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_getsockopt(so, option_name, optval, optlenp, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); if ((so->so_filter_active == 0 || @@ -760,7 +760,7 @@ so_setsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_setsockopt(so, option_name, optval, optlen, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); /* X/Open requires this check */ @@ -845,7 +845,7 @@ so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * If there is a pending error, return error * This can happen if a non blocking operation caused an error. */ - if (so->so_error != 0) { + if (so->so_error != 0 && (so->so_mode & SM_DEFERERR) == 0) { mutex_enter(&so->so_lock); error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index ab9c479af3..df159a122c 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -671,10 +671,15 @@ so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop, int more = 0; int error; ssize_t oobmark; + ssize_t copied = 0; sodirect_t *sodp = so->so_direct; + xuio_t *xuio = NULL; partial_read = B_FALSE; *mctlp = NULL; + if ((uiop->uio_extflg & UIO_XUIO) != 0) { + xuio = (xuio_t *)uiop; + } again: mutex_enter(&so->so_lock); again1: @@ -785,8 +790,6 @@ again1: * enabled socket, uio_resid can be 0. */ if (uiop->uio_resid >= 0) { - ssize_t copied = 0; - if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) { mutex_enter(&so->so_lock); ASSERT(uiop == (uio_t *)&sodp->sod_uioa); @@ -844,6 +847,18 @@ again1: } if (mp != NULL) { /* more data blocks in msg */ more |= MOREDATA; + + /* + * If requested, tally up remaining data along with the + * amount already copied. + */ + if (xuio != NULL && + xuio->xu_type == UIOTYPE_PEEKSIZE) { + xuio->xu_ext.xu_ps.xu_ps_set = B_TRUE; + xuio->xu_ext.xu_ps.xu_ps_size = + copied + msgdsize(mp); + } + if ((flags & (MSG_PEEK|MSG_TRUNC))) { if (flags & MSG_PEEK) { freemsg(mp); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index 1fa4efe59f..62a079f419 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/systm.h> @@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name, /* Module loaded OK, so there must be an ops vector */ ASSERT(ent->sofe_mod != NULL); + + /* + * Check again to confirm ATTACH is ok. See if the the module + * is not SOF_ATT_SAFE after an unsafe operation has taken + * place. + */ + if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 && + so->so_state & SS_FILOP_UNSF) { + sof_instance_destroy(inst); + return (EINVAL); + } + inst->sofi_ops = &ent->sofe_mod->sofm_ops; SOF_STAT_ADD(inst, tot_active_attach, 1); @@ -1445,7 +1458,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * sof_register(version, name, ops, flags) * * Register a socket filter identified by name `name' and which should use - * the ops vector `ops' for event notification. `flags' should be set to 0. + * the ops vector `ops' for event notification. `flags' should be set to 0 + * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An + * unsafe filter is one that cannot be attached after any socket operation has + * occured. This is the legacy default. A "safe" filter can be attached even + * after some basic initial socket operations have taken place. This set is + * currently bind, getsockname, getsockopt and setsockopt. The order in which + * a "safe" filter can be attached is more relaxed, and thus more flexible. * On success 0 is returned, otherwise an errno is returned. */ int @@ -1453,14 +1472,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags) { sof_module_t *mod; - _NOTE(ARGUNUSED(flags)); - if (version != SOF_VERSION) return (EINVAL); mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(mod->sofm_name, name); + mod->sofm_flags = flags; mod->sofm_ops = *ops; mutex_enter(&sof_module_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index e9a09bad88..e63831e172 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKFILTER_H @@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t; struct sof_module { char *sofm_name; + int sofm_flags; sof_ops_t sofm_ops; uint_t sofm_refcnt; list_node_t sofm_node; diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 3262150f79..739d439851 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright 2015, Joyent, Inc. All rights reserved. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. @@ -420,8 +421,10 @@ sogetoff(mblk_t *mp, t_uscalar_t offset, * * The underlying filesystem VSOCK vnode has a v_stream pointer that * references the actual stream head (hence indirectly the actual sonode). + * + * This function is non-static so it can be used by brand emulation. */ -static int +int so_ux_lookup(struct sonode *so, struct sockaddr_un *soun, int checkaccess, vnode_t **vpp) { diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h index 6a515be122..24acb81a0a 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi_impl.h +++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKTPI_IMPL_H @@ -56,6 +57,8 @@ extern int sogetrderr(vnode_t *, int, int *); extern int sogetwrerr(vnode_t *, int, int *); extern int so_addr_verify(struct sonode *, const struct sockaddr *, socklen_t); +extern int so_ux_lookup(struct sonode *, struct sockaddr_un *, int, + vnode_t **); extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *, socklen_t, int, void **, socklen_t *); extern void so_unix_close(struct sonode *); diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index 06ef8dd7fd..b28ced7111 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -21,10 +21,9 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -445,20 +444,7 @@ tdirenter( /* * Unmake the inode we just made. */ - rw_enter(&tp->tn_rwlock, RW_WRITER); - if ((tp->tn_type) == VDIR) { - ASSERT(tdp == NULL); - /* - * cleanup allocs made by tdirinit() - */ - tdirtrunc(tp); - } - mutex_enter(&tp->tn_tlock); - tp->tn_nlink = 0; - mutex_exit(&tp->tn_tlock); - gethrestime(&tp->tn_ctime); - rw_exit(&tp->tn_rwlock); - tmpnode_rele(tp); + tmpnode_cleanup(tp); tp = NULL; } } else if (tpp) { @@ -493,6 +479,7 @@ tdirdelete( enum dr_op op, struct cred *cred) { + struct tmount *tm; struct tdirent *tpdp; int error; size_t namelen; @@ -578,7 +565,8 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + tm = TNTOTM(dir); + tmp_kmem_free(tm, tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -600,19 +588,27 @@ tdirdelete( * tdirinit is used internally to initialize a directory (dir) * with '.' and '..' entries without checking permissions and locking */ -void +int tdirinit( struct tmpnode *parent, /* parent of directory to initialize */ struct tmpnode *dir) /* the new directory */ { + struct tmount *tm; struct tdirent *dot, *dotdot; timestruc_t now; ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + tm = TNTOTM(parent); + dot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 2, KM_SLEEP); + if (dot == NULL) + return (ENOSPC); + dotdot = tmp_kmem_zalloc(tm, sizeof (struct tdirent) + 3, KM_SLEEP); + if (dotdot == NULL) { + tmp_kmem_free(tm, dot, sizeof (struct tdirent) + 2); + return (ENOSPC); + } /* * Initialize the entries @@ -663,6 +659,8 @@ tdirinit( dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */ dir->tn_dirents = 2; dir->tn_nlink = 2; + + return (0); } @@ -674,6 +672,7 @@ tdirtrunc(struct tmpnode *dir) { struct tdirent *tdp; struct tmpnode *tp; + struct tmount *tm; size_t namelen; timestruc_t now; int isvattrdir, isdotdot, skip_decr; @@ -681,6 +680,8 @@ tdirtrunc(struct tmpnode *dir) ASSERT(RW_WRITE_HELD(&dir->tn_rwlock)); ASSERT(dir->tn_type == VDIR); + tm = TNTOTM(dir); + isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0; for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) { ASSERT(tdp->td_next != tdp); @@ -712,7 +713,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + tmp_kmem_free(tm, tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -965,6 +966,7 @@ tdiraddentry( enum de_op op, struct tmpnode *fromtp) { + struct tmount *tm; struct tdirent *tdp, *tpdp; size_t namelen, alloc_size; timestruc_t now; @@ -985,9 +987,10 @@ tdiraddentry( /* * Allocate and initialize directory entry */ + tm = TNTOTM(dir); namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = tmp_kmem_zalloc(tm, alloc_size, KM_NOSLEEP_LAZY); if (tdp == NULL) return (ENOSPC); @@ -1087,7 +1090,10 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + return (ENOSPC); + } tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ @@ -1149,8 +1155,13 @@ tdirmaketnode( if (va->va_mask & AT_MTIME) tp->tn_mtime = va->va_mtime; - if (op == DE_MKDIR) - tdirinit(dir, tp); + if (op == DE_MKDIR) { + int ret; + if ((ret = tdirinit(dir, tp)) != 0) { + tmpnode_cleanup(tp); + return (ret); + } + } *newnode = tp; return (0); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 8723631555..0c48c03a75 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -43,6 +43,7 @@ #include <sys/fs/tmpnode.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <vm/anon.h> #define KILOBYTE 1024 #define MEGABYTE (1024 * KILOBYTE) @@ -54,6 +55,80 @@ extern pgcnt_t swapfs_minfree; +void * +tmp_kmem_zalloc(struct tmount *tm, size_t size, int flag) +{ + void *buf; + zone_t *zone; + size_t pages; + + mutex_enter(&tm->tm_contents); + zone = tm->tm_vfsp->vfs_zone; + if (tm->tm_anonmem + size > tm->tm_anonmax || + tm->tm_anonmem + size < tm->tm_anonmem || + size + ptob(tmpfs_minfree) <= size || + !anon_checkspace(size + ptob(tmpfs_minfree), zone)) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + /* + * Only make anonymous memory reservations when a page boundary is + * crossed. This is necessary since the anon_resv functions rounds up + * to PAGESIZE internally. + */ + pages = btopr(tm->tm_allocmem + size); + pages -= btopr(tm->tm_allocmem); + if (pages > 0 && anon_try_resv_zone(ptob(pages), zone) == 0) { + mutex_exit(&tm->tm_contents); + return (NULL); + } + + tm->tm_allocmem += size; + tm->tm_anonmem += size; + mutex_exit(&tm->tm_contents); + + buf = kmem_zalloc(size, flag); + if (buf == NULL) { + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + if (pages > 0) { + /* + * Re-chasing the zone pointer is necessary since a + * forced umount could have been performed while the + * tm_contents lock was dropped during allocation. + */ + anon_unresv_zone(ptob(pages), tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); + } + + return (buf); +} + +void +tmp_kmem_free(struct tmount *tm, void *buf, size_t size) +{ + size_t pages; + + kmem_free(buf, size); + mutex_enter(&tm->tm_contents); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - size); + tm->tm_anonmem -= size; + pages = btopr(tm->tm_allocmem); + tm->tm_allocmem -= size; + pages -= btopr(tm->tm_allocmem); + /* + * Like the tmp_kmem_zalloc case, only unreserve anonymous memory when + * a page boundary has been crossed. + */ + if (pages > 0) { + anon_unresv_zone(size, tm->tm_vfsp->vfs_zone); + } + mutex_exit(&tm->tm_contents); +} + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -99,42 +174,8 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* - * Convert a string containing a number (number of bytes) to a pgcnt_t, - * containing the corresponding number of pages. On 32-bit kernels, the + * Convert a string containing a number (number of bytes) to a size_t, + * containing the corresponding number of bytes. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * @@ -152,7 +193,7 @@ tmp_memfree(void *cp, size_t size) * error. */ int -tmp_convnum(char *str, pgcnt_t *maxpg) +tmp_convnum(char *str, size_t *maxbytes) { u_longlong_t num = 0; #ifdef _LP64 @@ -160,6 +201,7 @@ tmp_convnum(char *str, pgcnt_t *maxpg) #else u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif + size_t pages; char *c; const struct convchar { char *cc_char; @@ -250,13 +292,21 @@ valid_char: done: /* - * Since btopr() rounds up to page granularity, this round-up can - * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) - * and (max_bytes). In this case the resulting number is zero, which - * is what we check for below. + * We've been given a size in bytes; however, we want to make sure that + * we have at least one page worth no matter what. Therefore we use + * btopr to round up. However, this may cause an overflow only if 'num' + * is between (max_bytes - PAGESIZE) and (max_bytes). In this case the + * resulting number is zero, which is what we check for below. Note, we + * require at least one page, so if pages is zero, well, it wasn't going + * to work anyways. */ - if ((*maxpg = (pgcnt_t)btopr(num)) == 0 && num != 0) + pages = btopr(num); + if (pages == 0) { return (EINVAL); + } + + *maxbytes = ptob(pages); + return (0); } diff --git a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c index 51e57b2611..13ea356924 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_tnode.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_tnode.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -64,21 +65,35 @@ tmp_resv( int pagecreate) /* call anon_resv if set */ { pgcnt_t pages = btopr(delta); + size_t pbytes = ptob(pages); zone_t *zone; ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* - * pagecreate is set only if we actually need to call anon_resv - * to reserve an additional page of anonymous memory. - * Since anon_resv always reserves a page at a time, - * it should only get called when we know we're growing the - * file into a new page or filling a hole. + * pagecreate is set only if we actually need to call anon_resv to + * reserve an additional page of anonymous memory. Since anon_resv + * always reserves a page at a time, it should only get called when we + * know we're growing the file into a new page or filling a hole. This + * is why we transform delta into a number of pages. However, because we + * track bytes and not pages, we convert that back to a number of bytes + * that we allocate against. * - * Deny if trying to reserve more than tmpfs can allocate + * Deny if trying to reserve more than tmpfs can allocate, the + * allocation causes an overflow, or the delta round up overflowed. + * Note, that btopr rounds up, so we need to catch the unsigned + * overflow. Note, rounding up when we are within a page of SIZE_MAX is + * done by adding a page, overflowing, which will then be rounded back + * to zero. Hence the following check. */ + if (pages == 0 && delta != 0) + return (1); + zone = tm->tm_vfsp->vfs_zone; - if (pagecreate && ((tm->tm_anonmem + pages > tm->tm_anonmax) || + if (pagecreate && ((tm->tm_anonmem + pbytes > tm->tm_anonmax) || + (tm->tm_anonmem + pbytes < tm->tm_anonmem) || + (ptob(pages + tmpfs_minfree) <= pbytes) || (!anon_checkspace(ptob(pages + tmpfs_minfree), zone)) || (anon_try_resv_zone(delta, zone) == 0))) { return (1); @@ -89,7 +104,7 @@ tmp_resv( */ if (pagecreate) { mutex_enter(&tm->tm_contents); - tm->tm_anonmem += pages; + tm->tm_anonmem += pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", @@ -110,13 +125,27 @@ tmp_unresv( struct tmpnode *tp, size_t delta) { + size_t pages, pbytes; + ASSERT(RW_WRITE_HELD(&tp->tn_rwlock)); ASSERT(tp->tn_type == VREG); + /* + * If this is true, we have a grevious overflow bug and some size + * accounting has been messed with as having an amount to truncate at + * this size would imply that all of memory was used for this file. No + * matter how small the kernel, it will always need at least one page. + */ + pages = btopr(delta); + if (pages == 0 && delta != 0) + panic("tmpfs unsigned overflow detected"); + pbytes = ptob(pages); + anon_unresv_zone(delta, tm->tm_vfsp->vfs_zone); mutex_enter(&tm->tm_contents); - tm->tm_anonmem -= btopr(delta); + ASSERT(tm->tm_anonmem > tm->tm_anonmem - pbytes); + tm->tm_anonmem -= pbytes; mutex_exit(&tm->tm_contents); TRACE_2(TR_FAC_VM, TR_ANON_TMPFS, "anon tmpfs:%p %lu", tp, delta); @@ -154,6 +183,26 @@ tmpnode_growmap(struct tmpnode *tp, ulong_t newsize) } /* + * This is used to clean up a tmpnode that hasn't made it out the door. In other + * words, we allocated it and did a tmpnode_init; however, before it could get + * fully inserted into a directory, bad things happened and it failed. + */ +void +tmpnode_cleanup(struct tmpnode *tp) +{ + rw_enter(&tp->tn_rwlock, RW_WRITER); + if ((tp->tn_type) == VDIR) { + tdirtrunc(tp); + } + mutex_enter(&tp->tn_tlock); + tp->tn_nlink = 0; + mutex_exit(&tp->tn_tlock); + gethrestime(&tp->tn_ctime); + rw_exit(&tp->tn_rwlock); + tmpnode_rele(tp); +} + +/* * Initialize a tmpnode and add it to file list under mount point. */ void @@ -232,7 +281,6 @@ tmpnode_trunc( { size_t oldsize = tp->tn_size; size_t delta; - struct vnode *vp = TNTOV(tp); timestruc_t now; int error = 0; @@ -316,7 +364,7 @@ tmpnode_trunc( /* Delete anon array for tmpnode */ ASSERT(tp->tn_nblocks == 0); ASSERT(anon_get_ptr(tp->tn_anon, 0) == NULL); - ASSERT(!vn_has_cached_data(vp)); + ASSERT(!vn_has_cached_data(TNTOV(tp))); anon_release(tp->tn_anon, tp->tn_asize); tp->tn_anon = NULL; diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index a7cf62cb99..24310fefe5 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -56,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -65,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -123,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -141,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -157,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -178,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -212,18 +221,12 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } @@ -234,7 +237,7 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) struct tmpnode *tp; struct pathname dpn; int error; - pgcnt_t anonmax; + size_t anonmax; struct vattr rattr; int got_attrs; boolean_t mode_arg = B_FALSE; @@ -278,7 +281,18 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { - anonmax = ULONG_MAX; + anonmax = SIZE_MAX; + } + + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; } /* @@ -311,7 +325,8 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) goto out; } - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if ((tm = kmem_zalloc(sizeof (struct tmount), KM_NOSLEEP_LAZY)) == + NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -343,17 +358,37 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* + * Preemptively set vfs_zone before any of the tmp_kmem_* functions are + * called. That field is not populated until after a successful + * VFS_MOUNT when domount() sets vfsp metadata via vfs_add(). An + * accurate value is required for proper swap usage accounting. + */ + ASSERT0(uap->flags & MS_REMOUNT); + ASSERT(vfsp->vfs_zone == NULL); + vfsp->vfs_zone = curproc->p_zone; + + /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), KM_SLEEP); + if (tp == NULL) { + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + + pn_free(&dpn); + error = ENOMEM; + goto out; + } tmpnode_init(tm, tp, &rattr, cr); /* @@ -392,12 +427,34 @@ tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) tp->tn_nlink = 0; tm->tm_rootnode = tp; - tdirinit(tp, tp); + if (tdirinit(tp, tp) != 0) { + /* + * While we would normally let our VOP_INACTIVE function take + * care of cleaning up here, we're in a bit of a delicate + * situation, so we do so manually. While it's tempting to try + * and rely upon tmpfs_freevfs() and others, it's probably safer + * for the time to do this manually at the cost of duplication. + */ + vn_invalid(TNTOV(tp)); + rw_destroy(&tp->tn_rwlock); + mutex_destroy(&tp->tn_tlock); + vn_free(TNTOV(tp)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + mutex_destroy(&tm->tm_contents); + mutex_destroy(&tm->tm_renamelck); + kmem_free(tm, sizeof (struct tmount)); + pn_free(&dpn); + error = ENOMEM; + goto out; + } rw_exit(&tp->tn_rwlock); pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -413,36 +470,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -452,14 +580,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -469,6 +633,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -535,15 +709,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -605,18 +780,19 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * If tm_anonmax for this mount is less than the available swap space * (minus the amount tmpfs can't use), use that instead */ - if (blocks > tmpfs_minfree) + if (blocks > tmpfs_minfree && tm->tm_anonmax > tm->tm_anonmem) { sbp->f_bfree = MIN(blocks - tmpfs_minfree, - tm->tm_anonmax - tm->tm_anonmem); - else + btop(tm->tm_anonmax) - btopr(tm->tm_anonmem)); + } else { sbp->f_bfree = 0; + } sbp->f_bavail = sbp->f_bfree; /* * Total number of blocks is what's available plus what's been used */ - sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + tm->tm_anonmem); + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree + btopr(tm->tm_anonmem)); if (eff_zid != GLOBAL_ZONEUNIQID && zp->zone_max_swap_ctl != UINT64_MAX) { @@ -646,13 +822,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index a09f206d88..cbe19aefea 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2017 by Delphix. All rights reserved. @@ -586,6 +586,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -615,6 +619,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -788,8 +796,13 @@ tmp_setattr( rw_exit(&tp->tn_contents); rw_exit(&tp->tn_rwlock); - if (error == 0 && vap->va_size == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } goto out1; } @@ -835,6 +848,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -853,6 +869,8 @@ tmp_lookup( rw_enter(&tp->tn_rwlock, RW_WRITER); if (tp->tn_xattrdp == NULL) { + int err; + if (!(flags & CREATE_XATTR_DIR)) { rw_exit(&tp->tn_rwlock); return (ENOENT); @@ -873,9 +891,13 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); tm = VTOTM(dvp); + xdp = tmp_kmem_zalloc(tm, sizeof (struct tmpnode), + KM_SLEEP); + if (xdp == NULL) { + rw_exit(&tp->tn_rwlock); + return (ENOSPC); + } tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* * Fix-up fields unique to attribute directories. @@ -893,7 +915,16 @@ tmp_lookup( } xdp->tn_vnode->v_type = VDIR; xdp->tn_vnode->v_flag |= V_XATTRDIR; - tdirinit(tp, xdp); + if ((err = tdirinit(tp, xdp)) != 0) { + rw_exit(&tp->tn_rwlock); + /* + * This never got properly initialized so we can + * just clean it up. + */ + xdp->tn_vnode->v_flag &= V_XATTRDIR; + tmpnode_cleanup(tp); + return (err); + } tp->tn_xattrdp = xdp; } else { VN_HOLD(tp->tn_xattrdp->tn_vnode); @@ -1302,10 +1333,8 @@ tmp_rename( vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); /* * vnevent_rename_dest is called in tdirenter(). - * Notify the target dir if not same as source dir. */ - if (ndvp != odvp) - vnevent_rename_dest_dir(ndvp, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); } done: @@ -1474,6 +1503,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1607,12 +1640,12 @@ tmp_symlink( rw_exit(&parent->tn_rwlock); if (error) { - if (self) + if (self != NULL) tmpnode_rele(self); return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = tmp_kmem_zalloc(tm, len, KM_NOSLEEP_LAZY); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1677,10 +1710,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - VN_RELE_LOCKED(vp); + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + VN_RELE_LOCKED(vp); + } else { + VN_RELE_LOCKED(vp); + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1705,7 +1755,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + tmp_kmem_free(tm, tp->tn_symlink, tp->tn_size + 1); } /* @@ -1739,7 +1789,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + tmp_kmem_free(tm, tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1861,6 +1915,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2082,6 +2140,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ @@ -2342,8 +2404,13 @@ tmp_space( return (EFBIG); error = tmp_freesp(vp, bfp, flag); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); } diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c index c1e2c74a87..def046a0bf 100644 --- a/usr/src/uts/common/fs/udfs/udf_dir.c +++ b/usr/src/uts/common/fs/udfs/udf_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -562,9 +563,8 @@ out: namep, ctp); } - if (sdp != tdp) { - vnevent_rename_dest_dir(ITOV(tdp), ctp); - } + vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip), + namep, ctp); } /* diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c index 054056c63a..51ce9b28af 100644 --- a/usr/src/uts/common/fs/udfs/udf_vnops.c +++ b/usr/src/uts/common/fs/udfs/udf_vnops.c @@ -569,8 +569,11 @@ udf_setattr( goto update_inode; } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } } /* * Change file access or modified times. @@ -1649,8 +1652,13 @@ udf_space( } else if ((error = convoff(vp, bfp, 0, offset)) == 0) { error = ud_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } return (error); diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c index 2be623f755..8aa961e340 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vnops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c @@ -2084,8 +2084,13 @@ again: goto update_inode; } - if (error == 0 && vap->va_size) - vnevent_truncate(vp, ct); + if (error == 0) { + if (vap->va_size) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } if (ulp) { @@ -3610,12 +3615,7 @@ retry_firstlock: if (error == 0) { vnevent_rename_src(ITOV(sip), sdvp, snm, ct); - /* - * Notify the target directory of the rename event - * if source and target directories are not the same. - */ - if (sdvp != tdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); } errout: @@ -4350,8 +4350,13 @@ ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, return (error); error = ufs_freesp(vp, bfp, flag, cr); - if (error == 0 && bfp->l_start == 0) - vnevent_truncate(vp, ct); + if (error == 0) { + if (bfp->l_start == 0) { + vnevent_truncate(vp, ct); + } else { + vnevent_resize(vp, ct); + } + } } else if (cmd == F_ALLOCSP) { error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_FALLOCATE_MASK); diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index 3cd2feebef..460d15bcbd 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -857,9 +857,11 @@ vfs_mountroot(void) for (p = practive; p != NULL; p = p->p_next) { ASSERT(p == &p0 || p->p_parent == &p0); + mutex_enter(&p->p_lock); PTOU(p)->u_cdir = rootdir; VN_HOLD(PTOU(p)->u_cdir); PTOU(p)->u_rdir = NULL; + mutex_exit(&p->p_lock); } mutex_exit(&pidlock); @@ -3883,6 +3885,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 4e73f7f6e6..953ee80471 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -25,6 +25,7 @@ * Copyright 2022 Spencer Evans-Cole. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -209,6 +210,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -849,6 +855,36 @@ vn_rele(vnode_t *vp) mutex_exit(&vp->v_lock); } +void +vn_phantom_rele(vnode_t *vp) +{ + mutex_enter(&vp->v_lock); + VERIFY3U(vp->v_count, >=, vp->v_phantom_count); + vp->v_phantom_count--; + DTRACE_PROBE1(vn__phantom_rele, vnode_t *, vp); + if (vp->v_count == 1) { + ASSERT0(vp->v_phantom_count); + mutex_exit(&vp->v_lock); + VOP_INACTIVE(vp, CRED(), NULL); + return; + } + VERIFY(vp->v_count > 0); + VN_RELE_LOCKED(vp); + mutex_exit(&vp->v_lock); +} + +/* + * Return the number of non-phantom holds. Things such as portfs will use + * phantom holds to prevent it from blocking filesystems from mounting over + * watched directories. + */ +uint_t +vn_count(vnode_t *vp) +{ + ASSERT(MUTEX_HELD(&vp->v_lock)); + return (vp->v_count - vp->v_phantom_count); +} + /* * Release a vnode referenced by the DNLC. Multiple DNLC references are treated * as a single reference, so v_count is not decremented until the last DNLC hold @@ -1130,7 +1166,20 @@ top: * Do remaining checks for FNOFOLLOW and FNOLINKS. */ if ((filemode & FNOFOLLOW) && vp->v_type == VLNK) { - error = ELOOP; + /* + * The __FLXPATH flag is a private interface for use by the lx + * brand in order to emulate open(O_NOFOLLOW|O_PATH) which, + * when a symbolic link is encountered, returns a file + * descriptor which references it. + * See uts/common/brand/lx/syscall/lx_open.c + * + * When this flag is set, VOP_OPEN() is not called (for a + * symlink, most filesystems will return ENOSYS anyway) + * and the link's vnode is returned to be linked to the + * file descriptor. + */ + if ((filemode & __FLXPATH) == 0) + error = ELOOP; goto out; } if (filemode & FNOLINKS) { @@ -2441,6 +2490,7 @@ vn_reinit(vnode_t *vp) { vp->v_count = 1; vp->v_count_dnlc = 0; + vp->v_phantom_count = 0; vp->v_vfsp = NULL; vp->v_stream = NULL; vp->v_vfsmountedhere = NULL; @@ -2497,6 +2547,7 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); + ASSERT0(vp->v_phantom_count); VERIFY(vp->v_path != NULL); if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); @@ -2587,6 +2638,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2601,12 +2653,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2693,6 +2746,15 @@ vnevent_truncate(vnode_t *vp, caller_context_t *ct) (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct); } +void +vnevent_resize(vnode_t *vp, caller_context_t *ct) +{ + if (vp == NULL || vp->v_femhead == NULL) { + return; + } + (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct); +} + /* * Vnode accessors. */ @@ -3468,14 +3530,58 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3487,14 +3593,63 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start = 0, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) && + vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (start != 0) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c index b2004f3d42..b841a8f38e 100644 --- a/usr/src/uts/common/fs/zfs/abd.c +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -171,7 +171,10 @@ int zfs_abd_scatter_min_size = 512 * 3; * it at runtime would cause ABD iteration to work incorrectly for ABDs which * were allocated with the old size, so a safeguard has been put in place which * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. + * within a scattered ABD. Note that tuning this value to be smaller than the + * page size can induce heavy fragmentation in the slab layer, which may itself + * result in more memory waste than is saved by the smaller chunk size -- and + * will induces more computational work in the slab layer. Tune with caution! */ size_t zfs_abd_chunk_size = 4096; diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index bf8b77f268..12b5872cdc 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -284,6 +284,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/zio_checksum.h> #include <sys/multilist.h> #include <sys/abd.h> @@ -349,7 +350,7 @@ int arc_grow_retry = 60; int arc_kmem_cache_reap_retry_ms = 1000; /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ -int zfs_arc_overflow_shift = 8; +int zfs_arc_overflow_shift = 3; /* shift of arc_c for calculating both min and max arc_p */ int arc_p_min_shift = 4; @@ -6112,6 +6113,14 @@ top: if (hash_lock != NULL) mutex_exit(hash_lock); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -7168,6 +7177,10 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + /* On larger-memory machines, we clamp the minimum at 1GB */ + if (zfs_arc_min == 0) + arc_c_min = MIN(arc_c_min, (1 << 30)); + if (zfs_arc_meta_min > 0) { arc_meta_min = zfs_arc_meta_min; } else { diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index f610268bf4..38c4a83cb1 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -1125,8 +1125,17 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); if (bonuslen < max_bonuslen) bzero(db->db.db_data, max_bonuslen); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + if (bonuslen) { + /* + * Absent byzantine on-disk corruption, we fully expect + * our bonuslen to be no more than max_bonuslen -- + * but we nonetheless explicitly clamp it on the bcopy() + * to prevent any on-disk corruption from becoming + * rampant in-kernel corruption. + */ + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + MIN(bonuslen, max_bonuslen)); + } DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index b7135df3fa..d91a48e2ca 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -22,7 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright 2016 RackTop Systems. * Copyright (c) 2014 Integros [integros.com] diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 53d5765bcb..6cb39d61a5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -39,11 +39,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -213,6 +213,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 02cad5f98e..c3d24abb3d 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -43,6 +43,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include <sys/zfeature.h> #include <sys/policy.h> #include <sys/zfs_znode.h> @@ -1413,7 +1414,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); + zfs_zone_txg_delay(), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index bc6f9aff77..d3901c6f79 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -44,6 +44,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/vdev_impl.h> #include <sys/metaslab_impl.h> #include <sys/bptree.h> @@ -905,7 +906,7 @@ dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; - ASSERT3U(dp->dp_dirty_total, >=, space); + VERIFY3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 68733f47c1..4828824b10 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -23,6 +23,7 @@ * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] + * Copyright 2019 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. */ @@ -71,6 +72,11 @@ int zfs_metaslab_sm_blksz_with_log = (1 << 17); int zfs_condense_pct = 200; /* + * Never condense any space map. This is for debugging/recovery only. + */ +int zfs_condense_never = 0; + +/* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the @@ -863,6 +869,7 @@ metaslab_group_activate(metaslab_group_t *mg) { metaslab_class_t *mc = mg->mg_class; metaslab_group_t *mgprev, *mgnext; + char kstat_name[KSTAT_STRLEN]; ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); @@ -887,6 +894,33 @@ metaslab_group_activate(metaslab_group_t *mg) mgprev->mg_next = mg; mgnext->mg_prev = mg; } + + /* Create a kstat to monitor the loading and unloading of metaslabs. */ + (void) snprintf(kstat_name, sizeof (kstat_name), "%llx", + (unsigned long long) mg->mg_vd->vdev_guid); + + mutex_init(&mg->mg_kstat_lock, NULL, MUTEX_DEFAULT, NULL); + if ((mg->mg_kstat = kstat_create("zfs_metaslab_group", 0, + kstat_name, "misc", KSTAT_TYPE_NAMED, + sizeof (metaslab_group_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + + metaslab_group_kstat_t *mg_kstat = kmem_zalloc( + sizeof (metaslab_group_kstat_t), KM_SLEEP); + kstat_named_init(&mg_kstat->mg_loads, "loads", + KSTAT_DATA_UINT64); + kstat_named_init(&mg_kstat->mg_unloads, "unloads", + KSTAT_DATA_UINT64); + kstat_named_init(&mg_kstat->mg_spa_name, "spa_name", + KSTAT_DATA_STRING); + kstat_named_setstr(&mg_kstat->mg_spa_name, + mg->mg_vd->vdev_spa->spa_name); + + mg->mg_kstat->ks_data = mg_kstat; + mg->mg_kstat->ks_lock = &mg->mg_kstat_lock; + kstat_install(mg->mg_kstat); + } + mc->mc_rotor = mg; } @@ -963,6 +997,14 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_prev = NULL; mg->mg_next = NULL; + + if (mg->mg_kstat != NULL) { + metaslab_group_kstat_t *data = mg->mg_kstat->ks_data; + + kstat_delete(mg->mg_kstat); + kmem_free(data, sizeof (metaslab_group_kstat_t)); + } + mutex_destroy(&mg->mg_kstat_lock); } boolean_t @@ -2400,6 +2442,7 @@ metaslab_load_impl(metaslab_t *msp) int metaslab_load(metaslab_t *msp) { + kstat_t *ksp; ASSERT(MUTEX_HELD(&msp->ms_lock)); /* @@ -2412,6 +2455,12 @@ metaslab_load(metaslab_t *msp) VERIFY(!msp->ms_loading); ASSERT(!msp->ms_condensing); + ksp = msp->ms_group->mg_kstat; + if (ksp != NULL) { + metaslab_group_kstat_t *mg_ksp = ksp->ks_data; + atomic_inc_64(&mg_ksp->mg_loads.value.ui64); + } + /* * We set the loading flag BEFORE potentially dropping the lock to * wait for an ongoing flush (see ms_flushing below). This way other @@ -4290,12 +4339,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If the metaslab is loaded and we've not tried to load or allocate - * from it in 'metaslab_unload_delay' txgs, then unload it. + * from it in 'metaslab_unload_delay' txgs, then we normally unload it. */ if (msp->ms_loaded && msp->ms_disabled == 0 && msp->ms_selected_txg + metaslab_unload_delay < txg) { - for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { VERIFY0(range_tree_space( msp->ms_allocating[(txg + t) & TXG_MASK])); @@ -4539,8 +4587,6 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); msp->ms_allocating_total += size; - /* Track the last successful allocation */ - msp->ms_alloc_txg = txg; metaslab_verify_space(msp, txg); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index d6e230fbb4..db3317e4cd 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1961,6 +1961,12 @@ spa_check_for_missing_logs(spa_t *spa) if (idx > 0) { spa_load_failed(spa, "some log devices are missing"); vdev_dbgmsg_print_tree(rvd, 2); + + /* Save the timestamp of the last completed txg. */ + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, + spa->spa_last_ubsync_txg_ts) == 0); + return (SET_ERROR(ENXIO)); } } else { @@ -1969,10 +1975,21 @@ spa_check_for_missing_logs(spa_t *spa) if (tvd->vdev_islog && tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + nvlist_t *rewind_info = fnvlist_alloc(); + spa_set_log_state(spa, SPA_LOG_CLEAR); spa_load_note(spa, "some log devices are " "missing, ZIL is dropped."); vdev_dbgmsg_print_tree(rvd, 2); + + VERIFY(nvlist_add_uint64(rewind_info, + ZPOOL_CONFIG_LOAD_TIME, + spa->spa_uberblock.ub_timestamp) == 0); + + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_INFO, + rewind_info) == 0); + break; } } diff --git a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h index bec7bdef2e..6adc8fa14e 100644 --- a/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/metaslab_impl.h @@ -283,8 +283,17 @@ struct metaslab_group { boolean_t mg_disabled_updating; kmutex_t mg_ms_disabled_lock; kcondvar_t mg_ms_disabled_cv; + + kstat_t *mg_kstat; + kmutex_t mg_kstat_lock; }; +typedef struct metaslab_group_kstat { + kstat_named_t mg_loads; + kstat_named_t mg_unloads; + kstat_named_t mg_spa_name; +} metaslab_group_kstat_t; + /* * This value defines the number of elements in the ms_lbas array. The value * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. @@ -491,7 +500,6 @@ struct metaslab { hrtime_t ms_unload_time; /* time last unloaded */ hrtime_t ms_selected_time; /* time last allocated from */ - uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ uint64_t ms_max_size; /* maximum allocatable size */ /* diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index d542368e7c..d760127ed9 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -151,6 +151,7 @@ struct vdev_queue { avl_tree_t vq_write_offset_tree; avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; + zoneid_t vq_last_zone_id; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..f1431b3f55 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern hrtime_t zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t, + avl_tree_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index d03106b942..7592614d6d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -394,8 +394,14 @@ typedef int zio_pipe_stage_t(zio_t *zio); * the reexecute flags are protected by io_lock, modifiable by children, * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. */ -#define ZIO_REEXECUTE_NOW 0x01 -#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 +#define ZIO_REEXECUTE_NO_SUSPEND 0x04 + +#define ZIO_SHOULD_REEXECUTE(x) \ + ((x)->io_reexecute & ZIO_REEXECUTE_NOW || \ + ((x)->io_reexecute & ZIO_REEXECUTE_SUSPEND && \ + (((x)->io_reexecute & ZIO_REEXECUTE_NO_SUSPEND) == 0))) /* * The io_trim flags are used to specify the type of TRIM to perform. They @@ -465,6 +471,7 @@ struct zio { hrtime_t io_timestamp; hrtime_t io_queued_timestamp; hrtime_t io_target_timestamp; + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ @@ -500,6 +507,7 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index a8670dcaa8..a99e581737 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -32,6 +32,7 @@ #include <sys/dsl_scan.h> #include <sys/zil.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * ZFS Transaction Groups @@ -535,6 +536,8 @@ txg_sync_thread(void *arg) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index cd5e80d769..228529d9fe 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -28,6 +28,7 @@ */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_impl.h> @@ -165,6 +166,8 @@ vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data __unused) { vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; /* * Ignore events other than offline. @@ -764,6 +767,7 @@ static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; if (vd->vdev_reopening || dvd == NULL) return; @@ -1028,6 +1032,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* * In general we would expect ldi_strategy() to return non-zero only * because of programming errors, but we've also seen this fail shortly @@ -1044,6 +1050,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 4c6515c43d..b40126cac0 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* @@ -35,6 +36,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/metaslab_impl.h> #include <sys/abd.h> @@ -145,7 +147,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_min_active = 3; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; @@ -274,6 +276,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_last_zone_id = 0; + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -318,6 +322,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_enqueue(zio); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -334,6 +339,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -732,7 +738,11 @@ again: search.io_timestamp = 0; search.io_offset = vq->vq_last_offset - 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); +#ifdef _KERNEL + zio = zfs_zone_schedule(vq, p, idx, tree); +#else zio = avl_nearest(tree, idx, AVL_AFTER); +#endif if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); @@ -890,9 +900,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) spa_t *spa = zio->io_spa; zio_priority_t oldpri = zio->io_priority; + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); zio->io_priority = priority; avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + zfs_zone_zio_enqueue(zio); mutex_enter(&spa->spa_iokstat_lock); ASSERT3U(spa->spa_queue_stats[oldpri].spa_queued, >, 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index f479ea9f30..b74baf46ea 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -696,9 +696,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); @@ -1022,6 +1023,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; + if (secpolicy_fs_import(cr) != 0) + return (set_errno(EPERM)); + if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); @@ -2162,7 +2166,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -2180,7 +2185,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -2207,11 +2213,24 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != (uintptr_t)NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); } @@ -2406,8 +2425,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != (uintptr_t)NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); @@ -2437,8 +2469,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -3148,6 +3182,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -3194,8 +3229,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3204,13 +3240,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); @@ -6591,7 +6629,8 @@ error: static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void -zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, +zfs_ioctl_register_legacy(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { @@ -6602,6 +6641,7 @@ zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); + vec->zvec_name = name; vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; @@ -6645,7 +6685,7 @@ zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } @@ -6653,14 +6693,15 @@ static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void -zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +zfs_ioctl_register_pool_modify(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func) { - zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, + zfs_ioctl_register_legacy(name, ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } @@ -6668,7 +6709,7 @@ static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } @@ -6676,7 +6717,7 @@ static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(NULL, ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } @@ -6688,10 +6729,10 @@ zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) } static void -zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, - zfs_secpolicy_func_t *secpolicy) +zfs_ioctl_register_dataset_modify(const char *name, zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { - zfs_ioctl_register_legacy(ioc, func, secpolicy, + zfs_ioctl_register_legacy(name, ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } @@ -6838,34 +6879,35 @@ zfs_ioctl_init(void) /* IOCTLS that use the legacy function signature */ - zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, - zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); + zfs_ioctl_register_legacy("pool_freeze", ZFS_IOC_POOL_FREEZE, + zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, + POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, + zfs_ioctl_register_pool_modify("pool_scan", ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, + zfs_ioctl_register_pool_modify("pool_upgrade", ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, + zfs_ioctl_register_pool_modify("vdev_add", ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, + zfs_ioctl_register_pool_modify("vdev_remove", ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, + zfs_ioctl_register_pool_modify("vdev_set_state", ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, + zfs_ioctl_register_pool_modify("vdev_attach", ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, + zfs_ioctl_register_pool_modify("vdev_detach", ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, + zfs_ioctl_register_pool_modify("vdev_setpath", ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, + zfs_ioctl_register_pool_modify("vdev_setfru", ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, + zfs_ioctl_register_pool_modify("pool_set_props", ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); - zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, + zfs_ioctl_register_pool_modify("vdev_split", ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); - zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, + zfs_ioctl_register_pool_modify("pool_reguid", ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, @@ -6943,20 +6985,20 @@ zfs_ioctl_init(void) zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, - zfs_secpolicy_none); - zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, - zfs_secpolicy_destroy); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, - zfs_secpolicy_rename); - zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, + zfs_ioctl_register_dataset_modify("set_prop", ZFS_IOC_SET_PROP, + zfs_ioc_set_prop, zfs_secpolicy_none); + zfs_ioctl_register_dataset_modify("destroy", ZFS_IOC_DESTROY, + zfs_ioc_destroy, zfs_secpolicy_destroy); + zfs_ioctl_register_dataset_modify("rename", ZFS_IOC_RENAME, + zfs_ioc_rename, zfs_secpolicy_rename); + zfs_ioctl_register_dataset_modify("recv", ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); - zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, - zfs_secpolicy_promote); - zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, + zfs_ioctl_register_dataset_modify("promote", ZFS_IOC_PROMOTE, + zfs_ioc_promote, zfs_secpolicy_promote); + zfs_ioctl_register_dataset_modify("inherit_prop", ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); - zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, - zfs_secpolicy_set_fsacl); + zfs_ioctl_register_dataset_modify("set_fsacl", ZFS_IOC_SET_FSACL, + zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); @@ -7333,7 +7375,32 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) nvlist_free(outnvl); } else { + spa_t *spa; + uint64_t orig_cookie = zc->zc_cookie; + error = vec->zvec_legacy_func(zc); + + if (error == 0 && vec->zvec_allow_log && + vec->zvec_name != NULL && + spa_open(zc->zc_name, &spa, FTAG) == 0) { + nvlist_t *lognv = NULL; + char *msg; + uint_t len = strlen(vec->zvec_name) + + strlen(zc->zc_name) + 128; + + msg = kmem_alloc(len, KM_SLEEP); + + lognv = fnvlist_alloc(); + (void) snprintf(msg, len, + "%s pool: %s cookie: %lu guid: %lx", vec->zvec_name, + zc->zc_name, orig_cookie, zc->zc_guid); + fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, msg); + + (void) spa_history_log_nvl(spa, lognv); + spa_close(spa, FTAG); + fnvlist_free(lognv); + kmem_free(msg, len); + } } out: diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 288dc93e3c..95a2be6239 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -23,7 +23,7 @@ * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. - * Copyright 2019 Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 Oxide Computer Company @@ -1917,7 +1917,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) mutex_enter(&mvp->v_lock); if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + (vn_count(mvp) != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (SET_ERROR(EBUSY)); } @@ -2342,6 +2342,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 1ee01c9146..dd58b4a549 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -847,6 +847,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -901,17 +912,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); - - /* * If in append mode, set the io offset pointer to eof. */ locked_range_t *lr; @@ -1147,9 +1147,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) break; ASSERT(tx_bytes == nbytes); n -= nbytes; - - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); } rangelock_exit(lr); @@ -3164,8 +3161,11 @@ top: return (err); } - if (vap->va_size == 0) + if (vap->va_size == 0) { vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } } if (mask & (AT_ATIME|AT_MTIME) || @@ -4173,9 +4173,7 @@ top: if (error == 0) { vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - /* notify the target dir if it is not the same as source dir */ - if (tdvp != sdvp) - vnevent_rename_dest_dir(tdvp, ct); + vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); } out: if (zl != NULL) @@ -5265,8 +5263,13 @@ zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag, error = zfs_freesp(zp, off, len, flag, TRUE); - if (error == 0 && off == 0 && len == 0) - vnevent_truncate(ZTOV(zp), ct); + if (error == 0 && len == 0) { + if (off == 0) { + vnevent_truncate(ZTOV(zp), ct); + } else { + vnevent_resize(ZTOV(zp), ct); + } + } ZFS_EXIT(zfsvfs); return (error); diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..f151595095 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1419 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018, Joyent, Inc. All rights reserved. + */ + +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + * 1) Ensure consistent and predictable I/O latency across all zones. + * 2) Sequential and random workloads have very different characteristics, + * so it is a non-starter to track IOPS or throughput. + * 3) A zone should be able to use the full disk bandwidth if no other zone + * is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + * (# of read syscalls) x (Average read latency) + + * (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +hrtime_t +zfs_zone_txg_delay() +{ + return (MSEC2NSEC(10)); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint8_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint8_t zfs_zone_delay_ceiling = 100; /* usec delay max */ + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. + */ +uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */ +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; + uint_t cycle_cnt; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + * ((now - zfs_zone_last_checked) * 1000); + */ +kmutex_t zfs_disk_lock; /* protects the following: */ +uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ +hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ + +hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +kmutex_t zfs_last_check_lock; /* protects zfs_zone_last_checked */ +hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; + +/* + * Data used to keep track of how often txg sync is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_sync_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on the zfs_vdev_sync_read_max_active value for the + * number of I/Os that can be pending on a device. If there are more than the + * max_active ops already queued up, beyond those already issued to the vdev, + * then use zone-based scheduling to get the next synchronous zio. + */ +uint32_t zfs_zone_schedule_thresh = 10; + +/* + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define SCHED_WEIGHT_MAX 20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale). + */ +int zfs_zone_txg_throttle_scale = 2; +hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); + +typedef struct { + int zq_qdepth; + zio_priority_t zq_queue; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op) +{ + zone_zfs_io_t *iop; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &iop->zpers_rd_ops); + iop->zpers_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &iop->zpers_wr_ops); + iop->zpers_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &iop->zpers_lwr_ops); + iop->zpers_lwr_ops.cycle_cnt++; + break; + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static boolean_t +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (B_FALSE); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (B_TRUE); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + atomic_inc_uint(&rd_lat.cycle_cnt); + atomic_add_64((uint64_t *)&rd_lat.cycle_lat, (int64_t)lat); + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + atomic_inc_uint(&wr_lat.cycle_cnt); + atomic_add_64((uint64_t *)&wr_lat.cycle_lat, (int64_t)lat); + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + DTRACE_PROBE3(zfs__zone__calc__wt__avg, + uintptr_t, cp->sys_avg_lat, + uintptr_t, cp->cycle_lat, + uintptr_t, cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_persist_t *zpd, hrtime_t unow, zfs_zone_iop_type_t op, + hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zpd, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_zfs_io_t *zpd, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + ASSERT3P(zpd, !=, NULL); + + *rops = calc_zone_cnt(unow, &zpd->zpers_rd_ops); + *wops = calc_zone_cnt(unow, &zpd->zpers_wr_ops); + *lwops = calc_zone_cnt(unow, &zpd->zpers_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zpd, + uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, + uintptr_t, *wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; + + ASSERT3P(iop, !=, NULL); + + mutex_enter(&zpd->zpers_zfs_lock); + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, iop, &rops, &wops, &lwops) == 0) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + iop->zpers_io_util = (rops * sp->zi_avgrlat) + (wops * sp->zi_avgwlat) + + (lwops * sp->zi_avgwlat); + sp->zi_totutil += iop->zpers_io_util; + + if (iop->zpers_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += iop->zpers_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, + uint_t, rops, uint_t, wops, uint_t, lwops, + uint64_t, iop->zpers_io_util, uint16_t, iop->zpers_zfs_io_pri); + + mutex_exit(&zpd->zpers_zfs_lock); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_zfs_io_t *zpd) +{ + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay < zfs_zone_delay_ceiling) + zpd->zpers_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_zfs_io_t *zpd) +{ + ASSERT3P(zpd, !=, NULL); + + if (zpd->zpers_io_delay > 0) + zpd->zpers_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop = zpd->zpers_zfsp; + zoneio_stats_t *sp = arg; + uint8_t delay; + uint_t fairutil = 0; + + ASSERT3P(iop, !=, NULL); + + mutex_enter(&zpd->zpers_zfs_lock); + delay = iop->zpers_io_delay; + iop->zpers_io_util_above_avg = 0; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * iop->zpers_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (iop->zpers_io_util > fairutil && sp->zi_overutil) { + iop->zpers_io_util_above_avg = 1; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(iop); + } else if (iop->zpers_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { + zfs_zone_delay_dec(iop); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, + uintptr_t, delay, uintptr_t, iop->zpers_io_delay, + uintptr_t, fairutil, uintptr_t, iop->zpers_io_util); + + mutex_exit(&zpd->zpers_zfs_lock); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) +{ + zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, + uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, + uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, + uintptr_t, stats.zi_diskutil); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + uint_t cnt; + zone_q_bump_t *qbp = arg; + zio_priority_t p = qbp->zq_queue; + zone_persist_t *zpd = &zone_pdata[zonep->zone_id]; + zone_zfs_io_t *iop; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + cnt = iop->zpers_zfs_queued[p]; + if (cnt == 0) { + iop->zpers_zfs_weight = 0; + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (iop->zpers_zfs_weight < SCHED_WEIGHT_MAX) + iop->zpers_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / cnt) * + iop->zpers_zfs_io_pri * iop->zpers_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * it becomes the new leading contender. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = iop->zpers_zfs_weight; + } + mutex_exit(&zpd->zpers_zfs_lock); + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. This is only + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller. + * + * For single-threaded synchronous processes a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple processes + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel processes (and ops in the queue) vs. other zones which + * are doing simple single-threaded processes, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, + avl_tree_t *tree) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + qbump.zq_queue = p; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(tree); zp != NULL; + zp = avl_walk(tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, + uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out TX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track and throttle IO operations per zone. Called from: + * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes + * go through this path) + * - arc_read for read ops that miss the ARC (both dataset and zvol) + * For each operation, increment that zone's counter based on the type of + * operation, then delay the operation, if necessary. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. We only see logical writes in this + * function, we see the physcial writes in the zfs_zone_zio_start and + * zfs_zone_zio_done functions. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice; first because this function + * is always called by a zone thread for logical writes, but then we also will + * count the physical writes that are performed at a low level via + * zfs_zone_zio_start. Without this, it can look like a non-global zone never + * writes (case 1). Depending on when the TXG is synced, the counts may be in + * the same sample bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through arc_read and we only come into this + * function when we have an arc miss. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zoneid_t zid = curzone->zone_id; + zone_persist_t *zpd = &zone_pdata[zid]; + zone_zfs_io_t *iop; + hrtime_t unow; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counter for logical writes here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + add_iop(zpd, unow, type, 0); + } + + if (!zfs_zone_delay_enable) + return; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + /* + * If the zone's I/O priority is set to zero, don't throttle that zone's + * operations at all. + */ + if (iop->zpers_zfs_io_pri == 0) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + + /* Handle periodically updating the per-zone I/O parameters */ + if ((unow - zfs_zone_last_checked) > zfs_zone_adjust_time) { + hrtime_t last_checked; + boolean_t do_update = B_FALSE; + + /* Recheck under mutex */ + mutex_enter(&zfs_last_check_lock); + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + do_update = B_TRUE; + } + mutex_exit(&zfs_last_check_lock); + + if (do_update) { + mutex_exit(&zpd->zpers_zfs_lock); + + zfs_zone_wait_adjust(unow, last_checked); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return; + } + } + } + + wait = iop->zpers_io_delay; + mutex_exit(&zpd->zpers_zfs_lock); + + if (wait > 0) { + /* + * If this is a write and we're doing above normal TXG + * syncing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_sync_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zid, + uintptr_t, type, uintptr_t, wait); + + drv_usecwait(wait); + + if (curzone->zone_vfs_stats != NULL) { + atomic_inc_64(&curzone->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&curzone->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the sync rate is going to be 1. When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second). In this case the rate + * will be > 1. The sync rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_sync_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_sync_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +hrtime_t +zfs_zone_txg_delay() +{ + zone_persist_t *zpd = &zone_pdata[curzone->zone_id]; + zone_zfs_io_t *iop; + uint8_t above; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop == NULL) { + mutex_exit(&zpd->zpers_zfs_lock); + return (0); + } + + above = iop->zpers_io_util_above_avg; + mutex_exit(&zpd->zpers_zfs_lock); + + if (above) { + return (zfs_zone_txg_delay_nsec); + } + + return (MSEC2NSEC(10)); +} + +/* + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&iop->zpers_zfs_rwstats); + iop->zpers_zfs_weight = 0; + } + mutex_exit(&zpd->zpers_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); +} + +/* + * Called from vdev_disk_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_persist_t *zpd; + zone_zfs_io_t *iop; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if (zp->io_dispatched == 0) + return; + + zpd = &zone_pdata[zp->io_zoneid]; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + /* + * To calculate the wsvc_t average, keep a cumulative sum of + * all the wait time before each I/O was dispatched. Since most + * writes are asynchronous, only track the wait time for + * read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + iop->zpers_zfs_rwstats.reads++; + iop->zpers_zfs_rwstats.nread += zp->io_size; + iop->zpers_zfs_rd_waittime += + zp->io_dispatched - zp->io_timestamp; + kstat_runq_exit(&iop->zpers_zfs_rwstats); + } else { + iop->zpers_zfs_rwstats.writes++; + iop->zpers_zfs_rwstats.nwritten += zp->io_size; + } + } + mutex_exit(&zpd->zpers_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + add_iop(zpd, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + } + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, + uintptr_t, zp->io_type, uintptr_t, udelta); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zio_priority_t p; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + ASSERT(iop->zpers_zfs_queued[p] > 0); + if (iop->zpers_zfs_queued[p] == 0) { + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + } else { + iop->zpers_zfs_queued[p]--; + } + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zio_priority_t p; + zone_persist_t *zpd = &zone_pdata[zp->io_zoneid]; + zone_zfs_io_t *iop; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + mutex_enter(&zpd->zpers_zfs_lock); + iop = zpd->zpers_zfsp; + if (iop != NULL) { + iop->zpers_zfs_queued[p]++; + } + mutex_exit(&zpd->zpers_zfs_lock); +} + +/* + * Called from vdev_queue_io_to_issue. That function is where zio's are listed + * in FIFO order on one of the sync queues, then pulled off (by + * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling + * here to find a zone's zio deeper in the sync queue and issue that instead + * of simply doing FIFO. + * + * We only do zone-based zio scheduling for the two synchronous I/O queues + * (read & write). These queues are normally serviced in FIFO order but we + * may decide to move a zone's zio to the head of the line. A typical I/O + * load will be mostly synchronous reads and some asynchronous writes (which + * are scheduled differently due to transaction groups). There will also be + * some synchronous writes for those apps which want to ensure their data is on + * disk. We want to make sure that a zone with a single-threaded app (e.g. the + * shell) that is doing synchronous I/O (typically reads) isn't penalized by + * other zones which are doing lots of synchronous I/O because they have many + * running threads. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, + avl_tree_t *tree) +{ + vdev_queue_class_t *vqc = &vq->vq_class[p]; + uint_t cnt; + zoneid_t last_zone; + zio_t *zio; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* Don't change the order on the LBA ordered queues. */ + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return (avl_nearest(tree, idx, AVL_AFTER)); + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + cnt = avl_numnodes(tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few zios in the queue then just issue the head. + * If there are more than a few zios already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zio = avl_nearest(tree, idx, AVL_AFTER); + else + zio = get_next_zio(vqc, cnt, p, tree); + + vq->vq_last_zone_id = zio->io_zoneid; + + /* + * Probe with 4 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, the zone that was associated + * with the next IO that is scheduled, and which queue (priority). + */ + DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, + uint_t, zio->io_zoneid, uint_t, p); + + return (zio); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 450ccb94e5..7a15838338 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ @@ -106,8 +107,23 @@ boolean_t zil_nocacheflush = B_FALSE; * Limit SLOG write size per commit executed with synchronous priority. * Any writes above that will be executed with lower (asynchronous) priority * to limit potential SLOG device abuse by single active ZIL writer. + * + * The default upstream value for zil_slog_bulk is: + * uint64_t zil_slog_bulk = 768 * 1024; + * For SmartOS, we default to using a high value to essentially disable this + * behavior. + * + * Because the default value of this tunable forces some zil_commit writes down + * to io_priority ZIO_PRIORITY_ASYNC_WRITE, those zio's would be in the same + * zio pipeline queue as all of the async spa_sync zio's. This can lead to + * serious latency problems for the user-level application code because it is + * blocked on completion of the zil_commit. We see this when a spa_sync zio is + * running slow (e.g. when metaslab loading takes a long time in the + * zio_dva_allocate pipeline stage), thus delaying all zio's backed up in the + * ZIO_PRIORITY_ASYNC_WRITE queue. For SmartOS, we choose to keep all + * zil_commmit zio's at ZIO_PRIORITY_SYNC_WRITE. */ -uint64_t zil_slog_bulk = 768 * 1024; +uint64_t zil_slog_bulk = 0x100000000ULL; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; @@ -3079,13 +3095,20 @@ zil_close(zilog_t *zilog) txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); mutex_exit(&zilog->zl_lock); - /* - * We need to use txg_wait_synced() to wait long enough for the - * ZIL to be clean, and to wait for all pending lwbs to be - * written out. - */ - if (txg != 0) + if (zilog_is_dirty(zilog)) { + /* + * If we're dirty, always wait for the current transaction -- + * our lwb_max_txg may be in the past. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (txg != 0) { + /* + * We need to use txg_wait_synced() to wait long enough for the + * ZIL to be clean, and to wait for all pending lwbs to be + * written out. + */ txg_wait_synced(zilog->zl_dmu_pool, txg); + } if (zilog_is_dirty(zilog)) zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index f8a98f73f3..b32dffd79c 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright (c) 2017, Intel Corporation. * Copyright 2020 Joyent, Inc. @@ -43,6 +44,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/zfs_zone.h> #include <sys/time.h> #include <sys/dsl_scan.h> #include <sys/metaslab_impl.h> @@ -765,6 +767,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) @@ -772,6 +775,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); @@ -4317,6 +4322,24 @@ zio_done(zio_t *zio) } } + /* + * When we have an error on a slog vdev, we must ensure that the + * zio is not suspended. Suspending the zio will cause dataset deletion + * or an attempt to remove the slog to hang. In both cases, the code + * might be trying to clean up the zil blocks on the slog, but because + * the slog is dead, the suspended zio causes this to hang indefinitely. + * The system properly switches over to using zils on regular storage + * when the slog dies. + * + * This is a reasonable point in the stack to detect that the vdev is + * a slog. The 'no_suspend' flag will propagate up to the logical zio + * via zio_notify_parent. + */ + if (zio->io_error && vd != NULL && vd->vdev_islog && + !vdev_accessible(vd, zio)) { + zio->io_reexecute |= ZIO_REEXECUTE_NO_SUSPEND; + } + if (zio->io_error && zio == lio) { /* * Determine whether zio should be reexecuted. This will @@ -4361,7 +4384,7 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); - if ((zio->io_error || zio->io_reexecute) && + if ((zio->io_error || ZIO_SHOULD_REEXECUTE(zio)) && IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) zio_dva_unallocate(zio, zio->io_gang_tree, bp); @@ -4375,7 +4398,7 @@ zio_done(zio_t *zio) (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) zio->io_reexecute = 0; - if (zio->io_reexecute) { + if (ZIO_SHOULD_REEXECUTE(zio)) { /* * This is a logical I/O that wants to reexecute. * @@ -4446,7 +4469,7 @@ zio_done(zio_t *zio) } ASSERT(zio->io_child_count == 0); - ASSERT(zio->io_reexecute == 0); + ASSERT(!ZIO_SHOULD_REEXECUTE(zio)); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); /* diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 2bb311d28d..3d2a42aa46 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -82,6 +82,7 @@ #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/zfeature.h> @@ -140,6 +141,11 @@ typedef struct zvol_state { #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * zvol maximum transfer in one DMU tx. */ @@ -1342,6 +1348,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) zvol_state_t *zv; uint64_t volsize; int error = 0; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1360,6 +1369,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) smt_begin_unsafe(); + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + locked_range_t *lr = rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1369,6 +1386,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; + tot_bytes += bytes; error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ @@ -1379,6 +1397,38 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } rangelock_exit(lr); + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += tot_bytes; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + smt_end_unsafe(); return (error); @@ -1393,6 +1443,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; int error = 0; boolean_t sync; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1411,6 +1464,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) smt_begin_unsafe(); + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for zvol write operations. There's no + * actual wait queue for zvol operations. + */ + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1424,6 +1490,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; + tot_bytes += bytes; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1443,8 +1510,40 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + smt_end_unsafe(); + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += tot_bytes; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + return (error); } diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h new file mode 100644 index 0000000000..e3eac799e5 --- /dev/null +++ b/usr/src/uts/common/inet/bpf.h @@ -0,0 +1,49 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _INET_BPF_H +#define _INET_BPF_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef _KERNEL + +#include <sys/types.h> + +/* + * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason + * about struct sizing. + */ +typedef struct ip_bpf_insn { + uint16_t code; + uint8_t jt; + uint8_t jf; + uint32_t k; +} ip_bpf_insn_t; + +extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t); +extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t); + + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_BPF_H */ diff --git a/usr/src/uts/common/io/bpf/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c index db5b224a5e..5a9ba38da6 100644 --- a/usr/src/uts/common/io/bpf/bpf_filter.c +++ b/usr/src/uts/common/inet/bpf_filter.c @@ -38,6 +38,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -45,11 +46,12 @@ #include <sys/stream.h> #include <sys/byteorder.h> #include <sys/sdt.h> +#include <inet/bpf.h> +#include <net/bpf.h> #define EXTRACT_SHORT(p) BE_IN16(p) #define EXTRACT_LONG(p) BE_IN32(p) -#ifdef _KERNEL #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) #define mtod(_a, _t) ((_t)((_a)->b_rptr)) #define MINDEX(len, m, k) \ @@ -123,11 +125,7 @@ m_xhalf(mblk_t *m, uint32_t k, int *err) *err = 0; return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]); } -#else /* _KERNEL */ -#include <stdlib.h> -#endif /* !_KERNEL */ -#include <net/bpf.h> /* * Execute the filter program starting at pc on the packet p @@ -137,8 +135,8 @@ m_xhalf(mblk_t *m, uint32_t k, int *err) * packet is only in one mblk_t. * When buflen is 0, p is an mblk_t pointer. */ -uint_t -bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +uint32_t +ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen) { uint32_t A, X, k; uint32_t mem[BPF_MEMWORDS]; @@ -147,7 +145,7 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) /* * No filter means accept all. */ - return ((uint_t)-1); + return ((uint32_t)-1); A = 0; X = 0; --pc; @@ -165,10 +163,10 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) abort(); #endif case BPF_RET|BPF_K: - return ((uint_t)pc->k); + return (pc->k); case BPF_RET|BPF_A: - return ((uint_t)A); + return (A); case BPF_LD|BPF_W|BPF_ABS: k = pc->k; @@ -456,7 +454,6 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) /* NOTREACHED */ } -#ifdef _KERNEL /* * Return true if the 'fcode' is a valid filter program. * The constraints are that each jump be forward and to a valid @@ -468,14 +465,14 @@ bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) * The kernel needs to be able to verify an application's filter code. * Otherwise, a bogus program could easily crash the system. */ -int -bpf_validate(struct bpf_insn *f, int len) +boolean_t +ip_bpf_validate(ip_bpf_insn_t *f, uint_t len) { uint_t i, from; - struct bpf_insn *p; + ip_bpf_insn_t *p; if (len < 1 || len > BPF_MAXINSNS) - return (0); + return (B_FALSE); for (i = 0; i < len; ++i) { p = &f[i]; @@ -489,7 +486,7 @@ bpf_validate(struct bpf_insn *f, int len) switch (BPF_MODE(p->code)) { case BPF_MEM: if (p->k >= BPF_MEMWORDS) - return (0); + return (B_FALSE); break; case BPF_ABS: case BPF_IND: @@ -498,13 +495,13 @@ bpf_validate(struct bpf_insn *f, int len) case BPF_LEN: break; default: - return (0); + return (B_FALSE); } break; case BPF_ST: case BPF_STX: if (p->k >= BPF_MEMWORDS) - return (0); + return (B_FALSE); break; case BPF_ALU: switch (BPF_OP(p->code)) { @@ -522,10 +519,10 @@ bpf_validate(struct bpf_insn *f, int len) * Check for constant division by 0. */ if (BPF_RVAL(p->code) == BPF_K && p->k == 0) - return (0); + return (B_FALSE); break; default: - return (0); + return (B_FALSE); } break; case BPF_JMP: @@ -549,17 +546,17 @@ bpf_validate(struct bpf_insn *f, int len) switch (BPF_OP(p->code)) { case BPF_JA: if (from + p->k < from || from + p->k >= len) - return (0); + return (B_FALSE); break; case BPF_JEQ: case BPF_JGT: case BPF_JGE: case BPF_JSET: if (from + p->jt >= len || from + p->jf >= len) - return (0); + return (B_FALSE); break; default: - return (0); + return (B_FALSE); } break; case BPF_RET: @@ -567,10 +564,9 @@ bpf_validate(struct bpf_insn *f, int len) case BPF_MISC: break; default: - return (0); + return (B_FALSE); } } return (BPF_CLASS(f[len - 1].code) == BPF_RET); } -#endif diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index c081c44a04..ebf2574363 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1416,6 +1416,7 @@ typedef union ill_g_head_u { #define ILL_CAPAB_DLD 0x20 /* DLD capabilities */ #define ILL_CAPAB_DLD_POLL 0x40 /* Polling */ #define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */ +#define ILL_CAPAB_DLD_IPCHECK 0x100 /* Check if IPs are permitted */ /* * Per-ill Hardware Checksumming capbilities. @@ -1772,6 +1773,10 @@ typedef struct ill_s { * Used to save errors that occur during plumbing */ uint_t ill_ifname_pending_err; + /* + * Used to save errors that occur during binding + */ + uint_t ill_dl_bind_err; avl_node_t ill_avl_byppa; /* avl node based on ppa */ uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */ list_t ill_nce; /* pointer to nce_s list */ @@ -1938,6 +1943,7 @@ typedef struct ill_s { * ill_nd_lla_len ipsq + down ill only when ill is up * ill_phys_addr_pend ipsq + down ill only when ill is up * ill_ifname_pending_err ipsq ipsq + * ill_dl_bind_err ipsq ipsq * ill_avl_byppa ipsq, ill_g_lock write once * * ill_fastpath_list ill_lock ill_lock @@ -3580,6 +3586,8 @@ typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t); +typedef boolean_t (*ip_mac_ipcheck_t)(void *, boolean_t, + in6_addr_t *); typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); @@ -3632,6 +3640,12 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ void *idd_tx_fctl_dh; /* mac_client_handle */ } ill_dld_direct_t; +/* IP - DLD direct function call to check if an IP is allowed */ +typedef struct ill_dld_ipcheck_s { + ip_mac_ipcheck_t idi_allowed_df; + void *idi_allowed_dh; +} ill_dld_ipcheck_t; + /* IP - DLD polling capability */ typedef struct ill_dld_poll_s { ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS]; @@ -3643,6 +3657,7 @@ struct ill_dld_capab_s { void *idc_capab_dh; /* dld_str_t *dsp */ ill_dld_direct_t idc_direct; ill_dld_poll_t idc_poll; + ill_dld_ipcheck_t idc_ipcheck; }; /* diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index 7aac9b655a..eeec56b162 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } ifindex = UINT_MAX; switch (name) { diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 57ee0c5585..46c791298a 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -81,6 +81,7 @@ #include <sys/tsol/tnet.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> #include <sys/disp.h> @@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp) icmp->icmp_filter = NULL; } + if (icmp->icmp_bpf_len != 0) { + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + } + /* * Clear any fields which the kmem_cache constructor clears. * Only icmp_connp needs to be preserved. @@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) return (err); } +static int +icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp) +{ + struct bpf_program prog; + ip_bpf_insn_t *insns = NULL; + unsigned int size; + +#ifdef _LP64 + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct bpf_program32 *prog32; + + if (inlen != sizeof (struct bpf_program32)) { + return (EINVAL); + } + prog32 = (struct bpf_program32 *)invalp; + prog.bf_len = prog32->bf_len; + prog.bf_insns = (void *)(uint64_t)prog32->bf_insns; + } else +#endif + if (inlen == sizeof (struct bpf_program)) { + bcopy(invalp, &prog, sizeof (prog)); + } else { + return (EINVAL); + } + + if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) { + return (EINVAL); + } + size = prog.bf_len * sizeof (struct bpf_insn); + insns = kmem_alloc(size, KM_SLEEP); + if (copyin(prog.bf_insns, insns, size) != 0) { + kmem_free(insns, size); + return (EFAULT); + } + if (!ip_bpf_validate(insns, prog.bf_len)) { + kmem_free(insns, size); + return (EINVAL); + } + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len != 0) { + ASSERT(icmp->icmp_bpf_prog != NULL); + + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + } + icmp->icmp_bpf_len = size; + icmp->icmp_bpf_prog = insns; + rw_exit(&icmp->icmp_bpf_lock); + return (0); +} + +static int +icmp_detach_filter(icmp_t *icmp) +{ + int error; + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len == 0) { + ASSERT(icmp->icmp_bpf_prog == NULL); + error = ENOENT; + } else { + kmem_free(icmp->icmp_bpf_prog, + icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + error = 0; + } + rw_exit(&icmp->icmp_bpf_lock); + return (error); +} + +static boolean_t +icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira) +{ + boolean_t res; + uchar_t *buf = mp->b_rptr; + uint_t wirelen, len = MBLKL(mp); + + rw_enter(&icmp->icmp_bpf_lock, RW_READER); + if (icmp->icmp_bpf_len == 0) { + rw_exit(&icmp->icmp_bpf_lock); + return (B_FALSE); + } + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)buf; + + wirelen = ntohs(ipha->ipha_length); + } else { + ip6_t *ip6h = (ip6_t *)buf; + + wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len); + rw_exit(&icmp->icmp_bpf_lock); + + return (res); +} + /* * This routine sets socket options. */ @@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, return (ENOBUFS); } break; + case SO_ATTACH_FILTER: + return (icmp_attach_filter(icmp, inlen, invalp)); + case SO_DETACH_FILTER: + return (icmp_detach_filter(icmp)); } break; @@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) /* Initialize regardless of IP version */ ipps.ipp_fields = 0; + /* Apply socket filter, if needed */ + if (icmp->icmp_bpf_len != 0) { + if (icmp_eval_filter(icmp, mp, ira)) { + freemsg(mp); + return; + } + } + if (ira->ira_flags & IRAF_IS_IPV4) { ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); ASSERT(MBLKL(mp) >= sizeof (ipha_t)); diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index ff0310de0c..d65d3164d3 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -41,6 +42,7 @@ #include <netinet/ip_mroute.h> #include <inet/optcom.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> /* * Table of all known options handled on a ICMP protocol stack. @@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = { 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, + sizeof (struct bpf_program), 0 }, +{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 }, + { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 6063fa01d2..704f152bb9 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) conn_t *connp = NULL; t_uscalar_t paddrreq; mblk_t *mp_hw; - boolean_t success; boolean_t ioctl_aborted = B_FALSE; boolean_t log = B_TRUE; @@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; mutex_exit(&ill->ill_lock); /* - * Something went wrong with the bind. We presumably + * Something went wrong with the bind. If this was the + * result of a DL_NOTE_REPLUMB, then we presumably * have an IOCTL hanging out waiting for completion. * Find it, take down the interface that was coming * up, and complete the IOCTL with the error noted. @@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ + } else { + /* + * There's no pending IOCTL, so the bind was + * most likely started by ill_dl_up(). We save + * the error and let it take care of responding + * to the IOCTL. + */ + ill->ill_dl_bind_err = dlea->dl_unix_errno ? + dlea->dl_unix_errno : ENXIO; } break; case DL_ENABMULTI_REQ: @@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill); ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); - /* - * Now bring up the resolver; when that is complete, we'll - * create IREs. Note that we intentionally mirror what - * ipif_up() would have done, because we got here by way of - * ill_dl_up(), which stopped ipif_up()'s processing. - */ - if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) - err = ipif_up_done_v6(ipif); - } else if (ill->ill_net_type == IRE_IF_RESOLVER) { - /* - * ARP and other v4 external resolvers. - * Leave the pending mblk intact so that - * the ioctl completes in ip_rput(). - */ - if (connp != NULL) - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - if (connp != NULL) - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - mp1 = ipsq_pending_mp_get(ipsq, &connp); - } else { - /* The conn has started closing */ - err = EINTR; - } - } else { - /* - * This one is complete. Reply to pending ioctl. - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done(ipif); - } - - if ((err == 0) && (ill->ill_up_ipifs)) { + if (ill->ill_up_ipifs) { err = ill_up_ipifs(ill, q, mp1); if (err == EINPROGRESS) { freemsg(mp); @@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - /* - * If we have a moved ipif to bring up, and everything has - * succeeded to this point, bring it up on the IPMP ill. - * Otherwise, leave it down -- the admin can try to bring it - * up by hand if need be. - */ - if (ill->ill_move_ipif != NULL) { - if (err != 0) { - ill->ill_move_ipif = NULL; - } else { - ipif = ill->ill_move_ipif; - ill->ill_move_ipif = NULL; - err = ipif_up(ipif, q, mp1); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - } - } break; case DL_NOTIFY_IND: { @@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index cc67299a1b..2307837eb8 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, static int ill_alloc_ppa(ill_if_t *, ill_t *); static void ill_delete_interface_type(ill_if_t *); -static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); +static int ill_dl_up(ill_t *ill, ipif_t *ipif); static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); static void ill_down_ipifs(ill_t *, boolean_t); @@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill) ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; } +static boolean_t +ill_capability_wait(ill_t *ill) +{ + /* + * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can + * only be set by someone who is the writer. Since we + * drop-and-reacquire the squeue in this loop, we need to check for + * ILL_CONDEMNED, which if set means nothing can signal our capability + * condition variable. + */ + ASSERT(IAM_WRITER_ILL(ill)); + + while (ill->ill_capab_pending_cnt != 0 && + (ill->ill_state_flags & ILL_CONDEMNED) == 0) { + /* This may enable blocked callers of ill_capability_done(). */ + ipsq_exit(ill->ill_phyint->phyint_ipsq); + /* Pause a bit (1msec) before we re-enter the squeue. */ + delay(drv_usectohz(1000000)); + + /* + * If ipsq_enter() fails, someone set ILL_CONDEMNED + * while we dropped the squeue. Indicate such to the caller. + */ + if (!ipsq_enter(ill, B_FALSE, CUR_OP)) + return (B_FALSE); + } + + return ((ill->ill_state_flags & ILL_CONDEMNED) == 0); +} + void ill_capability_reset(ill_t *ill, boolean_t reneg) { @@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg) ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; + ASSERT(ill->ill_capab_reset_mp != NULL); + ill_capability_send(ill, ill->ill_capab_reset_mp); ill->ill_capab_reset_mp = NULL; /* @@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill) } } +/* + * Check whether or not mac will prevent us from sending with a given IP + * address. This requires having the IPCHECK capability, which we should + * always be able to successfully negotiate, but if it's somehow missing + * then we just permit the caller to use the address, since mac does the + * actual enforcement and ip is just performing a courtesy check to help + * prevent users from unwittingly setting and attempting to use blocked + * addresses. + */ +static boolean_t +ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr) +{ + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0) + return (B_TRUE); + + ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck; + ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df; + return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr)); +} + +static void +ill_capability_ipcheck_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + ill_dld_ipcheck_t *idi = &idc->idc_ipcheck; + dld_capab_ipcheck_t spoof; + int rc; + + ASSERT(IAM_WRITER_ILL(ill)); + + bzero(&spoof, sizeof (spoof)); + if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + &spoof, DLD_ENABLE)) == 0) { + idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df; + idi->idi_allowed_dh = spoof.ipc_allowed_dh; + ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK; + } else { + cmn_err(CE_WARN, "warning: could not enable IPCHECK " + "capability, rc = %d\n", rc); + DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc); + } +} + static void ill_capability_dld_enable(ill_t *ill) { @@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill) ill_capability_direct_enable(ill); ill_capability_poll_enable(ill); } + + ill_capability_ipcheck_enable(ill); ill_capability_lso_enable(ill); ill->ill_capabilities |= ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); @@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill) NULL, DLD_DISABLE); } + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) { + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL); + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL); + + ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK; + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + NULL, DLD_DISABLE); + } + ill->ill_capabilities &= ~ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, in6_addr_t v6addr; boolean_t need_up = B_FALSE; ill_t *ill; - int i; ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); } - /* - * verify that the address being configured is permitted by the - * ill_allowed_ips[] for the interface. - */ - if (ill->ill_allowed_ips_cnt > 0) { - for (i = 0; i < ill->ill_allowed_ips_cnt; i++) { - if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i], - &v6addr)) - break; - } - if (i == ill->ill_allowed_ips_cnt) { - pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr); - return (EPERM); - } + /* verify that the address being configured is permitted by mac */ + if (!ill_ipcheck_addr(ill, &v6addr)) { + return (EPERM); } /* * Even if there is no change we redo things just to rerun @@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill) } ill->ill_unbind_mp = NULL; + + mutex_enter(&ill->ill_lock); + ill->ill_dl_up = 0; + ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); + mutex_exit(&ill->ill_lock); + if (mp != NULL) { ip1dbg(("ill_dl_down: %s (%u) for %s\n", dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, @@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill) ill_capability_dld_disable(ill); ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); + + /* + * Wait for the capability reset to finish. + * In this case, it doesn't matter WHY or HOW it finished. + */ + (void) ill_capability_wait(ill); } - mutex_enter(&ill->ill_lock); - ill->ill_dl_up = 0; - ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); - mutex_exit(&ill->ill_lock); } void @@ -12852,6 +12934,7 @@ void ill_capability_done(ill_t *ill) { ASSERT(ill->ill_capab_pending_cnt != 0); + ASSERT(IAM_WRITER_ILL(ill)); ill_dlpi_done(ill, DL_CAPABILITY_REQ); @@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) * address/netmask etc cause a down/up dance, but * does not cause an unbind (DL_UNBIND) with the driver */ - return (ill_dl_up(ill, ipif, mp, q)); + if ((err = ill_dl_up(ill, ipif)) != 0) { + return (err); + } + } + + /* Reject bringing up interfaces with unusable IP addresses */ + if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) { + return (EPERM); } /* @@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill) /* * Perform a bind for the physical device. - * When the routine returns EINPROGRESS then mp has been consumed and - * the ioctl will be acked from ip_rput_dlpi. - * Allocate an unbind message and save it until ipif_down. + * + * When the routine returns successfully then dlpi has been bound and + * capabilities negotiated. An unbind message will have been allocated + * for later use in ipif_down. */ static int -ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) +ill_dl_up(ill_t *ill, ipif_t *ipif) { mblk_t *bind_mp = NULL; mblk_t *unbind_mp = NULL; - conn_t *connp; - boolean_t success; int err; DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(mp != NULL); /* * Make sure we have an IRE_MULTICAST in case we immediately @@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) if (unbind_mp == NULL) goto bad; } - /* - * Record state needed to complete this operation when the - * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. - */ - connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; - ASSERT(connp != NULL || !CONN_Q(q)); - GRAB_CONN_LOCK(q); - mutex_enter(&ipif->ipif_ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); - mutex_exit(&ipif->ipif_ill->ill_lock); - RELEASE_CONN_LOCK(q); - if (!success) - goto bad; /* * Save the unbind message for ill_dl_down(); it will be consumed when @@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) ill_dlpi_send(ill, bind_mp); /* Send down link-layer capabilities probe if not already done. */ ill_capability_probe(ill); + /* + * Wait for DLPI to be bound and the capability probe to finish. + * The call drops-and-reacquires the squeue. If it couldn't because + * ILL_CONDEMNED got set, bail. + */ + if (!ill_capability_wait(ill)) + return (ENXIO); + + /* DLPI failed to bind. Return the saved error */ + if (!ill->ill_dl_up) { + return (ill->ill_dl_bind_err); + } /* * Sysid used to rely on the fact that netboots set domainname @@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) cmn_err(CE_WARN, "no cached dhcp response"); } - /* - * This operation will complete in ip_rput_dlpi with either - * a DL_BIND_ACK or DL_ERROR_ACK. - */ - return (EINPROGRESS); + return (0); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 13e961333c..b6565d9c1f 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(pri); + sqp = squeue_create(pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 34832d56e5..d47997a4aa 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 Joyent, Inc. */ @@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) connp->conn_flags = IPCL_RAWIPCONN; connp->conn_proto = IPPROTO_ICMP; icmp->icmp_connp = connp; + rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL); rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); if (connp->conn_ixa == NULL) @@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg) mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); rw_destroy(&connp->conn_ilg_lock); + rw_destroy(&icmp->icmp_bpf_lock); /* Can be NULL if constructor failed */ if (connp->conn_ixa != NULL) { diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 89968826b3..70cff374a4 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -299,7 +300,8 @@ struct conn_s { conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_pad_to_bit_31 : 12; + conn_reuseport : 1, /* SO_REUSEPORT state */ + conn_pad_to_bit_31 : 11; boolean_t conn_blocked; /* conn is flow-controlled */ diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c index 104603d840..22f2d79d24 100644 --- a/usr/src/uts/common/inet/ipd/ipd.c +++ b/usr/src/uts/common/inet/ipd/ipd.c @@ -9,7 +9,7 @@ * http://www.illumos.org/license/CDDL. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -222,7 +222,7 @@ typedef struct ipd_netstack { net_handle_t ipdn_v6hdl; /* IPv4 net handle */ int ipdn_hooked; /* are hooks registered */ hook_t *ipdn_v4in; /* IPv4 traffic in hook */ - hook_t *ipdn_v4out; /* IPv4 traffice out hook */ + hook_t *ipdn_v4out; /* IPv4 traffic out hook */ hook_t *ipdn_v6in; /* IPv6 traffic in hook */ hook_t *ipdn_v6out; /* IPv6 traffic out hook */ int ipdn_enabled; /* which perturbs are on */ @@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay) /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the - * request to enable corruption. + * request to enable packet delays. */ if (rval != 0) ins->ipdn_delay = 0; diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c new file mode 100644 index 0000000000..941aeac328 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/cfw.c @@ -0,0 +1,659 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* IPF oddness for compilation in userland for IPF tests. */ +#if defined(KERNEL) || defined(_KERNEL) +#undef KERNEL +#undef _KERNEL +#define KERNEL 1 +#define _KERNEL 1 +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include "netinet/ip_compat.h" +#ifdef USE_INET6 +#include <netinet/icmp6.h> +#endif +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_auth.h" +#include "netinet/ipf_stack.h" +#ifdef IPFILTER_SCAN +#include "netinet/ip_scan.h" +#endif +#ifdef IPFILTER_SYNC +#include "netinet/ip_sync.h" +#endif +#include "netinet/ip_pool.h" +#include "netinet/ip_htable.h" +#ifdef IPFILTER_COMPILED +#include "netinet/ip_rules.h" +#endif +#if defined(_KERNEL) +#include <sys/sunddi.h> +#endif + +#include "netinet/ipf_cfw.h" +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/ddi.h> + +/* + * cfw == Cloud Firewall ==> routines for a global-zone data collector about + * ipf events for SmartOS. The only ones that CFW cares about are ones + * enforced by global-zone-controlled rulesets. + * + * The variable below is tied into the GZ-only ipf device /dev/ipfev, that + * flips this on when there is an open instance. This feature will also + * consume an fr_flag to have per-rule granularity. + */ +boolean_t ipf_cfwlog_enabled; + +/* + * Because ipf's test tools in $SRC/cmd insert all of these files, we need to + * stub out what we can vs. drag in even more headers and who knows what else. + */ +#ifdef _KERNEL + +/* + * CFW event ring buffer. Remember, this is for ALL ZONES because only a + * global-zone event-reader will be consuming these. In other words, it's + * not something to instantiate per-netstack. + * + * We may want to get more sophisticated and performant (e.g. per-processor), + * but for now keep the ring buffer simple and stupid. + * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t + * + * Resizeable, see ipf_cfw_ring_resize() below. + */ +#define IPF_CFW_DEFAULT_RING_BUFS 1024 +#define IPF_CFW_MIN_RING_BUFS 8 +#define IPF_CFW_MAX_RING_BUFS (1U << 31U) + +/* Assume C's init-to-zero is sufficient for these types... */ +static kmutex_t cfw_ringlock; +static kcondvar_t cfw_ringcv; + +static cfwev_t *cfw_ring; /* NULL by default. */ +static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */ +static uint32_t cfw_ringmask; /* 0 by default. */ + +/* If these are equal, we're either empty or full. */ +static uint_t cfw_ringstart, cfw_ringend; +static boolean_t cfw_ringfull; /* Tell the difference here! */ +/* Bean-counters. */ +static uint64_t cfw_evreports; +static uint64_t cfw_evdrops; + +/* + * Place an event in the CFW event ring buffer. + * + * For now, be simple and drop the oldest event if we overflow. We may wish to + * selectively drop older events based on type in the future. + */ +static void +ipf_cfwev_report(cfwev_t *event) +{ + mutex_enter(&cfw_ringlock); + cfw_ring[cfw_ringend] = *event; + cfw_ringend++; + cfw_ringend &= cfw_ringmask; + if (cfw_ringfull) { + cfw_ringstart++; + cfw_ringstart &= cfw_ringmask; + ASSERT3U(cfw_ringstart, ==, cfw_ringend); + DTRACE_PROBE(ipf__cfw__evdrop); + cfw_evdrops++; + } else { + cfw_ringfull = (cfw_ringend == cfw_ringstart); + } + cfw_evreports++; + cv_broadcast(&cfw_ringcv); + mutex_exit(&cfw_ringlock); +} + +/* + * Provide access to multiple CFW events that can allow copying straight from + * the ring buffer up to userland. Requires a callback (which could call + * uiomove() directly, OR to a local still-in-kernel buffer) that must do the + * data copying-out. + * + * Callback function is of the form: + * + * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg); + * + * The function must return how many events got consumed, which MUST be <= the + * number available. The function must ALSO UNDERSTAND that cfw_ringlock is + * held and must not be released during this time. The function may be called + * more than once, if the available buffers wrap-around OR "block" is set and + * we don't have enough buffers. If any callback returns 0, exit the function + * with however many were consumed. + * + * This function, like the callback, returns the number of events *CONSUMED*. + * + * . . . + * + * Tunables for ipf_cfwev_consume_many(). + * + * If you wish to attempt to coalesce reads (to reduce the likelihood of one + * event at a time during high load) change the number of tries below to + * something not 0. Early experiments set this to 10. + * + * The wait between tries is in usecs in cfw_timeout_wait. The pessimal + * case for this is a timeout_wait-spaced trickle of one event at a time. + */ +uint_t cfw_timeout_tries = 0; +uint_t cfw_timeout_wait = 10000; /* 10ms wait. */ + +typedef struct uio_error_s { + struct uio *ue_uio; + int ue_error; +} uio_error_t; + +static uint_t +ipf_cfwev_consume_many(uint_t num_requested, boolean_t block, + cfwmanycb_t cfw_many_cb, void *cbarg) +{ + uint_t consumed = 0, cb_consumed, contig_size; + uint_t timeout_tries = cfw_timeout_tries; + boolean_t eintr = B_FALSE; + + mutex_enter(&cfw_ringlock); + + while (num_requested > 0) { + clock_t delta; + + /* Silly reality checks */ + ASSERT3U(cfw_ringstart, <, cfw_ringsize); + ASSERT3U(cfw_ringend, <, cfw_ringsize); + + if (cfw_ringstart > cfw_ringend || cfw_ringfull) { + /* We have from ringstart to the buffer's end. */ + contig_size = cfw_ringsize - cfw_ringstart; + } else if (cfw_ringstart < cfw_ringend) { + /* We have no potential wrapping at this time. */ + contig_size = cfw_ringend - cfw_ringstart; + } else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) { + /* Maybe something to consume now, try again. */ + continue; + } else { + /* Nothing (more) to consume, return! */ + eintr = (block && consumed == 0); + break; + } + + /* Less asked-for than what we needed. */ + if (num_requested < contig_size) + contig_size = num_requested; + + cb_consumed = + cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg); + ASSERT3U(cb_consumed, <=, contig_size); + + cfw_ringstart += cb_consumed; + ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1); + cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */ + consumed += cb_consumed; + cfw_ringfull = (cfw_ringfull && cb_consumed == 0); + if (cb_consumed < contig_size) { + /* + * Callback returned less than given. + * This is likely a uio error, but we have + * something. Get out of here. + */ + break; + } + ASSERT3U(cb_consumed, ==, contig_size); + num_requested -= contig_size; + + if (num_requested == 0) { + /* All done! */ + break; + } + + if (cfw_ringstart != cfw_ringend) { + /* + * We wrapped around the end of the buffer, and + * we have more available to fill our request. + */ + ASSERT0(cfw_ringstart); + ASSERT(!cfw_ringfull); + continue; + } + + /* + * We obtained some of the events we requested, but not all. + * Since we have nothing to consume, wait *a little* longer. + */ + if (timeout_tries == 0) + break; /* Don't bother... */ + delta = drv_usectohz(cfw_timeout_wait); + timeout_tries--; + + switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta, + TR_CLOCK_TICK)) { + case 0: + /* + * Received signal! Return what we have OR if we have + * nothing, EINTR. + */ + DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed); + eintr = (consumed == 0); + num_requested = 0; + break; + case -1: + /* Time reached! Bail with what we got. */ + DTRACE_PROBE(ipf__cfw__timedexpired); + num_requested = 0; + break; + default: + /* Aha! We've got more! */ + DTRACE_PROBE(ipf__cfw__moredata); + break; + } + } + + mutex_exit(&cfw_ringlock); + if (eintr) + ((uio_error_t *)cbarg)->ue_error = EINTR; + return (consumed); +} + +/* + * SmartOS likes using the zone's debug id. Make sure we squirrel that away in + * the ipf netstack instance if it's not there. + */ +static inline zoneid_t +ifs_to_did(ipf_stack_t *ifs) +{ + if (ifs->ifs_zone_did == 0) { + zone_t *zone; + + /* + * We can't get the zone_did at initialization time because + * most zone data isn't readily available then, cement the did + * in place now. + */ + VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID); + zone = zone_find_by_id(ifs->ifs_zone); + if (zone != NULL) { + ifs->ifs_zone_did = zone->zone_did; + zone_rele(zone); + } + /* Else we are either in shutdown or something weirder. */ + } + return (ifs->ifs_zone_did); +} + +/* + * ipf_block_cfwlog() + * + * Called by fr_check(). Record drop events for the global-zone data + * collector. Use rest-of-ipf-style names for the parameters. + */ +void +ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + /* + * We need a rule. + * Capture failure by using dtrace on this function's entry. + * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }' + */ + if (fr == NULL) + return; + + event.cfwev_type = CFWEV_BLOCK; + event.cfwev_length = sizeof (event); + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). + */ + event.cfwev_direction = (fr->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + + event.cfwev_protocol = fin->fin_p; + /* + * NOTE: fin_*port is in host/native order, and ICMP info is here too. + */ + event.cfwev_sport = htons(fin->fin_sport); + event.cfwev_dport = htons(fin->fin_dport); + + switch (fin->fin_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = fin->fin_src6.in6; + event.cfwev_daddr = fin->fin_dst6.in6; + break; + default: + /* We should never reach here, but mark it if we do. */ + DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = fin->fin_rule; + memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * ipf_log_cfwlog() + * + * Twin of ipstate_log(), but records state events for the global-zone data + * collector. + */ +void +ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + switch (type) { + case ISL_NEW: + case ISL_CLONE: + event.cfwev_type = CFWEV_BEGIN; + break; + case ISL_EXPIRE: + case ISL_FLUSH: + case ISL_REMOVE: + case ISL_KILLED: + case ISL_ORPHAN: + /* + * We don't care about session disappearances in CFW logging + * for now. (Possible future: CFWEV_END) + */ + return; + default: + event.cfwev_type = CFWEV_BLOCK; + break; + } + + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). Follow + * suit here. + */ + event.cfwev_length = sizeof (event); + ASSERT(is->is_rule != NULL); + event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + event.cfwev_protocol = is->is_p; + switch (is->is_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + /* NOTE: is_*port is in network order. */ + event.cfwev_sport = is->is_sport; + event.cfwev_dport = is->is_dport; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + /* Scribble the ICMP type in sport... */ + event.cfwev_sport = is->is_icmp.ici_type; + break; + /* Other protocols leave the event's port fields empty. */ + } + + switch(is->is_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = is->is_src.in6; + event.cfwev_daddr = is->is_dst.in6; + break; + default: + /* Can't parse addresses if we don't know the version. Drop. */ + DTRACE_PROBE1(ipf__cfw__ipstate__badipversion, + struct ipstate *, is); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = is->is_rulen; + memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * Callback routine we use for ipf_cfwev_consume_many(). + * Returning 0 means error indication. + */ +static uint_t +cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg) +{ + uio_error_t *ue = (uio_error_t *)cbarg; + + ASSERT(MUTEX_HELD(&cfw_ringlock)); + + if (ue->ue_error != 0) + return (0); + + ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr), + UIO_READ, ue->ue_uio); + if (ue->ue_error != 0) + return (0); + + return (num_avail); +} + +/* + * Resize the CFW event ring buffer. + * + * The caller must ensure the new size is a power of 2 between + * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values + * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY + * (netstack-unload destruction). + * + * Everything in the current ring will be destroyed (and reported as a drop) + * upon resize. + */ +int +ipf_cfw_ring_resize(uint32_t newsize) +{ + ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE || + newsize == IPF_CFW_RING_DESTROY); + + if (newsize == IPF_CFW_RING_ALLOCATE) { + if (cfw_ring != NULL) + return (EBUSY); + newsize = IPF_CFW_DEFAULT_RING_BUFS; + /* Fall through to allocating a new ring buffer. */ + } else { + /* We may be called during error cleanup, so be liberal here. */ + if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) || + newsize == cfw_ringsize) { + return (0); + } + kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t)); + cfw_ring = NULL; + if (cfw_ringfull) { + cfw_evdrops += cfw_ringsize; + } else if (cfw_ringstart > cfw_ringend) { + cfw_evdrops += cfw_ringend + + (cfw_ringsize - cfw_ringstart); + } else { + cfw_evdrops += cfw_ringend - cfw_ringstart; + } + cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0; + cfw_ringfull = B_FALSE; + + if (newsize == IPF_CFW_RING_DESTROY) + return (0); + /* + * Keep the reports & drops around because if we're just + * resizing, we need to know what we lost. + */ + } + + ASSERT(ISP2(newsize)); + cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP); + /* KM_SLEEP means we always succeed. */ + cfw_ringsize = newsize; + cfw_ringmask = cfw_ringsize - 1; + + return (0); +} + +/* + * ioctl handler for /dev/ipfev. Only supports SIOCIPFCFWCFG (get data + * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the + * event ring buffer). + */ +/* ARGSUSED */ +int +ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp, + int *rp) +{ + ipfcfwcfg_t cfginfo; + int error; + + if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ) + return (EIO); + + if (crgetzoneid(cp) != GLOBAL_ZONEID) + return (EACCES); + + error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t); + mutex_enter(&cfw_ringlock); + cfginfo.ipfcfwc_evreports = cfw_evreports; + if (cmd == SIOCIPFCFWNEWSZ) { + uint32_t newsize = cfginfo.ipfcfwc_evringsize; + + /* Do ioctl parameter checking here, then call the resizer. */ + if (newsize < IPF_CFW_MIN_RING_BUFS || + newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) { + error = EINVAL; + } else { + error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize); + } + } else { + error = 0; + } + /* Both cfw_evdrops and cfw_ringsize are affected by resize. */ + cfginfo.ipfcfwc_evdrops = cfw_evdrops; + cfginfo.ipfcfwc_evringsize = cfw_ringsize; + mutex_exit(&cfw_ringlock); + + if (error != 0) + return (error); + + error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + return (0); +} + +/* + * Send events up via /dev/ipfev reads. Will return only complete events. + */ +/* ARGSUSED */ +int +ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp) +{ + uint_t requested, consumed; + uio_error_t ue = {uio, 0}; + boolean_t block; + + if (uio->uio_resid == 0) + return (0); + if (uio->uio_resid < sizeof (cfwev_t)) + return (EINVAL); + + block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0); + requested = uio->uio_resid / sizeof (cfwev_t); + + /* + * As stated earlier, ipf_cfwev_consume_many() takes a callback. + * The callback may be called multiple times before we return. + * The callback will execute uiomove(). + */ + consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb, + &ue); + ASSERT3U(consumed, <=, requested); + if (!block && consumed == 0 && ue.ue_error == 0) { + /* No data available. */ + ue.ue_error = EWOULDBLOCK; + } else if (ue.ue_error != 0 && ue.ue_error != EINTR) { + /* + * We had a problem that wasn't simply a + * case of cv_wait_sig() receiving a signal. + */ + DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed); + mutex_enter(&cfw_ringlock); + cfw_evdrops += consumed; + mutex_exit(&cfw_ringlock); + } + return (ue.ue_error); +} + +#else /* _KERNEL */ + +/* Blank stubs to satisfy userland's test compilations. */ + +int +ipf_cfw_ring_resize(uint32_t a) +{ + return (0); +} + +void +ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c) +{ +} + +void +ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c) +{ +} + +#endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c index 78980be106..48fa6e7325 100644 --- a/usr/src/uts/common/inet/ipf/fil.c +++ b/usr/src/uts/common/inet/ipf/fil.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -2588,6 +2588,9 @@ ipf_stack_t *ifs; } #endif + if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass)) + ipf_block_cfwlog(fr, fin, ifs); + /* * The FI_STATE flag is cleared here so that calling fr_checkstate * will work when called from inside of fr_fastroute. Although diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index c9d5f03e13..0d34e0fce3 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #if !defined(lint) @@ -85,6 +85,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t, @@ -116,7 +124,7 @@ u_long *ip_forwarding = NULL; #endif vmem_t *ipf_minor; /* minor number arena */ -void *ipf_state; /* DDI state */ +void *ipf_state; /* DDI state */ /* * GZ-controlled and per-zone stacks: @@ -141,28 +149,38 @@ void *ipf_state; /* DDI state */ */ /* IPv4 hook names */ -char *hook4_nicevents = "ipfilter_hook4_nicevents"; -char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; -char *hook4_in = "ipfilter_hook4_in"; -char *hook4_in_gz = "ipfilter_hook4_in_gz"; -char *hook4_out = "ipfilter_hook4_out"; -char *hook4_out_gz = "ipfilter_hook4_out_gz"; -char *hook4_loop_in = "ipfilter_hook4_loop_in"; -char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; -char *hook4_loop_out = "ipfilter_hook4_loop_out"; -char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; +char *hook4_nicevents = "ipfilter_hook4_nicevents"; +char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; +char *hook4_in = "ipfilter_hook4_in"; +char *hook4_in_gz = "ipfilter_hook4_in_gz"; +char *hook4_out = "ipfilter_hook4_out"; +char *hook4_out_gz = "ipfilter_hook4_out_gz"; +char *hook4_loop_in = "ipfilter_hook4_loop_in"; +char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; +char *hook4_loop_out = "ipfilter_hook4_loop_out"; +char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; /* IPv6 hook names */ -char *hook6_nicevents = "ipfilter_hook6_nicevents"; -char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; -char *hook6_in = "ipfilter_hook6_in"; -char *hook6_in_gz = "ipfilter_hook6_in_gz"; -char *hook6_out = "ipfilter_hook6_out"; -char *hook6_out_gz = "ipfilter_hook6_out_gz"; -char *hook6_loop_in = "ipfilter_hook6_loop_in"; -char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; -char *hook6_loop_out = "ipfilter_hook6_loop_out"; -char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +char *hook6_nicevents = "ipfilter_hook6_nicevents"; +char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; +char *hook6_in = "ipfilter_hook6_in"; +char *hook6_in_gz = "ipfilter_hook6_in_gz"; +char *hook6_out = "ipfilter_hook6_out"; +char *hook6_out_gz = "ipfilter_hook6_out_gz"; +char *hook6_loop_in = "ipfilter_hook6_loop_in"; +char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; +char *hook6_loop_out = "ipfilter_hook6_loop_out"; +char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; + +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; /* viona hook names */ char *hook_viona_in = "ipfilter_hookviona_in"; @@ -170,6 +188,39 @@ char *hook_viona_in_gz = "ipfilter_hookviona_in_gz"; char *hook_viona_out = "ipfilter_hookviona_out"; char *hook_viona_out_gz = "ipfilter_hookviona_out_gz"; +/* + * For VIONA. The net_{instance,protocol}_notify_register() functions only + * deal with per-callback-function granularity. We need two wrapper functions + * for GZ-controlled and per-zone instances. + */ +static int +ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + +static int +ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -267,10 +318,36 @@ ipf_stack_t *ifs; } /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + + /* * Remove notification of viona hooks */ net_instance_notify_unregister(ifs->ifs_netid, - ipf_hook_instance_notify); + ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz : + ipf_hook_instance_notify_ngz); #undef UNDO_HOOK @@ -278,6 +355,10 @@ ipf_stack_t *ifs; * Normally, viona will unregister itself before ipldetach() is called, * so these will be no-ops, but out of caution, we try to make sure * we've removed any of our references. + * + * For now, the _gz and _ngz versions are both wrappers to what's + * below. Just call it directly, but if that changes fix here as + * well. */ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, NH_PHYSICAL_IN); @@ -295,6 +376,10 @@ ipf_stack_t *ifs; * traced, we pass the same value the nethook framework would * pass, even though the callback does not currently use the * value. + * + * For now, the _gz and _ngz versions are both wrappers to + * what's below. Just call it directly, but if that changes + * fix here as well. */ (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr, NULL, Hn_VIONA); @@ -495,6 +580,49 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + + /* * VIONA INET hooks. While the nethook framework allows us to register * hooks for events that haven't been registered yet, we instead * register and unregister our hooks in response to notifications @@ -504,9 +632,15 @@ ipf_stack_t *ifs; * is unloaded, the viona module cannot later re-register them if it * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded * even on DEBUG kernels, they do not experience this issue. + * + * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT + * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware. + * Employ two different versions of ipf_hook_instance_notify(), one for + * the GZ-controlled, and one for the per-zone one. */ - if (net_instance_notify_register(id, ipf_hook_instance_notify, - ifs) != 0) + if (net_instance_notify_register(id, ifs->ifs_gz_controlled ? + ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) != + 0) goto hookup_failed; /* @@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, { ipf_stack_t *ifs = arg; int ret = 0; + const boolean_t gz = ifs->ifs_gz_controlled; /* We currently only care about viona hooks */ if (strcmp(instance, Hn_VIONA) != 0) @@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, return (EPROTONOSUPPORT); ret = net_protocol_notify_register(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify, ifs); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz, ifs); VERIFY(ret == 0 || ret == ESHUTDOWN); break; case HN_UNREGISTER: if (ifs->ifs_ipf_viona == NULL) break; VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify)); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz)); VERIFY0(net_protocol_release(ifs->ifs_ipf_viona)); ifs->ifs_ipf_viona = NULL; break; @@ -821,6 +958,9 @@ int *rp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp)); + zid = crgetzoneid(cp); if (cmd == SIOCIPFZONESET) { if (zid == GLOBAL_ZONEID) @@ -1129,14 +1269,14 @@ ipf_stack_t *ifs; { net_handle_t nif; - if (v == 4) - nif = ifs->ifs_ipf_ipv4; - else if (v == 6) - nif = ifs->ifs_ipf_ipv6; - else - return 0; - - return (net_phylookup(nif, name)); + if (v == 4) + nif = ifs->ifs_ipf_ipv4; + else if (v == 6) + nif = ifs->ifs_ipf_ipv6; + else + return 0; + + return (net_phylookup(nif, name)); } /* @@ -1161,11 +1301,35 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + /* Special-case ipfev: global-zone-open only. */ + if (min == IPL_LOGEV) { + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (ENXIO); + /* + * Else enable the CFW logging of events. + * NOTE: For now, we only allow one open at a time. + * Use atomic_cas to confirm/deny. And also for now, + * assume sizeof (boolean_t) == sizeof (uint_t). + * + * Per the *_{refrele,REFRELE}() in other parts of inet, + * ensure all loads/stores complete before calling cas. + * membar_exit() does this. + */ + membar_exit(); + if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0) + return (EBUSY); + } + minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1, VM_BESTFIT | VM_SLEEP); if (ddi_soft_state_zalloc(ipf_state, minor) != 0) { vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1); + if (min == IPL_LOGEV) { + /* See above... */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } return ENXIO; } @@ -1187,6 +1351,7 @@ int flags, otype; cred_t *cred; { minor_t min = getminor(dev); + ipf_devstate_t *isp; #ifdef IPFDEBUG cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred); @@ -1195,6 +1360,15 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + isp = ddi_get_soft_state(ipf_state, min); + if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) { + /* + * Disable CFW logging. See iplopen() for details. + */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } + ddi_soft_state_free(ipf_state, min); vmem_free(ipf_minor, (void *)(uintptr_t)min, 1); @@ -1225,6 +1399,8 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_read(dev, uio, cp)); /* * ipf_find_stack returns with a read lock on ifs_ipf_global @@ -1277,6 +1453,9 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (EIO); /* ipfev doesn't support write yet. */ + /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2068,8 +2247,11 @@ frdest_t *fdp; return (-1); } - /* Check the src here, fin_ifp is the src interface. */ - if (!(fin->fin_flx & FI_GENERATED) && + /* + * If we're forwarding (vs. injecting), check the src here, fin_ifp is + * the src interface. + */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) { return (-1); } @@ -2138,8 +2320,8 @@ frdest_t *fdp; inj->ni_physical = net_routeto(net_data_p, sinp, NULL); } - /* we're checking the destination here */ - if (!(fin->fin_flx & FI_GENERATED) && + /* If we're forwarding (vs. injecting), check the destinatation here. */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled(inj->ni_physical, net_data_p)) { goto bad_fastroute; } @@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hookviona_{in,out} */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ @@ -3120,16 +3338,16 @@ fr_info_t *fin; /* both IP versions. The details are going to be explained here. */ /* */ /* The packet looks as follows: */ -/* xxx | IP hdr | IP payload ... | */ -/* ^ ^ ^ ^ */ -/* | | | | */ +/* xxx | IP hdr | IP payload ... | */ +/* ^ ^ ^ ^ */ +/* | | | | */ /* | | | fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */ /* | | | */ /* | | `- fin_m->fin_dp (in case of IPv4 points to L4 header) */ /* | | */ /* | `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case */ /* | of loopback) */ -/* | */ +/* | */ /* `- fin_m->b_rptr - points to L2 header in case of physical NIC */ /* */ /* All relevant IP headers are pulled up into the first mblk. It happened */ diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c index 584ee42d9a..b70e320def 100644 --- a/usr/src/uts/common/inet/ipf/ip_log.c +++ b/usr/src/uts/common/inet/ipf/ip_log.c @@ -8,7 +8,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/param.h> @@ -373,9 +373,11 @@ u_int flags; if (fin->fin_fr != NULL) { ipfl.fl_loglevel = fin->fin_fr->fr_loglevel; ipfl.fl_logtag = fin->fin_fr->fr_logtag; + bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t)); } else { ipfl.fl_loglevel = 0xffff; ipfl.fl_logtag = FR_NOLOGTAG; + bzero(ipfl.fl_uuid, sizeof (uuid_t)); } if (fin->fin_nattag != NULL) bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag, diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c index 184f8775b6..a45bcbfdaf 100644 --- a/usr/src/uts/common/inet/ipf/ip_state.c +++ b/usr/src/uts/common/inet/ipf/ip_state.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -108,6 +108,7 @@ struct file; # include <sys/systm.h> # endif #endif +#include <sys/uuid.h> /* END OF INCLUDES */ @@ -1445,6 +1446,7 @@ u_int flags; is->is_sti.tqe_flags |= TQE_RULEBASED; } is->is_tag = fr->fr_logtag; + memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t)); is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1]; is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2]; @@ -1524,6 +1526,9 @@ u_int flags; if (ifs->ifs_ipstate_logging) ipstate_log(is, ISL_NEW, ifs); + if (IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_NEW, ifs); + RWLOCK_EXIT(&ifs->ifs_ipf_state); fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr); fin->fin_flx |= FI_STATE; @@ -2314,6 +2319,8 @@ u_32_t cmask; is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT); if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging) ipstate_log(is, ISL_CLONE, ifs); + if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_CLONE, ifs); } ret = -1; @@ -3397,6 +3404,15 @@ ipf_stack_t *ifs; if (ifs->ifs_ipstate_logging != 0 && why != 0) ipstate_log(is, why, ifs); + /* + * For now, ipf_log_cfwlog() copes with all "why" values. Strictly + * speaking, though, they all map to one event (CFWEV_END), which for + * now is not supported, hence no code calling ipf_log_cfwlog() like + * below: + * + * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule)) + * ipf_log_cfwlog(is, why, ifs); + */ if (is->is_rule != NULL) { is->is_rule->fr_statecnt--; @@ -3931,7 +3947,6 @@ int flags; return rval; } - /* ------------------------------------------------------------------------ */ /* Function: ipstate_log */ /* Returns: Nil */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile index cca3b48ac4..88f91e633f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/Makefile +++ b/usr/src/uts/common/inet/ipf/netinet/Makefile @@ -1,16 +1,15 @@ # -#ident "%Z%%M% %I% %E% SMI" -# # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2019 Joyent, Inc. # # uts/common/inet/ipf/netinet/Makefile # # include global definitions include ../../../../../Makefile.master -HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \ - ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h +HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \ + ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h ROOTDIRS= $(ROOT)/usr/include/netinet diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h index 4c3c5683b5..bb5ce7bd6c 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h @@ -8,7 +8,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019, Joyent, Inc. */ #ifndef __IP_FIL_H__ @@ -16,6 +16,7 @@ #include "netinet/ip_compat.h" #include <sys/zone.h> +#include <sys/uuid.h> #ifdef SOLARIS #undef SOLARIS @@ -115,6 +116,8 @@ #define SIOCDELFR SIOCRMAFR #define SIOCINSFR SIOCINAFR # define SIOCIPFZONESET _IOWR('r', 97, struct ipfzoneobj) +# define SIOCIPFCFWCFG _IOR('r', 98, struct ipfcfwcfg) +# define SIOCIPFCFWNEWSZ _IOWR('r', 99, struct ipfcfwcfg) /* * What type of table is getting flushed? @@ -600,6 +603,7 @@ typedef struct frentry { u_32_t fr_flags; /* per-rule flags && options (see below) */ u_32_t fr_logtag; /* user defined log tag # */ u_32_t fr_collect; /* collection number */ + uuid_t fr_uuid; /* user defined uuid */ u_int fr_arg; /* misc. numeric arg for rule */ u_int fr_loglevel; /* syslog log facility + priority */ u_int fr_age[2]; /* non-TCP timeouts */ @@ -728,6 +732,7 @@ typedef struct frentry { #define FR_NEWISN 0x400000 /* new ISN for outgoing TCP */ #define FR_NOICMPERR 0x800000 /* do not match ICMP errors in state */ #define FR_STATESYNC 0x1000000 /* synchronize state to slave */ +#define FR_CFWLOG 0x2000000 /* Global CFW logging enabled */ #define FR_NOMATCH 0x8000000 /* no match occured */ /* 0x10000000 FF_LOGPASS */ /* 0x20000000 FF_LOGBLOCK */ @@ -883,6 +888,7 @@ typedef struct ipflog { u_32_t fl_lflags; u_32_t fl_logtag; ipftag_t fl_nattag; + uuid_t fl_uuid; u_short fl_plen; /* extra data after hlen */ u_short fl_loglevel; /* syslog log level */ char fl_group[FR_GROUPLEN]; @@ -931,6 +937,7 @@ typedef struct ipflog { #define IPSYNC_NAME "/dev/ipsync" #define IPSCAN_NAME "/dev/ipscan" #define IPLOOKUP_NAME "/dev/iplookup" +#define IPFEV_NAME "/dev/ipfev" #define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */ #define IPL_LOGNAT 1 @@ -939,8 +946,9 @@ typedef struct ipflog { #define IPL_LOGSYNC 4 #define IPL_LOGSCAN 5 #define IPL_LOGLOOKUP 6 -#define IPL_LOGCOUNT 7 -#define IPL_LOGMAX 7 +#define IPL_LOGEV 7 +#define IPL_LOGCOUNT 8 +#define IPL_LOGMAX 8 #define IPL_LOGSIZE (IPL_LOGMAX + 1) #define IPL_LOGALL -1 #define IPL_LOGNONE -2 @@ -1181,6 +1189,21 @@ typedef struct ipfzoneobj { char ipfz_zonename[ZONENAME_MAX]; /* zone to act on */ } ipfzoneobj_t; +/* ioctl to grab CFW logging parameters */ +typedef struct ipfcfwcfg { + /* CFG => Max event size, NEWSZ => ignored in, like CFG out. */ + uint32_t ipfcfwc_maxevsize; + /* + * CFG => Current ring size, + * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31. + */ + uint32_t ipfcfwc_evringsize; + /* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evreports; + /* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evdrops; +} ipfcfwcfg_t; + #if defined(_KERNEL) /* Set ipfs_zoneid to this if no zone has been set: */ #define IPFS_ZONE_UNSET -2 @@ -1560,6 +1583,23 @@ extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int, ipf_stack_t *)); extern void fr_logunload __P((ipf_stack_t *)); +/* SmartOS single-FD global-zone state accumulator (see cfw.c) */ +extern boolean_t ipf_cfwlog_enabled; +struct ipstate; /* Ugggh. */ +extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *)); +extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *)); +#define IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\ + fr != NULL && ((fr)->fr_flags & FR_CFWLOG)) +struct cfwev_s; /* See ipf_cfw.h */ +extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t)); +/* See cfw.c's ipf_cfwev_consume_many() for details. */ +typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *)); +extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *)); +extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *)); +#define IPF_CFW_RING_ALLOCATE 0 +#define IPF_CFW_RING_DESTROY 1 +extern int ipf_cfw_ring_resize(uint32_t); + extern frentry_t *fr_acctpkt __P((fr_info_t *, u_32_t *)); extern int fr_copytolog __P((int, char *, int)); extern u_short fr_cksum __P((mb_t *, ip_t *, int, void *)); diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h index 4c605c1b89..ef315d5ef1 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h @@ -8,11 +8,14 @@ * * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef __IP_STATE_H__ #define __IP_STATE_H__ +#include <sys/uuid.h> + #if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51) # define SIOCDELST _IOW('r', 61, struct ipfobj) #else @@ -66,6 +69,7 @@ typedef struct ipstate { /* in both directions */ u_32_t is_optmsk[2]; /* " " mask */ /* in both directions */ + uuid_t is_uuid; u_short is_sec; /* security options set */ u_short is_secmsk; /* " " mask */ u_short is_auth; /* authentication options set */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h new file mode 100644 index 0000000000..1972d2b3f7 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h @@ -0,0 +1,69 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef __IPF_CFW_H__ +#define __IPF_CFW_H__ + +#include <sys/types.h> +#include <inet/ip6.h> +#include <sys/uuid.h> + +/* Because ipf compiles this kernel file in userland testing... */ +#ifndef ASSERT3U +#define ASSERT3U(a, b, c) ASSERT(a ## b ## c); +#endif /* ASSERT3U */ + +/* + * CFW Event, which is emitted to a global-zone listener. The global-zone + * listener solves the one-fd-per-zone problem of using each zone's ipmon. + * + * These must be 64-bit aligned because they form an array in-kernel. There + * might be reserved fields to ensure that alignment. + */ +#define CFWEV_BLOCK 1 +#define CFWEV_BEGIN 2 +#define CFWEV_END 3 +#define CFWDIR_IN 1 +#define CFWDIR_OUT 2 + +typedef struct cfwev_s { + uint16_t cfwev_type; /* BEGIN, END, BLOCK */ + uint16_t cfwev_length; /* in bytes, so capped to 65535 bytes */ + zoneid_t cfwev_zonedid; /* Pullable from ipf_stack_t. */ + + uint32_t cfwev_ruleid; /* Pullable from fr_info_t. */ + uint16_t cfwev_sport; /* Source port (network order) */ + uint16_t cfwev_dport; /* Dest. port (network order) */ + + uint8_t cfwev_protocol; /* IPPROTO_* */ + /* "direction" informs if src/dst are local/remote or remote/local. */ + uint8_t cfwev_direction; + uint8_t cfwev_reserved[6]; /* Ensures 64-bit alignment. */ + + in6_addr_t cfwev_saddr; /* IPv4 addresses are V4MAPPED. */ + in6_addr_t cfwev_daddr; + + /* + * Because of 'struct timeval' being different between 32-bit and + * 64-bit ABIs, this interface is only usable by 64-bit binaries. + */ + struct timeval cfwev_tstamp; + + uuid_t cfwev_ruleuuid; /* Pullable from fr_info_t. */ +} cfwev_t; + + + +#endif /* __IPF_CFW_H__ */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index 0ceea1e921..0b2a8d826f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2018 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef __IPF_STACK_H__ @@ -46,6 +46,7 @@ struct ipf_stack { struct ipf_stack *ifs_gz_cont_ifs; netid_t ifs_netid; zoneid_t ifs_zone; + zoneid_t ifs_zone_did; boolean_t ifs_gz_controlled; /* ipf module */ @@ -126,6 +127,11 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; + hook_t *ifs_ipfhookviona_in; hook_t *ifs_ipfhookviona_out; @@ -140,12 +146,18 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; boolean_t ifs_hookviona_physical_in; boolean_t ifs_hookviona_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; net_handle_t ifs_ipf_viona; /* ip_auth.c */ @@ -305,6 +317,7 @@ struct ipf_stack { char *ifs_addmask_key; char *ifs_rn_zeros; char *ifs_rn_ones; + #ifdef KERNEL /* kstats for inbound and outbound */ kstat_t *ifs_kstatp[2]; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5ccbfa3188 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -116,7 +116,7 @@ static void ipf_stack_shutdown __P((const netid_t, void *)); static int ipf_property_g_update __P((dev_info_t *)); static char *ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME, IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME, - IPLOOKUP_NAME, NULL }; + IPLOOKUP_NAME, IPFEV_NAME, NULL }; extern void *ipf_state; /* DDI state */ extern vmem_t *ipf_minor; /* minor number arena */ @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { @@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd; ipf_dev_info = dip; + if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0) + goto attach_failed; + ipfncb = net_instance_alloc(NETINFO_VERSION); if (ipfncb == NULL) goto attach_failed; @@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd; } attach_failed: + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_remove_minor_node(dip, NULL); ddi_prop_remove_all(dip); ddi_soft_state_fini(&ipf_state); @@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd; * framework guarantees we are not active with this devinfo * node in any other entry points at this time. */ + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_prop_remove_all(dip); i = ddi_get_instance(dip); ddi_remove_minor_node(dip, NULL); diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index 5a168523ee..85ca5ebdec 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -23,6 +23,7 @@ /* * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -1400,6 +1401,8 @@ typedef struct tcpConnEntryInfo_s { /* round-trip time smoothed average (us) */ Gauge ce_rtt_sa; /* current rto (retransmit timeout) */ + Gauge ce_rtt_sd; + /* current rto (retransmit timeout) */ Gauge ce_rto; /* round-trip time count */ Gauge ce_rtt_cnt; diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 6fb72d1d08..ddb482db78 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -43,6 +44,7 @@ extern "C" { #include <inet/ip.h> #include <inet/optcom.h> #include <inet/tunables.h> +#include <inet/bpf.h> /* * ICMP stack instances @@ -84,6 +86,10 @@ typedef struct icmp_s { mblk_t *icmp_fallback_queue_head; mblk_t *icmp_fallback_queue_tail; struct sockaddr_storage icmp_delayed_addr; + + krwlock_t icmp_bpf_lock; /* protects icmp_bpf */ + ip_bpf_insn_t *icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */ + uint_t icmp_bpf_len; } icmp_t; /* diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c index 586d7f06f8..76191e93b8 100644 --- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c +++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -51,6 +51,7 @@ #include <sys/mac_client.h> #include <sys/mac_provider.h> #include <sys/mac_client_priv.h> +#include <inet/bpf.h> #include <netpacket/packet.h> @@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag) buffer = (uchar_t *)mp; } rw_enter(&ps->ps_bpflock, RW_READER); - if (bpf_filter(ps->ps_bpf.bf_insns, buffer, + if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer, hdr.mhi_pktsize, buflen) == 0) { rw_exit(&ps->ps_bpflock); ps->ps_stats.tp_drops++; @@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, const void *optval, socklen_t optlen) { struct bpf_program prog; - struct bpf_insn *fcode; + ip_bpf_insn_t *fcode; struct pfpsock *ps; struct sock_proto_props sopp; int error = 0; @@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, return (EFAULT); } - if (bpf_validate(fcode, (int)prog.bf_len)) { + if (ip_bpf_validate(fcode, prog.bf_len)) { rw_enter(&ps->ps_bpflock, RW_WRITER); pfp_release_bpf(ps); - ps->ps_bpf.bf_insns = fcode; + ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode; ps->ps_bpf.bf_len = size; rw_exit(&ps->ps_bpflock); diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 9fa40eccb6..e65af832eb 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -241,7 +245,7 @@ squeue_init(void) } squeue_t * -squeue_create(pri_t pri) +squeue_create(pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); @@ -256,11 +260,36 @@ squeue_create(pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -380,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -407,7 +439,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * still be best to process a single queued * item if it matches the active connection. */ - if (sqp->sq_first != NULL) { + if (sqp->sq_first != NULL && sqp->sq_isip) { squeue_try_drain_one(sqp, connp); } @@ -423,7 +455,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -496,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -511,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -540,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -658,7 +697,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -683,20 +722,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -925,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -950,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1079,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1190,6 +1243,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1286,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1374,6 +1433,7 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn) ASSERT(MUTEX_HELD(&sqp->sq_lock)); ASSERT((sqp->sq_state & SQS_PROC) == 0); ASSERT(sqp->sq_run == NULL); + ASSERT(sqp->sq_isip); VERIFY(mp != NULL); /* @@ -1440,6 +1500,9 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn) CONN_DEC_REF(connp); SQUEUE_DBG_CLEAR(sqp); + if (ira != NULL) + ira_cleanup(ira, B_TRUE); + done: mutex_enter(&sqp->sq_lock); sqp->sq_state &= ~(SQS_PROC); @@ -1451,6 +1514,7 @@ squeue_synch_exit(conn_t *connp, int flag) { squeue_t *sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS); mutex_enter(&sqp->sq_lock); diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 775c5abe6b..3ed2b7174a 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. @@ -137,6 +137,7 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; +struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -407,6 +408,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; + /* + * Group of tcp_t entries bound to the same adress and port via + * SO_REUSEPORT. The pointer itself is protected by tf_lock in the + * containing tcps_bind_fanout slot. + */ + struct tcp_rg_s *tcp_rg_bind; + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 9348ea3d0f..427a6df274 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -961,8 +961,7 @@ void tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -990,7 +989,7 @@ tcp_stop_lingering(tcp_t *tcp) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcps, tcp_detach_time_wait); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait); goto finish; } @@ -1429,6 +1428,21 @@ tcp_free(tcp_t *tcp) tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); /* + * Destroy any association with SO_REUSEPORT group. + */ + if (tcp->tcp_rg_bind != NULL) { + /* + * This is only necessary for connections which enabled + * SO_REUSEPORT but were never bound. Such connections should + * be the one and only member of the tcp_rg_tp to which they + * have been associated. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = NULL; + } + + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is @@ -2477,8 +2491,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) * Path MTU might have changed by either increase or decrease, so need to * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny * or negative MSS, since tcp_mss_set() will do it. + * + * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE. */ -void +boolean_t tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { uint32_t pmtu; @@ -2488,10 +2504,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) iaflags_t ixaflags; if (tcp->tcp_tcps->tcps_ignore_path_mtu) - return; + return (B_FALSE); if (tcp->tcp_state < TCPS_ESTABLISHED) - return; + return (B_FALSE); /* * Always call ip_get_pmtu() to make sure that IP has updated @@ -2511,13 +2527,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) * Nothing to change, so just return. */ if (mss == tcp->tcp_mss) - return; + return (B_FALSE); /* * Currently, for ICMP errors, only PMTU decrease is handled. */ if (mss > tcp->tcp_mss && decrease_only) - return; + return (B_FALSE); DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); @@ -2552,6 +2568,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; } ixa->ixa_flags = ixaflags; + return (B_TRUE); } int @@ -3424,7 +3441,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, tcp_update_lso(tcp, connp->conn_ixa); break; case IXAN_PMTU: - tcp_update_pmtu(tcp, B_FALSE); + (void) tcp_update_pmtu(tcp, B_FALSE); break; case IXAN_ZCOPY: tcp_update_zcopy(tcp); @@ -3755,7 +3772,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) { tcp_stack_t *tcps; int i; - int error = 0; major_t major; size_t arrsz; @@ -3819,8 +3835,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_mibkp = tcp_kstat_init(stackid); major = mod_name_to_major(INET_NAME); - error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); - ASSERT(error == 0); + VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident)); tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index 86242fc944..5c2e1e1932 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); /* * Hash list insertion routine for tcp_t structures. Each hash bucket @@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); + + /* destroy any association with SO_REUSEPORT group */ + if (tcp->tcp_rg_bind != NULL) { + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Last one out turns off the lights */ + tcp_rg_destroy(tcp->tcp_rg_bind); + } + tcp->tcp_rg_bind = NULL; + } + if (tcp->tcp_ptpbhn) { tcpnext = tcp->tcp_bind_hash_port; if (tcpnext != NULL) { @@ -638,13 +650,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 + * If the "bind_to_req_port_only" parameter is set and the requested port + * number is available, return it (else return 0). * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. + * If "bind_to_req_port_only" parameter is not set and the requested port + * number is available, return it. If not, return the first anonymous port we + * happen across. If no anonymous ports are available, return 0. * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. @@ -664,6 +675,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t reuseport = connp->conn_reuseport; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -700,6 +712,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tf_t *tbf; tcp_t *ltcp; conn_t *lconnp; + boolean_t attempt_reuse = B_FALSE; lport = htons(port); @@ -726,6 +739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; + boolean_t addrmatch; lconnp = ltcp->tcp_connp; @@ -831,22 +845,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, &lconnp->conn_faddr_v6))) continue; + addrmatch = IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6); + + if (addrmatch && reuseport && bind_to_req_port_only && + (ltcp->tcp_state == TCPS_BOUND || + ltcp->tcp_state == TCPS_LISTEN)) { + /* + * This entry is bound to the exact same + * address and port. If SO_REUSEPORT is set on + * the calling socket, attempt to reuse this + * binding if it too had SO_REUSEPORT enabled + * when it was bound. + */ + attempt_reuse = (ltcp->tcp_rg_bind != NULL); + break; + } + if (!reuseaddr) { /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. + * No socket option SO_REUSEADDR. If an + * existing port is bound to a non-wildcard IP + * address and the requesting stream is bound + * to a distinct different IP address + * (non-wildcard, also), keep going. */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) + !addrmatch) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -861,27 +888,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * socket option SO_REUSEADDR is set on the * binding tcp_t. * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. + * If two streams are bound to the same IP + * address or both addr and bound source are + * wildcards (INADDR_ANY), we want to stop + * searching. We have found a match of IP + * source address and source port, which is + * refused regardless of the SO_REUSEADDR + * setting, so we break. */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && + if (addrmatch && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; } } - if (ltcp != NULL) { + if (ltcp != NULL && !attempt_reuse) { /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { + if (attempt_reuse) { + int err; + struct tcp_rg_s *rg; + + ASSERT(ltcp != NULL); + ASSERT(ltcp->tcp_rg_bind != NULL); + ASSERT(tcp->tcp_rg_bind != NULL); + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); + + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + if (err != 0) { + mutex_exit(&tbf->tf_lock); + return (0); + } + /* + * Now that the newly-binding socket has joined + * the existing reuseport group on ltcp, it + * should clean up its own (empty) group. + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = ltcp->tcp_rg_bind; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } + /* * This port is ours. Insert in fanout and mark as * bound to prevent others from getting the port @@ -946,3 +995,124 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } + +/* Max number of members in TCP SO_REUSEPORT group */ +#define TCP_RG_SIZE_MAX 64 +/* Step size when expanding members array */ +#define TCP_RG_SIZE_STEP 2 + + +tcp_rg_t * +tcp_rg_init(tcp_t *tcp) +{ + tcp_rg_t *rg; + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY); + if (rg == NULL) + return (NULL); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY); + if (rg->tcprg_members == NULL) { + kmem_free(rg, sizeof (tcp_rg_t)); + return (NULL); + } + + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->tcprg_size = 2; + rg->tcprg_count = 1; + rg->tcprg_active = 1; + rg->tcprg_members[0] = tcp; + return (rg); +} + +void +tcp_rg_destroy(tcp_rg_t *rg) +{ + mutex_enter(&rg->tcprg_lock); + ASSERT(rg->tcprg_count == 0); + ASSERT(rg->tcprg_active == 0); + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); + mutex_destroy(&rg->tcprg_lock); + kmem_free(rg, sizeof (struct tcp_rg_s)); +} + +static int +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) +{ + mutex_enter(&rg->tcprg_lock); + + VERIFY(rg->tcprg_size > 0); + VERIFY(rg->tcprg_count <= rg->tcprg_size); + if (rg->tcprg_count != 0) { + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; + cred_t *newcred = tcp->tcp_connp->conn_cred; + + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->tcprg_lock); + return (EPERM); + } + } + + if (rg->tcprg_count == rg->tcprg_size) { + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; + tcp_t **newmembers; + + if (newsize > TCP_RG_SIZE_MAX) { + mutex_exit(&rg->tcprg_lock); + return (EINVAL); + } + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), + KM_NOSLEEP_LAZY); + if (newmembers == NULL) { + mutex_exit(&rg->tcprg_lock); + return (ENOMEM); + } + bcopy(rg->tcprg_members, newmembers, oldalloc); + kmem_free(rg->tcprg_members, oldalloc); + rg->tcprg_members = newmembers; + rg->tcprg_size = newsize; + } + + rg->tcprg_members[rg->tcprg_count] = tcp; + rg->tcprg_count++; + rg->tcprg_active++; + + mutex_exit(&rg->tcprg_lock); + return (0); +} + +boolean_t +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) +{ + int i; + boolean_t is_empty; + + mutex_enter(&rg->tcprg_lock); + for (i = 0; i < rg->tcprg_count; i++) { + if (rg->tcprg_members[i] == tcp) + break; + } + /* The item should be present */ + ASSERT(i < rg->tcprg_count); + /* Move the last member into this position */ + rg->tcprg_count--; + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; + rg->tcprg_members[rg->tcprg_count] = NULL; + if (tcp->tcp_connp->conn_reuseport != 0) + rg->tcprg_active--; + is_empty = (rg->tcprg_count == 0); + mutex_exit(&rg->tcprg_lock); + return (is_empty); +} + +void +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) +{ + mutex_enter(&rg->tcprg_lock); + if (is_active) { + rg->tcprg_active++; + } else { + rg->tcprg_active--; + } + mutex_exit(&rg->tcprg_lock); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index dd264528fc..22b0019a6a 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -5715,10 +5715,12 @@ noticmpv4: switch (icmph->icmph_code) { case ICMP_FRAGMENTATION_NEEDED: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the + * connection is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: @@ -5761,7 +5763,7 @@ noticmpv4: break; } break; - case ICMP_SOURCE_QUENCH: { + case ICMP_SOURCE_QUENCH: /* * use a global boolean to control * whether TCP should respond to ICMP_SOURCE_QUENCH. @@ -5786,7 +5788,6 @@ noticmpv4: } break; } - } freemsg(mp); } @@ -5839,10 +5840,12 @@ noticmpv6: switch (icmp6->icmp6_type) { case ICMP6_PACKET_TOO_BIG: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the connection + * is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 8687b52d53..15e49ae070 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -67,7 +67,8 @@ opdes_t tcp_opt_arr[] = { { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -505,6 +506,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) } /* + * Set a TCP connection's participation in SO_REUSEPORT. This operation is + * performed under the protection of the squeue via tcp_setsockopt. + * The manipulation of tcp_rg_bind, as part of this operation, is subject to + * these constraints: + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport + * under the protection of the squeue. + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be + * altered until such time as tcp_free() cleans up the connection. + * 3. A connection undergoing bind, which matches to a connection participating + * in port-reuse, will switch its tcp_rg_bind pointer when it joins the + * group of an existing connection in tcp_bindi(). + */ +static int +tcp_set_reuseport(conn_t *connp, boolean_t do_enable) +{ + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (!IPCL_IS_NONSTR(connp)) { + if (do_enable) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have + * fallen back to the STREAMS API. + */ + return (EINVAL); + } else { + /* + * A connection with SO_REUSEPORT enabled should be + * prevented from falling back to STREAMS mode via + * logic in tcp_fallback. It is legal, however, for + * fallen-back connections to affirm the disabled state + * of SO_REUSEPORT. + */ + ASSERT(connp->conn_reuseport == 0); + return (0); + } + } + if (tcp->tcp_state <= TCPS_CLOSED) { + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + /* + * Connection state is not a concern when initially + * populating tcp_rg_bind. Setting it to non-NULL on a + * bound or listening connection would only mean that + * new reused-port binds become a possibility. + */ + if ((rg = tcp_rg_init(tcp)) == NULL) { + return (ENOMEM); + } + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + ASSERT(tcp->tcp_rg_bind != NULL); + if (tcp->tcp_state == TCPS_IDLE) { + /* + * If the connection has not been bound yet, discard + * the reuse group state. Since disabling SO_REUSEPORT + * on a bound socket will _not_ prevent others from + * reusing the port, the presence of tcp_rg_bind is + * used to determine reuse availability, not + * conn_reuseport. + * + * This allows proper behavior for examples such as: + * + * setsockopt(fd1, ... SO_REUSEPORT, &on_val...); + * bind(fd1, &myaddr, ...); + * setsockopt(fd1, ... SO_REUSEPORT, &off_val...); + * + * setsockopt(fd2, ... SO_REUSEPORT, &on_val...); + * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED + * + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = NULL; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } else { + /* + * If a connection has been bound, it's no longer safe + * to manipulate tcp_rg_bind until connection clean-up + * during tcp_free. Just mark the member status of the + * connection as inactive. + */ + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); +} + +/* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. */ @@ -674,6 +773,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); + } + return (0); } break; case IPPROTO_TCP: @@ -1031,10 +1135,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index 9b6c0daac3..32422be675 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -1029,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, } /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (tcp->tcp_rg_bind != NULL) { + freeb(stropt_mp); + freeb(ordrel_mp); + squeue_synch_exit(connp, SQ_NODRAIN); + return (EINVAL); + } + + /* * Both endpoints must be of the same type (either STREAMS or * non-STREAMS) for fusion to be enabled. So if we are fused, * we have to unfuse. diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c index e29c76a696..226467e167 100644 --- a/usr/src/uts/common/inet/tcp/tcp_stats.c +++ b/usr/src/uts/common/inet/tcp/tcp_stats.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. */ @@ -131,9 +131,14 @@ tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv) tcei->ce_rto = tcp->tcp_rto; tcei->ce_mss = tcp->tcp_mss; tcei->ce_state = tcp->tcp_state; - tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3); tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum); tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt; + + /* tcp_rtt_sa is stored as 8 times the average RTT */ + tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3); + + /* tcp_rtt_sd is stored as 4 times the average RTTVAR */ + tcei->ce_rtt_sd = NSEC2USEC(tcp->tcp_rtt_sd >> 2); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index 5793a7fd27..7d9b449392 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 5669592cff..61af05f749 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for @@ -395,6 +395,22 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; +/* + * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - tcprg_lock: Protects the other fields + * - tcprg_size: Allocated size (in entries) of tcprg_members array + * - tcprg_count: Count of occupied tcprg_members slots + * - tcprg_active: Count of members which still have SO_REUSEPORT set + * - tcprg_members: Connections associated with address/port group + */ +typedef struct tcp_rg_s { + kmutex_t tcprg_lock; + unsigned int tcprg_size; + unsigned int tcprg_count; + unsigned int tcprg_active; + tcp_t **tcprg_members; +} tcp_rg_t; + #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -678,7 +694,7 @@ extern int tcp_rwnd_set(tcp_t *, uint32_t); extern int tcp_set_destination(tcp_t *); extern void tcp_set_ws_value(tcp_t *); extern void tcp_stop_lingering(tcp_t *); -extern void tcp_update_pmtu(tcp_t *, boolean_t); +extern boolean_t tcp_update_pmtu(tcp_t *, boolean_t); extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); extern boolean_t tcp_zcopy_check(tcp_t *); extern void tcp_zcopy_notify(tcp_t *); @@ -695,6 +711,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, int, boolean_t, boolean_t, boolean_t); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); +extern tcp_rg_t *tcp_rg_init(tcp_t *); +extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); +extern void tcp_rg_destroy(tcp_rg_t *); +extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); /* * Fusion related functions in tcp_fusion.c. diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5d42a69fa2..4e208465f2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -1671,6 +1671,11 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_vxlanhash; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_snd_to_conn ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1826,6 +1831,11 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, } /* Fully handled this option. */ return (0); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + udp->udp_snd_to_conn = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; } @@ -6096,10 +6106,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, else return (error); } - if (udp->udp_state == TS_DATA_XFER) { + + /* + * Check if we're allowed to send to a connection on which we've + * already called 'connect'. The posix spec. allows both behaviors but + * historically we've returned an error if already connected. The + * client can allow this via a sockopt. + */ + if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) { UDPS_BUMP_MIB(us, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, (struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) { diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c8e7d79e47..9c05b8c876 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -294,7 +294,9 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, -{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 } +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, +{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 0fc597ccf3..ef11973707 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -179,12 +179,12 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ - udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ /* Because there's only VXLAN, cheat */ /* and only use a single bit */ + udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */ - udp_pad_to_bit_31 : 28; + udp_pad_to_bit_31 : 27; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ diff --git a/usr/src/uts/common/io/bpf/bpf_wrap.c b/usr/src/uts/common/io/bpf/bpf_wrap.c new file mode 100644 index 0000000000..6cbde58a20 --- /dev/null +++ b/usr/src/uts/common/io/bpf/bpf_wrap.c @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/types.h> +#include <net/bpf.h> +#include <inet/bpf.h> + +/* + * With BPF filter validation and evaluation moved into the 'ip' module, these + * wrapper functions are provided to expose the original interface. + */ + +uint_t +bpf_filter(struct bpf_insn *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +{ + return ((uint_t)ip_bpf_filter((ip_bpf_insn_t *)pc, p, wirelen, buflen)); +} + +int +bpf_validate(struct bpf_insn *f, int len) +{ + return ((int)ip_bpf_validate((ip_bpf_insn_t *)f, (uint_t)len)); +} diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index dbcd9caea8..676fca1249 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -348,8 +348,8 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) if ((err = dls_devnet_hold_tmp(diap->dia_linkid, &dlh)) != 0) return (err); - if ((err = mac_perim_enter_by_macname( - dls_devnet_mac(dlh), &mph)) != 0) { + if ((err = mac_perim_enter_by_macname(dls_devnet_mac(dlh), + &mph)) != 0) { dls_devnet_rele_tmp(dlh); return (err); } @@ -361,7 +361,6 @@ drv_ioc_attr(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) } mac_sdu_get(dlp->dl_mh, NULL, &diap->dia_max_sdu); - dls_link_rele(dlp); mac_perim_exit(mph); dls_devnet_rele_tmp(dlh); @@ -703,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -877,7 +877,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) @@ -1332,10 +1332,13 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, dls_link_t *dlp = NULL; dld_ioc_gettran_t *dgt = karg; - if ((ret = mac_perim_enter_by_linkid(dgt->dgt_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dgt->dgt_linkid, &dlh)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dgt->dgt_linkid, &dlh, &dlp)) != 0) + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) + goto done; + + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1354,13 +1357,14 @@ drv_ioc_gettran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1384,10 +1388,13 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, if (dti->dti_nbytes != 256 || dti->dti_off != 0) return (EINVAL); - if ((ret = mac_perim_enter_by_linkid(dti->dti_linkid, &mph)) != 0) + if ((ret = dls_devnet_hold_tmp(dti->dti_linkid, &dlh)) != 0) goto done; - if ((ret = dls_devnet_hold_link(dti->dti_linkid, &dlh, &dlp)) != 0) + if ((ret = mac_perim_enter_by_macname(dls_devnet_mac(dlh), &mph)) != 0) + goto done; + + if ((ret = dls_link_hold(dls_devnet_mac(dlh), &dlp)) != 0) goto done; /* @@ -1407,13 +1414,14 @@ drv_ioc_readtran(void *karg, intptr_t arg, int mode, cred_t *cred, } done: - if (dlh != NULL && dlp != NULL) { - dls_devnet_rele_link(dlh, dlp); - } + if (dlp != NULL) + dls_link_rele(dlp); - if (mph != NULL) { + if (mph != NULL) mac_perim_exit(mph); - } + + if (dlh != NULL) + dls_devnet_rele_tmp(dlh); return (ret); } @@ -1510,7 +1518,6 @@ done: return (ret); } - /* * Note that ioctls that modify links have a NULL di_priv_func(), as * privileges can only be checked after we know the class of the link being diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index c60392f853..596147f4e9 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -42,7 +42,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, - proto_notify_req, proto_passive_req; + proto_notify_req, proto_passive_req, proto_exclusive_req; static void proto_capability_advertise(dld_str_t *, mblk_t *); static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); @@ -122,6 +122,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp) case DL_PASSIVE_REQ: proto_passive_req(dsp, mp); break; + case DL_EXCLUSIVE_REQ: + proto_exclusive_req(dsp, mp); + break; default: proto_req(dsp, mp); break; @@ -610,6 +613,10 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) new_flags |= DLS_PROMISC_RX_ONLY; break; + case DL_PROMISC_FIXUPS: + new_flags |= DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -705,6 +712,14 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) new_flags &= ~DLS_PROMISC_RX_ONLY; break; + case DL_PROMISC_FIXUPS: + if (!(dsp->ds_promisc & DLS_PROMISC_FIXUPS)) { + dl_err = DL_NOTENAB; + goto failed2; + } + new_flags &= ~DLS_PROMISC_FIXUPS; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -1305,7 +1320,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp) * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ - if (dsp->ds_passivestate == DLD_ACTIVE) { + if (dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE) { dl_err = DL_OUTSTATE; goto failed; } @@ -1359,12 +1375,20 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + if (direct->di_flags & DI_DIRECT_RAW) { + direct->di_tx_df = + (uintptr_t)str_mdata_raw_fastpath_put; + } else { + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + } direct->di_tx_dh = dsp; direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; @@ -1463,6 +1487,9 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: return (dld_capab_poll_enable(dsp, poll)); @@ -1473,12 +1500,34 @@ dld_capab_poll(dld_str_t *dsp, void *data, uint_t flags) } static int +dld_capab_ipcheck(dld_str_t *dsp, void *data, uint_t flags) +{ + dld_capab_ipcheck_t *ipc = data; + + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + switch (flags) { + case DLD_ENABLE: + ipc->ipc_allowed_df = (uintptr_t)mac_protect_check_addr; + ipc->ipc_allowed_dh = dsp->ds_mch; + return (0); + case DLD_DISABLE: + return (0); + } + + return (ENOTSUP); +} + +static int dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags) { dld_capab_lso_t *lso = data; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + if (dsp->ds_sap == ETHERTYPE_IPV6) + return (ENOTSUP); + switch (flags) { case DLD_ENABLE: { mac_capab_lso_t mac_lso; @@ -1534,8 +1583,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) * completes. So we limit the check to DLD_ENABLE case. */ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && - (!(dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) || - !check_mod_above(dsp->ds_rq, "ip"))) { + (((dsp->ds_sap != ETHERTYPE_IP && dsp->ds_sap != ETHERTYPE_IPV6) || + !check_mod_above(dsp->ds_rq, "ip")) && + !check_mod_above(dsp->ds_rq, "vnd"))) { return (ENOTSUP); } @@ -1564,6 +1614,10 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) err = dld_capab_lso(dsp, data, flags); break; + case DLD_CAPAB_IPCHECK: + err = dld_capab_ipcheck(dsp, data, flags); + break; + default: err = ENOTSUP; break; @@ -1625,10 +1679,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Direct capability negotiation interface between IP and DLD + * Direct capability negotiation interface between IP/VND and DLD. Note + * that for vnd we only allow the case where the media type is the + * native media type so we know that there are no transformations that + * would have to happen to the mac header that it receives. */ - if ((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) && - check_mod_above(dsp->ds_rq, "ip")) { + if (((dsp->ds_sap == ETHERTYPE_IP || dsp->ds_sap == ETHERTYPE_IPV6) && + check_mod_above(dsp->ds_rq, "ip")) || + (check_mod_above(dsp->ds_rq, "vnd") && + dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) { dld_capable = B_TRUE; subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1747,3 +1806,36 @@ dld_capabilities_disable(dld_str_t *dsp) if (dsp->ds_polling) (void) dld_capab_poll_disable(dsp, NULL); } + +static void +proto_exclusive_req(dld_str_t *dsp, mblk_t *mp) +{ + int ret = 0; + t_uscalar_t dl_err; + mac_perim_handle_t mph; + + if (dsp->ds_passivestate != DLD_UNINITIALIZED) { + dl_err = DL_OUTSTATE; + goto failed; + } + + if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) { + dl_err = DL_BADPRIM; + goto failed; + } + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + ret = dls_exclusive_set(dsp, B_TRUE); + mac_perim_exit(mph); + + if (ret != 0) { + dl_err = DL_SYSERR; + goto failed; + } + + dsp->ds_passivestate = DLD_EXCLUSIVE; + dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ); + return; +failed: + dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret); +} diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 9f89165455..e9e98441b5 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -23,6 +23,10 @@ */ /* + * Copyright 2019 Joyent, Inc. + */ + +/* * Data-Link Driver */ @@ -857,6 +861,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid, return (mp); } +static boolean_t +i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp) +{ + mblk_t *mp = *mpp; + mblk_t *newmp; + uint_t pri, vid, dvid; + + dvid = mac_client_vid(dsp->ds_mch); + + /* + * Discard the packet if this is a VLAN stream but the VID in + * the packet is not correct. + */ + vid = VLAN_ID(mhip->mhi_tci); + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + return (B_FALSE); + + /* + * Discard the packet if this packet is a tagged packet + * but both pri and VID are 0. + */ + pri = VLAN_PRI(mhip->mhi_tci); + if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 && + vid == VLAN_ID_NONE) + return (B_FALSE); + + /* + * Update the priority bits to the per-stream priority if + * priority is not set in the packet. Update the VID for + * packets on a VLAN stream. + */ + pri = (pri == 0) ? dsp->ds_pri : 0; + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { + if ((newmp = i_dld_ether_header_update_tag(mp, pri, + dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { + return (B_FALSE); + } + *mpp = newmp; + } + + return (B_TRUE); +} + +mac_tx_cookie_t +str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) +{ + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); + mac_header_info_t mhi; + mac_tx_cookie_t cookie; + + if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0) + goto discard; + + if (is_ethernet) { + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) + goto discard; + } + + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != (mac_tx_cookie_t)NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); +discard: + /* TODO: bump kstat? */ + freemsg(mp); + return ((mac_tx_cookie_t)NULL); +} + + + /* * M_DATA put (IP fast-path mode) */ @@ -905,7 +980,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -951,38 +1025,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { - dvid = mac_client_vid(dsp->ds_mch); - - /* - * Discard the packet if this is a VLAN stream but the VID in - * the packet is not correct. - */ - vid = VLAN_ID(mhi.mhi_tci); - if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) - goto discard; - - /* - * Discard the packet if this packet is a tagged packet - * but both pri and VID are 0. - */ - pri = VLAN_PRI(mhi.mhi_tci); - if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 && - vid == VLAN_ID_NONE) + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) goto discard; - - /* - * Update the priority bits to the per-stream priority if - * priority is not set in the packet. Update the VID for - * packets on a VLAN stream. - */ - pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dvid != VLAN_ID_NONE)) { - if ((newmp = i_dld_ether_header_update_tag(mp, pri, - dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { - goto discard; - } - mp = newmp; - } } if (DLD_TX(dsp, mp, 0, 0) != 0) { diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index b26637203f..3fa65ef35d 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -250,7 +250,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) { int err = 0; uint32_t old_flags = dsp->ds_promisc; - const uint32_t option_flags = DLS_PROMISC_RX_ONLY; + const uint32_t option_flags = DLS_PROMISC_RX_ONLY | DLS_PROMISC_FIXUPS; uint32_t old_type = old_flags & ~option_flags; uint32_t new_type = new_flags & ~option_flags; mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL; @@ -274,6 +274,8 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) */ if (new_flags & DLS_PROMISC_RX_ONLY) mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP; + if (new_flags & DLS_PROMISC_FIXUPS) + mac_flags |= MAC_PROMISC_FLAGS_DO_FIXUPS; if (new_type == DLS_PROMISC_SAP) mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS; @@ -643,6 +645,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } @@ -673,7 +691,10 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + } else if (dlp->dl_exclusive == B_TRUE) { + return (EBUSY); } + dlp->dl_nactive++; return (0); } @@ -699,7 +720,11 @@ dls_active_set(dld_str_t *dsp) if (dsp->ds_passivestate == DLD_PASSIVE) return (0); - /* If we're already active, then there's nothing more to do. */ + if (dsp->ds_dlp->dl_exclusive == B_TRUE && + dsp->ds_passivestate != DLD_EXCLUSIVE) + return (EBUSY); + + /* If we're already active, we need to check the link's exclusivity */ if ((dsp->ds_nactive == 0) && ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) { /* except for ENXIO all other errors are mapped to EBUSY */ @@ -708,7 +733,8 @@ dls_active_set(dld_str_t *dsp) return (err); } - dsp->ds_passivestate = DLD_ACTIVE; + dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ? + DLD_EXCLUSIVE : DLD_ACTIVE; dsp->ds_nactive++; return (0); } @@ -739,7 +765,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all) if (dsp->ds_nactive != 0) return; - ASSERT(dsp->ds_passivestate == DLD_ACTIVE); + ASSERT(dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE); dls_mac_active_clear(dsp->ds_dlp); + /* + * We verify below to ensure that no other part of DLS has mucked with + * our exclusive state. + */ + if (dsp->ds_passivestate == DLD_EXCLUSIVE) + VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0); dsp->ds_passivestate = DLD_UNINITIALIZED; } + +int +dls_exclusive_set(dld_str_t *dsp, boolean_t enable) +{ + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + if (enable == B_FALSE) { + dsp->ds_dlp->dl_exclusive = B_FALSE; + return (0); + } + + if (dsp->ds_dlp->dl_nactive != 0) + return (EBUSY); + + dsp->ds_dlp->dl_exclusive = B_TRUE; + + return (0); +} diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 4099d0b801..eee3569b10 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -686,6 +686,7 @@ i_dls_link_destroy(dls_link_t *dlp) dlp->dl_mnh = NULL; dlp->dl_unknowns = 0; dlp->dl_nonip_cnt = 0; + dlp->dl_exclusive = B_FALSE; kmem_cache_free(i_dls_link_cachep, dlp); } diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 49e867a19e..90b65ab36a 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* * Copyright (c) 2016 by Delphix. All rights reserved. @@ -85,6 +86,14 @@ static door_handle_t dls_mgmt_dh = NULL; /* dls_devnet_t dd_flags */ #define DD_CONDEMNED 0x1 #define DD_IMPLICIT_IPTUN 0x2 /* Implicitly-created ip*.*tun* tunnel */ +#define DD_INITIALIZING 0x4 + +/* + * If the link is marked as initializing or condemned then it should + * not be visible outside of the DLS framework. + */ +#define DD_NOT_VISIBLE(flags) ( \ + (flags & (DD_CONDEMNED | DD_INITIALIZING)) != 0) /* * This structure is used to keep the <linkid, macname> mapping. @@ -108,13 +117,14 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); -static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); +static int dls_devnet_unset(mac_handle_t, datalink_id_t *, boolean_t); /*ARGSUSED*/ static int @@ -134,9 +144,9 @@ i_dls_devnet_destructor(void *buf, void *arg) { dls_devnet_t *ddp = buf; - ASSERT(ddp->dd_ksp == NULL); - ASSERT(ddp->dd_ref == 0); - ASSERT(ddp->dd_tref == 0); + VERIFY(ddp->dd_ksp == NULL); + VERIFY(ddp->dd_ref == 0); + VERIFY(ddp->dd_tref == 0); mutex_destroy(&ddp->dd_mutex); cv_destroy(&ddp->dd_cv); } @@ -148,7 +158,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -529,6 +544,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -537,6 +553,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) return (err); } +int +dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid, + zoneid_t zid) +{ + dlmgmt_door_getlinkid_t getlinkid; + dlmgmt_getlinkid_retval_t retval; + int err; + + ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid()); + getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; + (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = zid; + + if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, + sizeof (retval))) == 0) { + *linkid = retval.lr_linkid; + } + return (err); +} + + datalink_id_t dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class, datalink_media_t dmedia, uint32_t flags) @@ -736,13 +773,24 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, + if (dls_stat_create("link", 0, nm, zoneid, dls_devnet_stat_update, (void *)(uintptr_t)ddp->dd_linkid, - &ksp) == 0) { + &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -762,12 +810,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -778,24 +826,38 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* - * Associate a linkid with a given link (identified by macname) + * Associate the linkid with the link identified by macname. If this + * is called on behalf of a physical link then linkid may be + * DATALINK_INVALID_LINKID. Otherwise, if called on behalf of a + * virtual link, linkid must have a value. */ static int -dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, +dls_devnet_set(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid, dls_devnet_t **ddpp) { + const char *macname = mac_name(mh); dls_devnet_t *ddp = NULL; datalink_class_t class; int err; @@ -828,17 +890,41 @@ dls_devnet_set(const char *macname, datalink_id_t linkid, zoneid_t zoneid, } /* - * This might be a physical link that has already - * been created, but which does not have a linkid - * because dlmgmtd was not running when it was created. + * If we arrive here we know we are attempting to set + * the linkid on a physical link. A virtual link + * should never arrive here because it should never + * call this function without a linkid. Virtual links + * are created through dlgmtmd and thus we know + * dlmgmtd is alive to assign it a linkid (search for + * uses of dladm_create_datalink_id() to prove this to + * yourself); we don't have the same guarantee for a + * physical link which may perform an upcall for a + * linkid while dlmgmtd is down but will continue + * creating a devnet without the linkid (see + * softmac_create_datalink() to see how physical link + * creation works). That is why there is no entry in + * the id hash but there is one in the macname hash -- + * softmac couldn't acquire a linkid the first time it + * called this function. + * + * Because of the check above, we also know that + * ddp->dd_linkid is not set. Following this, the link + * must still be in the DD_INITIALIZING state because + * that flag is removed IFF dd_linkid is set. This is + * why we can ASSERT the DD_INITIALIZING flag below if + * the call to i_dls_devnet_setzid() fails. */ if (linkid == DATALINK_INVALID_LINKID || class != DATALINK_CLASS_PHYS) { err = EINVAL; goto done; } + + ASSERT(ddp->dd_flags & DD_INITIALIZING); + } else { ddp = kmem_cache_alloc(i_dls_devnet_cachep, KM_SLEEP); + ddp->dd_flags = DD_INITIALIZING; ddp->dd_tref = 0; ddp->dd_ref++; ddp->dd_owner_zid = zoneid; @@ -875,8 +961,19 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) - (void) dls_devnet_unset(macname, &linkid, B_TRUE); + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) { + /* + * At this point the link is marked as + * DD_INITIALIZING -- there can be no + * outstanding temp refs and therefore no need + * to wait for them. + */ + ASSERT(ddp->dd_flags & DD_INITIALIZING); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); + return (err); + } + /* * The kstat subsystem holds its own locks (rather perimeter) * before calling the ks_update (dls_devnet_stat_update) entry @@ -884,20 +981,35 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; + + mutex_enter(&ddp->dd_mutex); + if (linkid != DATALINK_INVALID_LINKID && !ddp->dd_prop_loaded && + ddp->dd_prop_taskid == TASKQID_INVALID) { + ddp->dd_prop_taskid = taskq_dispatch(system_taskq, + dls_devnet_prop_task, ddp, TQ_SLEEP); + } + mutex_exit(&ddp->dd_mutex); + } return (err); } /* - * Disassociate a linkid with a given link (identified by macname) - * This waits until temporary references to the dls_devnet_t are gone. + * Disassociate the linkid from the link identified by macname. If + * wait is B_TRUE, wait until all temporary refs are released and the + * prop task is finished. + * + * If waiting then you SHOULD NOT call this from inside the MAC perim + * as deadlock will ensue. Otherwise, this function is safe to call + * from inside or outside the MAC perim. */ static int -dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) +dls_devnet_unset(mac_handle_t mh, datalink_id_t *id, boolean_t wait) { + const char *macname = mac_name(mh); dls_devnet_t *ddp; int err; mod_hash_val_t val; @@ -918,21 +1030,62 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) * deadlock. Return EBUSY if the asynchronous thread started for * property loading as part of the post attach hasn't yet completed. */ - ASSERT(ddp->dd_ref != 0); + VERIFY(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != 0))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); - /* * Remove this dls_devnet_t from the hash table. */ @@ -947,19 +1100,40 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) } rw_exit(&i_dls_devnet_lock); + /* + * It is important to call i_dls_devnet_setzid() WITHOUT the + * i_dls_devnet_lock held. The setzid call grabs the MAC + * perim; thus causing DLS -> MAC lock ordering if performed + * with the i_dls_devnet_lock held. This forces consumers to + * grab the MAC perim before calling dls_devnet_unset() (the + * locking rules state MAC -> DLS order). By performing the + * setzid outside of the i_dls_devnet_lock consumers can + * safely call dls_devnet_unset() outside the MAC perim. + */ + if (ddp->dd_zid != GLOBAL_ZONEID) { + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } + if (wait) { /* * Wait until all temporary references are released. + * The holders of the tref need the MAC perim to + * perform their work and release the tref. To avoid + * deadlock, assert that the perim is never held here. */ + ASSERT0(MAC_PERIM_HELD(mh)); while ((ddp->dd_tref != 0) || (ddp->dd_prop_taskid != 0)) cv_wait(&ddp->dd_cv, &ddp->dd_mutex); } else { - ASSERT(ddp->dd_tref == 0 && - ddp->dd_prop_taskid == (taskqid_t)NULL); + VERIFY(ddp->dd_tref == 0); + VERIFY(ddp->dd_prop_taskid == (taskqid_t)NULL); } - if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid); + } ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; @@ -1019,8 +1193,8 @@ dls_devnet_hold_common(datalink_id_t linkid, dls_devnet_t **ddpp, } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1087,8 +1261,8 @@ dls_devnet_hold_by_dev(dev_t dev, dls_dl_handle_t *ddhp) return (ENOENT); } mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 0); - if (ddp->dd_flags & DD_CONDEMNED) { + VERIFY(ddp->dd_ref > 0); + if (DD_NOT_VISIBLE(ddp->dd_flags)) { mutex_exit(&ddp->dd_mutex); rw_exit(&i_dls_devnet_lock); return (ENOENT); @@ -1105,7 +1279,7 @@ void dls_devnet_rele(dls_devnet_t *ddp) { mutex_enter(&ddp->dd_mutex); - ASSERT(ddp->dd_ref > 1); + VERIFY(ddp->dd_ref > 1); ddp->dd_ref--; if ((ddp->dd_flags & DD_IMPLICIT_IPTUN) && ddp->dd_ref == 1) { mutex_exit(&ddp->dd_mutex); @@ -1117,7 +1291,7 @@ dls_devnet_rele(dls_devnet_t *ddp) } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) { char drv[MAXLINKNAMELEN]; uint_t ppa; @@ -1127,7 +1301,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) dls_dev_handle_t ddh; int err; - if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0) + if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0) return (dls_devnet_hold(linkid, ddpp)); /* @@ -1270,9 +1444,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zoneinit parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1317,10 +1497,12 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) } mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } mutex_exit(&ddp->dd_mutex); @@ -1331,7 +1513,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1398,7 +1588,7 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (mph != NULL) mac_perim_exit(mph); @@ -1407,7 +1597,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1436,10 +1627,18 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) sizeof (retval)); if (err != 0) goto done; + + /* + * We set upcall_done only if the upcall is + * successful. This way, if dls_link_setzid() fails, + * we know another upcall must be done to reset the + * dlmgmtd state. + */ upcall_done = B_TRUE; } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1454,7 +1653,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1476,7 +1675,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1493,7 +1692,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } @@ -1531,15 +1730,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid) * Access a vanity naming node. */ int -dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp, + zoneid_t zid) { dls_devnet_t *ddp; dls_link_t *dlp; - zoneid_t zid = getzoneid(); + zoneid_t czid = getzoneid(); int err; mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + if (czid != GLOBAL_ZONEID && czid != zid) + return (ENOENT); + + if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) return (err); dls_devnet_prop_task_wait(ddp); @@ -1572,6 +1775,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (0); } +int +dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +{ + return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid())); +} + /* * Close access to a vanity naming node. */ @@ -1628,13 +1837,32 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) * we need to use the linkid to get the user name for the link * when we create the MAC client. */ - if ((err = dls_devnet_set(mac_name(mh), linkid, zoneid, &ddp)) == 0) { + if ((err = dls_devnet_set(mh, linkid, zoneid, &ddp)) == 0) { if ((err = dls_link_hold_create(mac_name(mh), &dlp)) != 0) { mac_perim_exit(mph); - (void) dls_devnet_unset(mac_name(mh), &linkid, B_TRUE); + (void) dls_devnet_unset(mh, &linkid, B_FALSE); return (err); } + + /* + * If dd_linkid is set then the link was successfully + * initialized. In this case we can remove the + * initializing flag and make the link visible to the + * rest of the system. + * + * If not set then we were called by softmac and it + * was unable to obtain a linkid for the physical link + * because dlmgmtd is down. In that case softmac will + * eventually obtain a linkid and call + * dls_devnet_recreate() to complete initialization. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + mac_perim_exit(mph); return (err); } @@ -1648,8 +1876,19 @@ dls_devnet_create(mac_handle_t mh, datalink_id_t linkid, zoneid_t zoneid) int dls_devnet_recreate(mac_handle_t mh, datalink_id_t linkid) { - ASSERT(linkid != DATALINK_INVALID_LINKID); - return (dls_devnet_set(mac_name(mh), linkid, GLOBAL_ZONEID, NULL)); + dls_devnet_t *ddp; + int err; + + VERIFY(linkid != DATALINK_INVALID_LINKID); + if ((err = dls_devnet_set(mh, linkid, GLOBAL_ZONEID, &ddp)) == 0) { + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); + } + + return (err); + } int @@ -1659,15 +1898,52 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) mac_perim_handle_t mph; *idp = DATALINK_INVALID_LINKID; - err = dls_devnet_unset(mac_name(mh), idp, wait); - if (err != 0 && err != ENOENT) + err = dls_devnet_unset(mh, idp, wait); + + /* + * We continue on in the face of ENOENT because the devnet + * unset and DLS link release are not atomic and we may have a + * scenario where there is no entry in i_dls_devnet_hash for + * the MAC name but there is an entry in i_dls_link_hash. For + * example, if the following occurred: + * + * 1. dls_devnet_unset() returns success, and + * + * 2. dls_link_rele_by_name() fails with ENOTEMPTY because + * flows still exist, and + * + * 3. dls_devnet_set() fails to set the zone id and calls + * dls_devnet_unset() -- leaving an entry in + * i_dls_link_hash but no corresponding entry in + * i_dls_devnet_hash. + * + * Even if #3 wasn't true the dls_devnet_set() may fail for + * different reasons in the future; the point is that it _can_ + * fail as part of its contract. We can't rely on it working + * so we must assume that these two pieces of state (devnet + * and link hashes), which should always be in sync, can get + * out of sync and thus even if we get ENOENT from the devnet + * hash we should still try to delete from the link hash just + * in case. + * + * We could prevent the ENOTEMPTY from dls_link_rele_by_name() + * by calling mac_disable() before calling + * dls_devnet_destroy() but that's not currently possible due + * to a long-standing bug. OpenSolaris 6791335: The semantics + * of mac_disable() were modified by Crossbow such that + * dls_devnet_destroy() needs to be called before + * mac_disable() can succeed. This is because of the implicit + * reference that dls has on the mac_impl_t. + */ + if (err != 0 && err != ENOENT) { return (err); + } mac_perim_enter_by_mh(mh, &mph); err = dls_link_rele_by_name(mac_name(mh)); - mac_perim_exit(mph); - if (err != 0) { + dls_devnet_t *ddp; + /* * XXX It is a general GLDv3 bug that dls_devnet_set() has to * be called to re-set the link when destroy fails. The @@ -1675,9 +1951,22 @@ dls_devnet_destroy(mac_handle_t mh, datalink_id_t *idp, boolean_t wait) * called from kernel context or from a zone other than that * which initially created the link. */ - (void) dls_devnet_set(mac_name(mh), *idp, crgetzoneid(CRED()), - NULL); + (void) dls_devnet_set(mh, *idp, crgetzoneid(CRED()), &ddp); + + /* + * You might think dd_linkid should always be set + * here, but in the case where dls_devnet_unset() + * returns ENOENT it will be DATALINK_INVALID_LINKID. + * Stay consistent with the rest of DLS and only + * remove the initializing flag if linkid is set. + */ + mutex_enter(&ddp->dd_mutex); + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + ddp->dd_flags &= ~DD_INITIALIZING; + mutex_exit(&ddp->dd_mutex); } + + mac_perim_exit(mph); return (err); } diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dump.c b/usr/src/uts/common/io/dump.c index 4fd52e6448..f4d8c1cf2c 100644 --- a/usr/src/uts/common/io/dump.c +++ b/usr/src/uts/common/io/dump.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Delphix (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ @@ -46,6 +47,7 @@ #include <sys/conf.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/random.h> static dev_info_t *dump_devi; @@ -141,16 +143,20 @@ dump_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp) *rvalp = dump_conflags; if (dumpvp && !(dumpvp->v_flag & VISSWAP)) *rvalp |= DUMP_EXCL; + mutex_exit(&dump_lock); break; case DIOCSETCONF: mutex_enter(&dump_lock); if (arg == DUMP_KERNEL || arg == DUMP_ALL || - arg == DUMP_CURPROC) - dump_conflags = arg; - else + arg == DUMP_CURPROC) { + dump_conflags = (dump_conflags & DUMP_STATE) | + (arg & DUMP_CONTENT); + } else { error = EINVAL; + } + mutex_exit(&dump_lock); break; @@ -181,6 +187,24 @@ dump_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred, int *rvalp) VN_RELE(vp); break; + case DIOCSCRYPTKEY: { + uint8_t key[DUMP_CRYPT_KEYLEN]; + uint8_t nonce[DUMP_CRYPT_NONCELEN]; + + if ((error = copyin((uint8_t *)arg, key, sizeof (key))) != 0) + break; + + (void) random_get_pseudo_bytes(nonce, sizeof (nonce)); + + mutex_enter(&dump_lock); + bcopy(key, dump_crypt_key, DUMP_CRYPT_KEYLEN); + bcopy(nonce, dump_crypt_nonce, DUMP_CRYPT_NONCELEN); + dump_conflags |= DUMP_ENCRYPT; /* a one-way trip */ + mutex_exit(&dump_lock); + + break; + } + case DIOCDUMP: mutex_enter(&dump_lock); if (dumpvp == NULL) diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c index 32f875917f..efc1f9233f 100644 --- a/usr/src/uts/common/io/eventfd.c +++ b/usr/src/uts/common/io/eventfd.c @@ -141,37 +141,39 @@ eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) * transitions from EVENTFD_VALMAX to a lower value. At all other * times, it is already considered writable by poll. */ - if (oval == EVENTFD_VALMAX) { + if (oval >= EVENTFD_VALMAX) { pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); } return (err); } -/*ARGSUSED*/ static int -eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +eventfd_post(eventfd_state_t *state, uint64_t val, boolean_t is_async, + boolean_t file_nonblock) { - eventfd_state_t *state; - minor_t minor = getminor(dev); - uint64_t val, oval; - int err; - - if (uio->uio_resid < sizeof (val)) - return (EINVAL); - - if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) - return (err); - - if (val > EVENTFD_VALMAX) - return (EINVAL); - - state = ddi_get_soft_state(eventfd_softstate, minor); + uint64_t oval; + boolean_t overflow = B_FALSE; mutex_enter(&state->efd_lock); while (val > EVENTFD_VALMAX - state->efd_value) { - if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + + /* + * When called from (LX) AIO, expectations about overflow and + * blocking are different than normal operation. If the + * incoming value would cause overflow, it is clamped to reach + * the overflow value exactly. This is added to the existing + * value without blocking. Any pollers of the eventfd will see + * POLLERR asserted when this occurs. + */ + if (is_async) { + val = EVENTFD_VALOVERFLOW - state->efd_value; + overflow = B_TRUE; + break; + } + + if (file_nonblock) { mutex_exit(&state->efd_lock); return (EAGAIN); } @@ -186,7 +188,7 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) } /* - * We now know that we can add the value without overflowing. + * We now know that we can safely add the value. */ state->efd_value = (oval = state->efd_value) + val; @@ -200,10 +202,13 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) mutex_exit(&state->efd_lock); /* - * Notify pollers as well if the eventfd is now readable. + * Notify pollers as well if the eventfd has become readable or has + * transitioned into overflow. */ if (oval == 0) { pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); + } else if (overflow && val != 0) { + pollwakeup(&state->efd_pollhd, POLLERR); } return (0); @@ -211,6 +216,29 @@ eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) /*ARGSUSED*/ static int +eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +{ + eventfd_state_t *state; + boolean_t file_nonblock; + uint64_t val; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) + return (err); + + if (val > EVENTFD_VALMAX) + return (EINVAL); + + file_nonblock = (uio->uio_fmode & (FNDELAY|FNONBLOCK)) != 0; + state = ddi_get_soft_state(eventfd_softstate, getminor(dev)); + return (eventfd_post(state, val, B_FALSE, file_nonblock)); +} + +/*ARGSUSED*/ +static int eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { @@ -228,6 +256,9 @@ eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, if (state->efd_value < EVENTFD_VALMAX) revents |= POLLWRNORM | POLLOUT; + if (state->efd_value == EVENTFD_VALOVERFLOW) + revents |= POLLERR; + *reventsp = revents & events; if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { *phpp = &state->efd_pollhd; @@ -244,17 +275,28 @@ eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { eventfd_state_t *state; minor_t minor = getminor(dev); + uint64_t *valp; state = ddi_get_soft_state(eventfd_softstate, minor); switch (cmd) { - case EVENTFDIOC_SEMAPHORE: { + case EVENTFDIOC_SEMAPHORE: mutex_enter(&state->efd_lock); state->efd_semaphore ^= 1; mutex_exit(&state->efd_lock); + return (0); + case EVENTFDIOC_POST: + /* + * This ioctl is expected to be kernel-internal, used only by + * the AIO emulation in LX. + */ + if ((md & FKIOCTL) == 0) { + break; + } + valp = (uint64_t *)arg; + VERIFY(eventfd_post(state, *valp, B_TRUE, B_FALSE) == 0); return (0); - } default: break; diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c new file mode 100644 index 0000000000..03bb799499 --- /dev/null +++ b/usr/src/uts/common/io/gsqueue/gsqueue.c @@ -0,0 +1,608 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Serialization queues are a technique used in illumos to provide what's + * commonly known as a 'vertical' perimeter. The idea (described a bit in + * uts/common/inet/squeue.c) is to provide a means to make sure that message + * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd + * consume these on different policies, ip on a conn_t basis, vnd on a per + * device basis, and use this to ensure that only one packet is being processed + * at a given time. + * + * Serialization queues were originally used by ip. As part of that + * implementation, many of the details of ip were baked into it. That includes + * things like conn_t, ip receive attributes, and the notion of sets. While an + * individual serialization queue, or gsqueue_t, is a useful level of + * abstraction, it isn't the basis on which monst consumers want to manage them. + * Instead, we have the notion of a set of serialization queues. These sets are + * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a + * gsqueue_t per CPU to fanout on without managing them all itself. In the + * original implementation, this existed, but they were heavily tied into the + * infrastructure of IP, and its notion of polling on the underlying MAC + * devices. + * + * The result of that past is a new interface to serialization queues and a + * similar, but slightly different, abstraction to sets of these + * (gsqueue_set_t). When designing this there are two different approaches that + * one could consider. The first is that the system has one gsqueue_set_t that + * the entire world shares, whether IP or some other consumer. The other is that + * every consumer has their own set. + * + * The trade offs between these two failure modes are the pathological failure + * modes. There is no guarantee that any two consumers here are equivalent. In + * fact, they very likely have very different latency profiles. If they are + * being processed in the same queue, that can lead to very odd behaviors. More + * generally, if we have a series of processing functions from one consumer + * which are generally short, and another which are generally long, that'll + * cause undue latency that's harder to observe. If we instead take the approach + * that each consumer should have its own set that it fans out over then we + * won't end up with the problem that a given serialization queue will have + * multiple latency profiles, but instead we'll see cpu contention for the bound + * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker + * thread is bound and it is in fact possible for it to be processed by other + * threads on other CPUs. + * + * We've opted to go down the second path, so each consumer has its own + * independent set of serialization queues that it is bound over. + * + * Structure Hierarchies + * --------------------- + * + * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t + * encapsulates all the per-CPU gsqueue_t that exist in the form of + * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could + * accommodate more than one gsqueue_t, but today there is a one to one mapping. + * + * We maintain two different lists of gsqueue_cpu_t, the active and defunct + * sets. The active set is maintained in the array `gs_cpus`. There are NCPU + * entries available in `gs_cpus` with the total number of currently active cpus + * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When + * there is no longer a need for a given binding (see the following section for + * more explanation on when this is the case) then we move the entry to the + * `gs_defunct` list which is just a list_t of gsqueue_cpu_t. + * + * In addition, each gsqueue_set_t can have a series of callbacks registered + * with it. These are described in the following section. Graphically, a given + * gsqueue_set_t looks roughly like the following: + * + * +---------------+ + * | gsqueue_set_t | + * +---------------+ + * | | | + * | | * . . . gs_cpus + * | | | + * | | | +-------------------------------------------------+ + * | | +--->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |... + * | | +-------------------------------------------------+ + * | | + * | * . . . gs_defunct + * | | + * | | +---------------+ +---------------+ +---------------+ + * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |... + * | +---------------+ +---------------+ +---------------+ + * * . . . gs_cbs + * | + * | +--------------+ +--------------+ +--------------+ + * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |... + * +--------------+ +--------------+ +--------------+ + * + * CPU DR, gsqueue_t, and gsqueue_t + * -------------------------------- + * + * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker + * thread that may end up doing work. As part of supporting fanout, we have one + * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of + * this binding, we need to deal with CPU DR changes. + * + * The gsqueue driver maintains a single CPU DR callback that is used for the + * entire sub-system. We break down CPU DR events into three groups. Offline + * events, online events, and events we can ignore. When the first group occurs, + * we need to go through every gsqueue_t, find the gsqueue_cpu_t that + * corresponds to that processor id, and unbind all of its gsqueue_t's. It's + * rather important that we only unbind the gsqueue_t's and not actually destroy + * them. When this happens, they could very easily have data queued inside of + * them and it's unreasonable to just throw out everything in them at this + * point. The data remains intact and service continues uinterrupted. + * + * When we receive an online event, we do the opposite. We try to find a + * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid + * field intact) in the defunct list. If we find one, we remove it from the + * defunct list and add it to the active list as well as binding the gsqueue_t + * to the CPU in question. If we don't find one, then we create a new one. + * + * To deal with these kinds of situations, we allow a consumer to register + * callbacks for the gsqueue_t that they are interested in. These callbacks will + * fire whenever we are handling a topology change. The design of the callbacks + * is not that the user can take any administrative action during them, but + * rather set something for them to do asynchronously. It is illegal to make any + * calls into the gsqueue system while you are in a callback. + * + * Locking + * ------- + * + * The lock ordering here is fairly straightforward. Due to our use of CPU + * binding and the CPU DR callbacks, we have an additional lock to consider + * cpu_lock. Because of that, the following are the rules for locking: + * + * + * o If performing binding operations, you must grab cpu_lock. cpu_lock is + * also at the top of the order. + * + * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock + * If you need to take multiple locks, you must take the greatest + * (left-most) one first. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/cpuvar.h> +#include <sys/list.h> +#include <sys/sysmacros.h> + +#include <sys/gsqueue.h> +#include <sys/squeue_impl.h> + +typedef struct gsqueue_cb { + struct gsqueue_cb *gcb_next; + gsqueue_cb_f gcb_func; + void *gcb_arg; +} gsqueue_cb_t; + +typedef struct gsqueue_cpu { + list_node_t gqc_lnode; + squeue_t *gqc_head; + processorid_t gqc_cpuid; +} gsqueue_cpu_t; + +struct gsqueue_set { + list_node_t gs_next; + pri_t gs_wpri; + kmutex_t gs_lock; + int gs_ncpus; + gsqueue_cpu_t **gs_cpus; + list_t gs_defunct; + gsqueue_cb_t *gs_cbs; +}; + +static kmutex_t gsqueue_lock; +static list_t gsqueue_list; +static kmem_cache_t *gsqueue_cb_cache; +static kmem_cache_t *gsqueue_cpu_cache; +static kmem_cache_t *gsqueue_set_cache; + +static gsqueue_cpu_t * +gsqueue_cpu_create(pri_t wpri, processorid_t cpuid) +{ + gsqueue_cpu_t *scp; + + scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP); + + list_link_init(&scp->gqc_lnode); + scp->gqc_cpuid = cpuid; + scp->gqc_head = squeue_create(wpri, B_FALSE); + scp->gqc_head->sq_state = SQS_DEFAULT; + squeue_bind(scp->gqc_head, cpuid); + + return (scp); +} + +static void +gsqueue_cpu_destroy(gsqueue_cpu_t *scp) +{ + squeue_destroy(scp->gqc_head); + kmem_cache_free(gsqueue_cpu_cache, scp); +} + +gsqueue_set_t * +gsqueue_set_create(pri_t wpri) +{ + int i; + gsqueue_set_t *gssp; + + gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP); + gssp->gs_wpri = wpri; + gssp->gs_ncpus = 0; + + /* + * We're grabbing CPU lock. Once we let go of it we have to ensure all + * set up of the gsqueue_set_t is complete, as it'll be in there for the + * various CPU DR bits. + */ + mutex_enter(&cpu_lock); + + for (i = 0; i < NCPU; i++) { + gsqueue_cpu_t *scp; + cpu_t *cp = cpu_get(i); + if (cp != NULL && CPU_ACTIVE(cp) && + cp->cpu_flags & CPU_EXISTS) { + scp = gsqueue_cpu_create(wpri, cp->cpu_id); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + } + } + + /* Finally we can add it to our global list and be done */ + mutex_enter(&gsqueue_lock); + list_insert_tail(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + mutex_exit(&cpu_lock); + + return (gssp); +} + +void +gsqueue_set_destroy(gsqueue_set_t *gssp) +{ + int i; + gsqueue_cpu_t *scp; + + /* + * Go through and unbind all of the squeues while cpu_lock is held and + * move them to the defunct list. Once that's done, we don't need to do + * anything else with cpu_lock. + */ + mutex_enter(&cpu_lock); + mutex_enter(&gsqueue_lock); + list_remove(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + + mutex_enter(&gssp->gs_lock); + + for (i = 0; i < gssp->gs_ncpus; i++) { + scp = gssp->gs_cpus[i]; + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = NULL; + } + gssp->gs_ncpus = 0; + + mutex_exit(&gssp->gs_lock); + mutex_exit(&cpu_lock); + + while ((scp = list_remove_head(&gssp->gs_defunct)) != NULL) { + gsqueue_cpu_destroy(scp); + } + + while (gssp->gs_cbs != NULL) { + gsqueue_cb_t *cbp; + + cbp = gssp->gs_cbs; + gssp->gs_cbs = cbp->gcb_next; + kmem_cache_free(gsqueue_cb_cache, cbp); + } + + ASSERT3U(gssp->gs_ncpus, ==, 0); + ASSERT3P(list_head(&gssp->gs_defunct), ==, NULL); + ASSERT3P(gssp->gs_cbs, ==, NULL); + kmem_cache_free(gsqueue_set_cache, gssp); +} + +gsqueue_t * +gsqueue_set_get(gsqueue_set_t *gssp, uint_t index) +{ + squeue_t *sqp; + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_cpus[index % gssp->gs_ncpus]; + sqp = scp->gqc_head; + mutex_exit(&gssp->gs_lock); + return ((gsqueue_t *)sqp); +} + +uintptr_t +gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg) +{ + gsqueue_cb_t *cbp; + + cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP); + cbp->gcb_func = cb; + cbp->gcb_arg = arg; + + mutex_enter(&gssp->gs_lock); + cbp->gcb_next = gssp->gs_cbs; + gssp->gs_cbs = cbp; + mutex_exit(&gssp->gs_lock); + return ((uintptr_t)cbp); +} + +int +gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id) +{ + gsqueue_cb_t *cbp, *prev; + mutex_enter(&gssp->gs_lock); + cbp = gssp->gs_cbs; + prev = NULL; + while (cbp != NULL) { + if ((uintptr_t)cbp != id) { + prev = cbp; + cbp = cbp->gcb_next; + continue; + } + + if (prev == NULL) { + gssp->gs_cbs = cbp->gcb_next; + } else { + prev->gcb_next = cbp->gcb_next; + } + + mutex_exit(&gssp->gs_lock); + kmem_cache_free(gsqueue_cb_cache, cbp); + return (0); + } + mutex_exit(&gssp->gs_lock); + return (-1); +} + +void +gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg, + int flags, uint8_t tag) +{ + squeue_t *sqp = (squeue_t *)gsp; + + ASSERT(mp->b_next == NULL); + ASSERT(mp->b_prev == NULL); + mp->b_queue = (queue_t *)func; + mp->b_prev = arg; + sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag); +} + +static void +gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online) +{ + gsqueue_cb_t *cbp; + + ASSERT(MUTEX_HELD(&gssp->gs_lock)); + cbp = gssp->gs_cbs; + while (cbp != NULL) { + cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online); + cbp = cbp->gcb_next; + } + +} + +/* + * When we online a processor we need to go through and either bind a defunct + * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the + * defunct list that used to be on that processor. If no such gsqueue_cpu_t + * exists, then we'll create a new one. We'd rather avoid taking over an + * existing defunct one that used to be on another CPU, as its not unreasonable + * to believe that its CPU will come back. More CPUs are offlined and onlined by + * the administrator or by creating cpu sets than actually get offlined by FMA. + */ +static void +gsqueue_handle_online(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + for (scp = list_head(&gssp->gs_defunct); scp != NULL; + scp = list_next(&gssp->gs_defunct, scp)) { + if (scp->gqc_cpuid == id) { + list_remove(&gssp->gs_defunct, scp); + break; + } + } + + if (scp == NULL) { + scp = gsqueue_cpu_create(gssp->gs_wpri, id); + } else { + squeue_bind(scp->gqc_head, id); + } + + ASSERT(gssp->gs_ncpus < NCPU); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + gsqueue_notify(gssp, scp->gqc_head, B_TRUE); + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +static void +gsqueue_handle_offline(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + int i; + gsqueue_cpu_t *scp = NULL; + + mutex_enter(&gssp->gs_lock); + for (i = 0; i < gssp->gs_ncpus; i++) { + if (gssp->gs_cpus[i]->gqc_cpuid == id) { + scp = gssp->gs_cpus[i]; + break; + } + } + + if (scp != NULL) { + squeue_unbind(scp->gqc_head); + list_insert_tail(&gssp->gs_defunct, scp); + gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1]; + gssp->gs_ncpus--; + gsqueue_notify(gssp, scp->gqc_head, B_FALSE); + } + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +/* ARGSUSED */ +static int +gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu_get(id); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS) + gsqueue_handle_online(cp->cpu_id); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + gsqueue_handle_offline(cp->cpu_id); + break; + default: + break; + } + + return (0); +} + + +/* ARGSUSED */ +static int +gsqueue_set_cache_construct(void *buf, void *arg, int kmflags) +{ + gsqueue_set_t *gssp = buf; + + gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags); + if (gssp->gs_cpus == NULL) + return (-1); + + mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&gssp->gs_defunct, sizeof (gsqueue_cpu_t), + offsetof(gsqueue_cpu_t, gqc_lnode)); + gssp->gs_ncpus = 0; + gssp->gs_cbs = NULL; + + return (0); +} + +/* ARGSUSED */ +static void +gsqueue_set_cache_destruct(void *buf, void *arg) +{ + gsqueue_set_t *gssp = buf; + + kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU); + gssp->gs_cpus = NULL; + list_destroy(&gssp->gs_defunct); + mutex_destroy(&gssp->gs_lock); +} + +static void +gsqueue_ddiinit(void) +{ + list_create(&gsqueue_list, sizeof (gsqueue_set_t), + offsetof(gsqueue_set_t, gs_next)); + mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL); + + gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache", + sizeof (gsqueue_cb_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache", + sizeof (gsqueue_cpu_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_set_cache = kmem_cache_create("squeue_set_cache", + sizeof (gsqueue_set_t), + 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct, + NULL, NULL, NULL, 0); + + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +static int +gsqueue_ddifini(void) +{ + mutex_enter(&gsqueue_lock); + if (list_is_empty(&gsqueue_list) == 0) { + mutex_exit(&gsqueue_lock); + return (EBUSY); + } + list_destroy(&gsqueue_list); + mutex_exit(&gsqueue_lock); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + kmem_cache_destroy(gsqueue_set_cache); + kmem_cache_destroy(gsqueue_cpu_cache); + kmem_cache_destroy(gsqueue_cb_cache); + + mutex_destroy(&gsqueue_lock); + + return (0); +} + +static struct modlmisc gsqueue_modmisc = { + &mod_miscops, + "gsqueue" +}; + +static struct modlinkage gsqueue_modlinkage = { + MODREV_1, + &gsqueue_modmisc, + NULL +}; + +int +_init(void) +{ + int ret; + + gsqueue_ddiinit(); + if ((ret = mod_install(&gsqueue_modlinkage)) != 0) { + VERIFY(gsqueue_ddifini() == 0); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&gsqueue_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = gsqueue_ddifini()) != 0) + return (ret); + + if ((ret = mod_remove(&gsqueue_modlinkage)) != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c new file mode 100644 index 0000000000..67bf55f213 --- /dev/null +++ b/usr/src/uts/common/io/inotify.c @@ -0,0 +1,1559 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2020 Joyent, Inc. + * Copyright (c) 2015 The MathWorks, Inc. All rights reserved. + */ + +/* + * Support for the inotify facility, a Linux-borne facility for asynchronous + * notification of certain events on specified files or directories. Our + * implementation broadly leverages the file event monitoring facility, and + * would actually be quite straightforward were it not for a very serious + * blunder in the inotify interface: in addition to allowing for one to be + * notified on events on a particular file or directory, inotify also allows + * for one to be notified on certain events on files _within_ a watched + * directory -- even though those events have absolutely nothing to do with + * the directory itself. This leads to all sorts of madness because file + * operations are (of course) not undertaken on paths but rather on open + * files -- and the relationships between open files and the paths that resolve + * to those files are neither static nor isomorphic. We implement this + * concept by having _child watches_ when directories are watched with events + * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is + * first added, and we modify those child watches dynamically as files are + * created, deleted, moved into or moved out of the specified directory. This + * mechanism works well, absent hard links. Hard links, unfortunately, break + * this rather badly, and the user is warned that watches on directories that + * have multiple directory entries referring to the same file may behave + * unexpectedly. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/inotify.h> +#include <sys/fem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/vfs_opreg.h> +#include <sys/vmem.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include <sys/cyclic.h> +#include <sys/filio.h> + +struct inotify_state; +struct inotify_kevent; + +typedef struct inotify_watch inotify_watch_t; +typedef struct inotify_state inotify_state_t; +typedef struct inotify_kevent inotify_kevent_t; + +struct inotify_watch { + kmutex_t inw_lock; /* lock protecting ref count */ + int inw_refcnt; /* reference count */ + uint8_t inw_zombie:1; /* boolean: is zombie */ + uint8_t inw_fired:1; /* boolean: fired one-shot */ + uint8_t inw_active:1; /* boolean: watch is active */ + uint8_t inw_orphaned:1; /* boolean: orphaned */ + kcondvar_t inw_cv; /* condvar for zombifier */ + uint32_t inw_mask; /* mask of watch */ + int32_t inw_wd; /* watch descriptor */ + vnode_t *inw_vp; /* underlying vnode */ + inotify_watch_t *inw_parent; /* parent, if a child */ + avl_node_t inw_byvp; /* watches by vnode */ + avl_node_t inw_bywd; /* watches by descriptor */ + avl_tree_t inw_children; /* children, if a parent */ + char *inw_name; /* name, if a child */ + list_node_t inw_orphan; /* orphan list */ + cred_t *inw_cred; /* cred, if orphaned */ + inotify_state_t *inw_state; /* corresponding state */ +}; + +struct inotify_kevent { + inotify_kevent_t *ine_next; /* next event in queue */ + struct inotify_event ine_event; /* event (variable size) */ +}; + +#define INOTIFY_EVENT_LENGTH(ev) \ + (sizeof (inotify_kevent_t) + (ev)->ine_event.len) + +struct inotify_state { + kmutex_t ins_lock; /* lock protecting state */ + avl_tree_t ins_byvp; /* watches by vnode */ + avl_tree_t ins_bywd; /* watches by descriptor */ + vmem_t *ins_wds; /* watch identifier arena */ + int ins_maxwatches; /* maximum number of watches */ + int ins_maxevents; /* maximum number of events */ + int ins_nevents; /* current # of events */ + int32_t ins_size; /* total size of events */ + inotify_kevent_t *ins_head; /* head of event queue */ + inotify_kevent_t *ins_tail; /* tail of event queue */ + pollhead_t ins_pollhd; /* poll head */ + kcondvar_t ins_cv; /* condvar for reading */ + list_t ins_orphans; /* orphan list */ + ddi_periodic_t ins_cleaner; /* cyclic for cleaning */ + inotify_watch_t *ins_zombies; /* zombie watch list */ + cred_t *ins_cred; /* creator's credentials */ + inotify_state_t *ins_next; /* next state on global list */ +}; + +/* + * Tunables (exported read-only in lx-branded zones via /proc). + */ +int inotify_maxwatches = 8192; /* max watches per instance */ +int inotify_maxevents = 16384; /* max events */ +int inotify_maxinstances = 128; /* max instances per user */ + +/* + * Internal global variables. + */ +static kmutex_t inotify_lock; /* lock protecting state */ +static dev_info_t *inotify_devi; /* device info */ +static fem_t *inotify_femp; /* FEM pointer */ +static vmem_t *inotify_minor; /* minor number arena */ +static void *inotify_softstate; /* softstate pointer */ +static inotify_state_t *inotify_state; /* global list if state */ + +static void inotify_watch_event(inotify_watch_t *, uint64_t, char *); +static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *); +static void inotify_watch_delete(inotify_watch_t *, uint32_t); +static void inotify_watch_remove(inotify_state_t *state, + inotify_watch_t *watch); + +static int +inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset, + cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) { + inotify_watch_event(watch, flag & FWRITE ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL); + } + + return (rval); +} + +static int +inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_create(vf, name, vap, excl, mode, + vpp, cr, flag, ct, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE, name); + } + + return (rval); +} + +static int +inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) { + inotify_watch_insert(watch, svp, tnm); + inotify_watch_event(watch, IN_CREATE, tnm); + } + + return (rval); +} + +static int +inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_mkdir(vf, name, vap, vpp, cr, + ct, flags, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name); + } + + return (rval); +} + +static int +inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_open(vf, mode, cr, ct)) == 0) + inotify_watch_event(watch, IN_OPEN, NULL); + + return (rval); +} + +static int +inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_read(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_ACCESS, NULL); + + return (rval); +} + +static int +inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags); + inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL); + + return (rval); +} + +int +inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE, nm); + + return (rval); +} + +int +inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm); + + return (rval); +} + +static int +inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + return (rval); +} + +static int +inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_write(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_MODIFY, NULL); + + return (rval); +} + +static int +inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + + switch (vnevent) { + case VE_RENAME_SRC: + inotify_watch_event(watch, IN_MOVE_SELF, NULL); + inotify_watch_delete(watch, IN_MOVE_SELF); + break; + case VE_REMOVE: + /* + * Linux will apparently fire an IN_ATTRIB event when the link + * count changes (including when it drops to 0 on a remove). + * This is merely somewhat odd; what is amazing is that this + * IN_ATTRIB event is not visible on an inotify watch on the + * parent directory. (IN_ATTRIB events are normally sent to + * watches on the parent directory). While it's hard to + * believe that this constitutes desired semantics, ltp + * unfortunately tests this case (if implicitly); in the name + * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are + * explicitly watching the file that has been removed. + */ + if (watch->inw_parent == NULL) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + /*FALLTHROUGH*/ + case VE_RENAME_DEST: + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_RMDIR: + /* + * It seems that IN_ISDIR should really be OR'd in here, but + * Linux doesn't seem to do that in this case; for the sake of + * bug-for-bug compatibility, we don't do it either. + */ + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_CREATE: + case VE_TRUNCATE: + case VE_RESIZE: + inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL); + break; + case VE_LINK: + inotify_watch_event(watch, IN_ATTRIB, NULL); + break; + case VE_RENAME_SRC_DIR: + inotify_watch_event(watch, IN_MOVED_FROM, name); + break; + case VE_RENAME_DEST_DIR: + if (name == NULL) + name = dvp->v_path; + + inotify_watch_insert(watch, dvp, name); + inotify_watch_event(watch, IN_MOVED_TO, name); + break; + case VE_SUPPORT: + case VE_MOUNTEDOVER: + case VE_PRE_RENAME_SRC: + case VE_PRE_RENAME_DEST: + case VE_PRE_RENAME_DEST_DIR: + break; + } + + return (vnext_vnevent(vf, vnevent, dvp, name, ct)); +} + +const fs_operation_def_t inotify_vnodesrc_template[] = { + VOPNAME_CLOSE, { .femop_close = inotify_fop_close }, + VOPNAME_CREATE, { .femop_create = inotify_fop_create }, + VOPNAME_LINK, { .femop_link = inotify_fop_link }, + VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir }, + VOPNAME_OPEN, { .femop_open = inotify_fop_open }, + VOPNAME_READ, { .femop_read = inotify_fop_read }, + VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir }, + VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove }, + VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir }, + VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr }, + VOPNAME_WRITE, { .femop_write = inotify_fop_write }, + VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent }, + NULL, NULL +}; + +static int +inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + if (lhs->inw_wd < rhs->inw_wd) + return (-1); + + if (lhs->inw_wd > rhs->inw_wd) + return (1); + + return (0); +} + +static int +inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp; + + if (lvp < rvp) + return (-1); + + if (lvp > rvp) + return (1); + + return (0); +} + +static void +inotify_watch_hold(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 0); + watch->inw_refcnt++; + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_release(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 1); + + if (--watch->inw_refcnt == 1 && watch->inw_zombie) { + /* + * We're down to our last reference; kick anyone that might be + * waiting. + */ + cv_signal(&watch->inw_cv); + } + + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name) +{ + inotify_kevent_t *event, *tail; + inotify_state_t *state = watch->inw_state; + uint32_t wd = watch->inw_wd, cookie = 0, len; + boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE; + inotify_watch_t *source = watch; + + if (!(mask &= watch->inw_mask) || mask == IN_ISDIR) + return; + + if (watch->inw_parent != NULL) { + /* + * This is an event on the child; if this isn't a valid child + * event, return. Otherwise, we move our watch to be our + * parent (which we know is around because we have a hold on + * it) and continue. + */ + if (!(mask & IN_CHILD_EVENTS)) + return; + + name = watch->inw_name; + watch = watch->inw_parent; + wd = watch->inw_wd; + } + + if (!removal) { + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || + watch->inw_fired || !watch->inw_active) { + mutex_exit(&state->ins_lock); + return; + } + } else { + if (!watch->inw_active) + return; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + } + + /* + * If this is an operation on a directory and it's a child event + * (event if it's not on a child), we specify IN_ISDIR. + */ + if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS)) + mask |= IN_ISDIR; + + if (mask & (IN_MOVED_FROM | IN_MOVED_TO)) + cookie = (uint32_t)curthread->t_did; + + if (state->ins_nevents >= state->ins_maxevents) { + /* + * We're at our maximum number of events -- turn our event + * into an IN_Q_OVERFLOW event, which will be coalesced if + * it's already the tail event. + */ + mask = IN_Q_OVERFLOW; + wd = (uint32_t)-1; + cookie = 0; + len = 0; + } + + if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd && + tail->ine_event.mask == mask && tail->ine_event.cookie == cookie && + ((tail->ine_event.len == 0 && len == 0) || + (name != NULL && tail->ine_event.len != 0 && + strcmp(tail->ine_event.name, name) == 0))) { + /* + * This is an implicitly coalesced event; we're done. + */ + if (!removal) + mutex_exit(&state->ins_lock); + return; + } + + if (name != NULL) { + /* + * We are in the context of a file event monitoring operation, + * so the name length is bounded by the kernel. + */ + len = strlen(name) + 1; + len = roundup(len, sizeof (struct inotify_event)); + } else { + len = 0; + } + + event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP); + event->ine_event.wd = wd; + event->ine_event.mask = (uint32_t)mask; + event->ine_event.cookie = cookie; + event->ine_event.len = len; + + if (name != NULL) + (void) strcpy(event->ine_event.name, name); + + if (tail != NULL) { + tail->ine_next = event; + } else { + VERIFY(state->ins_head == NULL); + state->ins_head = event; + cv_broadcast(&state->ins_cv); + } + + state->ins_tail = event; + state->ins_nevents++; + state->ins_size += sizeof (event->ine_event) + len; + + if (removal) + return; + + if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) { + /* + * If this is a one-shot, we need to remove the watch. (Note + * that this will recurse back into inotify_watch_event() to + * fire the IN_IGNORED event -- but with "removal" set.) + */ + watch->inw_fired = 1; + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); +} + +/* + * Destroy a watch. By the time we're in here, the watch must have exactly + * one reference. + */ +static void +inotify_watch_destroy(inotify_watch_t *watch) +{ + VERIFY(MUTEX_HELD(&watch->inw_lock)); + + if (watch->inw_name != NULL) + kmem_free(watch->inw_name, strlen(watch->inw_name) + 1); + + kmem_free(watch, sizeof (inotify_watch_t)); +} + +static int +inotify_fem_install(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * For vnodes that are devices (of type VCHR or VBLK), we silently + * refuse to actually install any event monitor. This is to avoid + * single-thread deadlock when both a special device vnode and its + * underlying real vnode are being watched: releasing the device + * vnode upon watch removal can induce an attribute update on the + * underlying vnode, which will bring us into inotify_watch_event() + * with our lock already held. While we could fail earlier and more + * explicitly in this case, we choose to keep with the Linux behavior + * on unwatchable entities and allow the watch but not generate any + * events for it. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_install(vp, inotify_femp, watch, OPARGUNIQ, + (void (*)(void *))inotify_watch_hold, + (void (*)(void *))inotify_watch_release)); +} + +static int +inotify_fem_uninstall(vnode_t *vp, inotify_watch_t *watch) +{ + /* + * See inotify_fem_install(), above, for our rationale here. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (0); + + return (fem_uninstall(vp, inotify_femp, watch)); +} + +/* + * Zombify a watch. By the time we come in here, it must be true that the + * watch has already been fem_uninstall()'d -- the only reference should be + * in the state's data structure. If we can get away with freeing it, we'll + * do that -- but if the reference count is greater than one due to an active + * vnode operation, we'll put this watch on the zombie list on the state + * structure. + */ +static void +inotify_watch_zombify(inotify_watch_t *watch) +{ + inotify_state_t *state = watch->inw_state; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(!watch->inw_zombie); + + watch->inw_zombie = 1; + + if (watch->inw_parent != NULL) { + inotify_watch_release(watch->inw_parent); + } else { + avl_remove(&state->ins_byvp, watch); + avl_remove(&state->ins_bywd, watch); + vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1); + watch->inw_wd = -1; + } + + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + /* + * There are no operations in flight and there is no way + * for anyone to discover this watch -- we can destroy it. + */ + inotify_watch_destroy(watch); + } else { + /* + * There are operations in flight; we will need to enqueue + * this for later destruction. + */ + watch->inw_parent = state->ins_zombies; + state->ins_zombies = watch; + mutex_exit(&watch->inw_lock); + } +} + +static inotify_watch_t * +inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent, + const char *name, vnode_t *vp, uint32_t mask) +{ + inotify_watch_t *watch; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + + watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP); + + watch->inw_vp = vp; + watch->inw_mask = mask; + watch->inw_state = state; + watch->inw_refcnt = 1; + + if (parent == NULL) { + watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds, + 1, VM_BESTFIT | VM_SLEEP); + avl_add(&state->ins_byvp, watch); + avl_add(&state->ins_bywd, watch); + + avl_create(&watch->inw_children, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + } else { + VERIFY(name != NULL); + inotify_watch_hold(parent); + watch->inw_mask &= IN_CHILD_EVENTS; + watch->inw_parent = parent; + + /* + * Copy the name. Note that when the name is user-specified, + * its length is bounded by the copyinstr() to be MAXPATHLEN + * (and regardless, we know by this point that it exists in + * our parent). + */ + watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + (void) strcpy(watch->inw_name, name); + + avl_add(&parent->inw_children, watch); + } + + /* + * Add our monitor to the vnode. We must not have the watch lock held + * when we do this, as it will immediately hold our watch. + */ + err = inotify_fem_install(vp, watch); + + VERIFY(err == 0); + + return (watch); +} + +/* + * Remove a (non-child) watch. This is called from either synchronous context + * via inotify_rm_watch() or monitor context via either a vnevent or a + * one-shot. + */ +static void +inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) +{ + inotify_watch_t *child; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(watch->inw_parent == NULL); + + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + /* + * If we have children, we're going to remove them all and set them + * all to be zombies. + */ + while ((child = avl_first(&watch->inw_children)) != NULL) { + VERIFY(child->inw_parent == watch); + avl_remove(&watch->inw_children, child); + + err = inotify_fem_uninstall(child->inw_vp, child); + VERIFY(err == 0); + + /* + * If this child watch has been orphaned, remove it from the + * state's list of orphans. + */ + if (child->inw_orphaned) { + list_remove(&state->ins_orphans, child); + crfree(child->inw_cred); + } + + VN_PHANTOM_RELE(child->inw_vp); + + /* + * We're down (or should be down) to a single reference to + * this child watch; it's safe to zombify it. + */ + inotify_watch_zombify(child); + } + + inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); + VN_PHANTOM_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- we know that the only reference + * can come from operations in flight. + */ + inotify_watch_zombify(watch); +} + +/* + * Delete a watch. Should only be called from VOP context. + */ +static void +inotify_watch_delete(inotify_watch_t *watch, uint32_t event) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent; + int err; + + if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie) { + mutex_exit(&state->ins_lock); + return; + } + + if ((parent = watch->inw_parent) == NULL) { + if (event == IN_DELETE_SELF) { + /* + * If we're here because we're being deleted and we + * are not a child watch, we need to delete the entire + * watch, children and all. + */ + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + return; + } else { + if (event == IN_DELETE_SELF && + !(parent->inw_mask & IN_EXCL_UNLINK)) { + /* + * This is a child watch for a file that is being + * removed and IN_EXCL_UNLINK has not been specified; + * indicate that it is orphaned and add it to the list + * of orphans. (This list will be checked by the + * cleaning cyclic to determine when the watch has + * become the only hold on the vnode, at which point + * the watch can be zombified.) Note that we check + * if the watch is orphaned before we orphan it: hard + * links make it possible for VE_REMOVE to be called + * multiple times on the same vnode. (!) + */ + if (!watch->inw_orphaned) { + watch->inw_orphaned = 1; + watch->inw_cred = CRED(); + crhold(watch->inw_cred); + list_insert_head(&state->ins_orphans, watch); + } + + mutex_exit(&state->ins_lock); + return; + } + + if (watch->inw_orphaned) { + /* + * If we're here, a file was orphaned and then later + * moved -- which almost certainly means that hard + * links are on the scene. We choose the orphan over + * the move because we don't want to spuriously + * drop events if we can avoid it. + */ + crfree(watch->inw_cred); + list_remove(&state->ins_orphans, watch); + } + } + + if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) { + /* + * This watch has already been deleted from the parent. + */ + mutex_exit(&state->ins_lock); + return; + } + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + VN_PHANTOM_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- which won't actually delete + * it as we know that the reference count is greater than 1. + */ + inotify_watch_zombify(watch); + mutex_exit(&state->ins_lock); +} + +/* + * Insert a new child watch. Should only be called from VOP context when + * a child is created in a watched directory. + */ +static void +inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = vp }; + + if (!(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) { + mutex_exit(&state->ins_lock); + return; + } + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + return; + } + + VN_PHANTOM_HOLD(vp); + watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); + VERIFY(watch != NULL); + + mutex_exit(&state->ins_lock); +} + + +static int +inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, + int32_t *wdp) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + uint32_t set; + + set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE; + + /* + * Lookup our vnode to determine if we already have a watch on it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * We don't have this watch; allocate a new one, provided that + * we have fewer than our limit. + */ + if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) { + mutex_exit(&state->ins_lock); + return (ENOSPC); + } + + VN_PHANTOM_HOLD(vp); + watch = inotify_watch_add(state, NULL, NULL, vp, set); + *wdp = watch->inw_wd; + mutex_exit(&state->ins_lock); + + return (0); + } + + VERIFY(!watch->inw_zombie); + + if (!(mask & IN_MASK_ADD)) { + /* + * Note that if we're resetting our event mask and we're + * transitioning from an event mask that includes child events + * to one that doesn't, there will be potentially some stale + * child watches. This is basically fine: they won't fire, + * and they will correctly be removed when the watch is + * removed. + */ + watch->inw_mask = 0; + } + + watch->inw_mask |= set; + + *wdp = watch->inw_wd; + + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + vnode_t *cvp; + int err; + + /* + * Verify that the specified child doesn't have a directory component + * within it. + */ + if (strchr(name, '/') != NULL) + return (EINVAL); + + /* + * Lookup the underlying file. Note that this will succeed even if + * we don't have permissions to actually read the file. + */ + if ((err = lookupnameat(name, + UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) { + return (err); + } + + /* + * Use our vnode to find our watch, and then add our child watch to it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * This is unexpected -- it means that we don't have the + * watch that we thought we had. + */ + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (ENXIO); + } + + /* + * Now lookup the child vnode in the watch; we'll only add it if it + * isn't already there. + */ + cmp.inw_vp = cvp; + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (0); + } + + /* Trade the plain hold from lookupnameat() for a phantom hold */ + VN_PHANTOM_HOLD(cvp); + VN_RELE(cvp); + + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); + VERIFY(watch != NULL); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_rm_watch(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + inotify_watch_remove(state, watch); + mutex_exit(&state->ins_lock); + + /* + * Because removing a watch will generate an IN_IGNORED event (and + * because inotify_watch_remove() won't alone induce a pollwakeup()), + * we need to explicitly issue a pollwakeup(). + */ + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); + + return (0); +} + +static int +inotify_activate(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + watch->inw_active = 1; + + mutex_exit(&state->ins_lock); + + return (0); +} + +/* + * Called periodically as a cyclic to process the orphans and zombies. + */ +static void +inotify_clean(void *arg) +{ + inotify_state_t *state = arg; + inotify_watch_t *watch, *parent, *next, **prev; + cred_t *savecred; + int err; + + mutex_enter(&state->ins_lock); + + for (watch = list_head(&state->ins_orphans); + watch != NULL; watch = next) { + next = list_next(&state->ins_orphans, watch); + + VERIFY(!watch->inw_zombie); + VERIFY((parent = watch->inw_parent) != NULL); + + if (watch->inw_vp->v_count > 1) + continue; + + avl_remove(&parent->inw_children, watch); + err = inotify_fem_uninstall(watch->inw_vp, watch); + VERIFY(err == 0); + + list_remove(&state->ins_orphans, watch); + + /* + * For purposes of releasing the vnode, we need to switch our + * cred to be the cred of the orphaning thread (which we held + * at the time this watch was orphaned). + */ + savecred = curthread->t_cred; + curthread->t_cred = watch->inw_cred; + VN_PHANTOM_RELE(watch->inw_vp); + crfree(watch->inw_cred); + curthread->t_cred = savecred; + + inotify_watch_zombify(watch); + } + + prev = &state->ins_zombies; + + while ((watch = *prev) != NULL) { + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + *prev = watch->inw_parent; + inotify_watch_destroy(watch); + continue; + } + + prev = &watch->inw_parent; + mutex_exit(&watch->inw_lock); + } + + mutex_exit(&state->ins_lock); +} + +/*ARGSUSED*/ +static int +inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + int instances = 0; + char c[64]; + + if (minor != INOTIFYMNRN_INOTIFY) + return (ENXIO); + + mutex_enter(&inotify_lock); + + for (state = inotify_state; state != NULL; state = state->ins_next) { + if (state->ins_cred == cred_p) + instances++; + } + + if (instances >= inotify_maxinstances) { + mutex_exit(&inotify_lock); + return (EMFILE); + } + + minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) { + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&inotify_lock); + return (EINVAL); + } + + state = ddi_get_soft_state(inotify_softstate, minor); + *devp = makedevice(major, minor); + + crhold(cred_p); + state->ins_cred = cred_p; + state->ins_next = inotify_state; + inotify_state = state; + + (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor); + state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + avl_create(&state->ins_bywd, + (int(*)(const void *, const void *))inotify_watch_cmpwd, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_bywd)); + + avl_create(&state->ins_byvp, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + + list_create(&state->ins_orphans, sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_orphan)); + + state->ins_maxwatches = inotify_maxwatches; + state->ins_maxevents = inotify_maxevents; + + mutex_exit(&inotify_lock); + + state->ins_cleaner = ddi_periodic_add(inotify_clean, + state, NANOSEC, DDI_IPL_0); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + inotify_state_t *state; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + int err = 0, nevents = 0; + size_t len; + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + while (state->ins_head == NULL) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->ins_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) { + mutex_exit(&state->ins_lock); + return (EINTR); + } + } + + /* + * We have events and we have our lock; return as many as we can. + */ + while ((event = state->ins_head) != NULL) { + len = sizeof (event->ine_event) + event->ine_event.len; + + if (uio->uio_resid < len) { + if (nevents == 0) + err = EINVAL; + break; + } + + nevents++; + + if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0) + break; + + VERIFY(state->ins_nevents > 0); + state->ins_nevents--; + + VERIFY(state->ins_size > 0); + state->ins_size -= len; + + if ((state->ins_head = event->ine_next) == NULL) { + VERIFY(event == state->ins_tail); + VERIFY(state->ins_nevents == 0); + state->ins_tail = NULL; + } + + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + mutex_exit(&state->ins_lock); + + return (err); +} + +static int +inotify_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + if (state->ins_head != NULL) { + *reventsp = events & (POLLRDNORM | POLLIN); + } else { + *reventsp = 0; + } + + if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { + *phpp = &state->ins_pollhd; + } + + mutex_exit(&state->ins_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + file_t *fp; + int rval; + + state = ddi_get_soft_state(inotify_softstate, minor); + + switch (cmd) { + case INOTIFYIOC_ADD_WATCH: { + inotify_addwatch_t addwatch; + file_t *fp; + + if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0) + return (EFAULT); + + if ((fp = getf(addwatch.inaw_fd)) == NULL) + return (EBADF); + + rval = inotify_add_watch(state, fp->f_vnode, + addwatch.inaw_mask, rv); + + releasef(addwatch.inaw_fd); + return (rval); + } + + case INOTIFYIOC_ADD_CHILD: { + inotify_addchild_t addchild; + char name[MAXPATHLEN]; + + if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0) + return (EFAULT); + + if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0) + return (EFAULT); + + if ((fp = getf(addchild.inac_fd)) == NULL) + return (EBADF); + + rval = inotify_add_child(state, fp->f_vnode, name); + + releasef(addchild.inac_fd); + return (rval); + } + + case INOTIFYIOC_RM_WATCH: + return (inotify_rm_watch(state, arg)); + + case INOTIFYIOC_ACTIVATE: + return (inotify_activate(state, arg)); + + case FIONREAD: { + int32_t size; + + mutex_enter(&state->ins_lock); + size = state->ins_size; + mutex_exit(&state->ins_lock); + + if (copyout(&size, (void *)arg, sizeof (size)) != 0) + return (EFAULT); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state, **sp; + inotify_watch_t *watch, *zombies; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + if (state->ins_pollhd.ph_list != NULL) { + pollwakeup(&state->ins_pollhd, POLLERR); + pollhead_clean(&state->ins_pollhd); + } + + mutex_enter(&state->ins_lock); + + /* + * First, destroy all of our watches. + */ + while ((watch = avl_first(&state->ins_bywd)) != NULL) + inotify_watch_remove(state, watch); + + /* + * And now destroy our event queue. + */ + while ((event = state->ins_head) != NULL) { + state->ins_head = event->ine_next; + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + zombies = state->ins_zombies; + state->ins_zombies = NULL; + mutex_exit(&state->ins_lock); + + /* + * Now that our state lock is dropped, we can synchronously wait on + * any zombies. + */ + while ((watch = zombies) != NULL) { + zombies = zombies->inw_parent; + + mutex_enter(&watch->inw_lock); + + while (watch->inw_refcnt > 1) + cv_wait(&watch->inw_cv, &watch->inw_lock); + + inotify_watch_destroy(watch); + } + + if (state->ins_cleaner != NULL) { + ddi_periodic_delete(state->ins_cleaner); + state->ins_cleaner = NULL; + } + + mutex_enter(&inotify_lock); + + /* + * Remove our state from our global list, and release our hold on + * the cred. + */ + for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->ins_next; + crfree(state->ins_cred); + vmem_destroy(state->ins_wds); + + ddi_soft_state_free(inotify_softstate, minor); + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&inotify_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&inotify_lock); + + if (ddi_soft_state_init(&inotify_softstate, + sizeof (inotify_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/inotify failed to create soft state"); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "inotify", S_IFCHR, + INOTIFYMNRN_INOTIFY, DDI_PSEUDO, 0) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node"); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (fem_create("inotify_fem", + inotify_vnodesrc_template, &inotify_femp) != 0) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + inotify_devi = devi; + + inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE, + UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&inotify_lock); + fem_free(inotify_femp); + vmem_destroy(inotify_minor); + + ddi_remove_minor_node(inotify_devi, NULL); + inotify_devi = NULL; + + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)inotify_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops inotify_cb_ops = { + inotify_open, /* open */ + inotify_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + inotify_read, /* read */ + nodev, /* write */ + inotify_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + inotify_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops inotify_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + inotify_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + inotify_attach, /* attach */ + inotify_detach, /* detach */ + nodev, /* reset */ + &inotify_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "inotify support", /* name of module */ + &inotify_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf new file mode 100644 index 0000000000..ce9da6180f --- /dev/null +++ b/usr/src/uts/common/io/inotify.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="inotify" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index 86c46bee44..9966641df4 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -284,7 +284,7 @@ static adapter_info_t ixgbe_82599eb_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -315,7 +315,7 @@ static adapter_info_t ixgbe_X540_cap = { 128, /* default number of rx queues */ 64, /* maximum number of rx groups */ 1, /* minimum number of rx groups */ - 1, /* default number of rx groups */ + 32, /* default number of rx groups */ 128, /* maximum number of tx queues */ 1, /* minimum number of tx queues */ 8, /* default number of tx queues */ @@ -2049,6 +2049,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, void *arg1, void *arg2) { ixgbe_t *ixgbe = (ixgbe_t *)arg1; + int prev = ixgbe->intr_cnt; switch (cbaction) { /* IRM callback */ @@ -2062,7 +2063,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, if (ixgbe_intr_adjust(ixgbe, cbaction, count) != DDI_SUCCESS) { ixgbe_error(ixgbe, - "IRM CB: Failed to adjust interrupts"); + "IRM CB: Failed to adjust interrupts [%d %d %d]", + cbaction, count, prev); goto cb_fail; } break; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index c71297f0de..311fae6719 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,7 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2017 Joyent, Inc. * Copyright 2022 Garrett D'Amore */ diff --git a/usr/src/uts/common/io/ksyms.c b/usr/src/uts/common/io/ksyms.c index 74e71ed7e8..759b524186 100644 --- a/usr/src/uts/common/io/ksyms.c +++ b/usr/src/uts/common/io/ksyms.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -219,6 +220,14 @@ ksyms_open(dev_t *devp, int flag, int otyp, struct cred *cred) char *addr; void *hptr = NULL; ksyms_buflist_hdr_t hdr; + + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); + bzero(&hdr, sizeof (struct ksyms_buflist_hdr)); list_create(&hdr.blist, PAGESIZE, offsetof(ksyms_buflist_t, buflist_node)); diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 41d3ee5fe1..4ce359f87b 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -159,7 +159,7 @@ * perimeter) across a call to any other layer from the mac layer. The call to * any other layer could be via mi_* entry points, classifier entry points into * the driver or via upcall pointers into layers above. The mac perimeter may - * be acquired or held only in the down direction, for e.g. when calling into + * be acquired or held only in the down direction, e.g. when calling into * a mi_* driver enty point to provide atomicity of the operation. * * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across @@ -208,7 +208,7 @@ * number whenever the ring's stop routine is invoked. * See comments in mac_rx_ring(); * - * R17 Similarly mi_stop is another synchronization point and the driver must + * R17. Similarly mi_stop is another synchronization point and the driver must * ensure that all upcalls are done and there won't be any future upcall * before returning from mi_stop. * @@ -228,7 +228,7 @@ * * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind] * - * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename] + * mac perim -> i_dls_devnet_lock [dls_devnet_rename] * * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac * client to driver. In the case of clients that explictly use the mac provided diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index 94ad441a65..952b4c844b 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -3349,6 +3349,11 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mac_cb_info_t *mcbi; int rc; + if ((flags & MAC_PROMISC_FLAGS_NO_COPY) && + (flags & MAC_PROMISC_FLAGS_DO_FIXUPS)) { + return (EINVAL); + } + i_mac_perim_enter(mip); if ((rc = mac_start((mac_handle_t)mip)) != 0) { @@ -3395,6 +3400,7 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, mpip->mpi_strip_vlan_tag = ((flags & MAC_PROMISC_FLAGS_VLAN_TAG_STRIP) != 0); mpip->mpi_no_copy = ((flags & MAC_PROMISC_FLAGS_NO_COPY) != 0); + mpip->mpi_do_fixups = ((flags & MAC_PROMISC_FLAGS_DO_FIXUPS) != 0); mcbi = &mip->mi_promisc_cb_info; mutex_enter(mcbi->mcbi_lockp); @@ -4105,13 +4111,33 @@ mac_promisc_dispatch_one(mac_promisc_impl_t *mpip, mblk_t *mp, { mblk_t *mp_next; - if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag) { + if (!mpip->mpi_no_copy || mpip->mpi_strip_vlan_tag || + (mpip->mpi_do_fixups && local)) { mblk_t *mp_copy; mp_copy = copymsg(mp); if (mp_copy == NULL) return; + /* + * The consumer has requested we emulate HW offloads + * for host-local packets. + */ + if (mpip->mpi_do_fixups && local) { + /* + * Remember that copymsg() doesn't copy + * b_next, so we are only passing a single + * packet to mac_hw_emul(). Also keep in mind + * that mp_copy will become an mblk chain if + * the argument is an LSO message. + */ + mac_hw_emul(&mp_copy, NULL, NULL, + MAC_HWCKSUM_EMUL | MAC_LSO_EMUL); + + if (mp_copy == NULL) + return; + } + if (mpip->mpi_strip_vlan_tag) { mp_copy = mac_strip_vlan_tag_chain(mp_copy); if (mp_copy == NULL) @@ -4320,16 +4346,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) { - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) { + return (B_TRUE); } else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) { boolean_t res; diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index 17959ac48d..cff1f884b9 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. */ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. @@ -209,7 +209,7 @@ typedef struct slaac_addr { } slaac_addr_t; static void start_txn_cleanup_timer(mac_client_impl_t *); -static boolean_t allowed_ips_set(mac_resource_props_t *, uint32_t); +static boolean_t dynamic_method_set(mac_protect_t *, uint32_t); #define BUMP_STAT(m, s) (m)->mci_misc_stat.mms_##s++ @@ -580,8 +580,7 @@ intercept_dhcpv4_outbound(mac_client_impl_t *mcip, ipha_t *ipha, uchar_t *end) if (get_dhcpv4_info(ipha, end, &dh4) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV4_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV4)) return (B_FALSE); if (get_dhcpv4_option(dh4, end, CD_DHCP_TYPE, &opt, &opt_len) != 0 || @@ -1310,8 +1309,7 @@ intercept_dhcpv6_outbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end) if (get_dhcpv6_info(ip6h, end, &dh6) != 0) return (B_TRUE); - /* ip_nospoof/allowed-ips and DHCP are mutually exclusive by default */ - if (allowed_ips_set(mrp, IPV6_VERSION)) + if (!dynamic_method_set(&mrp->mrp_protect, MPT_DYN_DHCPV6)) return (B_FALSE); /* @@ -1517,6 +1515,10 @@ intercept_ra_inbound(mac_client_impl_t *mcip, ip6_t *ip6h, uchar_t *end, { struct nd_opt_hdr *opt; int len, optlen; + mac_protect_t *protect = &MCIP_RESOURCE_PROPS(mcip)->mrp_protect; + + if (!dynamic_method_set(protect, MPT_DYN_SLAAC)) + return; if (ip6h->ip6_hlim != 255) { DTRACE_PROBE1(invalid__hoplimit, uint8_t, ip6h->ip6_hlim); @@ -1755,6 +1757,7 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, if (*addr == INADDR_ANY) return (B_TRUE); + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v4addr = &protect->mp_ipaddrs[i]; @@ -1775,14 +1778,19 @@ ipnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } } - return (protect->mp_ipaddrcnt == 0 ? - check_dhcpv4_dyn_ip(mcip, *addr) : B_FALSE); + + if (dynamic_method_set(protect, MPT_DYN_DHCPV4)) { + return (check_dhcpv4_dyn_ip(mcip, *addr)); + } + + return (B_FALSE); } static boolean_t ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, in6_addr_t *addr) { + boolean_t slaac_enabled, dhcpv6_enabled; uint_t i; /* @@ -1793,7 +1801,7 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, IN6_ARE_ADDR_EQUAL(&mcip->mci_v6_local_addr, addr))) return (B_TRUE); - + /* If any specific addresses or subnets are allowed, check them */ for (i = 0; i < protect->mp_ipaddrcnt; i++) { mac_ipaddr_t *v6addr = &protect->mp_ipaddrs[i]; @@ -1804,12 +1812,15 @@ ipnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *protect, return (B_TRUE); } - if (protect->mp_ipaddrcnt == 0) { - return (check_slaac_ip(mcip, addr) || - check_dhcpv6_dyn_ip(mcip, addr)); - } else { - return (B_FALSE); - } + slaac_enabled = dynamic_method_set(protect, MPT_DYN_SLAAC); + if (slaac_enabled && check_slaac_ip(mcip, addr)) + return (B_TRUE); + + dhcpv6_enabled = dynamic_method_set(protect, MPT_DYN_DHCPV6); + if (dhcpv6_enabled && check_dhcpv6_dyn_ip(mcip, addr)) + return (B_TRUE); + + return (B_FALSE); } /* @@ -2025,6 +2036,9 @@ dhcpnospoof_check_cid(mac_protect_t *p, uchar_t *cid, uint_t cidlen) bcmp(dcid->dc_id, cid, cidlen) == 0) return (B_TRUE); } + + DTRACE_PROBE3(missing__cid, mac_protect_t *, p, + uchar_t *, cid, uint_t, cidlen); return (B_FALSE); } @@ -2046,6 +2060,12 @@ dhcpnospoof_check_v4(mac_client_impl_t *mcip, mac_protect_t *p, bcmp(mcip->mci_unicast->ma_addr, dh4->chaddr, maclen) != 0) { return (B_FALSE); } + + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + if (get_dhcpv4_option(dh4, end, CD_CLIENT_ID, &cid, &optlen) == 0) cidlen = optlen; @@ -2082,6 +2102,11 @@ dhcpnospoof_check_v6(mac_client_impl_t *mcip, mac_protect_t *p, mtype == DHCPV6_MSG_RECONFIGURE) return (B_TRUE); + /* Everything after here is checking the Client Identifier */ + if (p->mp_allcids == MPT_TRUE) { + return (B_TRUE); + } + d6o = get_dhcpv6_option(&dh6[1], end - (uchar_t *)&dh6[1], NULL, DHCPV6_OPT_CLIENTID, &cidlen); if (d6o == NULL || (uchar_t *)d6o + cidlen > end) @@ -2159,7 +2184,6 @@ dhcpnospoof_check(mac_client_impl_t *mcip, mac_protect_t *protect, return (0); fail: - /* increment dhcpnospoof stat here */ freemsg(nmp); return (err); } @@ -2487,6 +2511,11 @@ mac_protect_validate(mac_resource_props_t *mrp) if ((err = validate_cids(p)) != 0) return (err); + if (p->mp_allcids != MPT_FALSE && p->mp_allcids != MPT_TRUE && + p->mp_allcids != MPT_RESET) { + return (EINVAL); + } + return (0); } @@ -2554,6 +2583,16 @@ mac_protect_update(mac_resource_props_t *new, mac_resource_props_t *curr) cp->mp_cidcnt = 0; } } + if (np->mp_allcids == MPT_RESET) { + cp->mp_allcids = MPT_FALSE; + } else if (np->mp_allcids != 0) { + cp->mp_allcids = MPT_TRUE; + } + if (np->mp_dynamic == MPT_RESET) { + cp->mp_dynamic = 0; + } else if (np->mp_dynamic != 0) { + cp->mp_dynamic = np->mp_dynamic; + } } void @@ -2597,15 +2636,50 @@ mac_protect_fini(mac_client_impl_t *mcip) } static boolean_t -allowed_ips_set(mac_resource_props_t *mrp, uint32_t af) +dynamic_method_set(mac_protect_t *mpt, uint32_t method) +{ + if (mpt->mp_dynamic != 0) { + return ((mpt->mp_dynamic & method) != 0); + } else { + return (mpt->mp_ipaddrcnt == 0); + } +} + +boolean_t +mac_protect_check_addr(mac_client_handle_t mch, boolean_t isv6, + in6_addr_t *v6addr) { - int i; + mac_perim_handle_t perim; + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; - for (i = 0; i < mrp->mrp_protect.mp_ipaddrcnt; i++) { - if (mrp->mrp_protect.mp_ipaddrs[i].ip_version == af) - return (B_TRUE); + mac_perim_enter_by_mh(mh, &perim); + + mac_resource_props_t *mrp = MCIP_RESOURCE_PROPS(mcip); + mac_protect_t *p; + boolean_t allowed; + + ASSERT(mrp != NULL); + + p = &mrp->mrp_protect; + + /* If mac protection/ipnospoof isn't enabled, return true */ + if ((mrp->mrp_mask & MRP_PROTECT) == 0 || + (p->mp_types & MPT_IPNOSPOOF) == 0) { + allowed = B_TRUE; + goto done; } - return (B_FALSE); + + if (isv6) { + allowed = ipnospoof_check_v6(mcip, p, v6addr); + } else { + in_addr_t *v4addr = &V4_PART_OF_V6((*v6addr)); + allowed = ipnospoof_check_v4(mcip, p, v4addr); + } + +done: + mac_perim_exit(perim); + return (allowed); } mac_protect_t * diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index dbb5c0a914..e1151565a6 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -391,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname, kstat_t *ksp; kstat_named_t *knp; - ksp = kstat_create(modname, 0, statname, "net", - KSTAT_TYPE_NAMED, count, 0); + ksp = kstat_create_zone(modname, 0, statname, "net", + KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return (NULL); @@ -949,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip) major_t major = getmajor(mip->mi_phy_dev); count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount; - ksp = kstat_create((const char *)ddi_major_to_name(major), + ksp = kstat_create_zone((const char *)ddi_major_to_name(major), getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME, - MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0); + MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return; diff --git a/usr/src/uts/common/io/mem.c b/usr/src/uts/common/io/mem.c index cde4fab94f..f61de9d5eb 100644 --- a/usr/src/uts/common/io/mem.c +++ b/usr/src/uts/common/io/mem.c @@ -223,10 +223,19 @@ mmopen(dev_t *devp, int flag, int typ, struct cred *cred) case M_NULL: case M_ZERO: case M_FULL: + /* standard devices */ + break; + case M_MEM: case M_KMEM: case M_ALLKMEM: - /* standard devices */ + /* + * These devices should never be visible in a zone, but if they + * somehow do get created we refuse to allow the zone to use + * them. + */ + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (EACCES); break; default: diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index cfda434e23..6c585c6a42 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -13,3 +13,11 @@ # Fast-Path specific flag. Default is "yes". # mrsas-enable-fp="yes"; +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE new file mode 100644 index 0000000000..187088ff34 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014, Thales UK Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..cde8b65b37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +NFAST CRYPTO ACCELERATOR DRIVER diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h new file mode 100644 index 0000000000..b9021942b2 --- /dev/null +++ b/usr/src/uts/common/io/nfp/autoversion.h @@ -0,0 +1,21 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* AUTOGENERATED - DO NOT EDIT */ +#ifndef AUTOVERSION_H +#define AUTOVERSION_H + +#define VERSION_RELEASEMAJOR 2 +#define VERSION_RELEASEMINOR 26 +#define VERSION_RELEASEPATCH 40 +#define VERSION_NO "2.26.40cam999" +#define VERSION_COMPNAME "nfdrv" + +#endif diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c new file mode 100644 index 0000000000..a04b1fd5b0 --- /dev/null +++ b/usr/src/uts/common/io/nfp/drvlist.c @@ -0,0 +1,19 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include "nfp_common.h" +#include "nfp_cmd.h" + +const nfpcmd_dev *nfp_drvlist[] = { + &i21285_cmddev, + &i21555_cmddev, + NULL +}; + diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c new file mode 100644 index 0000000000..684be703ea --- /dev/null +++ b/usr/src/uts/common/io/nfp/hostif.c @@ -0,0 +1,1192 @@ +/* + +hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8 + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv +12/02/2002 jsh added high level interrupt support + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" + +#include "nfp.h" + +/* mapped memory attributes, no-swap endianess (done in higher level) */ +static struct ddi_device_acc_attr nosw_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* dma attributes */ +static ddi_dma_attr_t dma_attrs = { + DMA_ATTR_V0, /* version number */ + (uint64_t)0x0, /* low address */ + (uint64_t)0xffffffff, /* high address */ + (uint64_t)0xffffff, /* DMA counter max */ + (uint64_t)0x1, /* alignment */ + 0x0c, /* burst sizes */ + 0x1, /* minimum transfer size */ + (uint64_t)0x3ffffff, /* maximum transfer size */ + (uint64_t)0x7fff, /* maximum segment size */ + 1, /* no scatter/gather lists */ + 1, /* granularity */ + 0 /* DMA flags */ +}; + +/* + * Debug message control + * Debug Levels: + * 0 = no messages + * 1 = Errors + * 2 = Subroutine calls & control flow + * 3 = I/O Data (verbose!) + * Can be set with adb or in the /etc/system file with + * "set nfp:nfp_debug=<value>" + */ + +int nfp_debug= 1; + +static void *state_head; /* opaque handle top of state structs */ + +static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp); +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp); +static int nfp_release_dev( dev_info_t *dip ); + +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_strategy(struct buf *bp); + +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp); +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static void nfp_wrtimeout (void *pdev); +static void nfp_rdtimeout (void *pdev); + +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result); +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +static void nfp_read_complete_final(nfp_dev *pdev, int ok); +static void nfp_write_complete_final(nfp_dev *pdev, int ok); + +/* nfp file ops --------------------------------------------------- */ + +static struct cb_ops nfp_cb_ops = { + nfp_open, + nfp_close, + nodev, /* no nfp_strategy */ + nodev, /* no print routine */ + nodev, /* no dump routine */ + nfp_read, + nfp_write, + nfp_ioctl, + nodev, /* no devmap routine */ + nodev, /* no mmap routine */ + nodev, /* no segmap routine */ + nfp_chpoll, + ddi_prop_op, + 0, /* not a STREAMS driver, no cb_str routine */ + D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */ + CB_REV, + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops nfp_ops = { + DEVO_REV, /* DEVO_REV indicated by manual */ + 0, /* device reference count */ + nfp_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + nfp_attach, + nfp_detach, + nodev, /* device reset routine */ + &nfp_cb_ops, + (struct bus_ops *)0, /* bus operations */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv modldrv = { + &mod_driverops, + NFP_DRVNAME, + &nfp_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* MODREV_1 indicated by manual */ + (void *)&modldrv, + NULL, /* termination of list of linkage structures */ +}; + +/* interface resource allocation */ + +int nfp_alloc_pci_push( nfp_dev *pdev ) { + /* allocate resources needed for PCI Push, + * if not already allocated. + * return True if successful + */ + nfp_err ret; + uint_t cookie_count; + size_t real_length; + + if(!pdev->read_buf) { + /* allocate read buffer */ + pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP ); + } + if(!pdev->read_buf) { + nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed"); + pdev->read_buf = NULL; + return 0; + } + + if(!pdev->rd_dma_ok) { + /* allocate dma handle for read buffer */ + ret = ddi_dma_alloc_handle( pdev->dip, + &dma_attrs, + DDI_DMA_DONTWAIT, + NULL, + &pdev->read_dma_handle ); + if( ret != DDI_SUCCESS ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)", + ret ); + return 0; + } + + /* Allocate the memory for dma transfers */ + ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr, + DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL, + (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle); + if (ret != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + + ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle, + NULL, /* kernel address space */ + (caddr_t)pdev->read_buf, real_length, + DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */ + DDI_DMA_DONTWAIT, NULL, + &pdev->read_dma_cookie, &cookie_count ); + if( ret != DDI_DMA_MAPPED ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)", + ret); + ddi_dma_mem_free(&pdev->acchandle); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + if( cookie_count > 1 ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: error:" + " ddi_dma_addr_bind_handle wants %d transfers", + cookie_count); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + pdev->rd_dma_ok = 1; + } + return pdev->rd_dma_ok; +} + +void nfp_free_pci_push( nfp_dev *pdev ) { + /* free resources allocated to PCI Push */ + if( pdev->rd_dma_ok ) { + (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + pdev->rd_dma_ok = 0; + } + if( pdev->read_buf ) { + kmem_free( pdev->read_buf, NFP_READBUF_SIZE ); + pdev->read_buf = NULL; + } +} + +/* include definition of nfp_set_ifvers() */ +#define nfp_ifvers NFDEV_IF_PCI_PUSH +#include "nfp_ifvers.c" +#undef nfp_ifvers + +/*--------------------*/ +/* nfp_isr */ +/*--------------------*/ + +static u_int nfp_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + nfp_err ne; + int handled; + + nfp_log( NFP_DBG3, "nfp_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + + /* The isr needs to be mutex'ed - an SMP can call us while we're still + * running! + */ + mutex_enter(&pdev->low_mutex); + ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled ); + mutex_exit(&pdev->low_mutex); + + if( !ne && handled ) + return DDI_INTR_CLAIMED; + if (ne) + nfp_log( NFP_DBG1, "nfp_isr: failed"); + else + nfp_log( NFP_DBG3, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + +static u_int nfp_soft_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + int rd, wr; + + nfp_log( NFP_DBG3, "nfp_soft_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + rd= wr= 0; + + mutex_enter(&pdev->high_mutex); + if(pdev->high_read) { + pdev->high_read= 0; + mutex_exit(&pdev->high_mutex); + rd= 1; + } + if(pdev->high_write) { + pdev->high_write= 0; + wr= 1; + } + mutex_exit(&pdev->high_mutex); + + if(rd) { + nfp_log( NFP_DBG3, "nfp_soft_isr: read done"); + nfp_read_complete_final(pdev, pdev->rd_ok); + } + if(wr) { + nfp_log( NFP_DBG3, "nfp_soft_isr: write done"); + nfp_write_complete_final(pdev, pdev->wr_ok); + } + if( rd || wr ) + return DDI_INTR_CLAIMED; + + nfp_log( NFP_DBG2, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + + +/*-------------------------*/ +/* nfp_read */ +/*-------------------------*/ + +void nfp_read_complete(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete: entering"); + + if(pdev->high_intr) { + nfp_log(NFP_DBG2, "nfp_read_complete: high_intr"); + mutex_enter(&pdev->high_mutex); + nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered"); + if(pdev->high_read) + nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!"); + pdev->high_read= 1; + pdev->rd_ok= ok; + nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex"); + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_read_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_read_complete: exiting"); +} + +static void nfp_read_complete_final(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete_final: entering"); + if(pdev->rdtimeout) + (void) untimeout(pdev->rdtimeout); + if(!pdev->rd_outstanding) { + nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding"); + } + nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok); + mutex_enter(&pdev->isr_mutex); + pdev->rd_outstanding= 0; + pdev->rd_ready= 1; + pdev->rd_ok= ok; + cv_broadcast(&pdev->rd_cv); + mutex_exit(&pdev->isr_mutex); + pollwakeup (&pdev->pollhead, POLLRDNORM); + nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting"); +} + +static void nfp_rdtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." ); + return; + } + pdev->rdtimeout= 0; + nfp_read_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_read: entered" ); + if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) { + nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev"); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_read: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_write */ +/*-------------------------*/ + +void nfp_write_complete( nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_write_complete: entering"); + + if(pdev->high_intr) { + mutex_enter(&pdev->high_mutex); + if(pdev->high_write) + nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!"); + pdev->high_write= 1; + pdev->wr_ok= ok; + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_write_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_write_complete: exiting"); +} + +static void nfp_write_complete_final( nfp_dev *pdev, int ok) { + struct buf *local_wr_bp; + nfp_log( NFP_DBG2,"nfp_write_complete_final: entering"); + if(pdev->wrtimeout) + (void) untimeout(pdev->wrtimeout); + + if (!pdev->wr_bp) { + nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." ); + return; + } + + bp_mapout(pdev->wr_bp); + pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount; + /* Make sure we set wr_ready before calling biodone to avoid a race */ + pdev->wr_ready = 1; + bioerror(pdev->wr_bp, ok ? 0 : ENXIO); + local_wr_bp = pdev->wr_bp; + pdev->wr_bp = 0; + biodone(local_wr_bp); + nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited"); + pollwakeup (&pdev->pollhead, POLLWRNORM); + + nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving"); +} + +static void nfp_wrtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." ); + return; + } + pdev->wrtimeout= 0; + nfp_write_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_write: entered." ); + if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) { + nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev."); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_write: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_strategy */ +/*-------------------------*/ + +#define NFP_STRAT_ERR(thebp,err,txt) \ + nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \ + (thebp)->b_resid = (thebp)->b_bcount; \ + bioerror ((thebp), err); \ + biodone ((thebp)); + +static int nfp_strategy(struct buf *bp) { + register struct nfp_dev *pdev; + nfp_err ne; + + nfp_log( NFP_DBG2, "nfp_strategy: entered." ); + if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) { + NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev"); + return (0); + } + + if (bp->b_flags & B_READ) { + int count; + /* read */ + if (!pdev->rd_ready) { + NFP_STRAT_ERR (bp,ENXIO,"read called when not ready"); + return (0); + } + pdev->rd_ready=0; + pdev->rd_pending = 0; + if( !pdev->rd_ok) { + NFP_STRAT_ERR (bp,ENXIO,"read failed"); + return (0); + } + /* copy data from module */ + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer"); + if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS ) + { + NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed"); + return (0); + } + /* LINTED: alignment */ + count= *(unsigned int *)(pdev->read_buf+4); + count= FROM_LE32_MEM(&count); + nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count); + if(count<0 || count>bp->b_bcount) { + NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device"); + nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count); + return (0); + } + bp_mapin (bp); + bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count ); + bp_mapout (bp); + } else { + bp_mapin (bp); + ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count ); + bp_mapout (bp); + if( ne != NFP_SUCCESS) { + NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed"); + return (0); + } + } + bioerror(bp, 0); + bp->b_resid = 0; + biodone (bp); + } else { + /* write */ + if (!pdev->wr_ready) { + NFP_STRAT_ERR (bp,ENXIO,"write called when not ready"); + return (0); + } + if (pdev->wr_bp) { + NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL"); + return (0); + } + pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + pdev->wr_bp = bp; + pdev->wr_ready = 0; + bp_mapin (bp); + ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx); + if( ne != NFP_SUCCESS ) { + bp_mapout (bp); + (void) untimeout(pdev->wrtimeout); + pdev->wr_bp = 0; + pdev->wr_ready = 1; + NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed"); + return (0); + } + } + nfp_log( NFP_DBG2, "nfp_strategy: leaving"); + + return (0); +} + + +/*--------------------*/ +/* poll / select */ +/*--------------------*/ + +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) { + nfp_dev *pdev; + short revents; + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev"); + *reventsp=0; + return (0); + } + nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events); + + revents=0; + if (events&POLLWRNORM) { + if (pdev->wr_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: write ready"); + revents|=POLLWRNORM; + } + } + + if (events&POLLRDNORM) { + if (pdev->rd_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: read ready"); + revents|=POLLRDNORM; + } + } + + if (!revents && !anyyet) { + *phpp=&pdev->pollhead; + } + *reventsp=revents; + + nfp_log( NFP_DBG2, "nfp_chpoll: leaving"); + return (0); +} + + +/*--------------------*/ +/* ioctl */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) { + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_ioctl: entered." ); + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev."); + return (ENXIO); + } + + switch (cmd) { + case NFDEV_IOCTL_ENQUIRY: + { + long *outp; + int outlen; + nfdev_enquiry_str enq_data; + + enq_data.busno = (unsigned int)-1; + enq_data.slotno = (unsigned char)-1; + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." ); + if( outlen > 0 ) { + enq_data.busno = ((*outp)>>16) & 0xff; + enq_data.slotno = ((*outp)>>11) & 0x1f; + nfp_log( NFP_DBG2, "busno %d, slotno %d.", + enq_data.busno, enq_data.slotno ); + } + } else + nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." ); + + if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + case NFDEV_IOCTL_ENSUREREADING: + { + unsigned int addr, len; + nfp_err ret; + if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + /* signal a read to the module */ + nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len ); + if (len>8192) { + nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len ); + return EINVAL; + } + if (pdev->rd_outstanding==1) { + nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding."); + return EIO; + } + + addr= 0; + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + if( len > NFP_READBUF_SIZE ) { + nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len ); + return EINVAL; + } + addr= pdev->read_dma_cookie.dmac_address; + } + + pdev->rd_outstanding = 1; + nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1"); + + /* setup timeout timer */ + pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + + nfp_log( NFP_DBG2, "nfp_ioctl: read request"); + ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx); + if ( ret != NFP_SUCCESS ) { + (void) untimeout(pdev->rdtimeout); + pdev->rdtimeout = 0; + pdev->rd_outstanding = 0; + nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed "); + return nfp_oserr( ret ); + } + } + break; + + case NFDEV_IOCTL_PCI_IFVERS: + { + int vers; + + nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS"); + + if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + + if( pdev->rd_outstanding ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers); + return EIO; + } + + nfp_set_ifvers(pdev, vers); + if( pdev->ifvers != vers ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers); + return EIO; + } + } + break; + + case NFDEV_IOCTL_STATS: + { + if( ddi_copyout( (char *)&(pdev->common.stats), + (void *)arg, + sizeof(nfdev_stats_str), + mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + default: + nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." ); + return EINVAL; + } + + return 0; +} + +/*-------------------------*/ +/* nfp_open */ +/*-------------------------*/ + +/* ARGSUSED */ +int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + nfp_err ret; + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "entered nfp_open." ); + + pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev)); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev."); + return (ENODEV); + } + + if( otyp != OTYP_CHR ) { + nfp_log( NFP_DBG1, "nfp_open: not opened as character device"); + return (EINVAL); + } + + mutex_enter(&pdev->busy_mutex); + + if (pdev->busy) { + mutex_exit(&pdev->busy_mutex); + nfp_log( NFP_DBG1, "nfp_open: device busy"); + return EBUSY; + } + pdev->busy= 1; + mutex_exit(&pdev->busy_mutex); + + /* use oldest possible interface until told otherwise */ + pdev->ifvers= NFDEV_IF_STANDARD; + nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers); + pdev->rd_ready= 0; /* drop any old data */ + + ret = pdev->cmddev->open(pdev->common.cmdctx); + if( ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed "); + return nfp_oserr( ret ); + } + + nfp_log( NFP_DBG2, "nfp_open: done"); + + return 0; +} + +/*--------------------*/ +/* nfp_close */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) { + nfp_dev *pdev; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_close: entered"); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + mutex_enter(&pdev->isr_mutex); + if(pdev->rd_outstanding) { + int lbolt, err; + nfp_get_lbolt(&lbolt, err); + if(!err) + (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) ); + } + mutex_exit(&pdev->isr_mutex); + ret = pdev->cmddev->close(pdev->common.cmdctx); + if (ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed"); + return nfp_oserr( ret ); + } + + mutex_enter(&pdev->busy_mutex); + pdev->busy= 0; + mutex_exit(&pdev->busy_mutex); + + return 0; +} + +/**************************************************************************** + + nfp driver config + + ****************************************************************************/ + +/*-------------------------*/ +/* nfp_getinfo */ +/*-------------------------*/ + +/* ARGSUSED */ +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + int error; + nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_getinfo: entered" ); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if (pdev == NULL) { + *result = NULL; + error = DDI_FAILURE; + } else { + /* + * don't need to use a MUTEX even though we are + * accessing our instance structure; dev->dip + * never changes. + */ + *result = pdev->dip; + error = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + break; + default: + *result = NULL; + error = DDI_FAILURE; + } + + nfp_log( NFP_DBG2, "nfp_getinfo: leaving." ); + return (error); +} + +/*-------------------------*/ +/* nfp_release */ +/*-------------------------*/ + +static int nfp_release_dev( dev_info_t *dip ) { + nfp_dev *pdev; + int instance, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_release_dev: entering" ); + + instance = ddi_get_instance(dip); + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if (pdev) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing device" ); + + nfp_free_pci_push(pdev); + + if( pdev->cmddev ) { + nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" ); + ret = pdev->cmddev->destroy(pdev->common.cmdctx); + if (ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed "); + return nfp_oserr( ret ); + } + } + + if(pdev->high_iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" ); + ddi_remove_softintr(pdev->soft_int_id); + ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + mutex_destroy( &pdev->high_mutex ); + } else if(pdev->iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" ); + ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + } + if(pdev->low_iblock_cookie) { + ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie); + mutex_destroy( &pdev->low_mutex); + } + + for(i=0;i<6;i++) { + if( pdev->common.extra[i] ) { + nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i ); + ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]); + } + } + + ddi_remove_minor_node(dip, NULL); + + if (pdev->conf_handle) + pci_config_teardown( &pdev->conf_handle ); + + ddi_soft_state_free(state_head, instance); + } + nfp_log( NFP_DBG2, "nfp_release: finished" ); + + return DDI_SUCCESS; +} + + +/*-------------------------*/ +/* nfp_attach */ +/*-------------------------*/ + +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + int instance; + nfp_dev *pdev = NULL; + int intres; + uint16_t device, vendor, sub_device, sub_vendor; + long *outp; + nfpcmd_dev const *cmddev; + int index, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_attach: entered." ); + + if (cmd != DDI_ATTACH) { + nfp_log( NFP_DBG1, "nfp_attach: bad command." ); + goto bailout; + } + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(state_head, instance) != 0) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." ); + goto bailout; + } + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_attach: cannot find dev."); + return ENODEV; + } + pdev->dip = dip; + + /* map in pci config registers */ + if (pci_config_setup(dip, &pdev->conf_handle)) { + nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." ); + goto bailout; + } + + /* find out what we have got */ + vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID ); + device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID ); + sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID ); + sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID ); + + index= 0; + while( (cmddev = nfp_drvlist[index++]) != NULL ) { + if( cmddev->vendorid == vendor && + cmddev->deviceid == device && + cmddev->sub_vendorid == sub_vendor && + cmddev->sub_deviceid == sub_device ) + break; + } + if( !cmddev ) { + nfp_log( NFP_DBG1, "nfp_attach: unknonw device." ); + goto bailout; + } + + /* map BARs */ + for( i=0; i<6; i++ ) { + if( cmddev->bar_sizes[i] ) { + off_t size; + if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i ); + goto bailout; + } + if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { + nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) ); + goto bailout; + } + if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i], + 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i ); + goto bailout; + } + nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size ); + } + } + + pdev->read_buf = NULL; + pdev->rd_dma_ok = 0; + + /* attach to minor node */ + if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." ); + goto bailout; + } + + pdev->wr_ready = 1; + pdev->rd_ready = 0; + pdev->rd_pending = 0; + pdev->rd_outstanding = 0; + pdev->busy=0; + pdev->cmddev= cmddev; + + ret = pdev->cmddev->create(&pdev->common); + if( ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: failed to create command device"); + goto bailout; + } + pdev->common.dev= pdev; + + if (ddi_intr_hilevel(dip, 0) != 0){ + nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt"); + if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." ); + goto bailout; + } + if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." ); + goto bailout; + } + mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->high_iblock_cookie); + mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->low_iblock_cookie); + if (ddi_add_intr(dip, 0, NULL, + NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." ); + goto bailout; + } + if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH, + &pdev->iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." ); + goto bailout; + } + mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->iblock_cookie); + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id, + &pdev->iblock_cookie, NULL, + nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS) + goto bailout; + pdev->high_intr= 1; + } else { + nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt"); + + if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." ); + goto bailout; + } + + mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie); + + if (ddi_add_intr(dip, 0, NULL, + (ddi_idevice_cookie_t *)NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." ); + goto bailout; + } + } + mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL ); + cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL ); + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." ); + if( intres > 0 ) { + nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.", + ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f ); + } + } + + nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." ); + return DDI_SUCCESS; + +bailout: + (void) nfp_release_dev( dip ); + + return DDI_FAILURE; +} + +/*-------------------------*/ +/* nfp_detach */ +/*-------------------------*/ + +/* + * When our driver is unloaded, nfp_detach cleans up and frees the resources + * we allocated in nfp_attach. + */ +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + (void) nfp_release_dev(dip); + + return (DDI_SUCCESS); +} + +/*-------------------------*/ +/* _init */ +/*-------------------------*/ + +int _init(void) { + register int error; + + nfp_log( NFP_DBG2, "_init: entered" ); + + if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) { + nfp_log( NFP_DBG1, "_init: soft_state_init() failed" ); + return (error); + } + + if ((error = mod_install(&modlinkage)) != 0) { + nfp_log( NFP_DBG1, "_init: mod_install() failed" ); + ddi_soft_state_fini(&state_head); + } + + nfp_log( NFP_DBG2, "_init: leaving" ); + return (error); +} + +/*-------------------------*/ +/* _info */ +/*-------------------------*/ + +int _info(struct modinfo *modinfop) { + nfp_log( NFP_DBG2, "_info: entered" ); + + return (mod_info(&modlinkage, modinfop)); +} + +/*-------------------------*/ +/* _fini */ +/*-------------------------*/ + +int _fini(void) { + int status; + + nfp_log( NFP_DBG2, "_fini: entered" ); + + if ((status = mod_remove(&modlinkage)) != 0) { + nfp_log( NFP_DBG2, "_fini: mod_remove() failed." ); + return (status); + } + + ddi_soft_state_fini(&state_head); + + nfp_log( NFP_DBG2, "_fini: leaving" ); + + return (status); +} + diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c new file mode 100644 index 0000000000..f51a09188d --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.c @@ -0,0 +1,310 @@ +/* + +i21285.c: nCipher PCI HSM intel/digital 21285 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21285.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* create ------------------------------------------------------- */ + +static nfp_err i21285_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + nfp_log( NFP_DBG2, "i21285_create: enable doorbell"); + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21285_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_open( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21285_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + unsigned int doorbell; + unsigned int tmp32; + + nfp_log( NFP_DBG3, "i21285_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_isr: NULL pdev"); + return NFP_ENODEV; + } + + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell ); + TO_LE32_IO( &tmp32, 0xffff & doorbell ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + } + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + } + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21285_write( const char *block, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_write: NULL pdev"); + return NFP_ENODEV; + } + + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]); + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]); + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_write: length not written"); + return NFP_EIO; + } + + TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST); + + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log( NFP_DBG2, "i21285_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_read: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if(ne) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if( ne ) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21285_read: done"); + *rcount= count; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21285_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + unsigned int tmp32; + nfp_err ne; + + nfp_log( NFP_DBG2, "i21285_ensure_reading: entered"); + + if(addr) { + nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr"); + return -NFP_EINVAL; + } + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR ); + return NFP_ENXIO; + } + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM( &hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written"); + return NFP_EIO; + }; + TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST ); + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + + +const nfpcmd_dev i21285_cmddev = { + "nCipher Gen 1 PCI", + PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, + PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21285_create, + i21285_destroy, + i21285_open, + i21285_close, + i21285_isr, + i21285_write, + i21285_read, + i21285_chupdate, + i21285_ensure_reading, + 0, /* no debug */ +}; + diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h new file mode 100644 index 0000000000..4ea1d853ec --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.h @@ -0,0 +1,43 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_I21285_H +#define NFP_I21285_H + +#ifndef PCI_VENDOR_ID_DEC +#define PCI_VENDOR_ID_DEC 0x1011 +#endif +#ifndef PCI_DEVICE_ID_DEC_21285 +#define PCI_DEVICE_ID_DEC_21285 0x1065 +#endif +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_DEVICE_ID_NFAST_GEN1 +#define PCI_DEVICE_ID_NFAST_GEN1 0x0100 +#endif + +#define I21285_OFFSET_DOORBELL 0x60 +#define I21285_OFFSET_INTERRUPT_MASK 0x34 + +#define DOORBELL_ENABLE 0x0 +#define DOORBELL_DISABLE 0x4 + +#define POSTLIST_ENABLE 0x0 +#define POSTLIST_DISABLE 0x8 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x80 +#define MEMSIZE 0x100000 + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c new file mode 100644 index 0000000000..82024dc800 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.c @@ -0,0 +1,423 @@ +/* + +i21555.c: nCipher PCI HSM intel 21555 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21555.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* started ------------------------------------------------------ + * + * Check that device is ready to talk, by checking that + * the i21555 has master enabled on its secondary interface + */ + +static nfp_err i21555_started( nfp_cdev *pdev ) { + unsigned int tmp32; +#ifdef CONFIGSPACE_DEBUG + unsigned int reg32[64]; + int i; +#endif + nfp_err ne; + + nfp_log( NFP_DBG2, "i21555_started: entered"); + +#ifdef CONFIGSPACE_DEBUG + /* Suck up all the registers */ + for (i=0; i < 64; i++) { + ne = nfp_config_inl( pdev, i*4, ®32[i] ); + } + + for (i=0; i < 16; i++) { + int j = i * 4; + nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4, + reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]); + } +#endif + + ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 ); + if (ne) { + /* succeed if PCI config reads are not implemented */ + if (ne == NFP_EUNKNOWN) + return NFP_SUCCESS; + nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed"); + return ne; + } + + tmp32= FROM_LE32_IO(&tmp32) & 0xffff; + + if ( tmp32 & CFG_CMD_MASTER ) { + nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32); + return NFP_SUCCESS; + } else { + nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32); + return NFP_ESTARTING; + } +} + +/* create ------------------------------------------------------- */ + +static nfp_err i21555_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + nfp_log( NFP_DBG2, "i21555_create: enable doorbell"); + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21555_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_open( void * ctx ) { + + nfp_log( NFP_DBG2, "i21555_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21555_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21555_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + nfp_err ne; + unsigned short doorbell; + unsigned short tmp16; + + nfp_log( NFP_DBG3, "i21555_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_isr: NULL pdev"); + return NFP_ENODEV; + } + + pdev->stats.isr++; + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + /* This interrupt may not be from our module, so check that it actually is + * us before handling it. + */ + ne = i21555_started( pdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed"); + } + return ne; + } + + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + pdev->stats.isr_write++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + pdev->stats.isr_read++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE16_IO(&tmp16,doorbell); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell ); + } + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + } + nfp_log( NFP_DBG3, "i21555_isr: exiting"); + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21555_write( const char *block, int len, void *ctx) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned short tmp16; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_write: NULL cdev"); + return NFP_ENODEV; + } + + cdev->stats.write_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_write: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_write: length not written"); + return NFP_EIO; + } + TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.write_fail--; + cdev->stats.write_block++; + cdev->stats.write_byte += len; + + nfp_log( NFP_DBG2, "i21555_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21555_read: entered"); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_read: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.read_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21555_read: done"); + *rcount= count; + cdev->stats.read_fail--; + cdev->stats.read_block++; + cdev->stats.read_byte += len; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21555_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[3]; + unsigned short tmp16; + unsigned int tmp32; + nfp_err ne; + int hdr_len; + + nfp_log( NFP_DBG2, "i21555_ensure_reading: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.ensure_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + if(addr) { + nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr); + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH); + TO_LE32_MEM(&hdr[1], len); + TO_LE32_MEM(&hdr[2], addr); + hdr_len= 12; + } else { + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + hdr_len= 8; + } + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written"); + return NFP_EIO; + } + TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.ensure_fail--; + cdev->stats.ensure++; + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + +const nfpcmd_dev i21555_cmddev = { + "nCipher Gen 2 PCI", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555, + PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21555_create, + i21555_destroy, + i21555_open, + i21555_close, + i21555_isr, + i21555_write, + i21555_read, + i21555_chupdate, + i21555_ensure_reading, + i21555_debug, +}; diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h new file mode 100644 index 0000000000..d8f3965938 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.h @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef I21555_H +#define I21555_H + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL_21555 +#define PCI_DEVICE_ID_INTEL_21555 0xb555 +#endif + +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1 +#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100 +#endif + +#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C +#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E +#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98 + +#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4 +#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0 + +#define I21555_DOORBELL_PRI_ENABLE 0x0000 +#define I21555_DOORBELL_PRI_DISABLE 0xFFFF + +#define I21555_CFG_SEC_CMD_STATUS 0x44 + +#define CFG_CMD_MASTER 0x0004 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x100 + +extern nfp_err i21555_debug( int cmd, void *ctx ); + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c new file mode 100644 index 0000000000..183ace8275 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555d.c @@ -0,0 +1,28 @@ +/* + +i21555d.c: nCipher PCI HSM intel 21555 debug ioctl + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +15/05/2002 jsh Original, does nothing + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "i21555.h" + +/* ARGSUSED */ +nfp_err i21555_debug( int cmd, void *ctx) { + nfp_log( NFP_DBG1, "i21555_debug: entered"); + + return NFP_EUNKNOWN; +} diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h new file mode 100644 index 0000000000..8a97bf2c63 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-common.h @@ -0,0 +1,141 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ +/** \file nfdev-common.h + * + * \brief nFast device driver (not generic SCSI) ioctl struct definition file + * include NFDEV-$(system) for ioctl number definitions + * + * 1998.07.13 jsh Started + * + * + */ + +#ifndef NFDEV_COMMON_H +#define NFDEV_COMMON_H + +/** + * Result of the ENQUIRY ioctl. + */ +typedef struct nfdev_enquiry_str { + unsigned int busno; /**< Which bus is the PCI device on. */ + unsigned char slotno; /**< Which slot is the PCI device in. */ + unsigned char reserved[3]; /**< for consistant struct alignment */ +} nfdev_enquiry_str; + +/** + * Result of the STATS ioctl. + */ +typedef struct nfdev_stats_str { + unsigned long isr; /**< Count interrupts. */ + unsigned long isr_read; /**< Count read interrupts. */ + unsigned long isr_write; /**< Count write interrupts. */ + unsigned long write_fail; /**< Count write failures. */ + unsigned long write_block; /**< Count blocks written. */ + unsigned long write_byte; /**< Count bytes written. */ + unsigned long read_fail; /**< Count read failures. */ + unsigned long read_block; /**< Count blocks read. */ + unsigned long read_byte; /**< Count bytes read. */ + unsigned long ensure_fail; /**< Count read request failures. */ + unsigned long ensure; /**< Count read requests. */ +} nfdev_stats_str; + +/** + * Input to the CONTROL ioctl. + */ +typedef struct nfdev_control_str { + unsigned control; /**< Control flags. */ +} nfdev_control_str; + +/** Control bit indicating host supports MOI control */ +#define NFDEV_CONTROL_HOST_MOI 0x0001 + +/** Index of control bits indicating desired mode + * + * Desired mode follows the M_ModuleMode enumeration. + */ +#define NFDEV_CONTROL_MODE_SHIFT 1 + +/** Detect a backwards-compatible control value + * + * Returns true if the request control value "makes no difference", i.e. + * and the failure of an attempt to set it is therefore uninteresting. + */ +#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1) + +/** + * Result of the STATUS ioctl. + */ +typedef struct nfdev_status_str { + unsigned status; /**< Status flags. */ + char error[8]; /**< Error string. */ +} nfdev_status_str; + +/** Monitor firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_MONITOR_MOI 0x0001 + +/** Application firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_APPLICATION_MOI 0x0002 + +/** Application firmware running and supports error reporting */ +#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004 + +/** HSM failed + * + * Consult error[] for additional information. + */ +#define NFDEV_STATUS_FAILED 0x0008 + +/** Standard PCI interface. */ +#define NFDEV_IF_STANDARD 0x01 + +/** PCI interface with results pushed from device + * via DMA. + */ +#define NFDEV_IF_PCI_PUSH 0x02 + +/* platform independant base ioctl numbers */ + +/** Enquiry ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_ENQUIRY 0x01 +/** Channel Update ioctl. + * \deprecated */ +#define NFDEV_IOCTL_NUM_CHUPDATE 0x02 +/** Ensure Reading ioctl. + * Signal a read request to the device. + * \param (unsigned int) Length of data to be read. + */ +#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03 +/** Device Count ioctl. + * Not implemented for on all platforms. + * \return (int) the number of attached devices. */ +#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04 +/** Internal Debug ioctl. + * Not implemented in release drivers. */ +#define NFDEV_IOCTL_NUM_DEBUG 0x05 +/** PCI Interface Version ioctl. + * \param (int) Maximum PCI interface version + * supported by the user of the device. */ +#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06 +/** Statistics ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_STATS 0x07 + +/** Module control ioctl + * \param (nfdev_control_str) Value to write to HSM control register + */ +#define NFDEV_IOCTL_NUM_CONTROL 0x08 + +/** Module state ioctl + * \return (nfdev_status_str) Values read from HSM status/error registers + */ +#define NFDEV_IOCTL_NUM_STATUS 0x09 + +#endif diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h new file mode 100644 index 0000000000..923b902e46 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h @@ -0,0 +1,37 @@ +/* + +nfdev-solaris.h: nFast solaris specific device ioctl interface. + +(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +14/07/1998 jsh Original + +*/ + +#ifndef NFDEV_SOLARIS_H +#define NFDEV_SOLARIS_H + +#include "nfdev-common.h" + +#define NFDEV_IOCTL_TYPE ('n'<<8) + +#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENQUIRY ) +#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENSUREREADING ) +#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEVCOUNT ) +#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEBUG ) +#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_PCI_IFVERS ) +#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_STATS ) + +#endif /* NFDEV_SOLARIS_H */ diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h new file mode 100644 index 0000000000..9704f04fbc --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp.h @@ -0,0 +1,113 @@ +/* + +nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7 + +(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv + +*/ + +#ifndef NFP_H +#define NFP_H + +#ifndef _KERNEL +#error Hello? this is a driver, please compile with -D_KERNEL +#endif + +#if ( CH_KERNELVER < 260 ) +typedef int ioctlptr_t; +typedef unsigned short uint16_t; +#define DDI_GET32 ddi_getl +#define DDI_PUT32 ddi_putl +#define DDI_GET16 ddi_getw +#define DDI_PUT16 ddi_putw +#define DDI_REP_GET8 ddi_rep_getb +#define DDI_REP_PUT8 ddi_rep_putb +#define DDI_REP_GET32 ddi_rep_getl +#define DDI_REP_PUT32 ddi_rep_putl +#define PCI_CONFIG_GET16 pci_config_getw +#else /* ( CH_KERNELVER >= 260 ) */ +typedef intptr_t ioctlptr_t; +#define DDI_GET32 ddi_get32 +#define DDI_PUT32 ddi_put32 +#define DDI_GET16 ddi_get16 +#define DDI_PUT16 ddi_put16 +#define DDI_REP_GET8 ddi_rep_get8 +#define DDI_REP_PUT8 ddi_rep_put8 +#define DDI_REP_GET32 ddi_rep_get32 +#define DDI_REP_PUT32 ddi_rep_put32 +#define PCI_CONFIG_GET16 pci_config_get16 +#endif + +#if ( CH_KERNELVER < 270 ) +typedef int nfp_timeout_t; +#define EXTRA_CB_FLAGS 0 +#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap) +#else /* ( CH_KERNELVER >= 270 ) */ +typedef timeout_id_t nfp_timeout_t; +#define EXTRA_CB_FLAGS D_64BIT +#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap) +#endif + +typedef struct nfp_dev { + int rd_ok; + int wr_ok; + + int ifvers; + + /* for PCI push read interface */ + unsigned char *read_buf; + ddi_dma_handle_t read_dma_handle; + ddi_dma_cookie_t read_dma_cookie; + + ddi_acc_handle_t acchandle; + + int rd_dma_ok; + + nfp_timeout_t wrtimeout; + nfp_timeout_t rdtimeout; + + struct buf *wr_bp; + int wr_ready; + int rd_ready; + int rd_pending; + int rd_outstanding; + kcondvar_t rd_cv; + + struct pollhead pollhead; + dev_info_t *dip; + + ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */ + ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */ + kmutex_t high_mutex; + kmutex_t low_mutex; + int high_intr; + ddi_softintr_t soft_int_id; + int high_read; + int high_write; + + ddi_iblock_cookie_t iblock_cookie; /* for mutex */ + kmutex_t isr_mutex; + + kmutex_t busy_mutex; + int busy; + + ddi_acc_handle_t conf_handle; + + nfp_cdev common; + const nfpcmd_dev *cmddev; +} nfp_dev; + +extern struct nfp_dev *nfp_dev_list[]; + +#endif /* NFP_H */ diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h new file mode 100644 index 0000000000..db8af0b2f9 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_cmd.h @@ -0,0 +1,68 @@ +/* + +nfp_cmd.h: nCipher PCI HSM command driver decalrations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFPCMD_H +#define NFPCMD_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* read and write called with userspace buffer */ + +typedef struct nfpcmd_dev { + const char *name; + unsigned short vendorid, deviceid, + sub_vendorid, sub_deviceid; + unsigned int bar_sizes[6]; /* includes IO bit */ + unsigned int flags; + nfp_err (*create)(struct nfp_cdev *pdev); + nfp_err (*destroy)(void * ctx); + nfp_err (*open)(void * ctx); + nfp_err (*close)(void * ctx); + nfp_err (*isr)(void *ctx, int *handled); + nfp_err (*write_block)( const char *ublock, int len, void *ctx ); + nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount); + nfp_err (*channel_update)( char *data, int len, void *ctx); + nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx ); + nfp_err (*debug)( int cmd, void *ctx); +} nfpcmd_dev; + +#define NFP_CMD_FLG_NEED_IOBUF 0x1 + +/* list of all supported drivers ---------------------------------------- */ + +extern const nfpcmd_dev *nfp_drvlist[]; + +extern const nfpcmd_dev i21285_cmddev; +extern const nfpcmd_dev i21555_cmddev; +extern const nfpcmd_dev bcm5820_cmddev; + +#ifndef PCI_BASE_ADDRESS_SPACE_IO +#define PCI_BASE_ADDRESS_SPACE_IO 0x1 +#endif + +#define NFP_MAXDEV 16 + + +#define NFP_MEMBAR_MASK ~0xf +#define NFP_IOBAR_MASK ~0x3 +/* + This masks off the bottom bits of the PCI_CSR_BAR which signify that the + BAR is an IO BAR rather than a MEM BAR +*/ + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h new file mode 100644 index 0000000000..d1d2100fea --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_common.h @@ -0,0 +1,68 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_COMMON_H +#define NFP_COMMON_H + +#include <sys/types.h> +#include <sys/conf.h> + +typedef uint32_t UINT32; +typedef uint8_t BYTE; + +#define DEFINE_NFPCI_PACKED_STRUCTS +#include "nfpci.h" +#include "nfdev-solaris.h" + +typedef int oserr_t; + +#if CH_BIGENDIAN + +/* Big Endian Sparc */ + +#define SWP32(x) \ +( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) + +#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) ) + +#define FROM_LE32_IO(x) SWP32(*x) +#define TO_LE32_IO(x,y) *x=SWP32(y) + +#define FROM_LE32_MEM(x) SWP32(*x) +#define TO_LE32_MEM(x,y) *x=SWP32(y) + +#define FROM_LE16_IO(x) SWP16(*x) +#define TO_LE16_IO(x,y) *x=SWP16(y) + +#else + +/* Little Endian x86 */ + +#define FROM_LE32_IO(x) (*x) +#define TO_LE32_IO(x,y) (*x=y) + +#define FROM_LE32_MEM(x) (*x) +#define TO_LE32_MEM(x,y) (*x=y) + +#define FROM_LE16_IO(x) (*x) +#define TO_LE16_IO(x,y) (*x=y) + +#endif /* !CH_BIGENDIAN */ + +#include <sys/types.h> + +#if CH_KERNELVER == 260 +#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt ) +#else +#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; } +#endif + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h new file mode 100644 index 0000000000..d64cb78fd4 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_error.h @@ -0,0 +1,48 @@ +/* + +nfp_error.h: nCipher PCI HSM error handling + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +05/12/2001 jsh Original + +*/ + +#ifndef NFP_ERROR_H +#define NFP_ERROR_H + +#include "nfp_common.h" + +#define NFP_SUCCESS 0x0 +#define NFP_EFAULT 0x1 +#define NFP_ENOMEM 0x2 +#define NFP_EINVAL 0x3 +#define NFP_EIO 0x4 +#define NFP_ENXIO 0x5 +#define NFP_ENODEV 0x6 +#define NFP_EINTR 0x7 +#define NFP_ESTARTING 0x8 +#define NFP_EAGAIN 0x9 +#define NFP_EUNKNOWN 0x100 + +typedef int nfp_err; + +extern oserr_t nfp_oserr( nfp_err nerr ); +extern nfp_err nfp_error( oserr_t oerr ); + +#define nfr( x) \ + return nfp_error((x)) + +#define nfer(x, fn, msg) \ + { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } } + +#define er(x, fn, msg ) \ +{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } } + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h new file mode 100644 index 0000000000..3e7d8187e5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_hostif.h @@ -0,0 +1,54 @@ +/* + +nfp_hostif.h: nCipher PCI HSM host interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_HOSTIF_H +#define NFP_HOSTIF_H + +#include "nfdev-common.h" + +struct nfp_dev; + +/* common device structure */ + +typedef struct nfp_cdev { + unsigned char *bar[6]; + void *extra[6]; + + int busno; + int slotno; + + void *cmdctx; + + char *iobuf; + + struct nfp_dev* dev; + + struct nfdev_stats_str stats; + +} nfp_cdev; + +/* callbacks from command drivers -------------------------------------- */ + +void nfp_read_complete( struct nfp_dev *pdev, int ok); +void nfp_write_complete( struct nfp_dev *pdev, int ok); + +#define NFP_READ_MAX (8 * 1024) +#define NFP_READBUF_SIZE (NFP_READ_MAX + 8) +#define NFP_TIMEOUT_SEC 10 + +#define NFP_DRVNAME "nCipher nFast PCI driver" + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c new file mode 100644 index 0000000000..807b4f24c5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* + * nfp_ifervs.c - common pci interface versioning + * + * uses: + * + * int pdev->ifvers + * device interface version + * + * int nfp_ifvers + * interface version limit + * + * int nfp_alloc_pci_push( nfp_dev *pdev ) + * allocates resources needed for PCI Push, + * if not already allocated, and return True if successful + * + * void nfp_free_pci_push( nfp_dev *pdev ) { + * frees any resources allocated to PCI Push + */ + +void nfp_set_ifvers( nfp_dev *pdev, int vers ) { + if( nfp_ifvers != 0 && vers > nfp_ifvers ) { + nfp_log( NFP_DBG2, + "nfp_set_ifvers: can't set ifvers %d" + " as nfp_ifvers wants max ifvers %d", + vers, nfp_ifvers); + return; + } + if( vers >= NFDEV_IF_PCI_PUSH ) { + if(!nfp_alloc_pci_push(pdev)) { + nfp_log( NFP_DBG1, + "nfp_set_ifvers: can't set ifvers %d" + " as resources not available", + vers); + return; + } + } else { + nfp_free_pci_push(pdev); + } + pdev->ifvers= vers; + nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers); +} diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h new file mode 100644 index 0000000000..17ffe469ce --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_osif.h @@ -0,0 +1,105 @@ +/* + +nfp_osif.h: nCipher PCI HSM OS interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_OSIF_H +#define NFP_OSIF_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* general typedefs ----------------------------------------------- */ + +typedef volatile unsigned int reg32; +typedef volatile unsigned short reg16; +typedef volatile unsigned char reg8; + +/* sempaphores, mutexs and events --------------------------------- */ + +#if 0 +extern nfp_err nfp_sema_init( nfp_sema *sema, int initial); +extern void nfp_sema_destroy( nfp_sema *sema ); +extern void nfp_sema_post( nfp_sema *sema ); +extern void nfp_sema_wait( nfp_sema *sema ); +extern int nfp_sema_wait_sig( nfp_sema *sema ); + +extern nfp_err nfp_mutex_init( nfp_mutex *mutex ); +extern void nfp_mutex_destroy( nfp_mutex *mutex ); +extern void nfp_mutex_enter( nfp_mutex *mutex ); +extern void nfp_mutex_exit( nfp_mutex *mutex ); + +extern nfp_err nfp_event_init( nfp_event *event ); +extern void nfp_event_destroy( nfp_event *event ); +extern void nfp_event_set( nfp_event *event ); +extern void nfp_event_clear( nfp_event *event ); +extern void nfp_event_wait( nfp_event *event ); +extern void nfp_event_wait_sig( nfp_event *event ); + +#endif + +/* timeouts ------------------------------------------------------ */ + +extern void nfp_sleep( int ms ); + +/* memory handling ----------------------------------------------- */ + +#define KMALLOC_DMA 0 +#define KMALLOC_CACHED 1 + +extern void *nfp_kmalloc( int size, int flags ); +extern void *nfp_krealloc( void *ptr, int size, int flags ); +extern void nfp_kfree( void * ); + +/* config space access ------------------------------------------------ */ + +/* return Little Endian 32 bit config register */ +extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ); + +/* io space access ------------------------------------------------ */ + +extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ); +extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ); +extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ); +extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ); + +/* user and device memory space access ---------------------------- */ + +/* NB these 2 functions are not guarenteed to be re-entrant for a given device */ +extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len); +extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len); + +extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len ); +extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len ); + +extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len ); +extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len); + +/* debug ------------------------------------------------------------ */ + +#define NFP_DBG1 1 +#define NFP_DBGE NFP_DBG1 +#define NFP_DBG2 2 +#define NFP_DBG3 3 +#define NFP_DBG4 4 + +#ifdef STRANGE_VARARGS +extern void nfp_log(); +#else +extern void nfp_log( int severity, const char *format, ...); +#endif + +extern int nfp_debug; + +#endif diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h new file mode 100644 index 0000000000..793f5995e6 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfpci.h @@ -0,0 +1,171 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* +* +* NFPCI.H - nFast PCI interface definition file +* +* +* +* 1998.06.09 IH Started +* +* The interface presented by nFast PCI devices consists of: +* +* A region of shared RAM used for data transfer & control information +* A doorbell interrupt register, so both sides can give each other interrupts +* A number of DMA channels for transferring data +*/ + +#ifndef NFPCI_H +#define NFPCI_H + +/* Sizes of some regions */ +#define NFPCI_RAM_MINSIZE 0x00100000 +/* This is the minimum size of shared RAM. In future it may be possible to + negotiate larger sizes of shared RAM or auto-detect how big it is */ +#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */ +#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */ + +/* Offsets within shared memory space. + The following main regions are: + jobs input area + jobs output area + kernel jobs input area + kernel output area +*/ + +#define NFPCI_OFFSET_JOBS 0x00000000 +#define NFPCI_OFFSET_JOBS_WR 0x00000000 +#define NFPCI_OFFSET_JOBS_RD 0x00010000 +#define NFPCI_OFFSET_KERN 0x00020000 +#define NFPCI_OFFSET_KERN_WR 0x00020000 +#define NFPCI_OFFSET_KERN_RD 0x00030000 + +/* Interrupts, defined by bit position in doorbell register */ + +/* Interrupts from device to host */ +#define NFAST_INT_DEVICE_WRITE_OK 0x00000001 +#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002 +#define NFAST_INT_DEVICE_READ_OK 0x00000004 +#define NFAST_INT_DEVICE_READ_FAILED 0x00000008 +#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010 +#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020 +#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040 +#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080 + +/* Interrupts from host to device */ +#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000 +#define NFAST_INT_HOST_READ_REQUEST 0x00020000 +#define NFAST_INT_HOST_DEBUG 0x00040000 +#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000 +#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000 + +/* Ordinary job submission ------------------------ */ + +/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined + by the following (byte) address offsets... */ + +#define NFPCI_OFFSET_CONTROL 0x0 +#define NFPCI_OFFSET_LENGTH 0x4 +#define NFPCI_OFFSET_DATA 0x8 +#define NFPCI_OFFSET_PUSH_ADDR 0x8 + +#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8) + +#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8) + +/* Kernel inferface job submission ---------------- */ + +#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8) + +#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8) + +#ifdef DEFINE_NFPCI_PACKED_STRUCTS +typedef struct +{ + UINT32 controlword; + UINT32 length; /* length of data to follow */ + union { + BYTE data[1]; + UINT32 addr; + } uu; +} + NFPCI_JOBS_BLOCK; +#endif + + +#define NFPCI_JOB_CONTROL 0x00000001 +#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002 +/* + The 'Control' word is analogous to the SCSI read/write address; + 1 = standard push/pull IO + 2 = push/push IO + + To submit a block of job data, the host: + - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data + - copies the data to NFPCI_JOBS_WR_DATA + - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register + - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back + + To read a block of jobs back, the host: + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at + NFPCI_JOBS_RD_LENGTH to its actual length. + + Optionally the host can request the PCI read data to be pushed to host PCI mapped ram: + - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max + size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8 + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of + the buffer + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The + module will set NFPCI_OFFSET_LENGTH to the actual length. +*/ + +#define NFPCI_SCRATCH_CONTROL 0 + +#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0) +#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1 +#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT) + +#define NFPCI_SCRATCH_STATUS 1 + +#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0) +#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1) +#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2) +#define NFPCI_SCRATCH_STATUS_ERROR (1<<3) + +#define NFPCI_SCRATCH_ERROR_LO 2 +#define NFPCI_SCRATCH_ERROR_HI 3 + +#endif diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c new file mode 100644 index 0000000000..fba62f9a37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/osif.c @@ -0,0 +1,184 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" +#include "nfp.h" +#include "autoversion.h" + +/* config space access ---------------------------------- */ + +nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) { + unsigned int tmp32; + if ( !pdev || !pdev->dev || !pdev->dev->conf_handle ) + return NFP_ENODEV; + +/* pci_config_get32() does byte swapping, so put back to LE */ + tmp32 = pci_config_get32( pdev->dev->conf_handle, offset ); + TO_LE32_IO(res, tmp32); + + return NFP_SUCCESS; +} + +/* user space memory access ---------------------------------- */ + +nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) { + bcopy(ubuf, kbuf, len); + return 0; +} + +nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) { + bcopy(kbuf, ubuf, len); + return 0; +} + +nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */ + return nfp_copy_to_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */ + return nfp_copy_from_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR); + else + /* LINTED: alignment */ + DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR); + return NFP_SUCCESS; +} + +nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR ); + else + /* LINTED: alignment */ + DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR ); + return NFP_SUCCESS; +} + +/* pci io space access --------------------------------------- */ + +unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) ); +} + +unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) ); +} + +void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); +} + +void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); +} + +/* logging ---------------------------------------------------- */ + +void nfp_log( int level, const char *fmt, ...) +{ + auto char buf[256]; + va_list ap; + + switch (level) { + case NFP_DBG4: if (nfp_debug < 4) break; + /*FALLTHROUGH*/ + case NFP_DBG3: if (nfp_debug < 3) break; + /*FALLTHROUGH*/ + case NFP_DBG2: if (nfp_debug < 2) break; + /*FALLTHROUGH*/ + case NFP_DBG1: if (nfp_debug < 1) break; + /*FALLTHROUGH*/ + default: + va_start(ap, fmt); + (void) vsnprintf(buf, 256, fmt, ap); + va_end(ap); + cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf); + break; + } +} + +struct errstr { + int oserr; + nfp_err nferr; +}; + + +static struct errstr errtab[] = { + { EFAULT, NFP_EFAULT }, + { ENOMEM, NFP_ENOMEM }, + { EINVAL, NFP_EINVAL }, + { EIO, NFP_EIO }, + { ENXIO, NFP_ENXIO }, + { ENODEV, NFP_ENODEV }, + { EINVAL, NFP_EUNKNOWN }, + { 0, 0 } +}; + +nfp_err nfp_error( int oserr ) +{ + struct errstr *perr; + if(!oserr) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->oserr == oserr) + return perr->nferr; + perr++; + } + return NFP_EUNKNOWN; +} + +int nfp_oserr( nfp_err nferr ) +{ + struct errstr *perr; + if(nferr == NFP_SUCCESS) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->nferr == nferr) + return perr->oserr; + perr++; + } + return EIO; +} diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c index 51bb472c97..288f17ccb8 100644 --- a/usr/src/uts/common/io/overlay/overlay.c +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -1108,7 +1108,8 @@ out: mutex_enter(&odd->odd_lock); overlay_io_done(odd, OVERLAY_F_IN_TX); mutex_exit(&odd->odd_lock); - return (mp_chain); + freemsgchain(mp_chain); + return (NULL); } /* ARGSUSED */ diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c index de669ce645..d847beb1c2 100644 --- a/usr/src/uts/common/io/overlay/overlay_mux.c +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -351,8 +351,16 @@ overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) /* * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, * that isn't actually supported by UDP at this time. + * + * Send with MSG_DONTWAIT to indicate clogged UDP sockets upstack. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, MSG_DONTWAIT, &mp, kcred); + /* + * NOTE: ksocket_sendmblk() may send partial packets downstack, + * returning what's not sent in &mp (i.e. mp pre-call might be a + * b_cont of mp post-call). We can't hold up this message (it's a + * datagram), so we drop, and let the caller cope. */ - ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); if (ret != 0) freemsg(mp); diff --git a/usr/src/uts/common/io/physmem.c b/usr/src/uts/common/io/physmem.c index 665c9eff6c..9aaf58fb7b 100644 --- a/usr/src/uts/common/io/physmem.c +++ b/usr/src/uts/common/io/physmem.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ @@ -807,6 +808,13 @@ physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) int ret; static int msg_printed = 0; + /* + * This device should never be visible in a zone, but if it somehow + * does get created we refuse to allow the zone to use it. + */ + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EACCES); + if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { return (EINVAL); } diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf index 42248e93d6..08affec609 100644 --- a/usr/src/uts/common/io/pseudo.conf +++ b/usr/src/uts/common/io/pseudo.conf @@ -22,8 +22,7 @@ # # Copyright 2003 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2014 Joyent, Inc. All rights reserved. # # This file is private to the pseudonex driver. It should not be edited. # @@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0; # /pseudo; it has as its children the zone console pseudo nodes. # name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons"; + +# +# zfdnex is an alias for pseudo; this node is instantiated as a child of +# /pseudo; it has as its children the zone fd pseudo nodes. +# +name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd"; diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index 8127d54594..21d641992f 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -451,6 +451,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -587,6 +599,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg Binary files differnew file mode 100644 index 0000000000..b932ffaa7c --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/iwarp_sm.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg Binary files differnew file mode 100644 index 0000000000..9421ecc0db --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full-36.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png Binary files differnew file mode 100644 index 0000000000..4b8a66761a --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-full.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png Binary files differnew file mode 100644 index 0000000000..3254fbdc3b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/qlogic-logo.png diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg Binary files differnew file mode 100644 index 0000000000..7bb0dbf21b --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/documentation/pictures/reg_access.jpg diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin Binary files differnew file mode 100644 index 0000000000..43014fd8ea --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values.bin diff --git a/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin Binary files differnew file mode 100644 index 0000000000..9524eb4a63 --- /dev/null +++ b/usr/src/uts/common/io/qede/579xx/drivers/ecore/ecore_init_values_zipped.bin diff --git a/usr/src/uts/common/io/qede/qede_list.h b/usr/src/uts/common/io/qede/qede_list.h index 2350cb4117..656d2a915f 100644 --- a/usr/src/uts/common/io/qede/qede_list.h +++ b/usr/src/uts/common/io/qede/qede_list.h @@ -176,4 +176,3 @@ qede_list_splice_tail(qede_list_t *list, #define QEDE_LIST_FOR_EACH_ENTRY_SAFE OSAL_LIST_FOR_EACH_ENTRY_SAFE #endif /* !_QEDE_LIST_H */ - diff --git a/usr/src/uts/common/io/qede/qede_version.h b/usr/src/uts/common/io/qede/qede_version.h index 43584f95f0..0ee38b4338 100644 --- a/usr/src/uts/common/io/qede/qede_version.h +++ b/usr/src/uts/common/io/qede/qede_version.h @@ -42,4 +42,3 @@ #define REVVERSION 25 #endif /* !_QEDE_VERSION_H */ - diff --git a/usr/src/uts/common/io/random.c b/usr/src/uts/common/io/random.c index d79b86362c..a50bbcceec 100644 --- a/usr/src/uts/common/io/random.c +++ b/usr/src/uts/common/io/random.c @@ -291,6 +291,9 @@ rnd_write(dev_t dev, struct uio *uiop, cred_t *credp) if ((error = uiomove(buf, bytes, UIO_WRITE, uiop)) != 0) return (error); + if (crgetzone(credp) != global_zone) + continue; + switch (devno) { case DEVRANDOM: if ((error = random_add_entropy(buf, bytes, 0)) != 0) diff --git a/usr/src/uts/common/io/rsm/rsm.c b/usr/src/uts/common/io/rsm/rsm.c index b49d5b735a..d9d40c83fd 100644 --- a/usr/src/uts/common/io/rsm/rsm.c +++ b/usr/src/uts/common/io/rsm/rsm.c @@ -22,8 +22,8 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. */ diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c index ead7433aef..d3ae7fb6c7 100644 --- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c +++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c @@ -10783,6 +10783,7 @@ ahci_em_ioctl_set(ahci_ctl_t *ahci_ctlp, intptr_t arg) } task->aelta_ctl = ahci_ctlp; + task->aelta_port = set.aiems_port; task->aelta_port = (uint8_t)set.aiems_port; task->aelta_op = set.aiems_op; task->aelta_state = set.aiems_leds; diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf index c6e017655e..721e66c276 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mpt_sas.conf @@ -21,7 +21,7 @@ # # Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# +# Copyright 2019 Joyent, Inc. # # @@ -49,3 +49,8 @@ ddi-vhci-class="scsi_vhci"; # name="mpt_sas" parent="/pci@7c0/pci@0/pci@9" unit-address="0" mpxio-disable="yes"; # mpxio-disable="no"; + +# +# Command/target timeout checking should be done at a 1-second granularity. +# +scsi-watchdog-tick=1; diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index a759c735a3..2d25026a9a 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -3011,9 +3011,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -3917,19 +3921,78 @@ static int sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; - int rval = SD_SUCCESS; + int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. + */ + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. + */ + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring @@ -6142,7 +6205,7 @@ sdpower(dev_info_t *devi, int component, int level) time_t intvlp; struct pm_trans_data sd_pm_tran_data; uchar_t save_state = SD_STATE_NORMAL; - int sval; + int sval, tursval = 0; uchar_t state_before_pm; sd_ssc_t *ssc; int last_power_level = SD_SPINDLE_UNINIT; @@ -6426,13 +6489,26 @@ sdpower(dev_info_t *devi, int component, int level) * a deadlock on un_pm_busy_cv will occur. */ if (SD_PM_IS_IO_CAPABLE(un, level)) { - sval = sd_send_scsi_TEST_UNIT_READY(ssc, + tursval = sd_send_scsi_TEST_UNIT_READY(ssc, SD_DONT_RETRY_TUR | SD_BYPASS_PM); - if (sval != 0) + if (tursval != 0) sd_ssc_assessment(ssc, SD_FMT_IGNORE); } - if (un->un_f_power_condition_supported) { + /* + * We've encountered certain classes of drives that pass a TUR, but fail + * the START STOP UNIT when using power conditions, or worse leave the + * drive in an unusable state despite passing SSU. Strictly speaking, + * for SPC-4 or greater, no additional actions are required to make the + * drive operational when a TUR passes. If we have something that + * matches this condition, we continue on and presume the drive is + * successfully powered on. + */ + if (un->un_f_power_condition_supported && + SD_SCSI_VERS_IS_GE_SPC_4(un) && SD_PM_IS_IO_CAPABLE(un, level) && + level == SD_SPINDLE_ACTIVE && tursval == 0) { + sval = 0; + } else if (un->un_f_power_condition_supported) { char *pm_condition_name[] = {"STOPPED", "STANDBY", "IDLE", "ACTIVE"}; SD_TRACE(SD_LOG_IO_PM, un, @@ -6452,6 +6528,7 @@ sdpower(dev_info_t *devi, int component, int level) sd_ssc_assessment(ssc, SD_FMT_STATUS_CHECK); else sd_ssc_assessment(ssc, SD_FMT_IGNORE); + } /* Command failed, check for media present. */ @@ -30373,7 +30450,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) if (SD_PM_CAPABLE_IS_UNDEFINED(pm_cap)) { un->un_f_log_sense_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_INQUIRY(un)->inq_ansi == 6) { + SD_SCSI_VERS_IS_GE_SPC_4(un)) { un->un_f_power_condition_supported = TRUE; } } else { @@ -30391,7 +30468,7 @@ sd_set_unit_attributes(struct sd_lun *un, dev_info_t *devi) /* SD_PM_CAPABLE_IS_TRUE case */ un->un_f_pm_supported = TRUE; if (!un->un_f_power_condition_disabled && - SD_PM_CAPABLE_IS_SPC_4(pm_cap)) { + (SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap))) { un->un_f_power_condition_supported = TRUE; } diff --git a/usr/src/uts/common/io/signalfd.c b/usr/src/uts/common/io/signalfd.c index 46d616dd79..4dce53e22c 100644 --- a/usr/src/uts/common/io/signalfd.c +++ b/usr/src/uts/common/io/signalfd.c @@ -107,6 +107,7 @@ #include <sys/schedctl.h> #include <sys/id_space.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/disp.h> #include <sys/taskq_impl.h> @@ -459,6 +460,9 @@ consume_signal(k_sigset_t set, uio_t *uio, boolean_t block) lwp->lwp_extsig = 0; mutex_exit(&p->p_lock); + if (PROC_IS_BRANDED(p) && BROP(p)->b_sigfd_translate) + BROP(p)->b_sigfd_translate(infop); + /* Convert k_siginfo into external, datamodel independent, struct. */ bzero(ssp, sizeof (*ssp)); ssp->ssi_signo = infop->si_signo; diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c new file mode 100644 index 0000000000..d36608efcd --- /dev/null +++ b/usr/src/uts/common/io/vnd/frameio.c @@ -0,0 +1,465 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * Frame I/O utility functions + */ + +#include <sys/frameio.h> + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/inttypes.h> + +static kmem_cache_t *frameio_cache; + +int +frameio_init(void) +{ + frameio_cache = kmem_cache_create("frameio_cache", + sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX, + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (frameio_cache == NULL) + return (1); + + return (0); +} + +void +frameio_fini(void) +{ + if (frameio_cache != NULL) + kmem_cache_destroy(frameio_cache); +} + +frameio_t * +frameio_alloc(int kmflags) +{ + return (kmem_cache_alloc(frameio_cache, kmflags)); +} + +void +frameio_free(frameio_t *fio) +{ + kmem_cache_free(frameio_cache, fio); +} + +/* + * Ensure that we don't see any garbage in the framevecs that we're nominally + * supposed to work with. Specifically we want to make sure that the buflen and + * the address are not zero. + */ +static int +frameio_hdr_check_vecs(frameio_t *fio) +{ + int i; + for (i = 0; i < fio->fio_nvecs; i++) + if (fio->fio_vecs[i].fv_buf == NULL || + fio->fio_vecs[i].fv_buflen == 0) + return (EINVAL); + + return (0); +} + +/* + * We have to copy in framevec32_t's. To work around the data model issues and + * trying not to copy memory we first copy in the framevec32_t data into the + * standard fio_vec space. Next we work backwards copying a given framevec32_t + * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that + * it is important that we do this in reverse so as to ensure that we don't + * clobber data as the framevec_t is larger than the framevec32_t. + */ +static int +frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr) +{ + framevec32_t *vec32p; + framevec_t fv; + int i; + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + + if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs, + 0) != 0) + return (EFAULT); + + for (i = fio->fio_nvecs - 1; i >= 0; i--) { + fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf; + fv.fv_buflen = vec32p[i].fv_buflen; + fv.fv_actlen = vec32p[i].fv_actlen; + fio->fio_vecs[i].fv_buf = fv.fv_buf; + fio->fio_vecs[i].fv_buflen = fv.fv_buflen; + fio->fio_vecs[i].fv_actlen = fv.fv_actlen; + } + + return (frameio_hdr_check_vecs(fio)); +} + +/* + * Copy in a frame io header into fio with space for up to nvecs. If the frameio + * contains more vectors than specified it will be ignored. mode should contain + * information about the datamodel. + */ +int +frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode) +{ + int model = ddi_model_convert_from(mode & FMODELS); + int cpf = mode & FKIOCTL ? FKIOCTL : 0; + size_t fsize = model == DDI_MODEL_ILP32 ? + sizeof (frameio32_t) : sizeof (frameio_t); + + /* + * The start of the header is the same in all data models for the + * current verison. + */ + if (ddi_copyin(addr, fio, fsize, cpf) != 0) + return (EFAULT); + + if (fio->fio_version != FRAMEIO_VERSION_ONE) + return (EINVAL); + + if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0) + return (EINVAL); + + if (fio->fio_nvpf == 0) + return (EINVAL); + + if (fio->fio_nvecs % fio->fio_nvpf != 0) + return (EINVAL); + + if (fio->fio_nvecs > max_vecs) + return (EOVERFLOW); + + addr = (void *)((uintptr_t)addr + fsize); + if (model == DDI_MODEL_ILP32) { + if (cpf != 0) + return (EINVAL); + return (frameio_hdr_copyin_ilp32(fio, addr)); + } + + if (ddi_copyin(addr, &fio->fio_vecs[0], + sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0) + return (EFAULT); + + return (frameio_hdr_check_vecs(fio)); +} + +static mblk_t * +frameio_allocb(size_t sz) +{ + mblk_t *mp; + + mp = allocb(sz, 0); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_DATA; + return (mp); +} + +static int +framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf) +{ + mblk_t *mp; + cpf = cpf != 0 ? FKIOCTL : 0; + + mp = frameio_allocb(fv->fv_buflen); + + if (mp == NULL) { + freemsg(mp); + return (EAGAIN); + } + + if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen, + cpf) != 0) { + freemsg(mp); + return (EFAULT); + } + + mp->b_wptr += fv->fv_buflen; + *mpp = mp; + return (0); +} + +/* + * Read a set of frame vectors that make up a single message boundary and return + * that as a single message in *mpp that consists of multiple data parts. + */ +static int +frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf) +{ + int nparts = fio->fio_nvpf; + int part, error; + mblk_t *mp; + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + /* + * Construct the initial frame + */ + for (part = 0; part < nparts; part++) { + error = framevec_mblk_read(fv, &mp, cpf); + if (error != 0) { + freemsg(*mpp); + return (error); + } + + if (*mpp == NULL) + *mpp = mp; + else + linkb(*mpp, mp); + fv++; + } + + return (0); +} + +/* + * Read data from a series of frameio vectors into a message block chain. A + * given frameio request has a number of discrete messages divided into + * individual vectors based on fio->fio_nvcspframe. Each discrete message will + * be constructed into a message block chain pointed to by b_next. + * + * If we get an EAGAIN while trying to construct a given message block what we + * return depends on what else we've done so far. If we have succesfully + * completed at least one message then we free everything else we've done so + * far and return that. If no messages have been completed we return EAGAIN. If + * instead we encounter a different error, say EFAULT, then all of the fv_actlen + * entries values are undefined. + */ +int +frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf) +{ + int error = ENOTSUP; + int nframes = fio->fio_nvecs / fio->fio_nvpf; + int frame; + framevec_t *fv; + mblk_t *mp, *bmp = NULL; + + /* + * Protect against bogus kernel subsystems. + */ + VERIFY(fio->fio_nvecs > 0); + VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0); + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + fv = &fio->fio_vecs[0]; + for (frame = 0; frame < nframes; frame++) { + error = frameio_mblk_read(fio, fv, &mp, cpf); + if (error != 0) + goto failed; + + if (bmp != NULL) + bmp->b_next = mp; + else + *mpp = mp; + bmp = mp; + } + + *nvecs = nframes; + return (0); +failed: + /* + * On EAGAIN we've already taken care of making sure that we have no + * leftover messages, eg. they were never linked in. + */ + if (error == EAGAIN) { + if (frame != 0) + error = 0; + if (nvecs != NULL) + *nvecs = frame; + ASSERT(*mpp != NULL); + } else { + for (mp = *mpp; mp != NULL; mp = bmp) { + bmp = mp->b_next; + freemsg(mp); + } + if (nvecs != NULL) + *nvecs = 0; + *mpp = NULL; + } + return (error); +} + +size_t +frameio_frame_length(frameio_t *fio, framevec_t *fv) +{ + int i; + size_t len = 0; + + for (i = 0; i < fio->fio_nvpf; i++, fv++) + len += fv->fv_buflen; + + return (len); +} + +/* + * Write a portion of an mblk to the current. + */ +static int +framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff, + size_t foff, int cpf) +{ + ASSERT(len <= MBLKL(mp) - moff); + ASSERT(len <= fv->fv_buflen - fv->fv_actlen); + cpf = cpf != 0 ? FKIOCTL : 0; + + if (ddi_copyout(mp->b_rptr + moff, (caddr_t)fv->fv_buf + foff, len, + cpf) != 0) + return (EFAULT); + fv->fv_actlen += len; + + return (0); +} + +/* + * Because copying this out to the user might fail we don't want to update the + * b_rptr in case we need to copy it out again. + */ +static int +framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf) +{ + int err; + size_t msize, blksize, len, moff, foff; + + msize = msgsize(mp); + if (msize > frameio_frame_length(fio, fv)) + return (EOVERFLOW); + + moff = 0; + foff = 0; + blksize = MBLKL(mp); + fv->fv_actlen = 0; + while (msize != 0) { + len = MIN(blksize, fv->fv_buflen - fv->fv_actlen); + err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf); + if (err != 0) + return (err); + + msize -= len; + blksize -= len; + moff += len; + foff += len; + + if (blksize == 0 && msize != 0) { + mp = mp->b_cont; + ASSERT(mp != NULL); + moff = 0; + blksize = MBLKL(mp); + } + + if (fv->fv_buflen == fv->fv_actlen && msize != 0) { + fv++; + fv->fv_actlen = 0; + foff = 0; + } + } + + return (0); +} + +int +frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map, + mblk_t *mp, int *nwrite, int cpf) +{ + int mcount = 0; + int ret = 0; + + if (map != MAP_BLK_FRAME) + return (EINVAL); + + while (mp != NULL && mcount < fio->fio_nvecs) { + ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf); + if (ret != 0) + break; + mcount += fio->fio_nvpf; + mp = mp->b_next; + } + + if (ret != 0 && mcount == 0) { + if (nwrite != NULL) + *nwrite = 0; + return (ret); + } + + if (nwrite != NULL) + *nwrite = mcount / fio->fio_nvpf; + + return (0); +} + +/* + * Copy out nframes worth of frameio header data back to userland. + */ +int +frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode) +{ + int i; + int model = ddi_model_convert_from(mode & FMODELS); + framevec32_t *vec32p; + framevec32_t f; + + if (fio->fio_nvecs / fio->fio_nvpf < nframes) + return (EINVAL); + + fio->fio_nvecs = nframes * fio->fio_nvpf; + + if (model == DDI_MODEL_NONE) { + if (ddi_copyout(fio, addr, + sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); + } + + ASSERT(model == DDI_MODEL_ILP32); + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + for (i = 0; i < fio->fio_nvecs; i++) { + f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf; + if (fio->fio_vecs[i].fv_buflen > UINT_MAX || + fio->fio_vecs[i].fv_actlen > UINT_MAX) + return (EOVERFLOW); + f.fv_buflen = fio->fio_vecs[i].fv_buflen; + f.fv_actlen = fio->fio_vecs[i].fv_actlen; + vec32p[i].fv_buf = f.fv_buf; + vec32p[i].fv_buflen = f.fv_buflen; + vec32p[i].fv_actlen = f.fv_actlen; + } + + if (ddi_copyout(fio, addr, + sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +void +frameio_mark_consumed(frameio_t *fio, int nframes) +{ + int i; + + ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes); + for (i = 0; i < nframes * fio->fio_nvpf; i++) + fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen; +} diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c new file mode 100644 index 0000000000..8c05c8aee0 --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -0,0 +1,5857 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +/* + * vnd - virtual (machine) networking datapath + * + * vnd's purpose is to provide a highly performant data path for Layer 2 network + * traffic and exist side by side an active IP netstack, each servicing + * different datalinks. vnd provides many of the same capabilities as the + * current TCP/IP stack does and some specific to layer two. Specifically: + * + * o Use of the DLD fastpath + * o Packet capture hooks + * o Ability to use hardware capabilities + * o Useful interfaces for handling multiple frames + * + * The following image shows where vnd fits into today's networking stack: + * + * +---------+----------+----------+ + * | libdlpi | libvnd | libsocket| + * +---------+----------+----------+ + * | · · VFS | + * | VFS · VFS +----------+ + * | · | sockfs | + * +---------+----------+----------+ + * | | VND | IP | + * | +----------+----------+ + * | DLD/DLS | + * +-------------------------------+ + * | MAC | + * +-------------------------------+ + * | GLDv3 | + * +-------------------------------+ + * + * ----------------------------------------- + * A Tale of Two Devices - DDI Device Basics + * ----------------------------------------- + * + * vnd presents itself to userland as a character device; however, it also is a + * STREAMS device so that it can interface with dld and the rest of the + * networking stack. Users never interface with the STREAMs devices directly and + * they are purely an implementation detail of vnd. Opening the STREAMS device + * require kcred and as such userland cannot interact with it or push it onto + * the stream head. + * + * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every + * clone gets its own minor number; however, minor nodes are not created in the + * devices tree for these instances. In this state a user may do two different + * things. They may issue ioctls that affect global state or they may issue + * ioctls that try to attach it to a given datalink. Once a minor device has + * been attached to a datalink, all operations on it are scoped to that context, + * therefore subsequent global operations are not permitted. + * + * A given device can be linked into the /devices and /dev name space via a link + * ioctl. That ioctl causes a minor node to be created in /devices and then it + * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar + * to, but simpler than, IP's persistence mechanism. + * + * --------------------- + * Binding to a datalink + * --------------------- + * + * Datalinks are backed by the dld (datalink device) and dls (datalink services) + * drivers. These drivers provide a STREAMS device for datalinks on the system + * which are exposed through /dev/net. Userland generally manipulates datalinks + * through libdlpi. When an IP interface is being plumbed up what actually + * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink + * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may + * then can negotiate with dld and dls to obtain access to various capabilities + * and fast paths via a series of STREAMS messages. + * + * In vnd, we do the same thing, but we leave our STREAMS module as an + * implementation detail of the system. We don't want users to be able to + * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require + * kcred to manipulate it. Thus, when a user issues a request to attach a + * datalink to a minor instance of the character device, that vnd minor instance + * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. + * vnd does that open using the passed in credentials from the ioctl, not kcred. + * This ensures that users who doesn't have permissions to open the device + * cannot. Once that's been opened, we push on the vnd streams module. + * + * Once the vnd STREAMS instance has been created for this device, eg. the + * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl + * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. + * This association begins the STREAM device's initialization. We start up an + * asynchronous state machine that takes care of all the different aspects of + * plumbing up the device with dld and dls and enabling the MAC fast path. We + * need to guarantee to consumers of the character device that by the time their + * ioctl returns, the data path has been fully initialized. + * + * The state progression is fairly linear. There are two general steady states. + * The first is VND_S_ONLINE, which means that everything is jacked up and good + * to go. The alternative is VND_S_ZOMBIE, which means that the streams device + * encountered an error or we have finished tearing it down and the character + * device can clean it up. The following is our state progression and the + * meaning of each state: + * + * | + * | + * V + * +---------------+ + * | VNS_S_INITIAL | This is our initial state. Every + * +---------------+ vnd STREAMS device starts here. + * | While in this state, only dlpi + * | M_PROTO and M_IOCTL messages can be + * | sent or received. All STREAMS based + * | data messages are dropped. + * | We transition out of this state by + * | sending a DL_INFO_REQ to obtain + * | information about the underlying + * | link. + * v + * +-----------------+ + * +--<-| VNS_S_INFO_SENT | In this state, we verify and + * | +-----------------+ record information about the + * | | underlying device. If the device is + * | | not suitable, eg. not of type + * v | DL_ETHER, then we immediately + * | | become a ZOMBIE. To leave this + * | | state we request exclusive active + * | | access to the device via + * v | DL_EXCLUSIVE_REQ. + * | v + * | +----------------------+ + * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether + * | +----------------------+ or not we were able to obtain + * | | | exclusive access to the device. If + * | | | we were not able to, then we leave, + * v | | as that means that something like + * | | | IP is already plumbed up on top of + * | | | the datalink. We leave this state + * | | | by progressing through to the + * | | | appropriate DLPI primitive, either + * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ + * | | | depending on the style of the + * | | | datalink. + * | | v + * | | +-------------------+ + * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were + * | | +-------------------+ able to perform a standard DLPI + * | | | attach and if so, go ahead and + * v | | send a DLPI_BIND_REQ. + * | v v + * | +-------------------+ + * +--<-| VNS_S_BIND_SENT | In this state we see the result of + * | +-------------------+ our attempt to bind to PPA 0 of the + * v | underlying device. Because we're + * | | trying to be a layer two datapath, + * | | the specific attachment point isn't + * | | too important as we're going to + * v | have to enable promiscuous mode. We + * | | transition out of this by sending + * | | our first of three promiscuous mode + * | | requests. + * v v + * | +------------------------+ + * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we + * | +------------------------+ were able to enable promiscuous + * | | mode at the physical level. We + * | | transition out of this by enabling + * | | multicast and broadcast promiscuous + * v | mode. + * | v + * | +--------------------------+ + * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ have enabled DL_PROMISC_MULTI and + * v | move onto the second promiscuous + * | | mode request. + * | v + * | +----------------------------+ + * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we + * | +----------------------------+ enabled RX_ONLY promiscuous mode. + * | | We specifically do this as we don't + * v | want to receive our own traffic + * | | that we'll send out. We leave this + * | | state by enabling the final flag + * | | DL_PROMISC_FIXUPS. + * | v + * | +--------------------------+ + * +--<-| VNS_S_FIXUP_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ enabled FIXUP promiscuous mode. + * | | We specifically do this as we need + * v | to ensure that traffic which is + * | | received by being looped back to us + * | | correctly has checksums fixed. We + * | | leave this state by requesting the + * | | dld/dls capabilities that we can + * v | process. + * | v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of + * | +--------------------+ capabilities that dld advertised + * | | and enable the ones that currently + * v | support for use. See the section + * | | later on regarding capabilities + * | | for more information. We leave this + * | | state by sending an enable request. + * v v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability + * | +--------------------+ initialization. Once finished, we + * | | transition to the next state. If + * v | the dld fast path is not available, + * | | we become a zombie. + * | v + * | +--------------+ + * | | VNS_S_ONLINE | This is a vnd STREAMS device's + * | +--------------+ steady state. It will normally + * | | reside in this state while it is in + * | | active use. It will only transition + * v | to the next state when the STREAMS + * | | device is closed by the character + * | | device. In this state, all data + * | | flows over the dld fast path. + * | v + * | +---------------------+ + * +--->| VNS_S_SHUTTING_DOWN | This vnd state takes care of + * | +---------------------+ disabling capabilities and + * | | flushing all data. At this point + * | | any additional data that we receive + * | | will be dropped. We leave this + * v | state by trying to remove multicast + * | | promiscuity. + * | | + * | v + * | +---------------------------------+ + * +-->| VNS_S_MULTICAST_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------------+ successfully removed multicast + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. We leave this state by trying + * | | to disable SAP level promiscuous + * | | mode. + * | v + * | +---------------------------+ + * +-->| VNS_S_SAP_PROMISCOFF_SENT | In this state, we check if we have + * | +---------------------------+ successfully removed SAP level + * | | promiscuous mode. If we have + * | | failed, we still carry on but only + * | | warn. Note that we don't worry + * | | about either of + * | | DL_PROMISC_FIXUPS or + * | | DL_PROMISC_RX_ONLY. If these are + * | | the only two entries left, then we + * | | should have anything that MAC is + * | | doing for us at this point, + * | | therefore it's safe for us to + * | | proceed to unbind, which is how we + * | | leave this state via a + * | v DL_UNBIND_REQ. + * | +-------------------+ + * +--->| VNS_S_UNBIND_SENT | Here, we check how the unbind + * | +-------------------+ request went. Regardless of its + * | | success, we always transition to + * | | a zombie state. + * | v + * | +--------------+ + * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS + * +--------------+ device is waiting to finish being + * reaped. Because we have no more + * ways to receive data it should be + * safe to destroy all remaining data + * structures. + * + * If the stream association fails for any reason the state machine reaches + * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the + * STREAMS ioctl to the character device. That will fail the user ioctl and + * propagate the vnd_errno_t back to userland. If, on the other hand, the + * association succeeds, then the vnd STREAMS device will be fully plumbed up + * and ready to transmit and receive message blocks. Consumers will be able to + * start using the other cbops(9E) entry points once the attach has fully + * finished, which will occur after the original user attach ioctl to the + * character device returns. + * + * It's quite important that we end up sending the full series of STREAMS + * messages when tearing down. While it's tempting to say that we should just + * rely on the STREAMS device being closed to properly ensure that we have no + * more additional data, that's not sufficient due to our use of direct + * callbacks. DLS does not ensure that by the time we change the direct + * callback (vnd_mac_input) that all callers to it will have been quiesced. + * However, it does guarantee that if we disable promiscuous mode ourselves and + * we turn off the main data path via DL_UNBIND_REQ that it will work. + * Therefore, we make sure to do this ourselves rather than letting DLS/DLD do + * it as part of tearing down the STREAMS device. This ensures that we'll + * quiesce all data before we destroy our data structures and thus we should + * eliminate the race in changing the data function. + * + * -------------------- + * General Architecture + * -------------------- + * + * There are several different devices and structures in the vnd driver. There + * is a per-netstack component, pieces related to the character device that + * consumers see, the internal STREAMS device state, and the data queues + * themselves. The following ASCII art picture describes their relationships and + * some of the major pieces of data that contain them. These are not exhaustive, + * e.g. synchronization primitives are left out. + * + * +----------------+ +-----------------+ + * | global | | global | + * | device list | | netstack list | + * | vnd_dev_list | | vnd_nsd_list | + * +----------------+ +-----------------+ + * | | + * | v + * | +-------------------+ +-------------------+ + * | | per-netstack data | ---> | per-netstack data | --> ... + * | | vnd_pnsd_t | | vnd_pnsd_t | + * | | | +-------------------+ + * | | | + * | | nestackid_t ---+----> Netstack ID + * | | vnd_pnsd_flags_t -+----> Status flags + * | | zoneid_t ---+----> Zone ID for this netstack + * | | hook_family_t ---+----> VND IPv4 Hooks + * | | hook_family_t ---+----> VND IPv6 Hooks + * | | list_t ----+ | + * | +------------+------+ + * | | + * | v + * | +------------------+ +------------------+ + * | | character device | ---> | character device | -> ... + * +---------->| vnd_dev_t | | vnd_dev_t | + * | | +------------------+ + * | | + * | minor_t ---+--> device minor number + * | ldi_handle_t ---+--> handle to /dev/net/%datalink + * | vnd_dev_flags_t -+--> device flags, non blocking, etc. + * | char[] ---+--> name if linked + * | vnd_str_t * -+ | + * +--------------+---+ + * | + * v + * +-------------------------+ + * | STREAMS device | + * | vnd_str_t | + * | | + * | vnd_str_state_t ---+---> State machine state + * | gsqueue_t * ---+---> mblk_t Serialization queue + * | vnd_str_stat_t ---+---> per-device kstats + * | vnd_str_capab_t ---+----------------------------+ + * | vnd_data_queue_t ---+ | | + * | vnd_data_queue_t -+ | | v + * +-------------------+-+---+ +---------------------+ + * | | | Stream capabilities | + * | | | vnd_str_capab_t | + * | | | | + * | | supported caps <--+-- vnd_capab_flags_t | + * | | dld cap handle <--+-- void * | + * | | direct tx func <--+-- vnd_dld_tx_t | + * | | +---------------------+ + * | | + * +----------------+ +-------------+ + * | | + * v v + * +-------------------+ +-------------------+ + * | Read data queue | | Write data queue | + * | vnd_data_queue_t | | vnd_data_queue_t | + * | | | | + * | size_t ----+--> Current size | size_t ----+--> Current size + * | size_t ----+--> Max size | size_t ----+--> Max size + * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head + * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail + * +-------------------+ +-------------------+ + * + * + * Globally, we maintain two lists. One list contains all of the character + * device soft states. The other maintains a list of all our netstack soft + * states. Each netstack maintains a list of active devices that have been + * associated with a datalink in its netstack. + * + * Recall that a given minor instance of the character device exists in one of + * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, + * or it can be associated with a given datalink. When minor instances are in + * the former state, they do not exist in a given vnd_pnsd_t's list of devices. + * As part of attaching to a datalink, the given vnd_dev_t will be inserted into + * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a + * vnd_str_t, to be created and associated to a vnd_dev_t. + * + * The character device, and its vnd_dev_t, is the interface to the rest of the + * system. The vnd_dev_t keeps track of various aspects like whether various + * operations, such as read, write and the frameio ioctls, are considered + * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for + * keeping track of things like the name of the device, if any, in /dev. The + * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual + * data queues. However, ioctls that manipulate these properties all go through + * the vnd_dev_t to its associated vnd_str_t. + * + * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One + * for frames to transmit (write queue) and one for frames received (read + * queue). These data queues have a maximum size and attempting to add data + * beyond that maximum size will result in data being dropped. The sizes are + * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits + * in those buffers or has a reservation in those buffers while they are in vnd + * and waiting to be consumed by the user or by mac. + * + * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the + * available, negotiated, and currently active features. + * + * ---------------------- + * Data Path and gsqueues + * ---------------------- + * + * There's a lot of plumbing in vnd to get to the point where we can send data, + * but vnd's bread and butter is the data path, so it's worth diving into it in + * more detail. Data enters and exits the system from two ends. + * + * The first end is the vnd consumer. This comes in the form of read and write + * system calls as well as the frame I/O ioctls. The read and write system calls + * operate on a single frame at a time. Think of a frame as a single message + * that has come in off the wire, which may itself comprise multiple mblk_t's + * linked together in the kernel. readv(2) and writev(2) have the same + * limitations as read(2) and write(2). We enforce this as the system is + * required to fill up every uio(9S) buffer before moving onto the next one. + * This means that if you have a MTU sized buffer and two frames come in which + * are less than half of the MTU they must fill up the given iovec. Even if we + * didn't want to do this, we have no way of informing the supplier of the + * iovecs that they were only partially filled or where one frame ends and + * another begins. That's life, as such we have frame I/O which solves this + * problem. It allows for multiple frames to be consumed as well as for frames + * to be broken down into multiple vector components. + * + * The second end is the mac direct calls. As part of negotiating capabilities + * via dld, we give mac a function of ours to call when packets are received + * [vnd_mac_input()] and a callback to indicate that flow has been restored + * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can + * transmit data with. As part of the contract with mac, mac is allowed to flow + * control us by returning a cookie to the transmit function. When that happens, + * all outbound traffic is halted until our callback function is called and we + * can schedule drains. + * + * It's worth looking at these in further detail. We'll start with the rx path. + * + * + * | + * * . . . packets from gld + * | + * v + * +-------------+ + * | mac | + * +-------------+ + * | + * v + * +-------------+ + * | dld | + * +-------------+ + * | + * * . . . dld direct callback + * | + * v + * +---------------+ + * | vnd_mac_input | + * +---------------+ + * | + * v + * +---------+ +-------------+ + * | dropped |<--*---------| vnd_hooks | + * | by | . +-------------+ + * | hooks | . drop probe | + * +---------+ kstat bump * . . . Do we have free + * | buffer space? + * | + * no . | . yes + * . + . + * +---*--+------*-------+ + * | | + * * . . drop probe * . . recv probe + * | kstat bump | kstat bump + * v | + * +---------+ * . . fire pollin + * | freemsg | v + * +---------+ +-----------------------+ + * | vnd_str_t`vns_dq_read | + * +-----------------------+ + * ^ ^ + * +----------+ | | +---------+ + * | read(9E) |-->-+ +--<--| frameio | + * +----------+ +---------+ + * + * The rx path is rather linear. Packets come into us from mac. We always run + * them through the various hooks, and if they come out of that, we inspect the + * read data queue. If there is not enough space for a packet, we drop it. + * Otherwise, we append it to the data queue, and fire read notifications + * targetting anyone polling or doing blocking I/O on this device. Those + * consumers then drain the head of the data queue. + * + * The tx path is more complicated due to mac flow control. After any call into + * mac, we may have to potentially suspend writes and buffer data for an + * arbitrary amount of time. As such, we need to carefully track the total + * amount of outstanding data so that we don't waste kernel memory. This is + * further complicated by the fact that mac will asynchronously tell us when our + * flow has been resumed. + * + * For data to be able to enter the system, it needs to be able to take a + * reservation from the write data queue. Once the reservation has been + * obtained, we enter the gsqueue so that we can actually append it. We use + * gsqueues (serialization queues) to ensure that packets are manipulated in + * order as we deal with the draining and appending packets. We also leverage + * its worker thread to help us do draining after mac has restorted our flow. + * + * The following image describes the flow: + * + * +-----------+ +--------------+ +-------------------------+ +------+ + * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | + * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ + * +-----------+ +--------------+ . +-------------------------+ + * | ^ . + * | | . reserve space from gsqueue + * | | | + * queue . . . * | space v + * full | * . . . avail +------------------------+ + * v | | vnd_squeue_tx_append() | + * +--------+ +------------+ +------------------------+ + * | EAGAIN |<--*------| Non-block? |<-+ | + * +--------+ . +------------+ | v + * . yes v | wait +--------------+ + * no . .* * . . for | append chain | + * +----+ space | to outgoing | + * | mblk chain | + * from gsqueue +--------------+ + * | | + * | +-------------------------------------------------+ + * | | + * | | yes . . . + * v v . + * +-----------------------+ +--------------+ . +------+ + * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | + * +-----------------------+ +--------------+ +------+ + * | | + * +---------------------------------|---------------------+ + * | | tx | + * | no . . * queue . . * + * | flow controlled . | empty * . fire pollout + * | . v | if mblk_t's + * +-------------+ . +---------------------+ | sent + * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ + * | flags | +---------------------+ | + * +-------------+ More data | | | More data | + * and limit ^ v * . . and limit ^ + * not reached . . * | | reached | + * +----+ | | + * v | + * +----------+ +-------------+ +---------------------------+ + * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | + * | control | | block flags | | vnd_squeue_tx_drain() and | + * | callback | +-------------+ | GSQUEUE_FILL flag, iff | + * +----------+ | not already scheduled | + * +---------------------------+ + * + * The final path taken for a given write(9E)/frameio ioctl depends on whether + * or not the vnd_dev_t is non-blocking. That controls the initial path of + * trying to take a reservation in write data queue. If the device is in + * non-blocking mode, we'll return EAGAIN when there is not enough space + * available, otherwise, the calling thread blocks on the data queue. + * + * Today when we call into vnd_squeue_tx_drain() we will not try to drain the + * entire queue, as that could be quite large and we don't want to necessarily + * keep the thread that's doing the drain until it's been finished. Not only + * could more data be coming in, but the draining thread could be a userland + * thread that has more work to do. We have two limits today. There is an upper + * bound on the total amount of data and the total number of mblk_t chains. If + * we hit either limit, then we will schedule another drain in the gsqueue and + * go from there. + * + * It's worth taking some time to describe how we interact with gsqueues. vnd + * has a gsqueue_set_t for itself. It's important that it has its own set, as + * the profile of work that vnd does is different from other sub-systems in the + * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. + * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up + * maintaining one for a given device. Because of that, we want to use a + * pseudo-random one to try and spread out the load, and picking one at random + * is likely to be just as good as any fancy algorithm we might come up with, + * especially as any two devices could have radically different transmit + * profiles. + * + * While some of the write path may seem complicated, it does allow us to + * maintain an important property. Once we have acknowledged a write(9E) or + * frameio ioctl, we will not drop the packet, excepting something like ipf via + * the firewall hooks. + * + * There is one other source of flow control that can exist in the system which + * is in the form of a barrier. The barrier is an internal mechanism used for + * ensuring that an gsqueue is drained for a given device. We use this as part + * of tearing down. Specifically we disable the write path so nothing new can be + * inserted into the gsqueue and then insert a barrier block. Once the barrier + * block comes out of the gsqueue, then we know nothing else in the gsqueue that + * could refer to the vnd_str_t, being destroyed, exists. + * + * --------------------- + * vnd, zones, netstacks + * --------------------- + * + * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. + * Because of that, vnd is also a netstack module. It registers with the + * netstack sub-system and receives callbacks every time a netstack is created, + * being shutdown, and destroyed. The netstack callbacks drive the creation and + * destruction of the vnd_pnsd_t structures. + * + * Recall from the earlier architecture diagrams that every vnd device is scoped + * to a netstack and known about by a given vnd_pnsd_t. When that netstack is + * torn down, we also tear down any vnd devices that are hanging around. When + * the netstack is torn down, we know that any zones that are scoped to that + * netstack are being shut down and have no processes remaining. This is going + * to be the case whether they are shared or exclusive stack zones. We have to + * perform a careful dance. + * + * There are two different callbacks that happen on tear down, the first is a + * shutdown callback, the second is a destroy callback. When the shutdown + * callback is fired we need to prepare for the netstack to go away and ensure + * that nothing can continue to persist itself. + * + * More specifically, when we get notice of a stack being shutdown we first + * remove the netstack from the global netstack list to ensure that no one new + * can come in and find the netstack and get a reference to it. After that, we + * notify the neti hooks that they're going away. Once that's all done, we get + * to the heart of the matter. + * + * When shutting down there could be any number of outstanding contexts that + * have a reference on the vnd_pnsd_t and on the individual links. However, we + * know that no one new will be able to find the vnd_pnsd_t. To account for + * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with + * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device + * to the netstack's list. If this is set, then they must not append to it. + * Once this is set, we know that the netstack's list of devices can never grow, + * only shrink. + * + * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that + * the container for the device is being destroyed and that we should not allow + * additional references to the device to be created, whether via open, or + * linking. The presence of this bit also allows things like the list ioctl and + * sdev to know not to consider its existence. At the conclusion of this being + * set, we know that no one else should be able to obtain a new reference to the + * device. + * + * Once that has been set for all devices, we go through and remove any existing + * links that have been established in sdev. Because doing that may cause the + * final reference for the device to be dropped, which still has a reference to + * the netstack, we have to restart our walk due to dropped locks. We know that + * this walk will eventually complete because the device cannot be relinked and + * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. + * Once that's finished, the shutdown callback returns. + * + * When we reach the destroy callback, we simply wait for references on the + * netstack to disappear. Because the zone has been shut down, all processes in + * it that have open references have been terminated and reaped. Any threads + * that are newly trying to reference it will fail. However, there is one thing + * that can halt this that we have no control over, which is the global zone + * holding open a reference to the device. In this case the zone halt will hang + * in vnd_stack_destroy. Once the last references is dropped we finish destroy + * the netinfo hooks and free the vnd_pnsd_t. + * + * ---- + * sdev + * ---- + * + * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd + * for both the global and non-global zones. In any given zone we always supply + * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone + * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. + * if a link was named net0, there would be a /dev/vnd/net0. The global zone can + * also see every link for every zone, ala /dev/net, under + * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device + * named net0, the global zone would have /dev/vnd/turin/net0. + * + * The sdev plugin has three interfaces that it supplies back to sdev. One is to + * validate that a given node is still valid. The next is a callback from sdev + * to say that it is no longer using the node. The third and final one is from + * sdev where it asks us to fill a directory. All of the heavy lifting is done + * in directory filling and in valiation. We opt not to maintain a reference on + * the device while there is an sdev node present. This makes the removal of + * nodes much simpler and most of the possible failure modes shouldn't cause any + * real problems. For example, the open path has to handle both dev_t's which no + * longer exist and which are no longer linked. + * + * ----- + * hooks + * ----- + * + * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd + * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks + * in a minimal fashion. While we will allow traffic to be filtered through the + * hooks, we do not provide means for packet injection or additional inspection + * at this time. There are a total of four different events created: + * + * o IPv4 physical in + * o IPv4 physical out + * o IPv6 physical in + * o IPv6 physical out + * + * --------------- + * Synchronization + * --------------- + * + * To make our synchronization simpler, we've put more effort into making the + * metadata/setup paths do more work. That work allows the data paths to make + * assumptions around synchronization that simplify the general case. Each major + * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is + * annotated with the protection that its members receives. The following + * annotations are used: + * + * A Atomics; these values are only modified using atomics values. + * Currently this only applies to kstat values. + * E Existence; no lock is needed to access this member, it does not + * change while the structure is valid. + * GL Global Lock; these members are protected by the global + * vnd_dev_lock. + * L Locked; access to the member is controlled by a lock that is in + * the structure. + * NSL netstack lock; this member is protected by the containing + * netstack. This only applies to the vnd_dev_t`vdd_nslink. + * X This member is special, and is discussed in this section. + * + * In addition to locking, we also have reference counts on the vnd_dev_t and + * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. + * With rare exception, once a reference count is decremented, the consumer + * should not assume that the data is valid any more. The only exception to this + * is the case where we're removing an extant reference count from a link into + * /devices or /dev. Reference counts are obtained on these structures as a part + * of looking them up. + * + * # Global Lock Ordering + * ###################### + * + * The following is the order that you must take locks in vnd: + * + * 1) vnd`vnd_dev_lock + * 2) vnd_pnsd_t`vpnd_lock + * 3) vnd_dev_t`vnd_lock + * 4) vnd_str_t`vns_lock + * 5) vnd_data_queue_t`vdq_lock + * + * One must adhere to the following rules: + * + * o You must acquire a lower numbered lock before a high numbered lock. + * o It is NOT legal to hold two locks of the same level concurrently, eg. you + * can not hold two different vnd_dev_t's vnd_lock at the same time. + * o You may release locks in any order. + * o If you release a lock, you must honor the locking rules before acquiring + * it again. + * o You should not hold any locks when calling any of the rele functions. + * + * # Special Considerations + * ######################## + * + * While most of the locking is what's expected, it's worth going into the + * special nature that a few members hold. Today, only two structures have + * special considerations: the vnd_dev_t and the vnd_str_t. All members with + * special considerations have an additional annotation that describes how you + * should interact with it. + * + * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is + * attached or in the process of attaching. If the code path that goes through + * requires an attached vnd_dev_t, eg. the data path and tear down path, then it + * is always legal to dereference that member without a lock held. When they are + * added to the system, they should be done under the vdd_lock and done as part + * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the + * lifetime of the vnd_dev_t. + * + * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it + * always exists as it is a part of the structure. The only time that it's valid + * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag + * set or during tear down. Outside of those paths which are naturally + * serialized, there is no explicit locking around the member. + * + * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not + * initially set as part of creating the structure, but are set as part of + * responding to the association ioctl. Anything in the data path or metadata + * path that requires association may assume that they exist, as we do not kick + * off the state machine until they're set. + * + * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The + * members are designed to be used as part of various operations with the + * gsqueues. A lock isn't needed to use them, but to work with them, the + * appropriate flag in the vnd_str_t`vns_flags must have been set by the current + * thread. Otherwise, it is always fair game to refer to their addresses. Their + * contents are ignored by vnd, but some members are manipulated by the gsqueue + * subsystem. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/ddi.h> +#include <sys/ethernet.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/ksynch.h> +#include <sys/taskq_impl.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/dlpi.h> +#include <sys/cred.h> +#include <sys/id_space.h> +#include <sys/list.h> +#include <sys/ctype.h> +#include <sys/policy.h> +#include <sys/sunldi.h> +#include <sys/cred.h> +#include <sys/strsubr.h> +#include <sys/poll.h> +#include <sys/neti.h> +#include <sys/hook.h> +#include <sys/hook_event.h> +#include <sys/vlan.h> +#include <sys/dld.h> +#include <sys/mac_client.h> +#include <sys/netstack.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/gsqueue.h> +#include <sys/smt.h> + +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/vnd.h> + +/* + * Globals + */ +static dev_info_t *vnd_dip; +static taskq_t *vnd_taskq; +static kmem_cache_t *vnd_str_cache; +static kmem_cache_t *vnd_dev_cache; +static kmem_cache_t *vnd_pnsd_cache; +static id_space_t *vnd_minors; +static int vnd_list_init = 0; +static sdev_plugin_hdl_t vnd_sdev_hdl; +static gsqueue_set_t *vnd_sqset; + +static kmutex_t vnd_dev_lock; +static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ +static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ + +/* + * STREAMs ioctls + * + * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such + * they aren't a part of the header file. + */ +#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) + +/* + * Private ioctl to associate a given streams instance with a minor instance of + * the character device. + */ +#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) + +typedef struct vnd_strioc_associate { + minor_t vsa_minor; /* minor device node */ + netstackid_t vsa_nsid; /* netstack id */ + vnd_errno_t vsa_errno; /* errno */ +} vnd_strioc_associate_t; + +typedef enum vnd_strioc_state { + VSS_UNKNOWN = 0, + VSS_COPYIN = 1, + VSS_COPYOUT = 2, +} vnd_strioc_state_t; + +typedef struct vnd_strioc { + vnd_strioc_state_t vs_state; + caddr_t vs_addr; +} vnd_strioc_t; + +/* + * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though + * really, overlap is at the end of the day, inevitable. + */ +#define VND_SQUEUE_TAG_TX_DRAIN 0x42 +#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 +#define VND_SQUEUE_TAG_VND_WRITE 0x44 +#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 +#define VND_SQUEUE_TAG_STRBARRIER 0x46 + +/* + * vnd reserved names. These are names which are reserved by vnd and thus + * shouldn't be used by some external program. + */ +static char *vnd_reserved_names[] = { + "ctl", + "zone", + NULL +}; + +/* + * vnd's DTrace probe macros + * + * DTRACE_VND* are all for a stable provider. We also have an unstable internal + * set of probes for reference count manipulation. + */ +#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_VND_REFINC(vdp) \ + DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); +#define DTRACE_VND_REFDEC(vdp) \ + DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); + + +/* + * Tunables + */ +size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ +size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ + +/* + * These numbers are designed as per-device tunables that are applied when a new + * vnd device is attached. They're a rough stab at what may be a reasonable + * amount of work to do in one burst in an squeue. + */ +size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ +size_t vnd_flush_nburst = 10; /* 10 frames */ + +/* + * Constants related to our sdev plugins + */ +#define VND_SDEV_NAME "vnd" +#define VND_SDEV_ROOT "/dev/vnd" +#define VND_SDEV_ZROOT "/dev/vnd/zone" + +/* + * vnd relies on privileges, not mode bits to limit access. As such, device + * files are read-write to everyone. + */ +#define VND_SDEV_MODE (S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | \ + S_IROTH | S_IWOTH) + +/* + * Statistic macros + */ +#define VND_STAT_INC(vsp, field, val) \ + atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) +#define VND_LATENCY_1MS 1000000 +#define VND_LATENCY_10MS 10000000 +#define VND_LATENCY_100MS 100000000 +#define VND_LATENCY_1S 1000000000 +#define VND_LATENCY_10S 10000000000 + +/* + * Constants for vnd hooks + */ +static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; +#define IPV4_MCAST_LEN 3 +static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +#define IPV6_MCAST_LEN 2 +static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* + * vnd internal data structures and types + */ + +struct vnd_str; +struct vnd_dev; +struct vnd_pnsd; + +/* + * As part of opening the device stream we need to properly communicate with our + * underlying stream. This is a bit of an asynchronous dance and we need to + * properly work with dld to get everything set up. We have to initiate the + * conversation with dld and as such we keep track of our state here. + */ +typedef enum vnd_str_state { + VNS_S_INITIAL = 0, + VNS_S_INFO_SENT, + VNS_S_EXCLUSIVE_SENT, + VNS_S_ATTACH_SENT, + VNS_S_BIND_SENT, + VNS_S_SAP_PROMISC_SENT, + VNS_S_MULTI_PROMISC_SENT, + VNS_S_RX_ONLY_PROMISC_SENT, + VNS_S_FIXUP_PROMISC_SENT, + VNS_S_CAPAB_Q_SENT, + VNS_S_CAPAB_E_SENT, + VNS_S_ONLINE, + VNS_S_SHUTTING_DOWN, + VNS_S_MULTICAST_PROMISCOFF_SENT, + VNS_S_SAP_PROMISCOFF_SENT, + VNS_S_UNBIND_SENT, + VNS_S_ZOMBIE +} vnd_str_state_t; + +typedef enum vnd_str_flags { + VNS_F_NEED_ZONE = 0x1, + VNS_F_TASKQ_DISPATCHED = 0x2, + VNS_F_CONDEMNED = 0x4, + VNS_F_FLOW_CONTROLLED = 0x8, + VNS_F_DRAIN_SCHEDULED = 0x10, + VNS_F_BARRIER = 0x20, + VNS_F_BARRIER_DONE = 0x40 +} vnd_str_flags_t; + +typedef enum vnd_capab_flags { + VNS_C_HCKSUM = 0x1, + VNS_C_DLD = 0x2, + VNS_C_DIRECT = 0x4, + VNS_C_HCKSUM_BADVERS = 0x8 +} vnd_capab_flags_t; + +/* + * Definitions to interact with direct callbacks + */ +typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, + mac_header_info_t *); +typedef uintptr_t vnd_mac_cookie_t; +/* DLD Direct capability function */ +typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); +/* DLD Direct tx function */ +typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +/* DLD Direct function to set flow control callback */ +typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), + void *); +/* DLD Direct function to see if flow controlled still */ +typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); + +/* + * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. + */ +typedef struct vnd_str_capab { + vnd_capab_flags_t vsc_flags; + t_uscalar_t vsc_hcksum_opts; + vnd_dld_cap_t vsc_capab_f; + void *vsc_capab_hdl; + vnd_dld_tx_t vsc_tx_f; + void *vsc_tx_hdl; + vnd_dld_set_fcb_t vsc_set_fcb_f; + void *vsc_set_fcb_hdl; + vnd_dld_is_fc_t vsc_is_fc_f; + void *vsc_is_fc_hdl; + vnd_mac_cookie_t vsc_fc_cookie; + void *vsc_tx_fc_hdl; +} vnd_str_capab_t; + +/* + * The vnd_data_queue is a simple construct for storing a series of messages in + * a queue. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_data_queue { + struct vnd_str *vdq_vns; /* E */ + kmutex_t vdq_lock; + kcondvar_t vdq_ready; /* Uses vdq_lock */ + ssize_t vdq_max; /* L */ + ssize_t vdq_cur; /* L */ + mblk_t *vdq_head; /* L */ + mblk_t *vdq_tail; /* L */ +} vnd_data_queue_t; + +typedef struct vnd_str_stat { + kstat_named_t vks_rbytes; + kstat_named_t vks_rpackets; + kstat_named_t vks_obytes; + kstat_named_t vks_opackets; + kstat_named_t vks_nhookindrops; + kstat_named_t vks_nhookoutdrops; + kstat_named_t vks_ndlpidrops; + kstat_named_t vks_ndataindrops; + kstat_named_t vks_ndataoutdrops; + kstat_named_t vks_tdrops; + kstat_named_t vks_linkname; + kstat_named_t vks_zonename; + kstat_named_t vks_nmacflow; + kstat_named_t vks_tmacflow; + kstat_named_t vks_mac_flow_1ms; + kstat_named_t vks_mac_flow_10ms; + kstat_named_t vks_mac_flow_100ms; + kstat_named_t vks_mac_flow_1s; + kstat_named_t vks_mac_flow_10s; +} vnd_str_stat_t; + +/* + * vnd stream structure + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_str { + kmutex_t vns_lock; + kcondvar_t vns_cancelcv; /* Uses vns_lock */ + kcondvar_t vns_barriercv; /* Uses vns_lock */ + kcondvar_t vns_stcv; /* Uses vns_lock */ + vnd_str_state_t vns_state; /* L */ + vnd_str_state_t vns_laststate; /* L */ + vnd_errno_t vns_errno; /* L */ + vnd_str_flags_t vns_flags; /* L */ + vnd_str_capab_t vns_caps; /* L */ + taskq_ent_t vns_tqe; /* L */ + vnd_data_queue_t vns_dq_read; /* E */ + vnd_data_queue_t vns_dq_write; /* E */ + mblk_t *vns_dlpi_inc; /* L */ + queue_t *vns_rq; /* E */ + queue_t *vns_wq; /* E */ + queue_t *vns_lrq; /* E */ + t_uscalar_t vns_dlpi_style; /* L */ + t_uscalar_t vns_minwrite; /* L */ + t_uscalar_t vns_maxwrite; /* L */ + hrtime_t vns_fclatch; /* L */ + hrtime_t vns_fcupdate; /* L */ + kstat_t *vns_kstat; /* E */ + gsqueue_t *vns_squeue; /* E */ + mblk_t vns_drainblk; /* E + X */ + mblk_t vns_barrierblk; /* E + X */ + vnd_str_stat_t vns_ksdata; /* A */ + size_t vns_nflush; /* L */ + size_t vns_bsize; /* L */ + struct vnd_dev *vns_dev; /* E + X */ + struct vnd_pnsd *vns_nsd; /* E + X */ +} vnd_str_t; + +typedef enum vnd_dev_flags { + VND_D_ATTACH_INFLIGHT = 0x001, + VND_D_ATTACHED = 0x002, + VND_D_LINK_INFLIGHT = 0x004, + VND_D_LINKED = 0x008, + VND_D_CONDEMNED = 0x010, + VND_D_ZONE_DYING = 0x020, + VND_D_OPENED = 0x040 +} vnd_dev_flags_t; + +/* + * This represents the data associated with a minor device instance. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_dev { + kmutex_t vdd_lock; + list_node_t vdd_link; /* GL */ + list_node_t vdd_nslink; /* NSL */ + int vdd_ref; /* L */ + vnd_dev_flags_t vdd_flags; /* L */ + minor_t vdd_minor; /* E */ + dev_t vdd_devid; /* E */ + ldi_ident_t vdd_ldiid; /* E */ + ldi_handle_t vdd_ldih; /* X */ + cred_t *vdd_cr; /* X */ + vnd_str_t *vdd_str; /* L */ + struct pollhead vdd_ph; /* E */ + struct vnd_pnsd *vdd_nsd; /* E + X */ + char vdd_datalink[VND_NAMELEN]; /* L */ + char vdd_lname[VND_NAMELEN]; /* L */ +} vnd_dev_t; + +typedef enum vnd_pnsd_flags { + VND_NS_CONDEMNED = 0x1 +} vnd_pnsd_flags_t; + +/* + * Per netstack data structure. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_pnsd { + list_node_t vpnd_link; /* protected by global dev lock */ + zoneid_t vpnd_zid; /* E */ + netstackid_t vpnd_nsid; /* E */ + boolean_t vpnd_hooked; /* E */ + net_handle_t vpnd_neti_v4; /* E */ + hook_family_t vpnd_family_v4; /* E */ + hook_event_t vpnd_event_in_v4; /* E */ + hook_event_t vpnd_event_out_v4; /* E */ + hook_event_token_t vpnd_token_in_v4; /* E */ + hook_event_token_t vpnd_token_out_v4; /* E */ + net_handle_t vpnd_neti_v6; /* E */ + hook_family_t vpnd_family_v6; /* E */ + hook_event_t vpnd_event_in_v6; /* E */ + hook_event_t vpnd_event_out_v6; /* E */ + hook_event_token_t vpnd_token_in_v6; /* E */ + hook_event_token_t vpnd_token_out_v6; /* E */ + kmutex_t vpnd_lock; /* Protects remaining members */ + kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ + int vpnd_ref; /* L */ + vnd_pnsd_flags_t vpnd_flags; /* L */ + list_t vpnd_dev_list; /* L */ +} vnd_pnsd_t; + +static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); + +/* + * Drop function signature. + */ +typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); + +static void +vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +/* ARGSUSED */ +static void +vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + panic("illegal vnd drop"); +} + +/* ARGSUSED */ +static void +vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + mblk_t *mp; + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp->b_next; + vnd_drop_hook_in(vsp, mp, "stream not associated"); + } +} + +static vnd_pnsd_t * +vnd_nsd_lookup(netstackid_t nsid) +{ + vnd_pnsd_t *nsp; + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + if (nsp->vpnd_nsid == nsid) { + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref >= 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zid(zoneid_t zid) +{ + netstack_t *ns; + vnd_pnsd_t *nsp; + ns = netstack_find_by_zoneid(zid); + if (ns == NULL) + return (NULL); + nsp = vnd_nsd_lookup(ns->netstack_stackid); + netstack_rele(ns); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zonename(char *zname) +{ + zone_t *zonep; + vnd_pnsd_t *nsp; + + zonep = zone_find_by_name(zname); + if (zonep == NULL) + return (NULL); + + nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); + zone_rele(zonep); + return (nsp); +} + +static void +vnd_nsd_ref(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + /* + * This can only be used on something that has been obtained through + * some other means. As such, the caller should already have a reference + * before adding another one. This function should not be used as a + * means of creating the initial reference. + */ + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static void +vnd_nsd_rele(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref--; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static vnd_dev_t * +vnd_dev_lookup(minor_t m) +{ + vnd_dev_t *vdp; + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + if (vdp->vdd_minor == m) { + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (vdp); +} + +static void +vnd_dev_free(vnd_dev_t *vdp) +{ + /* + * When the STREAM exists we need to go through and make sure + * communication gets torn down. As part of closing the stream, we + * guarantee that nothing else should be able to enter the stream layer + * at this point. That means no one should be able to call + * read(),write() or one of the frameio ioctls. + */ + if (vdp->vdd_flags & VND_D_ATTACHED) { + (void) ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + + /* + * We have to remove ourselves from our parents list now. It is + * really quite important that we have already set the condemend + * flag here so that our containing netstack basically knows + * that we're on the way down and knows not to wait for us. It's + * also important that we do that before we put a rele on the + * the device as that is the point at which it will check again. + */ + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(vdp->vdd_nsd); + vdp->vdd_nsd = NULL; + } + ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); + id_free(vnd_minors, vdp->vdd_minor); + mutex_destroy(&vdp->vdd_lock); + kmem_cache_free(vnd_dev_cache, vdp); +} + +static void +vnd_dev_ref(vnd_dev_t *vdp) +{ + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); +} + +/* + * As part of releasing the hold on this we may tear down a given vnd_dev_t As + * such we need to make sure that we grab the list lock first before grabbing + * the vnd_dev_t's lock to ensure proper lock ordering. + */ +static void +vnd_dev_rele(vnd_dev_t *vdp) +{ + mutex_enter(&vnd_dev_lock); + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref--; + DTRACE_VND_REFDEC(vdp); + if (vdp->vdd_ref > 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return; + } + + /* + * Now that we've removed this from the list, we can go ahead and + * drop the list lock. No one else can find this device and reference + * it. As its reference count is zero, it by definition does not have + * any remaining entries in /devices that could lead someone back to + * this. + */ + vdp->vdd_flags |= VND_D_CONDEMNED; + list_remove(&vnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + vnd_dev_free(vdp); +} + +/* + * Insert a mesage block chain if there's space, otherwise drop it. Return one + * so someone who was waiting for data would now end up having found it. eg. + * caller should consider a broadcast. + */ +static int +vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, + vnd_dropper_f dropf) +{ + size_t msize; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + if (reserved == B_FALSE) { + msize = msgsize(mp); + if (vqp->vdq_cur + msize > vqp->vdq_max) { + dropf(vqp->vdq_vns, mp, "buffer full"); + return (0); + } + vqp->vdq_cur += msize; + } + + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + vqp->vdq_head = mp; + vqp->vdq_tail = mp; + } else { + vqp->vdq_tail->b_next = mp; + vqp->vdq_tail = mp; + } + + return (1); +} + +/* + * Remove a message message block chain. If the amount of space in the buffer + * has changed we return 1. We have no way of knowing whether or not there is + * enough space overall for a given writer who is blocked, so we always end up + * having to return true and thus tell consumers that they should consider + * signalling. + */ +static int +vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) +{ + size_t msize; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(mpp != NULL); + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + *mpp = NULL; + return (0); + } + + mp = vqp->vdq_head; + msize = msgsize(mp); + + vqp->vdq_cur -= msize; + if (mp->b_next == NULL) { + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + /* + * We can't be certain that this is always going to be zero. + * Someone may have basically taken a reservation of space on + * the data queue, eg. claimed spae but not yet pushed it on + * yet. + */ + ASSERT(vqp->vdq_cur >= 0); + } else { + vqp->vdq_head = mp->b_next; + ASSERT(vqp->vdq_cur > 0); + } + mp->b_next = NULL; + *mpp = mp; + return (1); +} + +/* + * Reserve space in the queue. This will bump up the size of the queue and + * entitle the user to push something on later without bumping the space. + */ +static int +vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size >= 0); + + if (size == 0) + return (0); + + if (size + vqp->vdq_cur > vqp->vdq_max) + return (0); + + vqp->vdq_cur += size; + return (1); +} + +static void +vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size > 0); + ASSERT(size <= vqp->vdq_cur); + + vqp->vdq_cur -= size; +} + +static void +vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) +{ + mblk_t *mp, *next; + + mutex_enter(&vqp->vdq_lock); + for (mp = vqp->vdq_head; mp != NULL; mp = next) { + next = mp->b_next; + mp->b_next = NULL; + dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); + } + vqp->vdq_cur = 0; + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + mutex_exit(&vqp->vdq_lock); +} + +static boolean_t +vnd_dq_is_empty(vnd_data_queue_t *vqp) +{ + boolean_t ret; + + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head == NULL) + ret = B_TRUE; + else + ret = B_FALSE; + mutex_exit(&vqp->vdq_lock); + + return (ret); +} + +/* + * Get a network uint16_t from the message and translate it into something the + * host understands. + */ +static int +vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); +} + +/* + * Given an mblk chain find the mblk and address of a particular offset. + */ +static int +vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) +{ + size_t mpsize; + + if (off >= msgsize(mp)) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + *mpp = mp; + *offp = (uintptr_t)mp->b_rptr + off; + + return (0); +} + +/* + * Fetch the destination mac address. Set *dstp to that mac address. If the data + * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. + */ +static int +vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) +{ + int i; + + if (MBLKL(mp) >= ETHERADDRL) { + *dstpp = mp->b_rptr; + return (0); + } + + *dstpp = datap; + for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { + if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) + return (1); + } + + return (0); +} + +static int +vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, + hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, + hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) +{ + uint16_t etype; + hook_pkt_event_t info; + size_t offset, mblen; + uint8_t *dstp; + uint8_t dstaddr[6]; + hook_event_t he; + hook_event_token_t het; + net_handle_t neti; + + /* + * Before we can ask if we're interested we have to do enough work to + * determine the ethertype. + */ + + /* Byte 12 is either the VLAN tag or the ethertype */ + if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + if (etype == ETHERTYPE_VLAN) { + /* Actual ethertype is another four bytes in */ + if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { + ddrop(vsp, *mpp, + "packet has incomplete ethernet vlan header"); + *mpp = NULL; + return (1); + } + offset = sizeof (struct ether_vlan_header); + } else { + offset = sizeof (struct ether_header); + } + + /* + * At the moment we only hook on the kinds of things that the IP module + * would normally. + */ + if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) + return (0); + + if (etype == ETHERTYPE_IP) { + neti = netiv4; + he = hev4; + het = hetv4; + } else { + neti = netiv6; + he = hev6; + het = hetv6; + } + + if (!he.he_interested) + return (0); + + + if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + /* + * Now that we know we're interested, we have to do some additional + * sanity checking for IPF's sake, ala ip_check_length(). Specifically + * we need to check to make sure that the remaining packet size, + * excluding MAC, is at least the size of an IP header. + */ + mblen = msgsize(*mpp); + if ((etype == ETHERTYPE_IP && + mblen - offset < IP_SIMPLE_HDR_LENGTH) || + (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { + ddrop(vsp, *mpp, "packet has invalid IP header"); + *mpp = NULL; + return (1); + } + + info.hpe_protocol = neti; + info.hpe_ifp = (phy_if_t)vsp; + info.hpe_ofp = (phy_if_t)vsp; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) + info.hpe_flags |= HPE_BROADCAST; + else if (etype == ETHERTYPE_IP && + bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + else if (etype == ETHERTYPE_IPV6 && + bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + + if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, + (uintptr_t *)&info.hpe_hdr) != 0) { + ddrop(vsp, *mpp, "packet too small -- " + "unable to find payload"); + *mpp = NULL; + return (1); + } + + if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { + hdrop(vsp, *mpp, "drooped by hooks"); + return (1); + } + + return (0); +} + +/* + * This should not be used for DL_INFO_REQ. + */ +static mblk_t * +vnd_dlpi_alloc(size_t len, t_uscalar_t prim) +{ + mblk_t *mp; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_PROTO; + mp->b_wptr = mp->b_rptr + len; + bzero(mp->b_rptr, len); + ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; + + return (mp); +} + +static void +vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) +{ + mblk_t **mpp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + ASSERT(mp->b_next == NULL); + mpp = &vsp->vns_dlpi_inc; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + *mpp = mp; +} + +static mblk_t * +vnd_dlpi_inc_pop(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vsp->vns_dlpi_inc; + if (mp != NULL) { + VERIFY(mp->b_next == NULL || mp->b_next != mp); + vsp->vns_dlpi_inc = mp->b_next; + mp->b_next = NULL; + } + return (mp); +} + +static int +vnd_st_sinfo(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_info_req_t *dlir; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + vsp->vns_state = VNS_S_INFO_SENT; + cv_broadcast(&vsp->vns_stcv); + + mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_info(vnd_str_t *vsp) +{ + dl_info_ack_t *dlia; + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + dlia = (dl_info_ack_t *)mp->b_rptr; + vsp->vns_dlpi_style = dlia->dl_provider_style; + vsp->vns_minwrite = dlia->dl_min_sdu; + vsp->vns_maxwrite = dlia->dl_max_sdu; + + /* + * At this time we only support DL_ETHER devices. + */ + if (dlia->dl_mac_type != DL_ETHER) { + freemsg(mp); + vsp->vns_errno = VND_E_NOTETHER; + return (1); + } + + /* + * Because vnd operates on entire packets, we need to manually account + * for the ethernet header information. We add the size of the + * ether_vlan_header to account for this, regardless if it is using + * vlans or not. + */ + vsp->vns_maxwrite += sizeof (struct ether_vlan_header); + + freemsg(mp); + return (0); +} + +static int +vnd_st_sexclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_EXCLUSIVE_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + return (0); +} + +static int +vnd_st_exclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_exclusive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_EXCLUSIVE_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_exclusive: got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_DLEXCL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +/* + * Send down a DLPI_ATTACH_REQ. + */ +static int +vnd_st_sattach(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; + vsp->vns_state = VNS_S_ATTACH_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_attach(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_ATTACH_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_attach: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_ATTACHFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_sbind(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_bind_req_t *dbrp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), + DL_BIND_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + dbrp = (dl_bind_req_t *)(mp->b_rptr); + dbrp->dl_sap = 0; + dbrp->dl_service_mode = DL_CLDLS; + + vsp->vns_state = VNS_S_BIND_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_bind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + + if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_BINDFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + mblk_t *mp; + dl_promiscon_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + dprp = (dl_promiscon_req_t *)mp->b_rptr; + dprp->dl_level = type; + + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_promisc(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promisc"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_PROMISCON_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promisc: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_PROMISCFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_scapabq(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_CAPAB_Q_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +/* ARGSUSED */ +static void +vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + int signal = 0; + mblk_t *mp; + vnd_pnsd_t *nsp = vsp->vns_nsd; + + ASSERT(vsp != NULL); + ASSERT(mp_chain != NULL); + + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + uint16_t vid; + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * If we were operating in a traditional dlpi context then we + * would have enabled DLIOCRAW and rather than the fast path, we + * would come through dld_str_rx_raw. That function does two + * things that we have to consider doing ourselves. The first is + * that it adjusts the b_rptr back to account for dld bumping us + * past the mac header. It also tries to account for cases where + * mac provides an illusion of the mac header. Fortunately, dld + * only allows the fastpath when the media type is the same as + * the native type. Therefore all we have to do here is adjust + * the b_rptr. + */ + ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); + mp->b_rptr -= mhip->mhi_hdrsize; + vid = VLAN_ID(mhip->mhi_tci); + if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { + /* + * This is an overlapping copy. Do not use bcopy(9F). + */ + (void) memmove(mp->b_rptr + 4, mp->b_rptr, 12); + mp->b_rptr += 4; + } + + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, + nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, + nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) + continue; + + VND_STAT_INC(vsp, vks_rpackets, 1); + VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); + DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + mutex_enter(&vsp->vns_dq_read.vdq_lock); + signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, + vnd_drop_in); + mutex_exit(&vsp->vns_dq_read.vdq_lock); + } + + if (signal != 0) { + cv_broadcast(&vsp->vns_dq_read.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); + } + +} + +static void +vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) +{ + VND_STAT_INC(vsp, vks_nmacflow, 1); + VND_STAT_INC(vsp, vks_tmacflow, diff); + if (diff >= VND_LATENCY_1MS) + VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); + if (diff >= VND_LATENCY_10MS) + VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); + if (diff >= VND_LATENCY_100MS) + VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); + if (diff >= VND_LATENCY_1S) + VND_STAT_INC(vsp, vks_mac_flow_1s, 1); + if (diff >= VND_LATENCY_10S) + VND_STAT_INC(vsp, vks_mac_flow_10s, 1); +} + +/* + * This is a callback from MAC that indicates that we are allowed to send + * packets again. + */ +static void +vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) +{ + vnd_str_t *vsp = arg; + hrtime_t now; + + mutex_enter(&vsp->vns_lock); + now = gethrtime(); + + /* + * Check for the case that we beat vnd_squeue_tx_one to the punch. + * There's also an additional case here that we got notified because + * we're sharing a device that ran out of tx descriptors, even though it + * wasn't because of us. + */ + if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { + vsp->vns_fcupdate = now; + mutex_exit(&vsp->vns_lock); + return; + } + + ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); + ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); + vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = (vnd_mac_cookie_t)NULL; + vsp->vns_fclatch = 0; + DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, + vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); + /* + * If someone has asked to flush the squeue and thus inserted a barrier, + * than we shouldn't schedule a drain. + */ + if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, + vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, + VND_SQUEUE_TAG_MAC_FLOW_CONTROL); + } + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); +} + +static void +vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); +} + +static int +vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) +{ + int ret; + dld_capab_direct_t d; + mac_perim_handle_t mph; + vnd_str_capab_t *c = &vsp->vns_caps; + + bzero(&d, sizeof (d)); + d.di_rx_cf = (uintptr_t)rxfunc; + d.di_rx_ch = vsp; + d.di_flags = DI_DIRECT_RAW; + + vnd_mac_enter(vsp, &mph); + + /* + * If we're coming in here for a second pass, we need to make sure that + * we remove an existing flow control notification callback, otherwise + * we'll create a duplicate that will remain with garbage data. + */ + if (c->vsc_tx_fc_hdl != NULL) { + ASSERT(c->vsc_set_fcb_hdl != NULL); + (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, + c->vsc_tx_fc_hdl); + c->vsc_tx_fc_hdl = NULL; + } + + if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, + DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { + c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; + c->vsc_tx_hdl = d.di_tx_dh; + c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; + c->vsc_set_fcb_hdl = d.di_tx_cb_dh; + c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; + c->vsc_is_fc_hdl = d.di_tx_fctl_dh; + c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, + vnd_mac_flow_control, vsp); + c->vsc_flags |= VNS_C_DIRECT; + ret = 0; + } else { + vsp->vns_errno = VND_E_DIRECTFAIL; + ret = 1; + } + vnd_mac_exit(vsp, mph); + return (ret); +} + +static int +vnd_st_capabq(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_capability_ack_t *cap; + dl_capability_sub_t *subp; + dl_capab_hcksum_t *hck; + dl_capab_dld_t *dld; + unsigned char *rp; + int ret = 0; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + + rp = mp->b_rptr; + cap = (dl_capability_ack_t *)rp; + if (cap->dl_sub_length == 0) + goto done; + + /* Don't try to process something too big */ + if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_CAPACKINVAL; + ret = 1; + goto done; + } + + rp += cap->dl_sub_offset; + + while (cap->dl_sub_length > 0) { + subp = (dl_capability_sub_t *)rp; + /* Sanity check something crazy from down below */ + if (subp->dl_length + sizeof (dl_capability_sub_t) > + cap->dl_sub_length) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_SUBCAPINVAL; + ret = 1; + goto done; + } + + switch (subp->dl_cap) { + case DL_CAPAB_HCKSUM: + hck = (dl_capab_hcksum_t *)(rp + + sizeof (dl_capability_sub_t)); + if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; + break; + } + if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; + vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; + break; + case DL_CAPAB_DLD: + dld = (dl_capab_dld_t *)(rp + + sizeof (dl_capability_sub_t)); + if (dld->dld_version != DLD_CURRENT_VERSION) { + vsp->vns_errno = VND_E_DLDBADVERS; + ret = 1; + goto done; + } + if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_DLD; + vsp->vns_caps.vsc_capab_f = + (vnd_dld_cap_t)dld->dld_capab; + vsp->vns_caps.vsc_capab_hdl = + (void *)dld->dld_capab_handle; + /* + * At this point in time, we have to set up a direct + * function that drops all input. This validates that + * we'll be able to set up direct input and that we can + * easily switch it earlier to the real data function + * when we've plumbed everything up. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { + /* vns_errno set by vnd_dld_cap_enable */ + ret = 1; + goto done; + } + break; + default: + /* Ignore unsupported cap */ + break; + } + + rp += sizeof (dl_capability_sub_t) + subp->dl_length; + cap->dl_sub_length -= sizeof (dl_capability_sub_t) + + subp->dl_length; + } + +done: + /* Make sure we enabled direct callbacks */ + if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { + vsp->vns_errno = VND_E_DIRECTNOTSUP; + ret = 1; + } + + freemsg(mp); + return (ret); +} + +static void +vnd_st_sonline(vnd_str_t *vsp) +{ + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + vsp->vns_state = VNS_S_ONLINE; + cv_broadcast(&vsp->vns_stcv); +} + +static void +vnd_st_shutdown(vnd_str_t *vsp) +{ + mac_perim_handle_t mph; + vnd_str_capab_t *vsc = &vsp->vns_caps; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * At this point in time we know that there is no one transmitting as + * our final reference has been torn down and that vnd_s_close inserted + * a barrier to validate that everything is flushed. + */ + if (vsc->vsc_flags & VNS_C_DIRECT) { + vnd_mac_enter(vsp, &mph); + vsc->vsc_flags &= ~VNS_C_DIRECT; + (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, + vsc->vsc_tx_fc_hdl); + vsc->vsc_tx_fc_hdl = NULL; + (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + vnd_mac_exit(vsp, mph); + } +} + +static boolean_t +vnd_st_spromiscoff(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + boolean_t ret = B_TRUE; + mblk_t *mp; + dl_promiscoff_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCOFF_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "promiscoff request"); + ret = B_FALSE; + goto next; + } + + dprp = (dl_promiscoff_req_t *)mp->b_rptr; + dprp->dl_level = type; + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_promiscoff(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + return; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promiscoff"); + return; + } + + if (cprim != DL_PROMISCOFF_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promiscoff: Got ack/nack for wrong primitive"); + return; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to disable promiscuos mode during " + "vnd teardown"); + } +} + +static boolean_t +vnd_st_sunbind(vnd_str_t *vsp) +{ + mblk_t *mp; + boolean_t ret = B_TRUE; + + mp = vnd_dlpi_alloc(sizeof (dl_unbind_req_t), DL_UNBIND_REQ); + if (mp == NULL) { + cmn_err(CE_NOTE, "!vnd failed to allocate mblk_t for " + "unbind request"); + ret = B_FALSE; + goto next; + } + + putnext(vsp->vns_wq, mp); +next: + vsp->vns_state = VNS_S_UNBIND_SENT; + cv_broadcast(&vsp->vns_stcv); + return (ret); +} + +static void +vnd_st_unbind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + /* + * Unlike other cases where we guard against the incoming packet being + * NULL, during tear down we try to keep driving and therefore we may + * have gotten here due to an earlier failure, so there's nothing to do. + */ + mp = vnd_dlpi_inc_pop(vsp); + if (mp == NULL) + goto next; + + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_unbind"); + goto next; + } + + if (cprim != DL_UNBIND_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_unbind: Got ack/nack for wrong primitive"); + goto next; + } + + if (prim == DL_ERROR_ACK) { + cmn_err(CE_WARN, "!failed to unbind stream during vnd " + "teardown"); + } + +next: + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); +} + +/* + * Perform state transitions. This is a one way shot down the flow chart + * described in the big theory statement. + */ +static void +vnd_str_state_transition(void *arg) +{ + boolean_t died = B_FALSE; + vnd_str_t *vsp = arg; + mblk_t *mp; + + mutex_enter(&vsp->vns_lock); + if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && + vsp->vns_state != VNS_S_SHUTTING_DOWN)) { + mutex_exit(&vsp->vns_lock); + return; + } + + /* + * When trying to shut down, or unwinding from a failed enabling, rather + * than immediately entering the ZOMBIE state, we may instead opt to try + * and enter the next state in the progression. This is especially + * important when trying to tear everything down. + */ +loop: + DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, + vnd_str_state_t, vsp->vns_state); + switch (vsp->vns_state) { + case VNS_S_INITIAL: + VERIFY(vsp->vns_dlpi_inc == NULL); + if (vnd_st_sinfo(vsp) != 0) + died = B_TRUE; + break; + case VNS_S_INFO_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_info(vsp) == 0) { + if (vnd_st_sexclusive(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_EXCLUSIVE_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_exclusive(vsp) == 0) { + if (vsp->vns_dlpi_style == DL_STYLE2) { + if (vnd_st_sattach(vsp) != 0) + died = B_TRUE; + } else { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } + } else { + died = B_TRUE; + } + break; + case VNS_S_ATTACH_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_attach(vsp) == 0) { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_BIND_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_bind(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_SAP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, + VNS_S_MULTI_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_MULTI_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, + VNS_S_RX_ONLY_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_RX_ONLY_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_FIXUPS, + VNS_S_FIXUP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_FIXUP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_scapabq(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_CAPAB_Q_SENT: + if (vnd_st_capabq(vsp) != 0) + died = B_TRUE; + else + vnd_st_sonline(vsp); + break; + case VNS_S_SHUTTING_DOWN: + vnd_st_shutdown(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_MULTI, + VNS_S_MULTICAST_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_MULTICAST_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_spromiscoff(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISCOFF_SENT) == B_FALSE) + goto loop; + break; + case VNS_S_SAP_PROMISCOFF_SENT: + vnd_st_promiscoff(vsp); + if (vnd_st_sunbind(vsp) == B_FALSE) + goto loop; + break; + case VNS_S_UNBIND_SENT: + vnd_st_unbind(vsp); + break; + case VNS_S_ZOMBIE: + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vsp received data as a zombie"); + break; + default: + panic("vnd_str_t entered an unknown state"); + } + + if (died == B_TRUE) { + ASSERT(vsp->vns_errno != VND_E_SUCCESS); + vsp->vns_laststate = vsp->vns_state; + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); + } + + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_dlpi_taskq_dispatch(void *arg) +{ + vnd_str_t *vsp = arg; + int run = 1; + + while (run != 0) { + vnd_str_state_transition(vsp); + mutex_enter(&vsp->vns_lock); + if (vsp->vns_flags & VNS_F_CONDEMNED || + vsp->vns_dlpi_inc == NULL) { + run = 0; + vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; + } + if (vsp->vns_flags & VNS_F_CONDEMNED) + cv_signal(&vsp->vns_cancelcv); + mutex_exit(&vsp->vns_lock); + } +} + +/* ARGSUSED */ +static int +vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getptmue(net_handle_t neti) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + zoneid_t *zid) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_phylookup(net_handle_t neti, const char *name) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static lif_if_t +vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +vnd_neti_route(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static int +vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +/* ARGSUSED */ +static int +vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static net_protocol_t vnd_neti_info_v4 = { + NETINFO_VERSION, + NHF_VND_INET, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + +static net_protocol_t vnd_neti_info_v6 = { + NETINFO_VERSION, + NHF_VND_INET6, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + + +static int +vnd_netinfo_init(vnd_pnsd_t *nsp) +{ + nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v4); + ASSERT(nsp->vpnd_neti_v4 != NULL); + + nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v6); + ASSERT(nsp->vpnd_neti_v6 != NULL); + + nsp->vpnd_family_v4.hf_version = HOOK_VERSION; + nsp->vpnd_family_v4.hf_name = "vnd_inet"; + + if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_family_v6.hf_version = HOOK_VERSION; + nsp->vpnd_family_v6.hf_name = "vnd_inet6"; + + if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v4.he_flags = 0; + nsp->vpnd_event_in_v4.he_interested = B_FALSE; + + nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + if (nsp->vpnd_token_in_v4 == NULL) { + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v6.he_flags = 0; + nsp->vpnd_event_in_v6.he_interested = B_FALSE; + + nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + if (nsp->vpnd_token_in_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v4.he_flags = 0; + nsp->vpnd_event_out_v4.he_interested = B_FALSE; + + nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_out_v4); + if (nsp->vpnd_token_out_v4 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v6.he_flags = 0; + nsp->vpnd_event_out_v6.he_interested = B_FALSE; + + nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_out_v6); + if (nsp->vpnd_token_out_v6 == NULL) { + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + (void) net_event_shutdown(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_event_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + (void) net_family_unregister(nsp->vpnd_neti_v4, + &nsp->vpnd_family_v4); + (void) net_family_unregister(nsp->vpnd_neti_v6, + &nsp->vpnd_family_v6); + (void) net_protocol_unregister(nsp->vpnd_neti_v4); + (void) net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + return (0); +} + +static void +vnd_netinfo_shutdown(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); +} + +static void +vnd_netinfo_fini(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v4); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v6); + VERIFY(ret == 0); +} + +/* ARGSUSED */ +static void +vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + + VERIFY(bmp == &vsp->vns_barrierblk); + mutex_enter(&vsp->vns_lock); + VERIFY(vsp->vns_flags & VNS_F_BARRIER); + VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); + vsp->vns_flags |= VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * For better or worse, we have to broadcast here as we could have a + * thread that's blocked for completion as well as one that's blocked + * waiting to do a barrier itself. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * This is a data barrier for the stream while it is in fastpath mode. It blocks + * and ensures that there is nothing else in the squeue. + */ +static void +vnd_strbarrier(vnd_str_t *vsp) +{ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_flags & VNS_F_BARRIER) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags |= VNS_F_BARRIER; + mutex_exit(&vsp->vns_lock); + + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, + vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); + + mutex_enter(&vsp->vns_lock); + while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags &= ~VNS_F_BARRIER; + vsp->vns_flags &= ~VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * We have to broadcast in case anyone is waiting for the barrier + * themselves. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * Based on the type of message that we're dealing with we're going to want to + * do one of several things. Basically if it looks like it's something we know + * about, we should probably handle it in one of our transition threads. + * Otherwise, we should just simply putnext. + */ +static int +vnd_s_rput(queue_t *q, mblk_t *mp) +{ + t_uscalar_t prim; + int dispatch = 0; + vnd_str_t *vsp = q->q_ptr; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + vnd_drop_ctl(vsp, mp, "PROTO message too short"); + break; + } + + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; + if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { + vnd_drop_ctl(vsp, mp, + "recieved an unsupported dlpi DATA req"); + break; + } + + /* + * Enqueue the entry and fire off a taskq dispatch. + */ + mutex_enter(&vsp->vns_lock); + vnd_dlpi_inc_push(vsp, mp); + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + dispatch = 1; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + } + mutex_exit(&vsp->vns_lock); + if (dispatch != 0) + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, + vsp, 0, &vsp->vns_tqe); + break; + case M_DATA: + vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); + break; + default: + putnext(vsp->vns_rq, mp); + } + return (0); +} + +/* ARGSUSED */ +static void +vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) +{ + int error; + vnd_strioc_t *visp; + + if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || + iocp->ioc_count != TRANSPARENT) { + error = EINVAL; + goto nak; + } + + /* + * All streams ioctls that we support must use kcred as a means to + * distinguish that this is a layered open by the kernel as opposed to + * one by a user who has done an I_PUSH of the module. + */ + if (iocp->ioc_cr != kcred) { + error = EPERM; + goto nak; + } + + if (mp->b_cont == NULL) { + error = EAGAIN; + goto nak; + } + + visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); + ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); + visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; + visp->vs_state = VSS_COPYIN; + + mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); + qreply(q, mp); + + return; + +nak: + if (mp->b_cont != NULL) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + + iocp->ioc_error = error; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); +} + +static void +vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + vnd_str_state_t state; + struct copyreq *crp; + vnd_strioc_associate_t *vss; + vnd_dev_t *vdp = NULL; + vnd_pnsd_t *nsp = NULL; + char iname[2*VND_NAMELEN]; + zone_t *zone; + vnd_strioc_t *visp; + + visp = (vnd_strioc_t *)csp->cp_private; + + /* If it's not ours, it's not our problem */ + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* Data is sitting for us in b_cont */ + if (mp->b_cont == NULL || + MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { + kmem_free(visp, sizeof (vnd_strioc_t)); + miocnak(q, mp, 0, EINVAL); + return; + } + + vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; + vdp = vnd_dev_lookup(vss->vsa_minor); + if (vdp == NULL) { + vss->vsa_errno = VND_E_NODEV; + goto nak; + } + + nsp = vnd_nsd_lookup(vss->vsa_nsid); + if (nsp == NULL) { + vss->vsa_errno = VND_E_NONETSTACK; + goto nak; + } + + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { + mutex_exit(&vsp->vns_lock); + vss->vsa_errno = VND_E_ASSOCIATED; + goto nak; + } + + vsp->vns_nsd = nsp; + vsp->vns_flags &= ~VNS_F_NEED_ZONE; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + mutex_exit(&vsp->vns_lock); + + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, + &vsp->vns_tqe); + + + /* At this point we need to wait until we have transitioned to ONLINE */ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + state = vsp->vns_state; + mutex_exit(&vsp->vns_lock); + + if (state == VNS_S_ZOMBIE) { + vss->vsa_errno = vsp->vns_errno; + goto nak; + } + + mutex_enter(&vdp->vdd_lock); + mutex_enter(&vsp->vns_lock); + VERIFY(vdp->vdd_str == NULL); + /* + * Now initialize the remaining kstat properties and let's go ahead and + * create it. + */ + (void) snprintf(iname, sizeof (iname), "z%d_%d", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); + vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", + KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (vsp->vns_kstat == NULL) { + vss->vsa_errno = VND_E_KSTATCREATE; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + vdp->vdd_str = vsp; + vsp->vns_dev = vdp; + + /* + * Now, it's time to do the las thing that can fail, changing out the + * input function. After this we know that we can receive data, so we + * should make sure that we're ready. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { + vss->vsa_errno = VND_E_DIRECTFAIL; + vdp->vdd_str = NULL; + vsp->vns_dev = NULL; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + + zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); + ASSERT(zone != NULL); + vsp->vns_kstat->ks_data = &vsp->vns_ksdata; + /* Account for zone name */ + vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; + /* Account for eventual link name */ + vsp->vns_kstat->ks_data_size += VND_NAMELEN; + kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + zone_rele(zone); + kstat_install(vsp->vns_kstat); + + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + + /* + * Note that the vnd_str_t does not keep a permanent hold on the + * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what + * the nestack goes through to take care of everything. + */ + vss->vsa_errno = VND_E_SUCCESS; +nak: + if (vdp != NULL) + vnd_dev_rele(vdp); + if (nsp != NULL) + vnd_nsd_rele(nsp); + /* + * Change the copyin request to a copyout. Note that we can't use + * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's + * okay, as the copyin vs. copyout is basically the same. + */ + DB_TYPE(mp) = M_COPYOUT; + visp->vs_state = VSS_COPYOUT; + crp = (struct copyreq *)mp->b_rptr; + crp->cq_private = (void *)visp; + crp->cq_addr = visp->vs_addr; + crp->cq_size = sizeof (vnd_strioc_associate_t); + qreply(q, mp); +} + +static void +vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + ASSERT(csp->cp_private != NULL); + kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); + return; + } + + /* Ack and let's be done with it all */ + miocack(q, mp, 0, 0); +} + +static int +vnd_s_wput(queue_t *q, mblk_t *mp) +{ + vnd_str_t *vsp = q->q_ptr; + struct copyresp *crp; + vnd_strioc_state_t vstate; + vnd_strioc_t *visp; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); + return (0); + case M_IOCDATA: + crp = (struct copyresp *)mp->b_rptr; + ASSERT(crp->cp_private != NULL); + visp = (vnd_strioc_t *)crp->cp_private; + vstate = visp->vs_state; + ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); + if (vstate == VSS_COPYIN) + vnd_striocdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + else + vnd_stroutdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + return (0); + default: + break; + } + if (q->q_next != NULL) + putnext(q, mp); + else + vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) +{ + vnd_str_t *vsp; + uint_t rand; + + if (q->q_ptr != NULL) + return (EINVAL); + + if (!(sflag & MODOPEN)) + return (ENXIO); + + if (credp != kcred) + return (EPERM); + + vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); + bzero(vsp, sizeof (*vsp)); + mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); + vsp->vns_state = VNS_S_INITIAL; + + mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&vnd_dev_lock); + vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_read.vdq_vns = vsp; + vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_write.vdq_vns = vsp; + mutex_exit(&vnd_dev_lock); + vsp->vns_rq = q; + vsp->vns_wq = WR(q); + q->q_ptr = WR(q)->q_ptr = vsp; + vsp->vns_flags = VNS_F_NEED_ZONE; + vsp->vns_nflush = vnd_flush_nburst; + vsp->vns_bsize = vnd_flush_burst_size; + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); + + /* + * We create our kstat and initialize all of its fields now, but we + * don't install it until we actually do the zone association so we can + * get everything. + */ + kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, + "flowcontrol_100ms", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", + KSTAT_DATA_UINT64); + qprocson(q); + /* + * Now that we've called qprocson, grab the lower module for making sure + * that we don't have any pass through modules. + */ + vsp->vns_lrq = RD(vsp->vns_wq->q_next); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_s_close(queue_t *q, int flag, cred_t *credp) +{ + vnd_str_t *vsp; + mblk_t *mp; + + VERIFY(WR(q)->q_next != NULL); + + vsp = q->q_ptr; + ASSERT(vsp != NULL); + + /* + * We need to transition ourselves down. This means that we have a few + * important different things to do in the process of tearing down our + * input and output buffers, making sure we've drained the current + * squeue, and disabling the fast path. Before we disable the fast path, + * we should make sure the squeue is drained. Because we're in streams + * close, we know that no packets can come into us from userland, but we + * can receive more. As such, the following is the exact order of things + * that we do: + * + * 1) flush the vns_dq_read + * 2) Insert the drain mblk + * 3) When it's been received, tear down the fast path by kicking + * off the state machine. + * 4) One final flush of both the vns_dq_read,vns_dq_write + */ + + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_strbarrier(vsp); + mutex_enter(&vsp->vns_lock); + vsp->vns_state = VNS_S_SHUTTING_DOWN; + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, + 0, &vsp->vns_tqe); + } + while (vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + mutex_exit(&vsp->vns_lock); + + qprocsoff(q); + mutex_enter(&vsp->vns_lock); + vsp->vns_flags |= VNS_F_CONDEMNED; + while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) + cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); + + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vnd_s_close"); + mutex_exit(&vsp->vns_lock); + + q->q_ptr = NULL; + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); + mutex_destroy(&vsp->vns_dq_read.vdq_lock); + mutex_destroy(&vsp->vns_dq_write.vdq_lock); + + if (vsp->vns_kstat != NULL) + kstat_delete(vsp->vns_kstat); + mutex_destroy(&vsp->vns_lock); + cv_destroy(&vsp->vns_stcv); + cv_destroy(&vsp->vns_barriercv); + cv_destroy(&vsp->vns_cancelcv); + kmem_cache_free(vnd_str_cache, vsp); + + return (0); +} + +static vnd_mac_cookie_t +vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) +{ + hrtime_t txtime; + vnd_mac_cookie_t vc; + + VND_STAT_INC(vsp, vks_opackets, 1); + VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); + DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + /* Actually tx now */ + txtime = gethrtime(); + vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, + mp, 0, MAC_DROP_ON_NO_DESC); + + /* + * We need to check two different conditions before we immediately set + * the flow control lock. The first thing that we need to do is verify + * that this is an instance of hard flow control, so to say. The flow + * control callbacks won't always fire in cases where we still get a + * cookie returned. The explicit check for flow control will guarantee + * us that we'll get a subsequent notification callback. + * + * The second case comes about because we do not hold the + * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow + * control notification already came across for us in a different thread + * calling vnd_mac_flow_control(). To deal with this, we record a + * timestamp every time that we change the flow control state. We grab + * txtime here before we transmit because that guarantees that the + * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. + * + * If the flow control notification beat us to the punch, the value of + * vns_fcupdate will be larger than the value of txtime, and we should + * just record the statistics. However, if we didn't beat it to the + * punch (txtime > vns_fcupdate), then we know that it's safe to wait + * for a notification. + */ + if (vc != (vnd_mac_cookie_t)NULL) { + hrtime_t diff; + + if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, + vc) == 0) + return ((vnd_mac_cookie_t)NULL); + mutex_enter(&vsp->vns_lock); + diff = vsp->vns_fcupdate - txtime; + if (diff > 0) { + mutex_exit(&vsp->vns_lock); + vnd_mac_flow_control_stat(vsp, diff); + return ((vnd_mac_cookie_t)NULL); + } + vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = vc; + vsp->vns_fclatch = txtime; + vsp->vns_fcupdate = txtime; + DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, + uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); + mutex_exit(&vsp->vns_lock); + } + + return (vc); +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) +{ + mblk_t *mp; + int nmps; + size_t mptot, nflush, bsize; + boolean_t blocked, empty; + vnd_data_queue_t *vqp; + vnd_str_t *vsp = arg; + + mutex_enter(&vsp->vns_lock); + /* + * We either enter here via an squeue or via vnd_squeue_tx_append(). In + * the former case we need to mark that there is no longer an active + * user of the drain block. + */ + if (drain_mp != NULL) { + VERIFY(drain_mp == &vsp->vns_drainblk); + VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); + vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; + } + + /* + * If we're still flow controlled or under a flush barrier, nothing to + * do. + */ + if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { + mutex_exit(&vsp->vns_lock); + return; + } + + nflush = vsp->vns_nflush; + bsize = vsp->vns_bsize; + mutex_exit(&vsp->vns_lock); + + /* + * We're potentially going deep into the networking layer; make sure the + * guest can't run concurrently. + */ + smt_begin_unsafe(); + + nmps = 0; + mptot = 0; + blocked = B_FALSE; + vqp = &vsp->vns_dq_write; + while (nmps < nflush && mptot <= bsize) { + mutex_enter(&vqp->vdq_lock); + if (vnd_dq_pop(vqp, &mp) == 0) { + mutex_exit(&vqp->vdq_lock); + break; + } + mutex_exit(&vqp->vdq_lock); + + nmps++; + mptot += msgsize(mp); + if (vnd_squeue_tx_one(vsp, mp) != (vnd_mac_cookie_t)NULL) { + blocked = B_TRUE; + break; + } + } + + smt_end_unsafe(); + + empty = vnd_dq_is_empty(&vsp->vns_dq_write); + + /* + * If the queue is not empty, we're not blocked, and there isn't a drain + * scheduled, put it into the squeue with the drain block and + * GSQUEUE_FILL. + */ + if (blocked == B_FALSE && empty == B_FALSE) { + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { + mblk_t *mp = &vsp->vns_drainblk; + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, + mp, vnd_squeue_tx_drain, vsp, + GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); + } + mutex_exit(&vsp->vns_lock); + } + + /* + * If we drained some amount of data, we need to signal the data queue. + */ + if (nmps > 0) { + cv_broadcast(&vsp->vns_dq_write.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); + } +} + +/* ARGSUSED */ +static void +vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + vnd_data_queue_t *vqp = &vsp->vns_dq_write; + vnd_pnsd_t *nsp = vsp->vns_nsd; + size_t len = msgsize(mp); + + /* + * Before we append this packet, we should run it through the firewall + * rules. + */ + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, + nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, + vnd_drop_out) != 0) { + /* + * Because we earlier reserved space for this packet and it's + * not making the cut, we need to go through and unreserve that + * space. Also note that the message block will likely be freed + * by the time we return from vnd_hook so we cannot rely on it. + */ + mutex_enter(&vqp->vdq_lock); + vnd_dq_unreserve(vqp, len); + mutex_exit(&vqp->vdq_lock); + return; + } + + /* + * We earlier reserved space for this packet. So for now simply append + * it and call drain. We know that no other drain can be going on right + * now thanks to the squeue. + */ + mutex_enter(&vqp->vdq_lock); + (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); + mutex_exit(&vqp->vdq_lock); + vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); +} + +/* + * We need to see if this is a valid name of sorts for us. That means a few + * things. First off, we can't assume that what we've been given has actually + * been null terminated. More importantly, that it's a valid name as far as + * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We + * further constrain ourselves to simply alphanumeric characters and a few + * additional ones, ':', '-', and '_'. + */ +static int +vnd_validate_name(const char *buf, size_t buflen) +{ + int i, len; + + /* First make sure a null terminator exists */ + for (i = 0; i < buflen; i++) + if (buf[i] == '\0') + break; + len = i; + if (i == 0 || i == buflen) + return (0); + + for (i = 0; i < len; i++) + if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && + buf[i] != '_') + return (0); + + return (1); +} + +static int +vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) +{ + vnd_ioc_attach_t via; + vnd_strioc_associate_t vss; + vnd_pnsd_t *nsp; + zone_t *zonep; + zoneid_t zid; + char buf[2*VND_NAMELEN]; + int ret, rp; + + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) + return (EFAULT); + via.via_errno = VND_E_SUCCESS; + + if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + + /* + * Only the global zone can request to create a device in a different + * zone. + */ + zid = crgetzoneid(credp); + if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && + zid != via.via_zoneid) { + via.via_errno = VND_E_PERM; + ret = EIO; + goto errcopyout; + } + + if (via.via_zoneid == -1) + via.via_zoneid = zid; + + /* + * Establish the name we'll use now. We want to be extra paranoid about + * the device we're opening so check that now. + */ + if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { + zonep = zone_find_by_id(via.via_zoneid); + if (zonep == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, + via.via_name) >= sizeof (buf)) { + zone_rele(zonep); + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", + zonep->zone_name, via.via_name); + zone_rele(zonep); + zonep = NULL; + } else { + if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= + sizeof (buf)) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); + } + + /* + * If our zone is dying then the netstack will have been removed from + * this list. + */ + nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); + if (nsp == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + + /* + * Note we set the attached handle even though we haven't actually + * finished the process of attaching the ldi handle. + */ + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { + mutex_exit(&vdp->vdd_lock); + vnd_nsd_rele(nsp); + via.via_errno = VND_E_ATTACHED; + ret = EIO; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; + ASSERT(vdp->vdd_cr == NULL); + crhold(credp); + vdp->vdd_cr = credp; + ASSERT(vdp->vdd_nsd == NULL); + vdp->vdd_nsd = nsp; + mutex_exit(&vdp->vdd_lock); + + /* + * Place an additional hold on the vnd_pnsd_t as we go through and do + * all of the rest of our work. This will be the hold that we keep for + * as long as this thing is attached. + */ + vnd_nsd_ref(nsp); + + ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, + &vdp->vdd_ldih, vdp->vdd_ldiid); + if (ret != 0) { + if (ret == ENODEV) + via.via_errno = VND_E_NODATALINK; + goto err; + } + + /* + * Unfortunately the I_PUSH interface doesn't allow us a way to detect + * whether or not we're coming in from a layered device. We really want + * to make sure that a normal user can't push on our streams module. + * Currently the only idea I have for this is to make sure that the + * credp is kcred which is really terrible. + */ + ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, + kcred, &rp); + if (ret != 0) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + via.via_errno = VND_E_STRINIT; + ret = EIO; + goto err; + } + + vss.vsa_minor = vdp->vdd_minor; + vss.vsa_nsid = nsp->vpnd_nsid; + + ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, + FKIOCTL, kcred, &rp); + if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + if (ret == 0) { + via.via_errno = vss.vsa_errno; + ret = EIO; + } + goto err; + } + + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + + /* + * There's a chance that our netstack was condemned while we've had a + * hold on it. As such we need to check and if so, error out. + */ + if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + ret = EIO; + via.via_errno = VND_E_NOZONE; + goto err; + } + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_str != NULL); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + vdp->vdd_flags |= VND_D_ATTACHED; + (void) strlcpy(vdp->vdd_datalink, via.via_name, + sizeof (vdp->vdd_datalink)); + list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(nsp); + + return (0); + +err: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + vdp->vdd_nsd = NULL; + mutex_exit(&vdp->vdd_lock); + + /* + * We have two holds to drop here. One for our original reference and + * one for the hold this operation would have represented. + */ + vnd_nsd_rele(nsp); + vnd_nsd_rele(nsp); +errcopyout: + if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) + ret = EFAULT; + + return (ret); +} + +static int +vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret = 0; + vnd_ioc_link_t vil; + char mname[2*VND_NAMELEN]; + char **c; + vnd_dev_t *v; + zoneid_t zid; + + /* Not anyone can link something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) + return (EFAULT); + + if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + c = vnd_reserved_names; + while (*c != NULL) { + if (strcmp(vil.vil_name, *c) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + c++; + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOTATTACHED; + goto errcopyout; + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto errcopyout; + } + + if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKED; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_LINK_INFLIGHT; + zid = vdp->vdd_nsd->vpnd_zid; + mutex_exit(&vdp->vdd_lock); + + if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= + sizeof (mname)) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + mutex_enter(&vnd_dev_lock); + for (v = list_head(&vnd_dev_list); v != NULL; + v = list_next(&vnd_dev_list, v)) { + if (!(v->vdd_flags & VND_D_LINKED)) + continue; + + if (v->vdd_nsd->vpnd_zid == zid && + strcmp(v->vdd_lname, vil.vil_name) == 0) { + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKEXISTS; + goto error; + } + } + + /* + * We set the name and mark ourselves attached while holding the list + * lock to ensure that no other user can mistakingly find our name. + */ + (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, + vil.vil_name); + mutex_enter(&vdp->vdd_lock); + + /* + * Because we dropped our lock, we need to double check whether or not + * the zone was marked as dying while we were here. If it hasn't, then + * it's safe for us to link it in. + */ + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto error; + } + + (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); + if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + ret = EIO; + vil.vil_errno = VND_E_MINORNODE; + } else { + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_flags |= VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + ret = 0; + } + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + if (ret == 0) { + /* + * Add a reference to represent that this device is linked into + * the file system name space to ensure that it doesn't + * disappear. + */ + vnd_dev_ref(vdp); + return (0); + } + +error: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_lname[0] = '\0'; + mutex_exit(&vdp->vdd_lock); + +errcopyout: + if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) + ret = EFAULT; + return (ret); +} + +/* + * Common unlink function. This is used both from the ioctl path and from the + * netstack shutdown path. The caller is required to hold the mutex on the + * vnd_dev_t, but they basically will have it relinquished for them. The only + * thing the caller is allowed to do afterward is to potentially rele the + * vnd_dev_t if they have their own hold. Note that only the ioctl path has its + * own hold. + */ +static void +vnd_dev_unlink(vnd_dev_t *vdp) +{ + char mname[2*VND_NAMELEN]; + + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + (void) snprintf(mname, sizeof (mname), "z%d:%s", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); + ddi_remove_minor_node(vnd_dip, mname); + vdp->vdd_lname[0] = '\0'; + vdp->vdd_flags &= ~VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + mutex_exit(&vdp->vdd_lock); + + /* + * This rele corresponds to the reference that we took in + * vnd_ioctl_link. + */ + vnd_dev_rele(vdp); +} + +static int +vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret; + zoneid_t zid; + vnd_ioc_unlink_t viu; + + /* Not anyone can unlink something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + zid = crgetzoneid(credp); + + if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) + return (EFAULT); + + viu.viu_errno = VND_E_SUCCESS; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_NOTLINKED; + goto err; + } + VERIFY(vdp->vdd_flags & VND_D_ATTACHED); + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_PERM; + goto err; + } + + /* vnd_dev_unlink releases the vdp mutex for us */ + vnd_dev_unlink(vdp); + ret = 0; +err: + if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vdp->vdd_str->vns_dq_read.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +/* ARGSUSED */ +static int +vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + vnd_ioc_buf_t vib; + + mutex_enter(&vnd_dev_lock); + vib.vib_size = vnd_vdq_hard_max; + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + mutex_exit(&vdp->vdd_str->vns_lock); + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vdp->vdd_str->vns_dq_write.vdq_max = (size_t)vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) +{ + vnd_ioc_buf_t vib; + + vib.vib_errno = 0; + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED) { + mutex_enter(&vdp->vdd_str->vns_lock); + if (min == B_TRUE) + vib.vib_size = vdp->vdd_str->vns_minwrite; + else + vib.vib_size = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + } else { + vib.vib_errno = VND_E_NOTATTACHED; + } + mutex_exit(&vdp->vdd_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + int ret, nonblock, nwrite; + frameio_t *fio; + vnd_data_queue_t *vqp; + mblk_t *mp; + + fio = frameio_alloc(KM_NOSLEEP_LAZY); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, + mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EWOULDBLOCK); + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + } + + ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, + &nwrite, mode & FKIOCTL); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + while (nwrite > 0) { + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); + nwrite--; + } + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + + return (0); +} + +static int +vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + frameio_t *fio; + int ret, nonblock, nframes, i, nread; + size_t maxwrite, minwrite, total, flen; + mblk_t *mp_chain, *mp, *nmp; + vnd_data_queue_t *vqp; + + fio = frameio_alloc(KM_NOSLEEP_LAZY); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + /* + * Make sure no single frame is larger than we can accept. + */ + mutex_enter(&vdp->vdd_str->vns_lock); + minwrite = vdp->vdd_str->vns_minwrite; + maxwrite = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + + nframes = fio->fio_nvpf / fio->fio_nvecs; + total = 0; + for (i = 0; i < nframes; i++) { + flen = frameio_frame_length(fio, + &fio->fio_vecs[i*fio->fio_nvpf]); + if (flen < minwrite || flen > maxwrite) { + frameio_free(fio); + return (ERANGE); + } + total += flen; + } + + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, total) == 0) { + if (nonblock != 0) { + frameio_free(fio); + mutex_exit(&vqp->vdq_lock); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * We've reserved our space, let's copyin and go from here. + */ + ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); + if (ret != 0) { + frameio_free(fio); + vnd_dq_unreserve(vqp, total); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + return (ret); + } + + for (mp = mp_chain; mp != NULL; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + } + + /* + * Update the frameio structure to indicate that we wrote those frames. + */ + frameio_mark_consumed(fio, nread); + ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); + frameio_free(fio); + + return (ret); +} + +static int +vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) +{ + const char *link; + uint32_t vers = 1; + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + /* + * Copy all of the members out to userland. + */ + if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (vdp->vdd_flags & VND_D_LINKED) + link = vdp->vdd_lname; + else + link = "<anonymous>"; + if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, + sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, + sizeof (zoneid_t), mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +static int +vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) +{ + vnd_ioc_list_t vl; + vnd_ioc_list32_t vl32; + zoneid_t zid; + vnd_dev_t *vdp; + vnd_ioc_info_t *vip; + int found, cancopy, ret; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + vl.vl_nents = vl32.vl_nents; + vl.vl_actents = vl32.vl_actents; + vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; + } else { + if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), + mode & FKIOCTL) != 0) + return (EFAULT); + } + + cancopy = vl.vl_nents; + vip = vl.vl_ents; + found = 0; + zid = crgetzoneid(credp); + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && + (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { + found++; + if (cancopy > 0) { + ret = vnd_ioctl_list_copy_info(vdp, vip, mode); + if (ret != 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return (ret); + } + cancopy--; + vip++; + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, + sizeof (uint_t), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + + +/* ARGSUSED */ +static int +vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + ASSERT(m != 0); + + /* + * Make sure no one has come in on an ioctl from the strioc case. + */ + if ((cmd & VND_STRIOC) == VND_STRIOC) + return (ENOTTY); + + /* + * Like close, seems like if this minor isn't found, it's a programmer + * error somehow. + */ + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + switch (cmd) { + case VND_IOC_ATTACH: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_attach(vdp, arg, credp, mode); + break; + case VND_IOC_LINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_link(vdp, arg, credp, mode); + break; + case VND_IOC_UNLINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_unlink(vdp, arg, credp, mode); + break; + case VND_IOC_GETRXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_getrxbuf(vdp, arg, mode); + break; + case VND_IOC_SETRXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_setrxbuf(vdp, arg, mode); + break; + case VND_IOC_GETTXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettxbuf(vdp, arg, mode); + break; + case VND_IOC_SETTXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_settxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMAXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + if (crgetzoneid(credp) != GLOBAL_ZONEID) { + ret = EPERM; + break; + } + ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMINTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); + break; + case VND_IOC_GETMAXTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); + break; + case VND_IOC_FRAMEIO_READ: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_frameio_read(vdp, arg, mode); + break; + case VND_IOC_FRAMEIO_WRITE: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_frameio_write(vdp, arg, mode); + break; + case VND_IOC_LIST: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_list(arg, credp, mode); + break; + default: + ret = ENOTTY; + break; + } + + vnd_dev_rele(vdp); + return (ret); +} + +static int +vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + vnd_dev_t *vdp; + minor_t m; + zoneid_t zid; + + if (flag & (FEXCL | FNDELAY)) + return (ENOTSUP); + + if (otyp & OTYP_BLK) + return (ENOTSUP); + + zid = crgetzoneid(credp); + m = getminor(*devp); + + /* + * If we have an open of a non-zero instance then we need to look that + * up in our list of entries. + */ + if (m != 0) { + + /* + * We don't check for rawaccess globally as a user could be + * doing a list ioctl on the control node which doesn't require + * this privilege. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENOENT); + + /* + * We need to check to make sure that the user is allowed to + * open this node. At this point it should be an attached handle + * as that's all we're allowed to access. + */ + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (EBUSY); + } + + if (!(vdp->vdd_flags & VND_D_OPENED)) { + vdp->vdd_flags |= VND_D_OPENED; + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + } + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + + return (0); + } + + if (flag & FEXCL) + return (ENOTSUP); + + /* + * We need to clone ourselves and set up new a state. + */ + vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); + bzero(vdp, sizeof (vnd_dev_t)); + + if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { + kmem_cache_free(vnd_dev_cache, vdp); + return (EINVAL); + } + + vdp->vdd_minor = id_alloc(vnd_minors); + mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); + list_link_init(&vdp->vdd_link); + vdp->vdd_ref = 1; + *devp = makedevice(getmajor(*devp), vdp->vdd_minor); + vdp->vdd_devid = *devp; + DTRACE_VND_REFINC(vdp); + vdp->vdd_flags |= VND_D_OPENED; + + mutex_enter(&vnd_dev_lock); + list_insert_head(&vnd_dev_list, vdp); + mutex_exit(&vnd_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + if (m == 0) + return (ENXIO); + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_flags & VND_D_OPENED); + vdp->vdd_flags &= ~VND_D_OPENED; + mutex_exit(&vdp->vdd_lock); + + /* Remove the hold from the previous open. */ + vnd_dev_rele(vdp); + + /* And now from lookup */ + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static int +vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error = 0; + size_t mpsize; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + mblk_t *mp = NULL; + offset_t u_loffset; + + /* + * If we have more than one uio we refuse to do anything. That's for + * frameio. + */ + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + error = EWOULDBLOCK; + goto err; + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + error = EINTR; + goto err; + } + } + } + + /* Ensure our buffer is big enough */ + mp = vqp->vdq_head; + ASSERT(mp != NULL); + mpsize = msgsize(mp); + if (mpsize > uiop->uio_resid) { + error = EOVERFLOW; + goto err; + } + + u_loffset = uiop->uio_loffset; + while (mp != NULL) { + if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { + error = EFAULT; + uiop->uio_loffset = u_loffset; + mp = NULL; + goto err; + } + mpsize -= MBLKL(mp); + mp = mp->b_cont; + } + ASSERT(mpsize == 0); + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); +err: + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + + return (error); +} + +/* ARGSUSED */ +static int +vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error; + vnd_dev_t *vdp; + mblk_t *mp; + ssize_t iosize, origsize; + vnd_data_queue_t *vqp; + + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + mutex_enter(&vdp->vdd_str->vns_lock); + if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || + uiop->uio_resid < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + vnd_dev_rele(vdp); + return (ERANGE); + } + mutex_exit(&vdp->vdd_str->vns_lock); + VERIFY(vdp->vdd_str != NULL); + + /* + * Reserve space in the data queue if we can. If we can't, block or + * return EAGAIN. If we can, go and squeue_enter. + */ + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * Now that we've reserved the space, try to allocate kernel space for + * and copy in the block. To take care of all this we use the + * strmakedata subroutine for now. + */ + origsize = iosize = uiop->uio_resid; + error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, + &mp); + + /* + * strmakedata() will return an error or it may only consume a portion + * of the data. + */ + if (error != 0 || uiop->uio_resid != 0) { + vnd_dq_unreserve(vqp, origsize); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + vnd_dev_rele(vdp); + return (ENOSR); + } + + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + short ready = 0; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + if ((events & POLLIN) || (events & POLLRDNORM)) { + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head != NULL) + ready |= events & (POLLIN | POLLRDNORM); + mutex_exit(&vqp->vdq_lock); + } + + if (events & POLLOUT) { + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_cur != vqp->vdq_max) + ready |= POLLOUT; + mutex_exit(&vqp->vdq_lock); + } + + if ((ready == 0 && !anyyet) || (events & POLLET)) { + *phpp = &vdp->vdd_ph; + } + *reventsp = ready; + vnd_dev_rele(vdp); + return (0); +} + +/* ARGSUSED */ +static void * +vnd_stack_init(netstackid_t stackid, netstack_t *ns) +{ + vnd_pnsd_t *nsp; + + nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); + bzero(nsp, sizeof (*nsp)); + nsp->vpnd_nsid = stackid; + nsp->vpnd_zid = netstackid_to_zoneid(stackid); + nsp->vpnd_flags = 0; + mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_nslink)); + if (vnd_netinfo_init(nsp) == 0) + nsp->vpnd_hooked = B_TRUE; + + mutex_enter(&vnd_dev_lock); + list_insert_tail(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + return (nsp); +} + +/* ARGSUSED */ +static void +vnd_stack_shutdown(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + vnd_dev_t *vdp; + + ASSERT(nsp != NULL); + /* + * After shut down no one should be able to find their way to this + * netstack again. + */ + mutex_enter(&vnd_dev_lock); + list_remove(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + /* + * Make sure hooks know that they're going away. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_shutdown(nsp); + + /* + * Now we need to go through and notify each zone that they are in + * teardown phase. See the big theory statement section on vnd, zones, + * netstacks, and sdev for more information about this. + */ + mutex_enter(&nsp->vpnd_lock); + nsp->vpnd_flags |= VND_NS_CONDEMNED; + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_CONDEMNED)) + vdp->vdd_flags |= VND_D_ZONE_DYING; + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + /* + * Next we remove all the links as we know nothing new can be added to + * the list and that none of the extent devices can obtain additional + * links. + */ +restart: + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_CONDEMNED) || + !(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + continue; + } + + /* + * We drop our lock here and restart afterwards. Note that as + * part of unlinking we end up doing a rele of the vnd_dev_t. If + * this is the final hold on the vnd_dev_t then it might try and + * remove itself. Our locking rules requires not to be holding + * any locks when we call any of the rele functions. + * + * Note that the unlink function requires holders to call into + * it with the vnd_dev_t->vdd_lock held and will take care of it + * for us. Because we don't have a hold on it, we're done at + * this point. + */ + mutex_exit(&nsp->vpnd_lock); + /* Forcibly unlink */ + vnd_dev_unlink(vdp); + goto restart; + } + mutex_exit(&nsp->vpnd_lock); +} + +/* ARGSUSED */ +static void +vnd_stack_destroy(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + + ASSERT(nsp != NULL); + + /* + * Now that we've unlinked everything we just have to hang out for + * it to finish exiting. Now that it's no longer the kernel itself + * that's doing this we just need to wait for our reference count to + * equal zero and then we're free. If the global zone is holding open a + * reference to a vnd device for another zone, that's bad, but there's + * nothing much we can do. See the section on 'vnd, zones, netstacks' in + * the big theory statement for more information. + */ + mutex_enter(&nsp->vpnd_lock); + while (nsp->vpnd_ref != 0) + cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); + mutex_exit(&nsp->vpnd_lock); + + /* + * During shutdown we removed ourselves from the list and now we have no + * more references so we can safely say that there is nothing left and + * destroy everything that we had sitting around. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_fini(nsp); + + mutex_destroy(&nsp->vpnd_lock); + list_destroy(&nsp->vpnd_dev_list); + kmem_cache_free(vnd_pnsd_cache, nsp); +} + +/* + * Convert a node with a name of the form /dev/vnd/zone/%zonename and + * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. + */ +static vnd_pnsd_t * +vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) +{ + enum vtype vt; + const char *path = sdev_ctx_path(ctx); + char *zstart, *dup; + size_t duplen; + vnd_pnsd_t *nsp; + + vt = sdev_ctx_vtype(ctx); + ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); + + if (vt == VDIR) { + zstart = strrchr(path, '/'); + ASSERT(zstart != NULL); + zstart++; + return (vnd_nsd_lookup_by_zonename(zstart)); + } + + ASSERT(vt == VCHR); + + dup = strdup(path); + duplen = strlen(dup) + 1; + zstart = strrchr(dup, '/'); + *zstart = '\0'; + zstart--; + zstart = strrchr(dup, '/'); + zstart++; + nsp = vnd_nsd_lookup_by_zonename(zstart); + kmem_free(dup, duplen); + + return (nsp); +} + +static sdev_plugin_validate_t +vnd_sdev_validate_dir(sdev_ctx_t ctx) +{ + vnd_pnsd_t *nsp; + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) + return (SDEV_VTOR_VALID); + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + return (SDEV_VTOR_VALID); + } + + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (SDEV_VTOR_INVALID); + vnd_nsd_rele(nsp); + + return (SDEV_VTOR_VALID); +} + +static sdev_plugin_validate_t +vnd_sdev_validate(sdev_ctx_t ctx) +{ + enum vtype vt; + vnd_dev_t *vdp; + minor_t minor; + + vt = sdev_ctx_vtype(ctx); + if (vt == VDIR) + return (vnd_sdev_validate_dir(ctx)); + ASSERT(vt == VCHR); + + if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) + return (SDEV_VTOR_VALID); + + if (sdev_ctx_minor(ctx, &minor) != 0) + return (SDEV_VTOR_STALE); + + vdp = vnd_dev_lookup(minor); + if (vdp == NULL) + return (SDEV_VTOR_STALE); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED) || + (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_VALID); +} + +/* + * This function is a no-op. sdev never has holds on our devices as they can go + * away at any time and specfs has to deal with that fact. + */ +/* ARGSUSED */ +static void +vnd_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static int +vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) +{ + int ret; + vnd_dev_t *vdp; + + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_LINKED) && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, + VND_SDEV_MODE, vdp->vdd_devid); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&nsp->vpnd_lock); + vnd_nsd_rele(nsp); + return (ret); + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + return (0); +} + +static int +vnd_sdev_filldir_root(sdev_ctx_t ctx) +{ + zoneid_t zid; + vnd_pnsd_t *nsp; + int ret; + + zid = getzoneid(); + nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); + ASSERT(nsp != NULL); + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + if (ret != 0) + return (ret); + + /* + * Checking the zone id is not sufficient as the global zone could be + * reaching down into a non-global zone's mounted /dev. + */ + if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { + ret = sdev_plugin_mkdir(ctx, "zone"); + if (ret != 0 && ret != EEXIST) + return (ret); + } + + /* + * Always add a reference to the control node. There's no need to + * reference it since it always exists and is always what we clone from. + */ + ret = sdev_plugin_mknod(ctx, "ctl", VND_SDEV_MODE, + makedevice(ddi_driver_major(vnd_dip), 0)); + if (ret != 0 && ret != EEXIST) + return (ret); + + return (0); +} + +static int +vnd_sdev_filldir_zroot(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + zone_t *zonep; + + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + mutex_enter(&nsp->vpnd_lock); + if (list_is_empty(&nsp->vpnd_dev_list)) { + mutex_exit(&nsp->vpnd_lock); + continue; + } + mutex_exit(&nsp->vpnd_lock); + zonep = zone_find_by_id(nsp->vpnd_zid); + /* + * This zone must be being torn down, so skip it. + */ + if (zonep == NULL) + continue; + ret = sdev_plugin_mkdir(ctx, zonep->zone_name); + zone_rele(zonep); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vnd_dev_lock); + return (ret); + } + } + mutex_exit(&vnd_dev_lock); + return (0); +} + +static int +vnd_sdev_filldir(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + + ASSERT(sdev_ctx_vtype(ctx) == VDIR); + if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_root(ctx)); + + if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_zroot(ctx)); + + ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), + strlen(VND_SDEV_ZROOT)) == 0); + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (0); + + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + + return (ret); +} + +static sdev_plugin_ops_t vnd_sdev_ops = { + SDEV_PLUGIN_VERSION, + SDEV_PLUGIN_SUBDIR, + vnd_sdev_validate, + vnd_sdev_filldir, + vnd_sdev_inactive +}; + +static int +vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int errp = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Only allow one instance. + */ + if (vnd_dip != NULL) + return (DDI_FAILURE); + + vnd_dip = dip; + if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != + DDI_SUCCESS) { + vnd_dip = NULL; + return (DDI_FAILURE); + } + + if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, + DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { + ddi_remove_minor_node(vnd_dip, NULL); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, + &errp); + if (vnd_sdev_hdl == (sdev_plugin_hdl_t)NULL) { + ddi_remove_minor_node(vnd_dip, NULL); + ddi_prop_remove_all(vnd_dip); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_PRIORITY); + + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&vnd_dev_lock); + if (!list_is_empty(&vnd_dev_list)) { + mutex_exit(&vnd_dev_lock); + return (DDI_FAILURE); + } + mutex_exit(&vnd_dev_lock); + + return (DDI_FAILURE); +} + +/* ARGSUSED */ +static int +vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vnd_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + + + +static void +vnd_ddi_fini(void) +{ + netstack_unregister(NS_VND); + if (vnd_taskq != NULL) + taskq_destroy(vnd_taskq); + if (vnd_str_cache != NULL) + kmem_cache_destroy(vnd_str_cache); + if (vnd_dev_cache != NULL) + kmem_cache_destroy(vnd_dev_cache); + if (vnd_pnsd_cache != NULL) + kmem_cache_destroy(vnd_pnsd_cache); + if (vnd_minors != NULL) + id_space_destroy(vnd_minors); + if (vnd_list_init != 0) { + list_destroy(&vnd_nsd_list); + list_destroy(&vnd_dev_list); + mutex_destroy(&vnd_dev_lock); + vnd_list_init = 0; + } + frameio_fini(); +} + +static int +vnd_ddi_init(void) +{ + if (frameio_init() != 0) + return (DDI_FAILURE); + + vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_str_cache == NULL) { + frameio_fini(); + return (DDI_FAILURE); + } + vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_dev_cache == NULL) { + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", + sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_pnsd_cache == NULL) { + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); + if (vnd_taskq == NULL) { + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); + if (vnd_minors == NULL) { + taskq_destroy(vnd_taskq); + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&vnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_link)); + list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), + offsetof(vnd_pnsd_t, vpnd_link)); + vnd_list_init = 1; + + netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, + vnd_stack_destroy); + + return (DDI_SUCCESS); +} + +static struct module_info vnd_minfo = { + 0, /* module id */ + "vnd", /* module name */ + 1, /* smallest packet size */ + INFPSZ, /* largest packet size (infinite) */ + 1, /* high watermark */ + 0 /* low watermark */ +}; + +static struct qinit vnd_r_qinit = { + vnd_s_rput, + NULL, + vnd_s_open, + vnd_s_close, + NULL, + &vnd_minfo, + NULL +}; + +static struct qinit vnd_w_qinit = { + vnd_s_wput, + NULL, + NULL, + NULL, + NULL, + &vnd_minfo, + NULL +}; + +static struct streamtab vnd_strtab = { + &vnd_r_qinit, + &vnd_w_qinit, + NULL, + NULL +}; + + +static struct cb_ops vnd_cb_ops = { + vnd_open, /* open */ + vnd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + vnd_read, /* read */ + vnd_write, /* write */ + vnd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + vnd_chpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops vnd_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + vnd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + vnd_attach, /* attach */ + vnd_detach, /* detach */ + nodev, /* reset */ + &vnd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv vnd_modldrv = { + &mod_driverops, + "Virtual Networking Datapath Driver", + &vnd_dev_ops +}; + +static struct fmodsw vnd_fmodfsw = { + "vnd", + &vnd_strtab, + D_NEW | D_MP +}; + +static struct modlstrmod vnd_modlstrmod = { + &mod_strmodops, + "Virtual Networking Datapath Driver", + &vnd_fmodfsw +}; + +static struct modlinkage vnd_modlinkage = { + MODREV_1, + &vnd_modldrv, + &vnd_modlstrmod, + NULL +}; + +int +_init(void) +{ + int error; + + /* + * We need to do all of our global initialization in init as opposed to + * attach and detach. The problem here is that because vnd can be used + * from a stream context while being detached, we can not rely on having + * run attach to create everything, alas. so it goes in _init, just like + * our friend ip. + */ + if ((error = vnd_ddi_init()) != DDI_SUCCESS) + return (error); + error = mod_install((&vnd_modlinkage)); + if (error != 0) + vnd_ddi_fini(); + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vnd_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&vnd_modlinkage); + if (error == 0) + vnd_ddi_fini(); + return (error); +} diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf new file mode 100644 index 0000000000..65872e1ddf --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# + +name="vnd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c new file mode 100644 index 0000000000..46a9e435cd --- /dev/null +++ b/usr/src/uts/common/io/zfd.c @@ -0,0 +1,1157 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. All rights reserved. + */ + +/* + * Zone File Descriptor Driver. + * + * This driver is derived from the zcons driver which is in turn derived from + * the pts/ptm drivers. The purpose is to expose file descriptors within the + * zone which are connected to zoneadmd and used for logging or an interactive + * connection to a process within the zone. + * + * Its implementation is straightforward. Each instance of the driver + * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd + * uses these devices unidirectionally to provide stdin, stdout and stderr to + * the process within the zone. + * + * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd, + * using the devctl framework; thus the driver does not need to maintain any + * sort of "admin" node. + * + * The driver shuttles I/O from master side to slave side and back. In a break + * from the pts/ptm semantics, if one side is not open, I/O directed towards + * it will simply be discarded. This is so that if zoneadmd is not holding the + * master side fd open (i.e. it has died somehow), processes in the zone do not + * experience any errors and I/O to the fd does not cause the process to hang. + * + * The driver can also act as a multiplexer so that data written to the + * slave side within the zone is also redirected back to another zfd device + * inside the zone for consumption (i.e. it can be read). The intention is + * that a logging process within the zone can consume data that is being + * written by an application onto the primary stream. This is essentially + * a tee off of the primary stream into a log stream. This tee can also be + * configured to be flow controlled via an ioctl. Flow control happens on the + * primary stream and is used to ensure that the log stream receives all of + * the messages off the primary stream when consumption of the data off of + * the log stream gets behind. Configuring for flow control implies that the + * application writing to the primary stream will be blocked when the log + * consumer gets behind. Note that closing the log stream (e.g. when the zone + * halts) will cause the loss of all messages queued in the stream. + * + * The zone's zfd device configuration is driven by zoneadmd and a zone mode. + * The mode, which is controlled by the zone attribute "zlog-mode" is somewhat + * of a misnomer since its purpose has evolved. The attribute can have a + * variety of values, but the lowest two positions are used to control how many + * zfd devices are created inside the zone and if the primary stream is a tty. + * + * Here is a summary of how the 4 modes control what zfd devices are created + * and how they're used: + * + * t-: 1 stdio zdev (0) configured as a tty + * --: 3 stdio zdevs (0, 1, 2), not configured as a tty + * tn: 1 stdio zdev (0) configured as a tty, 1 additional zdev (1) + * -n: 3 stdio zdevs (0, 1, 2), not tty, 2 additional zdevs (3, 4) + * + * With the 't' flag set, stdin/out/err is multiplexed onto a single full-duplex + * stream which is configured as a tty. That is, ptem, ldterm and ttycompat are + * autopushed onto the stream when the slave side is opened. There is only a + * single zfd dev (0) needed for the primary stream. + * + * When the 'n' flag is set, it is assumed that output logging will be done + * within the zone itself. In this configuration 1 or 2 additional zfd devices, + * depending on tty mode ('t' flag) are created within the zone. An application + * can then configure the zfd streams driver into a multiplexer. Output from + * the stdout/stderr zfd(s) will be teed into the correspond logging zfd(s) + * within the zone. + * + * The following is a diagram of how this works for a '-n' configuration: + * + * + * zoneadmd (for zlogin -I stdout) + * GZ: ^ + * | + * -------------------------- + * ^ + * NGZ: | + * app >1 -> zfd1 -> zfd3 -> logger (for logger to consume app's stdout) + * + * There would be a similar path for the app's stderr into zfd4 for the logger + * to consume stderr. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/debug.h> +#include <sys/devops.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kstr.h> +#include <sys/modctl.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zfd.h> +#include <sys/vnode.h> +#include <sys/fs/snode.h> +#include <sys/zone.h> +#include <sys/sdt.h> + +static kmutex_t zfd_mux_lock; + +static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int zfd_attach(dev_info_t *, ddi_attach_cmd_t); +static int zfd_detach(dev_info_t *, ddi_detach_cmd_t); + +static int zfd_open(queue_t *, dev_t *, int, int, cred_t *); +static int zfd_close(queue_t *, int, cred_t *); +static int zfd_wput(queue_t *, mblk_t *); +static int zfd_rsrv(queue_t *); +static int zfd_wsrv(queue_t *); + +/* + * The instance number is encoded in the dev_t in the minor number; the lowest + * bit of the minor number is used to track the master vs. slave side of the + * fd. The rest of the bits in the minor number are the instance. + */ +#define ZFD_MASTER_MINOR 0 +#define ZFD_SLAVE_MINOR 1 + +#define ZFD_INSTANCE(x) (getminor((x)) >> 1) +#define ZFD_NODE(x) (getminor((x)) & 0x01) + +/* + * This macro converts a zfd_state_t pointer to the associated slave minor + * node's dev_t. + */ +#define ZFD_STATE_TO_SLAVEDEV(x) \ + (makedevice(ddi_driver_major((x)->zfd_devinfo), \ + (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR))) + +int zfd_debug = 0; +#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a) +#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b) + +/* + * ZFD Pseudo Terminal Module: stream data structure definitions, + * based on zcons. + */ +static struct module_info zfd_info = { + 0x20FD, /* ZOFD - 8445 */ + "zfd", + 0, /* min packet size */ + INFPSZ, /* max packet size - infinity */ + 2048, /* high water */ + 128 /* low water */ +}; + +static struct qinit zfd_rinit = { + NULL, + zfd_rsrv, + zfd_open, + zfd_close, + NULL, + &zfd_info, + NULL +}; + +static struct qinit zfd_winit = { + zfd_wput, + zfd_wsrv, + NULL, + NULL, + NULL, + &zfd_info, + NULL +}; + +static struct streamtab zfd_tab_info = { + &zfd_rinit, + &zfd_winit, + NULL, + NULL +}; + +#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL) + +/* + * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops) + */ +DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \ + nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \ + ddi_quiesce_not_needed); + +/* + * Module linkage information for the kernel. + */ + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module (this is a pseudo driver) */ + "Zone FD driver", /* description of module */ + &zfd_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +typedef enum { + ZFD_NO_MUX, + ZFD_PRIMARY_STREAM, + ZFD_LOG_STREAM +} zfd_mux_type_t; + +typedef struct zfd_state { + dev_info_t *zfd_devinfo; /* instance info */ + queue_t *zfd_master_rdq; /* GZ read queue */ + queue_t *zfd_slave_rdq; /* in-zone read queue */ + int zfd_state; /* ZFD_STATE_MOPEN, ZFD_STATE_SOPEN */ + int zfd_tty; /* ZFD_MAKETTY - strm mods will push */ + boolean_t zfd_is_flowcon; /* primary stream flow stopped */ + boolean_t zfd_allow_flowcon; /* use flow control */ + zfd_mux_type_t zfd_muxt; /* state type: none, primary, log */ + struct zfd_state *zfd_inst_pri; /* log state's primary ptr */ + struct zfd_state *zfd_inst_log; /* primary state's log ptr */ +} zfd_state_t; + +#define ZFD_STATE_MOPEN 0x01 +#define ZFD_STATE_SOPEN 0x02 + +static void *zfd_soft_state; + +/* + * List of STREAMS modules that are autopushed onto a slave instance when its + * opened, but only if the ZFD_MAKETTY ioctl has first been received by the + * master. + */ +static char *zfd_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t), + 0)) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(zfd_soft_state); + + mutex_init(&zfd_mux_lock, NULL, MUTEX_DEFAULT, NULL); + return (err); +} + + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) { + return (err); + } + + ddi_soft_state_fini(&zfd_soft_state); + mutex_destroy(&zfd_mux_lock); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN]; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME, + instance); + (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME, + instance); + + /* + * Create the master and slave minor nodes. + */ + if ((ddi_create_minor_node(dip, slavenm, S_IFCHR, + instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) || + (ddi_create_minor_node(dip, masternm, S_IFCHR, + instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) { + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + return (DDI_FAILURE); + } + + VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL); + zfds->zfd_devinfo = dip; + zfds->zfd_tty = 0; + zfds->zfd_muxt = ZFD_NO_MUX; + zfds->zfd_inst_log = NULL; + return (DDI_SUCCESS); +} + +static int +zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (DDI_FAILURE); + + if ((zfds->zfd_state & ZFD_STATE_MOPEN) || + (zfds->zfd_state & ZFD_STATE_SOPEN)) { + DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip); + return (DDI_FAILURE); + } + + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + + return (DDI_SUCCESS); +} + +/* + * zfd_getinfo() + * getinfo(9e) entrypoint. + */ +/*ARGSUSED*/ +static int +zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + zfd_state_t *zfds; + int instance = ZFD_INSTANCE((dev_t)arg); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) + return (DDI_FAILURE); + *result = zfds->zfd_devinfo; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Return the equivalent queue from the other side of the relationship. + * e.g.: given the slave's write queue, return the master's write queue. + */ +static queue_t * +zfd_switch(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq) + return (zfds->zfd_slave_rdq); + else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq + != NULL) + return (OTHERQ(zfds->zfd_slave_rdq)); + else if (qp == zfds->zfd_slave_rdq) + return (zfds->zfd_master_rdq); + else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq + != NULL) + return (OTHERQ(zfds->zfd_master_rdq)); + else + return (NULL); +} + +/* + * For debugging and outputting messages. Returns the name of the side of + * the relationship associated with this queue. + */ +static const char * +zfd_side(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq || + OTHERQ(qp) == zfds->zfd_master_rdq) { + return ("master"); + } + ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq); + return ("slave"); +} + +/*ARGSUSED*/ +static int +zfd_master_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + + /* + * Enforce exclusivity on the master side; the only consumer should + * be the zoneadmd for the zone. + */ + if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0) + return (EBUSY); + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_master_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_MOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + qprocson(rqp); + + /* + * Following qprocson(), the master side is fully plumbed into the + * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq + * will allow the slave to send messages to us (the master). + * This cannot occur before qprocson() because the master is not + * ready to process them until that point. + */ + zfds->zfd_master_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + if (oflag & FNOCTTY) + sop->so_flags = SO_HIWAT | SO_LOWAT; + else + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/*ARGSUSED*/ +static int +zfd_slave_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + /* + * The slave side can be opened as many times as needed. + */ + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds)); + return (0); + } + + /* A log stream is read-only */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + (oflag & (FREAD | FWRITE)) != FREAD) + return (EINVAL); + + if (zfds->zfd_tty == 1) { + major_t major; + minor_t minor; + minor_t lastminor; + uint_t anchorindex; + + /* + * Set up sad(7D) so that the necessary STREAMS modules will + * be in place. A wrinkle is that 'ptem' must be anchored + * in place (see streamio(7i)) because we always want the + * fd to have terminal semantics. + */ + minor = + ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR; + major = ddi_driver_major(zfds->zfd_devinfo); + lastminor = 0; + anchorindex = 1; + if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor, + &anchorindex, zfd_mods) != 0) { + DBG("zfd_slave_open(): kstr_autopush() failed\n"); + return (EIO); + } + } + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_slave_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_SOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + + qprocson(rqp); + + /* + * Must follow qprocson(), since we aren't ready to process until then. + */ + zfds->zfd_slave_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/* + * open(9e) entrypoint; checks sflag, and rejects anything unordinary. + */ +static int +zfd_open(queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + int instance = ZFD_INSTANCE(*devp); + int ret; + zfd_state_t *zfds; + + if (sflag != 0) + return (EINVAL); + + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (ENXIO); + + switch (ZFD_NODE(*devp)) { + case ZFD_MASTER_MINOR: + ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp); + break; + case ZFD_SLAVE_MINOR: + ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp); + /* + * If we just opened the log stream and flow control has + * been enabled, we want to make sure the primary stream can + * start flowing. + */ + if (ret == 0 && zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + break; + default: + ret = ENXIO; + break; + } + + return (ret); +} + +/* + * close(9e) entrypoint. + */ +/*ARGSUSED1*/ +static int +zfd_close(queue_t *rqp, int flag, cred_t *credp) +{ + queue_t *wqp; + mblk_t *bp; + zfd_state_t *zfds; + major_t major; + minor_t minor; + + zfds = (zfd_state_t *)rqp->q_ptr; + + if (rqp == zfds->zfd_master_rdq) { + DBG("Closing master side"); + + zfds->zfd_master_rdq = NULL; + zfds->zfd_state &= ~ZFD_STATE_MOPEN; + + /* + * qenable slave side write queue so that it can flush + * its messages as master's read queue is going away + */ + if (zfds->zfd_slave_rdq != NULL) { + qenable(WR(zfds->zfd_slave_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + } else if (rqp == zfds->zfd_slave_rdq) { + + DBG("Closing slave side"); + zfds->zfd_state &= ~ZFD_STATE_SOPEN; + zfds->zfd_slave_rdq = NULL; + + wqp = WR(rqp); + while ((bp = getq(wqp)) != NULL) { + if (zfds->zfd_master_rdq != NULL) + putnext(zfds->zfd_master_rdq, bp); + else if (bp->b_datap->db_type == M_IOCTL) + miocnak(wqp, bp, 0, 0); + else + freemsg(bp); + } + + /* + * Qenable master side write queue so that it can flush its + * messages as slaves's read queue is going away. + */ + if (zfds->zfd_master_rdq != NULL) + qenable(WR(zfds->zfd_master_rdq)); + + /* + * Qenable primary stream if necessary. + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && + zfds->zfd_inst_pri->zfd_allow_flowcon) { + zfds->zfd_inst_pri->zfd_is_flowcon = B_FALSE; + if (zfds->zfd_inst_pri->zfd_master_rdq != NULL) + qenable(RD(zfds->zfd_inst_pri->zfd_master_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + if (zfds->zfd_tty == 1) { + /* + * Clear the sad configuration so that reopening + * doesn't fail to set up sad configuration. + */ + major = ddi_driver_major(zfds->zfd_devinfo); + minor = ddi_get_instance(zfds->zfd_devinfo) << 1 | + ZFD_SLAVE_MINOR; + (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor, + NULL, NULL, NULL); + } + } + + return (0); +} + +static void +handle_mflush(queue_t *qp, mblk_t *mp) +{ + mblk_t *nmp; + DBG1("M_FLUSH on %s side", zfd_side(qp)); + + if (*mp->b_rptr & FLUSHW) { + DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp)); + flushq(qp, FLUSHDATA); + *mp->b_rptr &= ~FLUSHW; + if ((*mp->b_rptr & FLUSHR) == 0) { + /* + * FLUSHW only. Change to FLUSHR and putnext other side, + * then we are done. + */ + *mp->b_rptr |= FLUSHR; + if (zfd_switch(RD(qp)) != NULL) { + putnext(zfd_switch(RD(qp)), mp); + return; + } + } else if ((zfd_switch(RD(qp)) != NULL) && + (nmp = copyb(mp)) != NULL) { + /* + * It is a FLUSHRW; we copy the mblk and send + * it to the other side, since we still need to use + * the mblk in FLUSHR processing, below. + */ + putnext(zfd_switch(RD(qp)), nmp); + } + } + + if (*mp->b_rptr & FLUSHR) { + DBG("qreply(qp) turning FLUSHR around\n"); + qreply(qp, mp); + return; + } + freemsg(mp); +} + +/* + * Evaluate the various conditionals to determine if we're teeing into a log + * stream and if the primary stream should be flow controlled. This function + * can set the zfd_is_flowcon flag as a side effect. + * + * When teeing with flow control, we always queue the teed msg here and if + * the queue is getting full, we set zfd_is_flowcon. The primary stream will + * always queue when zfd_is_flowcon and will also not be served when + * zfd_is_flowcon is set. This causes backpressure on the primary stream + * until the teed queue can drain. + */ +static void +zfd_tee_handler(zfd_state_t *zfds, unsigned char type, mblk_t *mp) +{ + queue_t *log_qp; + zfd_state_t *log_zfds; + mblk_t *lmp; + + if (zfds->zfd_muxt != ZFD_PRIMARY_STREAM) + return; + + if (type != M_DATA) + return; + + log_zfds = zfds->zfd_inst_log; + if (log_zfds == NULL) + return; + + ASSERT(log_zfds->zfd_muxt == ZFD_LOG_STREAM); + + if ((log_zfds->zfd_state & ZFD_STATE_SOPEN) == 0) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + /* The zfd_slave_rdq is null until the log dev is opened in the zone */ + log_qp = RD(log_zfds->zfd_slave_rdq); + DTRACE_PROBE2(zfd__tee__check, void *, log_qp, void *, zfds); + + if (!zfds->zfd_allow_flowcon) { + /* + * We're not supposed to tee with flow control and the tee is + * full so we skip teeing into the log stream. + */ + if ((log_qp->q_flag & QFULL) != 0) + return; + } + + /* + * Tee the message into the log stream. + */ + lmp = dupmsg(mp); + if (lmp == NULL) { + if (zfds->zfd_allow_flowcon) + zfds->zfd_is_flowcon = B_TRUE; + return; + } + + if (log_qp->q_first == NULL && bcanputnext(log_qp, lmp->b_band)) { + putnext(log_qp, lmp); + } else { + if (putq(log_qp, lmp) == 0) { + /* The logger queue is full, free the msg. */ + freemsg(lmp); + } + /* + * If we're supposed to tee with flow control and the tee is + * over the high water mark then we want the primary stream to + * stop flowing. We'll stop queueing the primary stream after + * the log stream has drained. + */ + if (zfds->zfd_allow_flowcon && + log_qp->q_count > log_qp->q_hiwat) { + zfds->zfd_is_flowcon = B_TRUE; + } + } +} + +/* + * wput(9E) is symmetric for master and slave sides, so this handles both + * without splitting the codepath. (The only exception to this is the + * processing of zfd ioctls, which is restricted to the master side.) + * + * zfd_wput() looks at the other side; if there is no process holding that + * side open, it frees the message. This prevents processes from hanging + * if no one is holding open the fd. Otherwise, it putnext's high + * priority messages, putnext's normal messages if possible, and otherwise + * enqueues the messages; in the case that something is enqueued, wsrv(9E) + * will take care of eventually shuttling I/O to the other side. + * + * When configured as a multiplexer, then anything written to the stream + * from inside the zone is also teed off to the corresponding log stream + * for consumption within the zone (i.e. the log stream can be read, but never + * written to, by an application inside the zone). + */ +static int +zfd_wput(queue_t *qp, mblk_t *mp) +{ + unsigned char type = mp->b_datap->db_type; + zfd_state_t *zfds; + struct iocblk *iocbp; + boolean_t must_queue = B_FALSE; + + ASSERT(qp->q_ptr); + + DBG1("entering zfd_wput, %s side", zfd_side(qp)); + + /* + * Process zfd ioctl messages if qp is the master side's write queue. + */ + zfds = (zfd_state_t *)qp->q_ptr; + + if (type == M_IOCTL) { + iocbp = (struct iocblk *)(void *)mp->b_rptr; + + switch (iocbp->ioc_cmd) { + case ZFD_MAKETTY: + zfds->zfd_tty = 1; + miocack(qp, mp, 0, 0); + return (0); + case ZFD_EOF: + if (zfds->zfd_slave_rdq != NULL) + (void) putnextctl(zfds->zfd_slave_rdq, + M_HANGUP); + miocack(qp, mp, 0, 0); + return (0); + case ZFD_HAS_SLAVE: + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + miocack(qp, mp, 0, 0); + } else { + miocack(qp, mp, 0, ENOTTY); + } + return (0); + case ZFD_MUX: { + /* + * Setup the multiplexer configuration for the two + * streams. + * + * We expect to be called on the stream that will + * become the log stream and be passed one data block + * with the minor number of the slave side of the + * primary stream. + */ + int to; + int instance; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return (0); + } + + /* Get the primary slave minor device number */ + to = *(int *)mp->b_cont->b_rptr; + instance = ZFD_INSTANCE(to); + + if ((prim_zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) { + miocack(qp, mp, 0, EINVAL); + return (0); + } + + /* Disallow changing primary/log once set. */ + mutex_enter(&zfd_mux_lock); + if (zfds->zfd_muxt != ZFD_NO_MUX || + prim_zfds->zfd_muxt != ZFD_NO_MUX) { + mutex_exit(&zfd_mux_lock); + miocack(qp, mp, 0, EINVAL); + return (0); + } + + zfds->zfd_muxt = ZFD_LOG_STREAM; + zfds->zfd_inst_pri = prim_zfds; + prim_zfds->zfd_muxt = ZFD_PRIMARY_STREAM; + prim_zfds->zfd_inst_log = zfds; + mutex_exit(&zfd_mux_lock); + DTRACE_PROBE2(zfd__mux__link, void *, prim_zfds, + void *, zfds); + + miocack(qp, mp, 0, 0); + return (0); + } + case ZFD_MUX_FLOWCON: { + /* + * We expect this ioctl to be issued against the + * log stream. We don't use the primary stream since + * there can be other streams modules pushed onto that + * stream which would interfere with the ioctl. + */ + int val; + zfd_state_t *prim_zfds; + + if (iocbp->ioc_count != TRANSPARENT || + mp->b_cont == NULL) { + miocack(qp, mp, 0, EINVAL); + return (0); + } + + if (zfds->zfd_muxt != ZFD_LOG_STREAM) { + miocack(qp, mp, 0, EINVAL); + return (0); + } + prim_zfds = zfds->zfd_inst_pri; + + /* Get the flow control setting */ + val = *(int *)mp->b_cont->b_rptr; + if (val != 0 && val != 1) { + miocack(qp, mp, 0, EINVAL); + return (0); + } + + prim_zfds->zfd_allow_flowcon = (boolean_t)val; + if (!prim_zfds->zfd_allow_flowcon) + prim_zfds->zfd_is_flowcon = B_FALSE; + + DTRACE_PROBE1(zfd__mux__flowcon, void *, prim_zfds); + miocack(qp, mp, 0, 0); + return (0); + } + default: + break; + } + } + + /* if on the write side, may need to tee */ + if (zfds->zfd_slave_rdq != NULL && qp == WR(zfds->zfd_slave_rdq)) { + /* tee output to any attached log stream */ + zfd_tee_handler(zfds, type, mp); + + /* high-priority msgs are not subject to flow control */ + if (zfds->zfd_is_flowcon && type == M_DATA) + must_queue = B_TRUE; + } + + if (zfd_switch(RD(qp)) == NULL) { + DBG1("wput to %s side (no one listening)", zfd_side(qp)); + switch (type) { + case M_FLUSH: + handle_mflush(qp, mp); + break; + case M_IOCTL: + miocnak(qp, mp, 0, 0); + break; + default: + freemsg(mp); + break; + } + return (0); + } + + if (type >= QPCTL) { + DBG1("(hipri) wput, %s side", zfd_side(qp)); + switch (type) { + case M_READ: /* supposedly from ldterm? */ + DBG("zfd_wput: tossing M_READ\n"); + freemsg(mp); + break; + case M_FLUSH: + handle_mflush(qp, mp); + break; + default: + /* + * Put this to the other side. + */ + ASSERT(zfd_switch(RD(qp)) != NULL); + putnext(zfd_switch(RD(qp)), mp); + break; + } + DBG1("done (hipri) wput, %s side", zfd_side(qp)); + return (0); + } + + /* + * If the primary stream has been stopped for flow control then + * enqueue the msg, otherwise only putnext if there isn't already + * something in the queue. If we don't do this then things would wind + * up out of order. + */ + if (!must_queue && qp->q_first == NULL && + bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + putnext(RD(zfd_switch(qp)), mp); + } else { + /* + * zfd_wsrv expects msgs queued on the primary queue. Those + * will be handled by zfd_wsrv after zfd_rsrv performs the + * qenable on the proper queue. + */ + (void) putq(qp, mp); + } + + DBG1("done wput, %s side", zfd_side(qp)); + return (0); +} + +/* + * Read server + * + * For primary stream: + * Under normal execution rsrv(9E) is symmetric for master and slave, so + * zfd_rsrv() can handle both without splitting up the codepath. We do this by + * enabling the write side of the partner. This triggers the partner to send + * messages queued on its write side to this queue's read side. + * + * For log stream: + * Internally we've queued up the msgs that we've teed off to the log stream + * so when we're invoked we need to pass these along. + */ +static int +zfd_rsrv(queue_t *qp) +{ + zfd_state_t *zfds; + zfds = (zfd_state_t *)qp->q_ptr; + + /* + * log stream server + */ + if (zfds->zfd_muxt == ZFD_LOG_STREAM && zfds->zfd_slave_rdq != NULL) { + queue_t *log_qp; + mblk_t *mp; + + log_qp = RD(zfds->zfd_slave_rdq); + + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + zfd_state_t *pzfds = zfds->zfd_inst_pri; + + while ((mp = getq(qp)) != NULL) { + if (bcanputnext(log_qp, mp->b_band)) { + putnext(log_qp, mp); + } else { + (void) putbq(log_qp, mp); + break; + } + } + + if (log_qp->q_count < log_qp->q_lowat) { + DTRACE_PROBE(zfd__flow__on); + pzfds->zfd_is_flowcon = B_FALSE; + if (pzfds->zfd_master_rdq != NULL) + qenable(RD(pzfds->zfd_master_rdq)); + } + } else { + /* No longer open, drain the queue */ + while ((mp = getq(qp)) != NULL) { + freemsg(mp); + } + flushq(qp, FLUSHALL); + } + return (0); + } + + /* + * Care must be taken here, as either of the master or slave side + * qptr could be NULL. + */ + ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq); + if (zfd_switch(qp) == NULL) { + DBG("zfd_rsrv: other side isn't listening\n"); + return (0); + } + qenable(WR(zfd_switch(qp))); + return (0); +} + +/* + * Write server + * + * This routine is symmetric for master and slave, so it handles both without + * splitting up the codepath. + * + * If there are messages on this queue that can be sent to the other, send + * them via putnext(). Else, if queued messages cannot be sent, leave them + * on this queue. + */ +static int +zfd_wsrv(queue_t *qp) +{ + queue_t *swq; + mblk_t *mp; + zfd_state_t *zfds = (zfd_state_t *)qp->q_ptr; + + ASSERT(zfds != NULL); + + /* + * Partner has no read queue, so take the data, and throw it away. + */ + if (zfd_switch(RD(qp)) == NULL) { + DBG("zfd_wsrv: other side isn't listening"); + while ((mp = getq(qp)) != NULL) { + if (mp->b_datap->db_type == M_IOCTL) + miocnak(qp, mp, 0, 0); + else + freemsg(mp); + } + flushq(qp, FLUSHALL); + return (0); + } + + swq = RD(zfd_switch(qp)); + + /* + * while there are messages on this write queue... + */ + while (!zfds->zfd_is_flowcon && (mp = getq(qp)) != NULL) { + /* + * Due to the way zfd_wput is implemented, we should never + * see a high priority control message here. + */ + ASSERT(mp->b_datap->db_type < QPCTL); + + if (bcanputnext(swq, mp->b_band)) { + putnext(swq, mp); + } else { + (void) putbq(qp, mp); + break; + } + } + return (0); +} diff --git a/usr/src/uts/common/klm/klmmod.c b/usr/src/uts/common/klm/klmmod.c index 1546b3f67b..8a714ec686 100644 --- a/usr/src/uts/common/klm/klmmod.c +++ b/usr/src/uts/common/klm/klmmod.c @@ -12,6 +12,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* @@ -279,6 +280,10 @@ lm_svc(struct lm_svc_args *args) rfs4_lease_time = args->grace; } + if (args->n_v4_only == -1) { + g->nlm_v4_only = B_TRUE; + } + mutex_exit(&g->lock); err = nlm_svc_starting(g, fp, netid, &knc); mutex_enter(&g->lock); diff --git a/usr/src/uts/common/klm/mapfile-mod b/usr/src/uts/common/klm/mapfile-mod index 0debe6d986..b7789d81fd 100644 --- a/usr/src/uts/common/klm/mapfile-mod +++ b/usr/src/uts/common/klm/mapfile-mod @@ -11,6 +11,7 @@ # # Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# Copyright 2017 Joyent, Inc. # @@ -49,6 +50,11 @@ SYMBOL_SCOPE { nlm_frlock; nlm_register_lock_locally; nlm_shrlock; +# These four functions are available for use within a branded zone. + nlm_nsm_clnt_init; + nlm_netbuf_to_netobj; + sm_mon_1; + sm_unmon_1; local: *; diff --git a/usr/src/uts/common/klm/nlm_dispatch.c b/usr/src/uts/common/klm/nlm_dispatch.c index a0ca2a56c4..8fa9940eae 100644 --- a/usr/src/uts/common/klm/nlm_dispatch.c +++ b/usr/src/uts/common/klm/nlm_dispatch.c @@ -11,6 +11,7 @@ /* * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* @@ -412,13 +413,13 @@ nlm_prog_3_dtable[] = { 0, 0 }, - { /* 16: not used */ - NLM_SVC_FUNC(0), - (xdrproc_t)0, - (xdrproc_t)0, + { /* 16: Linux NLMPROC_NSM_NOTIFY (same handling as NLM_SM_NOTIFY1) */ + NLM_SVC_FUNC(nlm_sm_notify1_2_svc), + (xdrproc_t)xdr_nlm_sm_status, + (xdrproc_t)xdr_void, NULL, 0, - 0 }, + NLM_DISP_NOREMOTE }, { /* 17: NLM_SM_NOTIFY1 */ NLM_SVC_FUNC(nlm_sm_notify1_2_svc), diff --git a/usr/src/uts/common/klm/nlm_impl.c b/usr/src/uts/common/klm/nlm_impl.c index bab08acdae..cbba11f6ed 100644 --- a/usr/src/uts/common/klm/nlm_impl.c +++ b/usr/src/uts/common/klm/nlm_impl.c @@ -28,6 +28,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* @@ -57,6 +58,7 @@ #include <sys/queue.h> #include <sys/bitmap.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <netinet/in.h> #include <rpc/rpc.h> @@ -202,6 +204,12 @@ static struct nlm_knc nlm_netconfigs[] = { /* (g) */ }; /* + * NLM functions which can be called by a brand hook. + */ +void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); +void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); + +/* * NLM misc. function */ static void nlm_copy_netbuf(struct netbuf *, struct netbuf *); @@ -210,8 +218,6 @@ static void nlm_kmem_reclaim(void *); static void nlm_pool_shutdown(void); static void nlm_suspend_zone(struct nlm_globals *); static void nlm_resume_zone(struct nlm_globals *); -static void nlm_nsm_clnt_init(CLIENT *, struct nlm_nsm *); -static void nlm_netbuf_to_netobj(struct netbuf *, int *, netobj *); /* * NLM thread functions @@ -1847,6 +1853,12 @@ nlm_host_unmonitor(struct nlm_globals *g, struct nlm_host *host) return; host->nh_flags &= ~NLM_NH_MONITORED; + + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) { + ZBROP(curzone)->b_rpc_statd(SM_UNMON, g, host); + return; + } + stat = nlm_nsm_unmon(&g->nlm_nsm, host->nh_name); if (stat != RPC_SUCCESS) { NLM_WARN("NLM: Failed to contact statd, stat=%d\n", stat); @@ -1885,6 +1897,11 @@ nlm_host_monitor(struct nlm_globals *g, struct nlm_host *host, int state) host->nh_flags |= NLM_NH_MONITORED; mutex_exit(&host->nh_lock); + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_rpc_statd != NULL) { + ZBROP(curzone)->b_rpc_statd(SM_MON, g, host); + return; + } + /* * Before we begin monitoring the host register the network address * associated with this hostname. @@ -2361,6 +2378,13 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp, VERIFY(g->run_status == NLM_ST_STARTING); VERIFY(g->nlm_gc_thread == NULL); + if (g->nlm_v4_only) { + NLM_WARN("Zone %d has no rpcbind, NLM is v4 only", getzoneid()); + bzero(&g->nlm_nsm, sizeof (struct nlm_nsm)); + g->nlm_nsm.ns_addr_handle = (void *)-1; + goto v4_only; + } + error = nlm_nsm_init_local(&g->nlm_nsm); if (error != 0) { NLM_ERR("Failed to initialize NSM handler " @@ -2397,6 +2421,7 @@ nlm_svc_starting(struct nlm_globals *g, struct file *fp, "(rpcerr=%d)\n", stat); goto shutdown_lm; } +v4_only: g->grace_threshold = ddi_get_lbolt() + SEC_TO_TICK(g->grace_period); @@ -2520,7 +2545,9 @@ nlm_svc_stopping(struct nlm_globals *g) ASSERT(TAILQ_EMPTY(&g->nlm_slocks)); - nlm_nsm_fini(&g->nlm_nsm); + /* If started with rpcbind (the normal case) */ + if (g->nlm_nsm.ns_addr_handle != (void *)-1) + nlm_nsm_fini(&g->nlm_nsm); g->lockd_pid = 0; g->run_status = NLM_ST_DOWN; } @@ -2814,14 +2841,14 @@ nlm_cprresume(void) rw_exit(&lm_lck); } -static void +void nlm_nsm_clnt_init(CLIENT *clnt, struct nlm_nsm *nsm) { (void) clnt_tli_kinit(clnt, &nsm->ns_knc, &nsm->ns_addr, 0, NLM_RPC_RETRIES, zone_kcred()); } -static void +void nlm_netbuf_to_netobj(struct netbuf *addr, int *family, netobj *obj) { /* LINTED pointer alignment */ diff --git a/usr/src/uts/common/klm/nlm_impl.h b/usr/src/uts/common/klm/nlm_impl.h index e59ea540e3..9caae1a8c7 100644 --- a/usr/src/uts/common/klm/nlm_impl.h +++ b/usr/src/uts/common/klm/nlm_impl.h @@ -30,6 +30,7 @@ /* * Copyright 2019 Nexenta by DDN, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -459,6 +460,7 @@ struct nlm_globals { int cn_idle_tmo; /* (z) */ int grace_period; /* (z) */ int retrans_tmo; /* (z) */ + boolean_t nlm_v4_only; /* (z) */ zoneid_t nlm_zoneid; /* (c) */ kmutex_t clean_lock; /* (c) */ TAILQ_ENTRY(nlm_globals) nlm_link; /* (g) */ diff --git a/usr/src/uts/common/mapfiles/ddi.mapfile b/usr/src/uts/common/mapfiles/ddi.mapfile index a9f4f2d730..507768a223 100644 --- a/usr/src/uts/common/mapfiles/ddi.mapfile +++ b/usr/src/uts/common/mapfiles/ddi.mapfile @@ -194,6 +194,9 @@ SYMBOL_SCOPE { mutex_tryenter { FLAGS = EXTERN }; nochpoll { FLAGS = EXTERN }; nodev { FLAGS = EXTERN }; + nvlist_add_string { FLAGS = EXTERN }; + nvlist_alloc { FLAGS = EXTERN }; + nvlist_free { FLAGS = EXTERN }; nulldev { FLAGS = EXTERN }; nvlist_add_string { FLAGS = EXTERN }; nvlist_alloc { FLAGS = EXTERN }; diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h index fb9f8a0976..74cff75d43 100644 --- a/usr/src/uts/common/netinet/udp.h +++ b/usr/src/uts/common/netinet/udp.h @@ -1,6 +1,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -34,6 +35,7 @@ struct udphdr { #define UDP_RCVHDR 0x0102 /* for internal use only */ #define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */ #define UDP_SRCPORT_HASH 0x0104 /* for internal use only */ +#define UDP_SND_TO_CONNECTED 0x0105 /* for internal use only */ /* * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go diff --git a/usr/src/uts/common/nfs/nfssys.h b/usr/src/uts/common/nfs/nfssys.h index e9a2746017..7d2401856c 100644 --- a/usr/src/uts/common/nfs/nfssys.h +++ b/usr/src/uts/common/nfs/nfssys.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -122,13 +123,20 @@ struct nfs_revauth_args32 { enum lm_fmly { LM_INET, LM_INET6, LM_LOOPBACK }; enum lm_proto { LM_TCP, LM_UDP }; +/* + * The 'n_v4_only' member was formerly called 'debug'. This member is not used + * in the kernel. To avoid a new version of this user/kernel interface + * structure, the member was renamed in a binary compatible way. It is now used + * by the user-level code to indicate that the zone is not running + * rpcbind/rpc.statd and that only NFSv4 locking is needed. + */ struct lm_svc_args { int version; /* keep this first */ int fd; enum lm_fmly n_fmly; /* protocol family */ enum lm_proto n_proto; /* protocol */ dev_t n_rdev; /* device ID */ - int debug; /* debugging level */ + int n_v4_only; /* NFSv4 locking only */ time_t timout; /* client handle life (asynch RPCs) */ int grace; /* secs in grace period */ time_t retransmittimeout; /* retransmission interval */ @@ -141,7 +149,7 @@ struct lm_svc_args32 { enum lm_fmly n_fmly; /* protocol family */ enum lm_proto n_proto; /* protocol */ dev32_t n_rdev; /* device ID */ - int32_t debug; /* debugging level */ + int32_t n_v4_only; /* NFSv4 locking only */ time32_t timout; /* client handle life (asynch RPCs) */ int32_t grace; /* secs in grace period */ time32_t retransmittimeout; /* retransmission interval */ diff --git a/usr/src/uts/common/os/acct.c b/usr/src/uts/common/os/acct.c index e598e0d08d..891c4e0836 100644 --- a/usr/src/uts/common/os/acct.c +++ b/usr/src/uts/common/os/acct.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -47,6 +48,7 @@ #include <sys/time.h> #include <sys/msacct.h> #include <sys/zone.h> +#include <sys/brand.h> /* * Each zone has its own accounting settings (on or off) and associated @@ -373,7 +375,7 @@ acct_compress(ulong_t t) * On exit, write a record on the accounting file. */ void -acct(char st) +acct(int st) { struct vnode *vp; struct cred *cr; @@ -402,6 +404,21 @@ acct(char st) * This only gets called from exit after all lwp's have exited so no * cred locking is needed. */ + + /* If there is a brand-specific hook, use it instead */ + if (ZONE_IS_BRANDED(curzone) && ZBROP(curzone)->b_acct_out != NULL) { + ZBROP(curzone)->b_acct_out(vp, st); + mutex_exit(&ag->aclock); + return; + } + + /* + * The 'st' status value was traditionally masked this way by our + * caller, but we now accept the unmasked value for brand handling. + * Zones not using the brand hook mask the status here. + */ + st &= 0xff; + p = curproc; ua = PTOU(p); bcopy(ua->u_comm, ag->acctbuf.ac_comm, sizeof (ag->acctbuf.ac_comm)); diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index 68b699630a..fa3555a82a 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -46,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -54,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -311,46 +312,115 @@ brand_unregister_zone(struct brand *bp) mutex_exit(&brand_list_lock); } -void -brand_setbrand(proc_t *p) +int +brand_setbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * Process branding occurs during fork() and exec(). When it happens + * during fork(), the LWP count will always be 0 since branding is + * performed as part of getproc(), before LWPs have been associated. + * The same is not true during exec(), where a multi-LWP process may + * undergo branding just prior to gexec(). This is to ensure + * exec-related brand hooks are available. While it may seem + * complicated to brand a multi-LWP process, the two possible outcomes + * simplify things: + * + * 1. The exec() succeeds: LWPs besides the caller will be killed and + * any further branding will occur in a single-LWP context. + * 2. The exec() fails: The process will be promptly unbranded since + * the hooks are no longer needed. + * + * To prevent inconsistent brand state from being encountered during + * the exec(), LWPs beyond the caller which are associated with this + * process must be held temporarily. They will be released either when + * they are killed in the exec() success, or when the brand is cleared + * after exec() failure. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + if (lwps_ok) { + /* + * We've been called from a exec() context tolerating the + * existence of multiple LWPs during branding is necessary. + */ + VERIFY(p == curproc); + VERIFY(p->p_tlist != NULL); + if (p->p_tlist != p->p_tlist->t_forw) { + /* + * Multiple LWPs are present. Hold all but the caller. + */ + if (!holdlwps(SHOLDFORK1)) { + return (-1); + } + } + } else { + /* + * Processes branded during fork() should not have LWPs at all. + */ + VERIFY(p->p_tlist == NULL); + } + + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); + return (0); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p, boolean_t lwps_ok) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; - /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. - */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + if (BROP(p)->b_clearbrand != NULL) + BROP(p)->b_clearbrand(p, lwps_ok); + + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + + if (lwps_ok) { + VERIFY(p == curproc); + /* + * A process with multiple LWPs is being de-branded after + * failing an exec. The other LWPs were held as part of the + * procedure, so they must be resumed now. + */ + if (p->p_tlist != NULL && p->p_tlist != p->p_tlist->t_forw) { + continuelwps(p); + } + } else { + /* + * While clearing the brand, it's ok for one LWP to be present. + * This happens when a native binary is executed inside a + * branded zone, since the brand will be removed during the + * course of a successful exec. + */ + VERIFY(p->p_tlist == NULL || p->p_tlist == p->p_tlist->t_forw); + } + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -484,7 +554,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,16 +671,16 @@ restoreexecenv(struct execenv *ep, stack_t *sp) /*ARGSUSED*/ int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, - intpdata_t *idatap, int level, size_t *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + intpdata_t *idatap, int level, size_t *execsz, int setid, + caddr_t exec_file, cred_t *cred, int *brand_action, struct brand *pbrand, + char *bname, char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -620,7 +690,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -637,12 +706,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -726,7 +793,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -734,7 +801,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -745,6 +812,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -762,7 +833,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -778,16 +849,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -795,7 +870,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -935,9 +1010,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1079,55 +1154,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1146,5 +1197,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/contract.c b/usr/src/uts/common/os/contract.c index 9e498dc1c7..e4b1db84e1 100644 --- a/usr/src/uts/common/os/contract.c +++ b/usr/src/uts/common/os/contract.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* * Copyright (c) 2017 by Delphix. All rights reserved. @@ -290,7 +291,10 @@ contract_ctor(contract_t *ct, ct_type_t *type, ct_template_t *tmpl, void *data, avl_index_t where; klwp_t *curlwp = ttolwp(curthread); - ASSERT(author == curproc); + /* + * It's possible that author is not curproc if the zone is creating + * a new process as a child of zsched. + */ mutex_init(&ct->ct_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ct->ct_reflock, NULL, MUTEX_DEFAULT, NULL); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d56484ac34..3df98d0f6d 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -125,6 +125,7 @@ remove_core_file(char *fp, enum core_types core_type) /* * Determine what rootvp to use. */ + mutex_enter(&curproc->p_lock); if (core_type == CORE_PROC) { rootvp = (PTOU(curproc)->u_rdir == NULL ? curproc->p_zone->zone_rootvp : PTOU(curproc)->u_rdir); @@ -140,6 +141,7 @@ remove_core_file(char *fp, enum core_types core_type) VN_HOLD(startvp); if (rootvp != rootdir) VN_HOLD(rootvp); + mutex_exit(&curproc->p_lock); if ((error = lookuppnvp(&pn, NULL, NO_FOLLOW, &dvp, &vp, rootvp, startvp, CRED())) != 0) { pn_free(&pn); diff --git a/usr/src/uts/common/os/cpu.c b/usr/src/uts/common/os/cpu.c index 075bb6e70a..6a86dbb8cb 100644 --- a/usr/src/uts/common/os/cpu.c +++ b/usr/src/uts/common/os/cpu.c @@ -112,7 +112,7 @@ cpu_t *cpu_list; /* list of all CPUs */ cpu_t *clock_cpu_list; /* used by clock to walk CPUs */ cpu_t *cpu_active; /* list of active CPUs */ cpuset_t cpu_active_set; /* cached set of active CPUs */ -static cpuset_t cpu_available; /* set of available CPUs */ +cpuset_t cpu_available; /* set of available CPUs */ cpuset_t cpu_seqid_inuse; /* which cpu_seqids are in use */ cpu_t **cpu_seq; /* ptrs to CPUs, indexed by seq_id */ diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 3e1df330b7..5e909667de 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -730,6 +730,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index 8faa8fea8c..2433c504fc 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/dumpsubr.c b/usr/src/uts/common/os/dumpsubr.c index 484b2042e2..868ed9e5c4 100644 --- a/usr/src/uts/common/os/dumpsubr.c +++ b/usr/src/uts/common/os/dumpsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2018 Nexenta Systems, Inc. All rights reserved. */ @@ -75,6 +75,7 @@ #include <sys/cpu.h> #include <bzip2/bzlib.h> +#include <crypto/chacha/chacha.h> #define ONE_GIG (1024 * 1024 * 1024UL) @@ -112,6 +113,8 @@ int dump_timeout = 120; /* timeout for dumping pages */ int dump_timeleft; /* portion of dump_timeout remaining */ int dump_ioerr; /* dump i/o error */ int dump_check_used; /* enable check for used pages */ +uint8_t dump_crypt_key[DUMP_CRYPT_KEYLEN]; /* dump encryption key */ +uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; /* dump nonce */ char *dump_stack_scratch; /* scratch area for saving stack summary */ /* @@ -357,6 +360,7 @@ typedef struct dumpsync { hrtime_t iotime; /* time spent writing nwrite bytes */ hrtime_t iowait; /* time spent waiting for output */ hrtime_t iowaitts; /* iowait timestamp */ + hrtime_t crypt; /* time spent encrypting */ perpage_t perpage; /* metrics */ perpage_t perpagets; int dumpcpu; /* master cpu */ @@ -435,6 +439,7 @@ typedef struct dumpbuf { char *cur; /* dump write pointer */ char *start; /* dump buffer address */ char *end; /* dump buffer end */ + char *scratch; /* scratch buffer */ size_t size; /* size of dumpbuf in bytes */ size_t iosize; /* best transfer size for device */ } dumpbuf_t; @@ -493,11 +498,16 @@ dumpbuf_resize(void) if (new_size <= old_size) return; /* no need to reallocate buffer */ - new_buf = kmem_alloc(new_size, KM_SLEEP); + /* + * Allocate thrice the size of buffer to allow for space for the stream + * and its ciphertext should encryption be enabled (or become so). + */ + new_buf = kmem_alloc(new_size * 3, KM_SLEEP); dumpbuf.size = new_size; dumpbuf.start = new_buf; dumpbuf.end = new_buf + new_size; - kmem_free(old_buf, old_size); + dumpbuf.scratch = dumpbuf.end + new_size; + kmem_free(old_buf, old_size * 3); } /* @@ -1125,9 +1135,16 @@ dumphdr_init(void) dumphdr->dump_pagesize = PAGESIZE; dumphdr->dump_utsname = utsname; (void) strcpy(dumphdr->dump_platform, platform); + + /* + * Allocate our buffer, assuring enough room for encryption + * should it become configured. + */ dumpbuf.size = dumpbuf_iosize(maxphys); - dumpbuf.start = kmem_alloc(dumpbuf.size, KM_SLEEP); + dumpbuf.start = kmem_alloc(dumpbuf.size * 3, KM_SLEEP); dumpbuf.end = dumpbuf.start + dumpbuf.size; + dumpbuf.scratch = dumpbuf.end + dumpbuf.size; + dumpcfg.pids = kmem_alloc(v.v_proc * sizeof (pid_t), KM_SLEEP); dumpcfg.helpermap = kmem_zalloc(BT_SIZEOFMAP(NCPU), KM_SLEEP); LOCK_INIT_HELD(&dumpcfg.helper_lock); @@ -1317,6 +1334,41 @@ dumpfini(void) dumppath = NULL; } +static void +dumpvp_encrypt(size_t size) +{ + size_t nelems = size / sizeof (uint64_t), i; + uint64_t *start = (uint64_t *)dumpbuf.start; + uint64_t *stream = (uint64_t *)dumpbuf.end; + uint64_t *crypt = (uint64_t *)dumpbuf.scratch; + uint64_t ctr = dumpbuf.vp_off >> DUMP_CRYPT_BLOCKSHIFT; + hrtime_t ts = gethrtime(); + offset_t dumpoff = dumpbuf.vp_off; + chacha_ctx_t ctx; + + /* + * Our size should be 64-bit aligned and our offset must be aligned + * to our crypto blocksize. + */ + ASSERT(!(size & (sizeof (uint64_t) - 1))); + ASSERT(!(dumpbuf.vp_off & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1))); + + chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0); + chacha_ivsetup(&ctx, dump_crypt_nonce, (uint8_t *)&ctr); + + for (i = 0; i < nelems; i++) { + stream[i] = dumpoff; + dumpoff += sizeof (uint64_t); + } + + chacha_encrypt_bytes(&ctx, (uint8_t *)stream, (uint8_t *)crypt, size); + + for (i = 0; i < nelems; i++) + start[i] ^= crypt[i]; + + dumpsync.crypt += gethrtime() - ts; +} + static offset_t dumpvp_flush(void) { @@ -1328,6 +1380,17 @@ dumpvp_flush(void) dump_ioerr = ENOSPC; dumpbuf.vp_off = dumpbuf.vp_limit; } else if (size != 0) { + /* + * If our dump is encrypted and this is neither the initial + * dump header nor the terminal dump header and metrics, + * encrypt the buffer before writing it. + */ + if ((dump_conflags & DUMP_ENCRYPT) && + dumpbuf.vp_off > dumphdr->dump_start && + dumpbuf.vp_off < dumpbuf.vp_limit - DUMP_OFFSET) { + dumpvp_encrypt(size); + } + iotime = gethrtime(); dumpsync.iowait += iotime - dumpsync.iowaitts; if (panicstr) @@ -2618,6 +2681,7 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) P("Dump I/O rate MBS,%d.%02d\n", iorate / 100, iorate % 100); P("..total bytes,%lld\n", (u_longlong_t)ds->nwrite); P("..total nsec,%lld\n", (u_longlong_t)ds->iotime); + P("..crypt nsec,%lld\n", (u_longlong_t)ds->crypt); P("dumpbuf.iosize,%ld\n", dumpbuf.iosize); P("dumpbuf.size,%ld\n", dumpbuf.size); @@ -2658,6 +2722,29 @@ dumpsys_metrics(dumpsync_t *ds, char *buf, size_t size) } #endif /* COLLECT_METRICS */ +CTASSERT(DUMP_CRYPT_HMACLEN <= sizeof (struct utsname)); + +/* + * Mark the dump as encrypted and calculate our (crude) HMAC based on the + * dump_utsname. (The purpose of the HMAC is to merely allow for incorrect + * keys to be quickly rejected.) + */ +void +dumpsys_crypt(dumphdr_t *dumphdr, dump_crypt_t *dcrypt) +{ + chacha_ctx_t ctx; + + dumphdr->dump_flags |= DF_ENCRYPTED; + bcopy(dump_crypt_nonce, dcrypt->dump_crypt_nonce, DUMP_CRYPT_NONCELEN); + dcrypt->dump_crypt_algo = DUMP_CRYPT_ALGO_CHACHA20; + + chacha_keysetup(&ctx, dump_crypt_key, DUMP_CRYPT_KEYLEN * 8, 0); + chacha_ivsetup(&ctx, dump_crypt_nonce, NULL); + + chacha_encrypt_bytes(&ctx, (uint8_t *)&dumphdr->dump_utsname, + (uint8_t *)&dcrypt->dump_crypt_hmac, DUMP_CRYPT_HMACLEN); +} + /* * Dump the system. */ @@ -2679,6 +2766,7 @@ dumpsys(void) dumpmlw_t mlw; dumpcsize_t datatag; dumpdatahdr_t datahdr; + dump_crypt_t dcrypt; if (dumpvp == NULL || dumphdr == NULL) { uprintf("skipping system dump - no dump device configured\n"); @@ -2733,6 +2821,9 @@ dumpsys(void) /* Make sure nodename is current */ bcopy(utsname.nodename, dumphdr->dump_utsname.nodename, SYS_NMLN); + if (dump_conflags & DUMP_ENCRYPT) + dumpsys_crypt(dumphdr, &dcrypt); + /* * If this is a live dump, try to open a VCHR vnode for better * performance. We must take care to flush the buffer cache @@ -2999,11 +3090,19 @@ dumpsys(void) */ dumpbuf.vp_off = dumphdr->dump_start; dumpvp_write(dumphdr, sizeof (dumphdr_t)); + + if (dump_conflags & DUMP_ENCRYPT) + dumpvp_write(&dcrypt, sizeof (dump_crypt_t)); + (void) dumpvp_flush(); dumpbuf.vp_limit = dumpvp_size; dumpbuf.vp_off = dumpbuf.vp_limit - DUMP_OFFSET; dumpvp_write(dumphdr, sizeof (dumphdr_t)); + + if (dump_conflags & DUMP_ENCRYPT) + dumpvp_write(&dcrypt, sizeof (dump_crypt_t)); + dumpvp_write(&datahdr, sizeof (dumpdatahdr_t)); dumpvp_write(dumpcfg.cbuf[0].buf, datahdr.dump_metrics); diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index d663f27ca0..0f9e4ea6dd 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -24,7 +24,7 @@ */ /* Copyright (c) 1988 AT&T */ -/* All Rights Reserved */ +/* All Rights Reserved */ /* * Copyright 2019 Joyent, Inc. * Copyright 2022 Oxide Computer Company @@ -102,6 +102,7 @@ uint_t auxv_hwcap32_3 = 0; /* 32-bit version of auxv_hwcap3 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * These are consumed within the specific exec modules, but are defined here @@ -268,8 +269,10 @@ exec_common(const char *fname, const char **argp, const char **envp, * only if the pathname does not contain a "/" the resolved path * points to a file in the current working (attribute) directory. */ - if ((p->p_user.u_cdir->v_flag & V_XATTRDIR) != 0 && + mutex_enter(&p->p_lock); + if ((PTOU(p)->u_cdir->v_flag & V_XATTRDIR) != 0 && strchr(resolvepn.pn_path, '/') == NULL) { + mutex_exit(&p->p_lock); if (dir != NULL) VN_RELE(dir); error = EACCES; @@ -278,6 +281,7 @@ exec_common(const char *fname, const char **argp, const char **envp, VN_RELE(vp); goto out; } + mutex_exit(&p->p_lock); bzero(exec_file, MAXCOMLEN+1); (void) strncpy(exec_file, pn.pn_path, MAXCOMLEN); @@ -325,14 +329,43 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) - brand_setbrand(p); + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + + /* + * Process branding may fail if multiple LWPs are present and + * holdlwps() cannot complete successfully. + */ + error = brand_setbrand(p, B_TRUE); + + if (error == 0 && BROP(p)->b_lwpdata_alloc != NULL) { + brand_data = BROP(p)->b_lwpdata_alloc(p); + if (brand_data == NULL) { + error = 1; + } + } + + if (error == 0) { + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } else { + VN_RELE(vp); + if (dir != NULL) { + VN_RELE(dir); + } + pn_free(&resolvepn); + goto fail; + } + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p, B_TRUE); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -364,7 +397,7 @@ exec_common(const char *fname, const char **argp, const char **envp, /* * Clear contract template state */ - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_TRUE); /* * Save the directory in which we found the executable for expanding @@ -388,6 +421,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -447,8 +482,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); brand_clearbrand(p, B_FALSE); + } setregs(&args); @@ -572,7 +609,7 @@ gexec( size_t *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -893,8 +930,14 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) + + /* + * If legacy ptrace is enabled, generate the SIGTRAP. + */ + if (pp->p_proc_flag & P_PR_PTRACE) { psignal(pp, SIGTRAP); + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1558,6 +1601,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1594,6 +1658,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1650,7 +1715,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) } } argc = (int *)(args->stk_base + args->stk_size) - args->stk_offp; - args->arglen = args->stk_strp - args->stk_base; + args->argstrlen = args->stk_strp - args->stk_base; + + const char *envstr = args->stk_strp; /* * Add environ[] strings to the stack. @@ -1672,12 +1739,15 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) envp += ptrsize; } } + + args->envstrlen = args->stk_strp - envstr; args->na = (int *)(args->stk_base + args->stk_size) - args->stk_offp; args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1690,6 +1760,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1746,46 +1830,53 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) */ if (stk_putptr(args, usp, (char *)(uintptr_t)argc)) return (-1); + usp += ptrsize; /* - * Add argc space (ptrsize) to usp and record argv for /proc. + * For the benefit of /proc, record the user address of the argv[] array + * as well as the start of the argv string space (argv[0]). */ - up->u_argv = (uintptr_t)(usp += ptrsize); + up->u_argv = (uintptr_t)usp; + up->u_argvstrs = (uintptr_t)(&ustrp[*(offp - 1)]); + up->u_argvstrsize = args->argstrlen; /* - * Put the argv[] pointers on the stack. + * Put the argv[] pointers on the stack, including a NULL terminator. */ for (i = 0; i < argc; i++, usp += ptrsize) if (stk_putptr(args, usp, &ustrp[*--offp])) return (-1); + usp += ptrsize; /* * Copy arguments to u_psargs. */ - pslen = MIN(args->arglen, PSARGSZ) - 1; + pslen = MIN(args->argstrlen, PSARGSZ) - 1; for (i = 0; i < pslen; i++) up->u_psargs[i] = (kstrp[i] == '\0' ? ' ' : kstrp[i]); while (i < PSARGSZ) up->u_psargs[i++] = '\0'; /* - * Add space for argv[]'s NULL terminator (ptrsize) to usp and - * record envp for /proc. + * For the benefit of /proc, record the user address of the envp[] array + * as well as the start of the envp string space (envp[0]). */ - up->u_envp = (uintptr_t)(usp += ptrsize); + up->u_envp = (uintptr_t)usp; + up->u_envstrs = (uintptr_t)(&ustrp[*(offp - 1)]); + up->u_envstrsize = args->envstrlen; /* - * Put the envp[] pointers on the stack. + * Put the envp[] pointers on the stack, including a NULL terminator. */ for (i = 0; i < envc; i++, usp += ptrsize) if (stk_putptr(args, usp, &ustrp[*--offp])) return (-1); + usp += ptrsize; /* - * Add space for envp[]'s NULL terminator (ptrsize) to usp and - * remember where the stack ends, which is also where auxv begins. + * Remember where the stack ends, which is also where auxv begins. */ - args->stackend = usp += ptrsize; + args->stackend = usp; /* * Put all the argv[], envp[], and auxv strings on the stack. @@ -1796,7 +1887,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1809,6 +1900,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1821,6 +1917,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1964,6 +2065,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) @@ -2059,7 +2163,7 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) delete_itimer_realprof(); if (AU_AUDITING()) - audit_exec(args->stk_base, args->stk_base + args->arglen, + audit_exec(args->stk_base, args->stk_base + args->argstrlen, args->na - args->ne, args->ne, args->pfcred); /* diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index 5a9355ae9f..7ccf9b3221 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -141,11 +141,32 @@ rexit(int rval) } /* + * Bump the init_restarts kstat and let interested parties know about the + * restart. + */ +static void +restart_init_notify(zone_t *zone) +{ + nvlist_t *nvl = NULL; + + zone->zone_proc_init_restarts++; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0 && + nvlist_add_uint32(nvl, ZONE_CB_RESTARTS, + zone->zone_proc_init_restarts) == 0) { + zone_sysevent_publish(zone, ZONE_EVENT_INIT_CLASS, + ZONE_EVENT_INIT_RESTART_SC, nvl); + } + + nvlist_free(nvl); +} + +/* * Called by proc_exit() when a zone's init exits, presumably because * it failed. As long as the given zone is still in the "running" * state, we will re-exec() init, but first we need to reset things * which are usually inherited across exec() but will break init's - * assumption that it is being exec()'d from a virgin process. Most + * assumption that it is being exec()'d from a virgin process. Most * importantly this includes closing all file descriptors (exec only * closes those marked close-on-exec) and resetting signals (exec only * resets handled signals, and we need to clear any signals which @@ -234,7 +255,7 @@ restart_init(int what, int why) siginfofree(lwp->lwp_curinfo); lwp->lwp_curinfo = NULL; } - lwp_ctmpl_clear(lwp); + lwp_ctmpl_clear(lwp, B_FALSE); /* * Reset both the process root directory and the current working @@ -286,6 +307,8 @@ restart_init(int what, int why) ASSERT(p == curproc); (void) freectty(B_TRUE); + restart_init_notify(p->p_zone); + /* * Now exec() the new init(8) on top of the current process. If we * succeed, the caller will treat this like a successful system call. @@ -320,7 +343,7 @@ exit(int why, int what) /* * If proc_exit() fails, then some other lwp in the process * got there first. We just have to call lwp_exit() to allow - * the other lwp to finish exiting the process. Otherwise we're + * the other lwp to finish exiting the process. Otherwise we're * restarting init, and should return. */ if (proc_exit(why, what) != 0) { @@ -333,7 +356,7 @@ exit(int why, int what) /* * Set the SEXITING flag on the process, after making sure /proc does - * not have it locked. This is done in more places than proc_exit(), + * not have it locked. This is done in more places than proc_exit(), * so it is a separate function. */ void @@ -380,8 +403,9 @@ zone_init_exit(zone_t *z, int why, int what) */ if (!z->zone_restart_init) { /* - * The zone has been set up to halt when init exits. + * The zone has been setup to halt when init exits. */ + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); z->zone_proc_initpid = -1; return (B_FALSE); @@ -421,6 +445,7 @@ zone_init_exit(zone_t *z, int why, int what) (void) zone_kadmin(A_REBOOT, 0, NULL, zone_kcred()); } + z->zone_init_status = wstat(why, what); z->zone_proc_initpid = -1; return (B_FALSE); } @@ -441,14 +466,16 @@ zone_init_exit(zone_t *z, int why, int what) /* * No restart modifiers on the zone, attempt to restart init. */ - if (restart_init(what, why) == 0) + if (restart_init(what, why) == 0) { return (B_TRUE); + } } /* * The restart failed, or the criteria for a restart are not met; * the zone will shut down. */ + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, zone_kcred()); z->zone_proc_initpid = -1; return (B_FALSE); @@ -483,7 +510,7 @@ proc_exit(int why, int what) /* * Stop and discard the process's lwps except for the current one, - * unless some other lwp beat us to it. If exitlwps() fails then + * unless some other lwp beat us to it. If exitlwps() fails then * return and the calling lwp will call (or continue in) lwp_exit(). */ proc_is_exiting(p); @@ -501,19 +528,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -527,6 +541,32 @@ proc_exit(int why, int what) return (0); } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -693,7 +733,7 @@ proc_exit(int why, int what) semexit(p); rv = wstat(why, what); - acct(rv & 0xff); + acct(rv); exacct_commit_proc(p, rv); /* @@ -786,10 +826,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -801,7 +853,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -959,7 +1012,7 @@ proc_exit(int why, int what) * curthread's proc pointer is changed to point to the 'sched' * process for the corresponding zone, except in the case when * the exiting process is in fact a zsched instance, in which - * case the proc pointer is set to p0. We do so, so that the + * case the proc pointer is set to p0. We do so, so that the * process still points at the right zone when we call the VN_RELE() * below. * @@ -975,8 +1028,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -1001,7 +1096,7 @@ proc_exit(int why, int what) /* * task_rele() may ultimately cause the zone to go away (or * may cause the last user process in a zone to go away, which - * signals zsched to go away). So prior to this call, we must + * signals zsched to go away). So prior to this call, we must * no longer point at zsched. */ t->t_procp = &p0; @@ -1055,10 +1150,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -1086,7 +1180,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -1106,10 +1201,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; - proc_gone = 0; + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } + + if (pp->p_child == NULL) { + goto no_real_children; + } + } + + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -1117,6 +1239,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1161,12 +1288,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1235,11 +1366,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1258,7 +1390,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1354,6 +1486,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p, B_FALSE); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index c25564d85f..f6179cf301 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,8 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. + * Copyright 2017, Joyent Inc. + * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -488,7 +489,7 @@ free_afd(afd_t *afd) /* called below and from thread_free() */ afd->a_fd[i] = -1; } -static void +void set_active_fd(int fd) { afd_t *afd = &curthread->t_activefd; @@ -958,7 +959,22 @@ closef(file_t *fp) vp = fp->f_vnode; - error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); + /* + * The __FLXPATH flag is a private interface for use by the lx + * brand in order to emulate open(O_NOFOLLOW|O_PATH) which, + * when a symbolic link is encountered, returns a file + * descriptor which references it. + * See uts/common/brand/lx/syscall/lx_open.c + * + * When this flag is set, VOP_OPEN() will not have been called when + * this file descriptor was opened, and VOP_CLOSE() should not be + * called here (for a symlink, most filesystems would return ENOSYS + * anyway) + */ + if (fp->f_flag2 & (__FLXPATH >> 16)) + error = 0; + else + error = VOP_CLOSE(vp, flag, count, offset, fp->f_cred, NULL); if (count > 1) { mutex_exit(&fp->f_tlock); @@ -1118,7 +1134,7 @@ falloc(vnode_t *vp, int flag, file_t **fpp, int *fdp) mutex_enter(&fp->f_tlock); fp->f_count = 1; fp->f_flag = (ushort_t)flag; - fp->f_flag2 = (flag & (FSEARCH|FEXEC)) >> 16; + fp->f_flag2 = (flag & (FSEARCH|FEXEC|__FLXPATH)) >> 16; fp->f_vnode = vp; fp->f_offset = 0; fp->f_audit_data = 0; diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index 1caa0b9b7b..183e1f4333 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2016, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -84,6 +84,7 @@ static int64_t cfork(int, int, int); static int getproc(proc_t **, pid_t, uint_t); #define GETPROC_USER 0x0 #define GETPROC_KERNEL 0x1 +#define GETPROC_ZSCHED 0x2 static void fork_fail(proc_t *); static void forklwp_fail(proc_t *); @@ -706,7 +707,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp, B_FALSE); } } @@ -755,7 +756,7 @@ forklwp_fail(proc_t *p) kmem_free(t->t_door, sizeof (door_data_t)); t->t_door = NULL; } - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); /* * Remove the thread from the all threads list. @@ -792,6 +793,9 @@ extern struct as kas; /* * fork a kernel process. + * + * Passing a pid argument of -1 indicates that the new process should be + * launched as a child of 'zsched' within the zone. */ int newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, @@ -810,6 +814,7 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; ASSERT(pid != 1); + ASSERT(pid >= 0); if (getproc(&p, pid, GETPROC_KERNEL) < 0) return (EAGAIN); @@ -853,8 +858,18 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, rctl_set_t *init_set; task_t *tk, *tk_old; klwp_t *lwp; + boolean_t pzsched = B_FALSE; + int flag = GETPROC_USER; + + /* Handle a new user-level thread as child of zsched. */ + if (pid < 0) { + VERIFY(curzone != global_zone); + flag = GETPROC_ZSCHED; + pzsched = B_TRUE; + pid = 0; + } - if (getproc(&p, pid, GETPROC_USER) < 0) + if (getproc(&p, pid, flag) < 0) return (EAGAIN); /* * init creates a new task, distinct from the task @@ -915,7 +930,8 @@ newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct, } t = lwptot(lwp); - ctp = contract_process_fork(sys_process_tmpl, p, curproc, + ctp = contract_process_fork(sys_process_tmpl, p, + (pzsched ? curproc->p_zone->zone_zsched : curproc), B_FALSE); ASSERT(ctp != NULL); if (ct != NULL) @@ -956,7 +972,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) return (-1); /* no point in starting new processes */ - pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + if (flags & GETPROC_ZSCHED) { + pp = curproc->p_zone->zone_zsched; + } else { + pp = (flags & GETPROC_KERNEL) ? &p0 : curproc; + } task = pp->p_task; proj = task->tk_proj; zone = pp->p_zone; @@ -1017,6 +1037,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1084,9 +1107,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1171,6 +1191,18 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + /* + * The only reason why process branding should fail is when + * the procedure is complicated by multiple LWPs on the scene. + * With an LWP count of 0, this newly allocated process has no + * reason to fail branding. + */ + VERIFY0(brand_setbrand(cp, B_FALSE)); + + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); @@ -1188,6 +1220,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) */ fcnt_add(P_FINFO(pp), 1); + mutex_enter(&pp->p_lock); if (PTOU(pp)->u_cdir) { VN_HOLD(PTOU(pp)->u_cdir); } else { @@ -1201,6 +1234,7 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) VN_HOLD(PTOU(pp)->u_rdir); if (PTOU(pp)->u_cwd) refstr_hold(PTOU(pp)->u_cwd); + mutex_exit(&pp->p_lock); /* * copy the parent's uarea. diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index da53bce24e..6e2d3c403c 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -21,7 +21,7 @@ /* * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright 2017 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ /* @@ -55,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -570,6 +571,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -585,8 +600,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -595,9 +608,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -638,7 +650,7 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, #define RANDOMIZABLE_MAPPING(addr, flags) (((flags & MAP_FIXED) == 0) && \ !(((flags & MAP_ALIGN) == 0) && (addr != 0) && aslr_respect_mmap_hint)) -static int +int smmap_common(caddr_t *addrp, size_t len, int prot, int flags, struct file *fp, offset_t pos) { @@ -771,8 +783,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -780,10 +790,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/ipc.c b/usr/src/uts/common/os/ipc.c index 86cb867da8..bf917ef716 100644 --- a/usr/src/uts/common/os/ipc.c +++ b/usr/src/uts/common/os/ipc.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1217,6 +1218,23 @@ ipc_remove(ipc_service_t *service, kipc_perm_t *perm) (IPC_ZONE_USAGE(perm, service) == 0))); } +/* + * Perform actual IPC_RMID, either via ipc_rmid or due to a delayed *_RMID. + */ +void +ipc_rmsvc(ipc_service_t *service, kipc_perm_t *perm) +{ + ASSERT(service->ipcs_count > 0); + ASSERT(MUTEX_HELD(&service->ipcs_lock)); + + ipc_remove(service, perm); + mutex_exit(&service->ipcs_lock); + + /* perform any per-service removal actions */ + service->ipcs_rmid(perm); + + ipc_rele(service, perm); +} /* * Common code to perform an IPC_RMID. Returns an errno value on @@ -1247,13 +1265,7 @@ ipc_rmid(ipc_service_t *service, int id, cred_t *cr) /* * Nothing can fail from this point on. */ - ipc_remove(service, perm); - mutex_exit(&service->ipcs_lock); - - /* perform any per-service removal actions */ - service->ipcs_rmid(perm); - - ipc_rele(service, perm); + ipc_rmsvc(service, perm); return (0); } diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index 394235f26c..4d2c1e6c10 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright 2018, Joyent, Inc. diff --git a/usr/src/uts/common/os/kstat_fr.c b/usr/src/uts/common/os/kstat_fr.c index 93c04cff8d..b09b2d3558 100644 --- a/usr/src/uts/common/os/kstat_fr.c +++ b/usr/src/uts/common/os/kstat_fr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -198,6 +198,9 @@ struct { kstat_named_t pagesfree; kstat_named_t pageslocked; kstat_named_t pagestotal; + kstat_named_t lowmemscan; + kstat_named_t zonecapscan; + kstat_named_t nthrottle; } system_pages_kstat = { { "physmem", KSTAT_DATA_ULONG }, { "nalloc", KSTAT_DATA_ULONG }, @@ -219,6 +222,9 @@ struct { { "pagesfree", KSTAT_DATA_ULONG }, { "pageslocked", KSTAT_DATA_ULONG }, { "pagestotal", KSTAT_DATA_ULONG }, + { "low_mem_scan", KSTAT_DATA_ULONG }, + { "zone_cap_scan", KSTAT_DATA_ULONG }, + { "n_throttle", KSTAT_DATA_ULONG }, }; static int header_kstat_update(kstat_t *, int); @@ -912,6 +918,9 @@ system_pages_kstat_update(kstat_t *ksp, int rw) system_pages_kstat.pageslocked.value.ul = (ulong_t)(availrmem_initial - availrmem); system_pages_kstat.pagestotal.value.ul = (ulong_t)total_pages; + system_pages_kstat.lowmemscan.value.ul = (ulong_t)low_mem_scan; + system_pages_kstat.zonecapscan.value.ul = (ulong_t)zone_cap_scan; + system_pages_kstat.nthrottle.value.ul = (ulong_t)n_throttle; /* * pp_kernel represents total pages used by the kernel since the * startup. This formula takes into account the boottime kernel diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index b5f41d93f9..6a922343e7 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -23,6 +23,8 @@ * Copyright 2020 Oxide Computer Company * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2022 Joyent, Inc. + * Copyright 2022 MNX Cloud, Inc. */ #include <sys/types.h> @@ -260,8 +262,11 @@ log_init(void) #ifdef LEGACY_BANNER printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + /* + * Note: In the future this should be 2022-20XX, and delete this + * comment when we don't need it anymore + */ + printf("Copyright 2022 MNX Cloud, Inc.\n"); #else bootbanner_print(log_bootbanner_print, KM_SLEEP); #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index 5e3b1ec949..f487760e68 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/param.h> @@ -56,6 +56,8 @@ #include <sys/lgrp.h> #include <sys/rctl.h> #include <sys/contract_impl.h> +#include <sys/contract/process.h> +#include <sys/contract/process_impl.h> #include <sys/cpc_impl.h> #include <sys/sdt.h> #include <sys/cmn_err.h> @@ -114,7 +116,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -282,6 +284,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -629,18 +644,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -653,7 +656,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -695,8 +697,27 @@ grow: t->t_post_sys = 1; /* + * Perform lwp branding + * + * The b_initlwp hook is _not_ allowed to drop p->p_lock as it must be + * continuously held between when the tidhash is sized and when the lwp + * is inserted into it. Operations requiring p->p_lock to be + * temporarily dropped can be performed in b_initlwp_post. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -717,6 +738,13 @@ grow: lep->le_start = t->t_start; lwp_hash_in(p, lep, p->p_tidhash, p->p_tidhash_sz, 1); + /* + * Complete lwp branding + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_initlwp_post != NULL) { + BROP(p)->b_initlwp_post(lwp); + } + lwp_fp_init(lwp); if (state == TS_RUN) { @@ -754,8 +782,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -828,8 +857,27 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) int i; for (i = 0; i < ct_ntypes; i++) { - dst->lwp_ct_active[i] = ctmpl_dup(src->lwp_ct_active[i]); + ct_template_t *tmpl = src->lwp_ct_active[i]; + + /* + * If the process contract template is setup to be preserved + * across exec, then if we're forking, perform an implicit + * template_clear now. This ensures that future children of + * this child will remain in the same contract unless they're + * explicitly setup differently. We know we're forking if the + * two LWPs belong to different processes. + */ + if (i == CTT_PROCESS && tmpl != NULL) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if (dst->lwp_procp != src->lwp_procp && + (ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + tmpl = NULL; + } + + dst->lwp_ct_active[i] = ctmpl_dup(tmpl); dst->lwp_ct_latest[i] = NULL; + } } @@ -837,21 +885,33 @@ lwp_ctmpl_copy(klwp_t *dst, klwp_t *src) * Clear an LWP's contract template state. */ void -lwp_ctmpl_clear(klwp_t *lwp) +lwp_ctmpl_clear(klwp_t *lwp, boolean_t is_exec) { ct_template_t *tmpl; int i; for (i = 0; i < ct_ntypes; i++) { - if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { - ctmpl_free(tmpl); - lwp->lwp_ct_active[i] = NULL; - } - if (lwp->lwp_ct_latest[i] != NULL) { contract_rele(lwp->lwp_ct_latest[i]); lwp->lwp_ct_latest[i] = NULL; } + + if ((tmpl = lwp->lwp_ct_active[i]) != NULL) { + /* + * If we're exec-ing a new program and the process + * contract template is setup to be preserved across + * exec, then don't clear it. + */ + if (is_exec && i == CTT_PROCESS) { + ctmpl_process_t *ctp = tmpl->ctmpl_data; + + if ((ctp->ctp_params & CT_PR_KEEP_EXEC) != 0) + continue; + } + + ctmpl_free(tmpl); + lwp->lwp_ct_active[i] = NULL; + } } } @@ -892,13 +952,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -942,6 +995,18 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + prbarrier(p); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp @@ -1096,7 +1161,7 @@ lwp_cleanup(void) } kpreempt_enable(); - lwp_ctmpl_clear(ttolwp(t)); + lwp_ctmpl_clear(ttolwp(t), B_FALSE); } int diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 148916d4d8..c57f8a7d2c 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -159,7 +159,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -289,7 +289,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)exec_fnamep, (const char **)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/mem_config.c b/usr/src/uts/common/os/mem_config.c index 4c4e78578b..fd74dd3092 100644 --- a/usr/src/uts/common/os/mem_config.c +++ b/usr/src/uts/common/os/mem_config.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -1638,7 +1639,7 @@ delthr_get_freemem(struct mem_handle *mhp) * Put pressure on pageout. */ page_needfree(free_get); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); mutex_enter(&mhp->mh_mutex); (void) cv_reltimedwait(&mhp->mh_cv, &mhp->mh_mutex, diff --git a/usr/src/uts/common/os/mmapobj.c b/usr/src/uts/common/os/mmapobj.c index d85df39a62..819d32116d 100644 --- a/usr/src/uts/common/os/mmapobj.c +++ b/usr/src/uts/common/os/mmapobj.c @@ -1367,10 +1367,15 @@ calc_loadable(Ehdr *ehdrp, caddr_t phdrbase, int nphdrs, size_t *len, } if (num_segs++ == 0) { /* - * The p_vaddr of the first PT_LOAD segment - * must either be NULL or within the first - * page in order to be interpreted. - * Otherwise, its an invalid file. + * While ELF doesn't specify the meaning of + * p_vaddr for PT_LOAD segments in ET_DYN + * objects, we mandate that is either NULL or + * (to accommodate some historical binaries) + * within the first page. (Note that there + * exist non-native ET_DYN objects that violate + * this constraint that we nonetheless must be + * able to execute; see the ET_DYN handling in + * mapelfexec() for details.) */ if (e_type == ET_DYN && ((caddr_t)((uintptr_t)vaddr & diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index 37389a6e4d..d48be19c71 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -113,6 +113,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { @@ -521,6 +533,19 @@ sprunlock(proc_t *p) mutex_exit(&p->p_lock); } +/* + * Undo effects of sprlock but without dropping p->p_lock + */ +void +sprunprlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; +} + void pid_init(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 0e4bd2c73d..b3f01cfab2 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -57,6 +57,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1275,6 +1276,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2123,6 +2140,13 @@ secpolicy_meminfo(const cred_t *cr) } int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2639,3 +2663,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index 186aafc460..05979dd236 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -377,6 +381,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/rctl.c b/usr/src/uts/common/os/rctl.c index 81a1b5454a..8f52f4ef3a 100644 --- a/usr/src/uts/common/os/rctl.c +++ b/usr/src/uts/common/os/rctl.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. */ #include <sys/atomic.h> @@ -194,6 +195,8 @@ id_space_t *rctl_ids; kmem_cache_t *rctl_cache; /* kmem cache for rctl structures */ kmem_cache_t *rctl_val_cache; /* kmem cache for rctl values */ +extern rctl_hndl_t rc_process_maxlockedmem; + kmutex_t rctl_lists_lock; rctl_dict_entry_t *rctl_lists[RC_MAX_ENTITY + 1]; @@ -2870,12 +2873,12 @@ rctl_init(void) * rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, * int chargeproc) * - * Increments the amount of locked memory on a project, and - * zone. If proj is non-NULL the project must be held by the - * caller; if it is NULL the proj and zone of proc_t p are used. - * If chargeproc is non-zero, then the charged amount is cached - * on p->p_locked_mem so that the charge can be migrated when a - * process changes projects. + * Increments the amount of locked memory on a process, project, and + * zone. If 'proj' is non-NULL, the project must be held by the + * caller; if it is NULL, the project and zone of process 'p' are used. + * If 'chargeproc' is non-zero, then the charged amount is added + * to p->p_locked_mem. This is also used so that the charge can be + * migrated when a process changes projects. * * Return values * 0 - success @@ -2893,6 +2896,7 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, ASSERT(p != NULL); ASSERT(MUTEX_HELD(&p->p_lock)); + if (proj != NULL) { projp = proj; zonep = proj->kpj_zone; @@ -2936,11 +2940,23 @@ rctl_incr_locked_mem(proc_t *p, kproject_t *proj, rctl_qty_t inc, } } - zonep->zone_locked_mem += inc; - projp->kpj_data.kpd_locked_mem += inc; if (chargeproc != 0) { + /* Check for overflow */ + if ((p->p_locked_mem + inc) < p->p_locked_mem) { + ret = EAGAIN; + goto out; + } + if (rctl_test_entity(rc_process_maxlockedmem, p->p_rctls, p, + &e, inc, 0) & RCT_DENY) { + ret = EAGAIN; + goto out; + } + p->p_locked_mem += inc; } + + zonep->zone_locked_mem += inc; + projp->kpj_data.kpd_locked_mem += inc; out: mutex_exit(&zonep->zone_mem_lock); return (ret); diff --git a/usr/src/uts/common/os/rctl_proc.c b/usr/src/uts/common/os/rctl_proc.c index 9b7324fe7b..c62540d2b4 100644 --- a/usr/src/uts/common/os/rctl_proc.c +++ b/usr/src/uts/common/os/rctl_proc.c @@ -21,6 +21,7 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -32,6 +33,7 @@ #include <sys/port_kernel.h> #include <sys/signal.h> #include <sys/var.h> +#include <sys/policy.h> #include <sys/vmparam.h> #include <sys/machparam.h> @@ -66,6 +68,7 @@ rctl_hndl_t rc_process_semmsl; rctl_hndl_t rc_process_semopm; rctl_hndl_t rc_process_portev; rctl_hndl_t rc_process_sigqueue; +rctl_hndl_t rc_process_maxlockedmem; /* * process.max-cpu-time / RLIMIT_CPU @@ -212,6 +215,26 @@ static rctl_ops_t proc_vmem_ops = { }; /* + * process.max-locked-memory + */ +/*ARGSUSED*/ +static int +proc_maxlockedmem_test(struct rctl *r, struct proc *p, rctl_entity_p_t *e, + struct rctl_val *rv, rctl_qty_t i, uint_t f) +{ + if (secpolicy_lock_memory(CRED()) == 0) + return (0); + return ((p->p_locked_mem + i) > rv->rcv_value); +} + +static rctl_ops_t proc_maxlockedmem_ops = { + rcop_no_action, + rcop_no_usage, + rcop_no_set, + proc_maxlockedmem_test +}; + +/* * void rctlproc_default_init() * * Overview @@ -383,6 +406,11 @@ rctlproc_init(void) rctl_add_default_limit("process.max-sigqueue-size", _SIGQUEUE_SIZE_PRIVILEGED, RCPRIV_PRIVILEGED, RCTL_LOCAL_DENY); + rc_process_maxlockedmem = rctl_register("process.max-locked-memory", + RCENTITY_PROCESS, RCTL_GLOBAL_LOWERABLE | RCTL_GLOBAL_DENY_ALWAYS | + RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_BYTES, + ULONG_MAX, UINT32_MAX, &proc_maxlockedmem_ops); + /* * Place minimal set of controls on "sched" process for inheritance by * processes created via newproc(). diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index dba962fa63..a54ab28751 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -643,16 +647,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/shm.c b/usr/src/uts/common/os/shm.c index 8f98fcb3f0..d0611eb9bb 100644 --- a/usr/src/uts/common/os/shm.c +++ b/usr/src/uts/common/os/shm.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -319,6 +320,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) size_t share_size; struct shm_data ssd; uintptr_t align_hint; + long curprot; /* * Pick a share pagesize to use, if (!isspt(sp)). @@ -453,6 +455,7 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } } + curprot = sp->shm_opts & SHM_PROT_MASK; if (!isspt(sp)) { error = sptcreate(size, &segspt, sp->shm_amp, prot, flags, share_szc); @@ -462,8 +465,8 @@ shmat(int shmid, caddr_t uaddr, int uflags, uintptr_t *rvp) } sp->shm_sptinfo->sptas = segspt->s_as; sp->shm_sptseg = segspt; - sp->shm_sptprot = prot; - } else if ((prot & sp->shm_sptprot) != sp->shm_sptprot) { + sp->shm_opts = (sp->shm_opts & ~SHM_PROT_MASK) | prot; + } else if ((prot & curprot) != curprot) { /* * Ensure we're attaching to an ISM segment with * fewer or equal permissions than what we're @@ -748,6 +751,23 @@ shmctl(int shmid, int cmd, void *arg) } break; + /* Stage segment for removal, but don't remove until last detach */ + case SHM_RMID: + if ((error = secpolicy_ipc_owner(cr, (kipc_perm_t *)sp)) != 0) + break; + + /* + * If attached, just mark it as a pending remove, otherwise + * we must perform the normal ipc_rmid now. + */ + if ((sp->shm_perm.ipc_ref - 1) > 0) { + sp->shm_opts |= SHM_RM_PENDING; + } else { + mutex_exit(lock); + return (ipc_rmid(shm_svc, shmid, cr)); + } + break; + default: error = EINVAL; break; @@ -778,6 +798,23 @@ shm_detach(proc_t *pp, segacct_t *sap) sp->shm_ismattch--; sp->shm_dtime = gethrestime_sec(); sp->shm_lpid = pp->p_pid; + if ((sp->shm_opts & SHM_RM_PENDING) != 0 && + sp->shm_perm.ipc_ref == 2) { + /* + * If this is the last detach of the segment across the whole + * system then now we can perform the delayed IPC_RMID. + * The ipc_ref count has 1 for the original 'get' and one for + * each 'attach' (see 'stat' handling in shmctl). + */ + sp->shm_opts &= ~SHM_RM_PENDING; + mutex_enter(&shm_svc->ipcs_lock); + ipc_rmsvc(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ + ASSERT(!MUTEX_HELD(&shm_svc->ipcs_lock)); + ASSERT(((kipc_perm_t *)sp)->ipc_ref > 0); + + /* Lock was dropped, need to retake it for following rele. */ + (void) ipc_lock(shm_svc, sp->shm_perm.ipc_id); + } ipc_rele(shm_svc, (kipc_perm_t *)sp); /* Drops lock */ kmem_free(sap, sizeof (segacct_t)); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 453b1f22d4..67a93581dd 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2017, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> #include <sys/signalfd.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -148,6 +149,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, klwp_t *lwp, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, lwp, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -159,12 +175,13 @@ signal_is_blocked(kthread_t *t, int sig) * the signal is not being accepted via sigwait() */ static int -sig_discardable(proc_t *p, int sig) +sig_discardable(proc_t *p, kthread_t *tp, int sig) { kthread_t *t = p->p_tlist; + klwp_t *lwp = (tp == NULL) ? NULL : tp->t_lwp; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, lwp, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -200,7 +217,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -297,7 +314,7 @@ sigtoproc(proc_t *p, kthread_t *t, int sig) } } - if (sig_discardable(p, sig)) { + if (sig_discardable(p, t, sig)) { DTRACE_PROC3(signal__discard, kthread_t *, p->p_tlist, proc_t *, p, int, sig); return; @@ -497,7 +514,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, lwp, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -623,6 +640,28 @@ issig_forreal(void) } /* + * The brand hook name 'b_issig_stop' is a misnomer. + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + int r; + + /* + * The brand hook will return 0 if it would like + * us to drive on, -1 if we should restart + * the loop to check other conditions, or 1 if we + * should terminate the loop. + */ + r = BROP(p)->b_issig_stop(p, lwp); + if (r < 0) { + continue; + } else if (r > 0) { + break; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -656,7 +695,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, lwp, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -708,7 +747,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -722,7 +761,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, lwp, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -954,6 +993,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1065,7 +1114,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1171,6 +1220,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1192,7 +1248,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1327,7 +1383,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, lwp, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1771,9 +1827,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1804,6 +1863,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); @@ -2115,7 +2183,7 @@ sigaddqa(proc_t *p, kthread_t *t, sigqueue_t *sigqp) ASSERT(MUTEX_HELD(&p->p_lock)); ASSERT(sig >= 1 && sig < NSIG); - if (sig_discardable(p, sig)) + if (sig_discardable(p, t, sig)) siginfofree(sigqp); else sigaddqins(p, t, sigqp); @@ -2141,7 +2209,7 @@ sigaddq(proc_t *p, kthread_t *t, k_siginfo_t *infop, int km_flags) * blocking the signal (it *could* change it's mind while * the signal is pending) then don't bother creating one. */ - if (!sig_discardable(p, sig) && + if (!sig_discardable(p, t, sig) && (sigismember(&p->p_siginfo, sig) || (curproc->p_ct_process != p->p_ct_process) || (sig == SIGCLD && SI_FROMKERNEL(infop))) && diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index c137a498d1..90a9ea6f0f 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -78,6 +78,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <sys/ptms.h> #include <sys/limits.h> #include <c2/audit.h> @@ -3267,6 +3268,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index fdd0c06aee..f2b91365d9 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -26,6 +26,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2018 Joyent, Inc. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. * Copyright 2018 Joyent, Inc. * Copyright 2022 Garrett D'Amore diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index 30cc5744c2..7c094a0f20 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5822,6 +5822,12 @@ ddi_ffs(long mask) return (ffs(mask)); } +int +ddi_ffsll(long long mask) +{ + return (ffs(mask)); +} + /* * Find last bit set. Take mask and clear * all but the most significant bit, and @@ -5833,8 +5839,14 @@ ddi_ffs(long mask) int ddi_fls(long mask) { + return (ddi_flsll(mask)); +} + +int +ddi_flsll(long long mask) +{ while (mask) { - long nx; + long long nx; if ((nx = (mask & (mask - 1))) == 0) break; diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index ab12de3935..41f421c505 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -23,6 +23,7 @@ * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2018, Joyent, Inc. * Copyright 2020 Oxide Computer Company */ @@ -62,8 +63,7 @@ struct mmaplf32a; int access(char *, int); int alarm(int); int auditsys(struct auditcalls *, rval_t *); -int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, - uintptr_t); +int64_t brandsys(int, uintptr_t, uintptr_t, uintptr_t, uintptr_t); intptr_t brk(caddr_t); int chdir(char *); int chmod(char *, int); @@ -645,7 +645,7 @@ struct sysent sysent[NSYSCALL] = SYSENT_NOSYS(), SYSENT_C("llseek", llseek32, 4)), /* 176 */ SYSENT_LOADABLE(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE(), /* kaio */ /* 179 */ SYSENT_LOADABLE(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1000,7 +1000,7 @@ struct sysent sysent32[NSYSCALL] = /* 174 */ SYSENT_CI("pwrite", pwrite32, 4), /* 175 */ SYSENT_C("llseek", llseek32, 4), /* 176 */ SYSENT_LOADABLE32(), /* inst_sync */ - /* 177 */ SYSENT_CI("brandsys", brandsys, 6), + /* 177 */ SYSENT_CI("brandsys", brandsys, 5), /* 178 */ SYSENT_LOADABLE32(), /* kaio */ /* 179 */ SYSENT_LOADABLE32(), /* cpc */ /* 180 */ SYSENT_CI("lgrpsys", lgrpsys, 3), @@ -1092,18 +1092,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1111,7 +1113,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1125,14 +1128,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1144,7 +1148,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1152,7 +1157,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1166,14 +1172,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1201,5 +1208,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index c78a545360..f587430625 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -82,6 +82,7 @@ timer_lock(proc_t *p, itimer_t *it) * waiters. p_lock must be held on entry; it will not be dropped by * timer_unlock(). */ +/* ARGSUSED */ static void timer_unlock(proc_t *p, itimer_t *it) { @@ -139,7 +140,7 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) it->it_backend->clk_timer_delete(it); - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portev) { port_kevent_t *pev; @@ -201,20 +202,20 @@ timer_delete_locked(proc_t *p, timer_t tid, itimer_t *it) static itimer_t * timer_grab(proc_t *p, timer_t tid) { - itimer_t **itp, *it; + itimer_t *it; if (tid < 0) { return (NULL); } mutex_enter(&p->p_lock); - - if ((itp = p->p_itimer) == NULL || tid >= p->p_itimer_sz || - (it = itp[tid]) == NULL) { + if (p->p_itimer == NULL || tid >= p->p_itimer_sz || + (it = p->p_itimer[tid]) == NULL) { mutex_exit(&p->p_lock); return (NULL); } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (it->it_lock & ITLK_REMOVE) { @@ -236,7 +237,7 @@ timer_grab(proc_t *p, timer_t tid) * should not be held on entry; timer_release() will acquire p_lock but * will drop it before returning. */ -static void +void timer_release(proc_t *p, itimer_t *it) { mutex_enter(&p->p_lock); @@ -249,7 +250,7 @@ timer_release(proc_t *p, itimer_t *it) * p_lock should not be held on entry; timer_delete_grabbed() will acquire * p_lock, but will drop it before returning. */ -static void +void timer_delete_grabbed(proc_t *p, timer_t tid, itimer_t *it) { mutex_enter(&p->p_lock); @@ -464,6 +465,9 @@ timer_fire(itimer_t *it) it->it_pending = 1; port_send_event((port_kevent_t *)it->it_portev); mutex_exit(&it->it_mutex); + } else if (it->it_flags & IT_CALLBACK) { + it->it_cb_func(it); + ASSERT(MUTEX_NOT_HELD(&it->it_mutex)); } else if (it->it_flags & IT_SIGNAL) { it->it_pending = 1; mutex_exit(&it->it_mutex); @@ -580,85 +584,27 @@ done: return (B_TRUE); } +/* + * Setup a timer + * + * This allocates an itimer_t (including a timer_t ID and slot in the process), + * wires it up according to the provided sigevent, and associates it with the + * desired clock backend. Upon successful completion, the timer will be + * locked, preventing it from being armed via timer_settime() or deleted via + * timer_delete(). This gives the caller a chance to perform any last minute + * manipulations (such as configuring the IT_CALLBACK functionality and/or + * copying the timer_t out to userspace) before using timer_release() to unlock + * it or timer_delete_grabbed() to delete it. + */ int -timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) +timer_setup(clock_backend_t *backend, struct sigevent *evp, port_notify_t *pnp, + itimer_t **itp, timer_t *tidp) { - struct sigevent ev; proc_t *p = curproc; - clock_backend_t *backend; + int error = 0; itimer_t *it; sigqueue_t *sigq; - cred_t *cr = CRED(); - int error = 0; - timer_t i; - port_notify_t tim_pnevp; - port_kevent_t *pkevp = NULL; - - if ((backend = CLOCK_BACKEND(clock)) == NULL) - return (set_errno(EINVAL)); - - if (evp != NULL) { - /* - * short copyin() for binary compatibility - * fetch oldsigevent to determine how much to copy in. - */ - if (get_udatamodel() == DATAMODEL_NATIVE) { - if (copyin(evp, &ev, sizeof (struct oldsigevent))) - return (set_errno(EFAULT)); - - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, - sizeof (port_notify_t))) - return (set_errno(EFAULT)); - } -#ifdef _SYSCALL32_IMPL - } else { - struct sigevent32 ev32; - port_notify32_t tim_pnevp32; - - if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) - return (set_errno(EFAULT)); - ev.sigev_notify = ev32.sigev_notify; - ev.sigev_signo = ev32.sigev_signo; - /* - * See comment in sigqueue32() on handling of 32-bit - * sigvals in a 64-bit kernel. - */ - ev.sigev_value.sival_int = ev32.sigev_value.sival_int; - if (ev.sigev_notify == SIGEV_PORT || - ev.sigev_notify == SIGEV_THREAD) { - if (copyin((void *)(uintptr_t) - ev32.sigev_value.sival_ptr, - (void *)&tim_pnevp32, - sizeof (port_notify32_t))) - return (set_errno(EFAULT)); - tim_pnevp.portnfy_port = - tim_pnevp32.portnfy_port; - tim_pnevp.portnfy_user = - (void *)(uintptr_t)tim_pnevp32.portnfy_user; - } -#endif - } - switch (ev.sigev_notify) { - case SIGEV_NONE: - break; - case SIGEV_SIGNAL: - if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) - return (set_errno(EINVAL)); - break; - case SIGEV_THREAD: - case SIGEV_PORT: - break; - default: - return (set_errno(EINVAL)); - } - } else { - /* - * Use the clock's default sigevent (this is a structure copy). - */ - ev = backend->clk_default; - } + timer_t tid; /* * We'll allocate our sigqueue now, before we grab p_lock. @@ -674,29 +620,25 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_init(&it->it_mutex, NULL, MUTEX_DEFAULT, NULL); mutex_enter(&p->p_lock); - if (!timer_get_id(p, &i)) { + if (!timer_get_id(p, &tid)) { mutex_exit(&p->p_lock); - kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); return (set_errno(EAGAIN)); } - ASSERT(i < p->p_itimer_sz && p->p_itimer[i] == NULL); + ASSERT(tid < p->p_itimer_sz && p->p_itimer[tid] == NULL); /* * If we develop other notification mechanisms, this will need * to call into (yet another) backend. */ - sigq->sq_info.si_signo = ev.sigev_signo; - if (evp == NULL) - sigq->sq_info.si_value.sival_int = i; - else - sigq->sq_info.si_value = ev.sigev_value; + sigq->sq_info.si_signo = evp->sigev_signo; + sigq->sq_info.si_value = evp->sigev_value; sigq->sq_info.si_code = SI_TIMER; sigq->sq_info.si_pid = p->p_pid; sigq->sq_info.si_ctid = PRCTID(p); sigq->sq_info.si_zoneid = getzoneid(); - sigq->sq_info.si_uid = crgetruid(cr); + sigq->sq_info.si_uid = crgetruid(CRED()); sigq->sq_func = timer_signal; sigq->sq_next = NULL; sigq->sq_backptr = it; @@ -704,9 +646,12 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_backend = backend; it->it_lock = ITLK_LOCKED; - if (ev.sigev_notify == SIGEV_THREAD || - ev.sigev_notify == SIGEV_PORT) { + if (evp->sigev_notify == SIGEV_THREAD || + evp->sigev_notify == SIGEV_PORT) { int port; + port_kevent_t *pkevp = NULL; + + ASSERT(pnp != NULL); /* * This timer is programmed to use event port notification when @@ -726,7 +671,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) */ it->it_flags |= IT_PORT; - port = tim_pnevp.portnfy_port; + port = pnp->portnfy_port; /* associate timer as event source with the port */ error = port_associate_ksource(port, PORT_SOURCE_TIMER, @@ -736,7 +681,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* allocate an event structure/slot */ @@ -748,21 +693,21 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) mutex_exit(&p->p_lock); kmem_cache_free(clock_timer_cache, it); kmem_free(sigq, sizeof (sigqueue_t)); - return (set_errno(error)); + return (error); } /* initialize event data */ - port_init_event(pkevp, i, tim_pnevp.portnfy_user, + port_init_event(pkevp, tid, pnp->portnfy_user, timer_port_callback, it); it->it_portev = pkevp; it->it_portfd = port; } else { - if (ev.sigev_notify == SIGEV_SIGNAL) + if (evp->sigev_notify == SIGEV_SIGNAL) it->it_flags |= IT_SIGNAL; } /* Populate the slot now that the timer is prepped. */ - p->p_itimer[i] = it; + p->p_itimer[tid] = it; mutex_exit(&p->p_lock); /* @@ -775,17 +720,8 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) it->it_lwp = ttolwp(curthread); it->it_proc = p; - if (copyout(&i, tid, sizeof (timer_t)) != 0) { - error = EFAULT; - goto err; - } - - /* - * If we're here, then we have successfully created the timer; we - * just need to release the timer and return. - */ - timer_release(p, it); - + *itp = it; + *tidp = tid; return (0); err: @@ -796,11 +732,115 @@ err: * impossible for a removal to be pending. */ ASSERT(!(it->it_lock & ITLK_REMOVE)); - timer_delete_grabbed(p, i, it); + timer_delete_grabbed(p, tid, it); + + return (error); +} + + +int +timer_create(clockid_t clock, struct sigevent *evp, timer_t *tidp) +{ + int error = 0; + proc_t *p = curproc; + clock_backend_t *backend; + struct sigevent ev; + itimer_t *it; + timer_t tid; + port_notify_t tim_pnevp; + + if ((backend = CLOCK_BACKEND(clock)) == NULL) + return (set_errno(EINVAL)); + + if (evp != NULL) { + /* + * short copyin() for binary compatibility + * fetch oldsigevent to determine how much to copy in. + */ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(evp, &ev, sizeof (struct oldsigevent))) + return (set_errno(EFAULT)); + + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin(ev.sigev_value.sival_ptr, &tim_pnevp, + sizeof (port_notify_t))) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + } else { + struct sigevent32 ev32; + port_notify32_t tim_pnevp32; - return (set_errno(error)); + if (copyin(evp, &ev32, sizeof (struct oldsigevent32))) + return (set_errno(EFAULT)); + ev.sigev_notify = ev32.sigev_notify; + ev.sigev_signo = ev32.sigev_signo; + /* + * See comment in sigqueue32() on handling of 32-bit + * sigvals in a 64-bit kernel. + */ + ev.sigev_value.sival_int = ev32.sigev_value.sival_int; + if (ev.sigev_notify == SIGEV_PORT || + ev.sigev_notify == SIGEV_THREAD) { + if (copyin((void *)(uintptr_t) + ev32.sigev_value.sival_ptr, + (void *)&tim_pnevp32, + sizeof (port_notify32_t))) + return (set_errno(EFAULT)); + tim_pnevp.portnfy_port = + tim_pnevp32.portnfy_port; + tim_pnevp.portnfy_user = + (void *)(uintptr_t)tim_pnevp32.portnfy_user; + } +#endif + } + switch (ev.sigev_notify) { + case SIGEV_NONE: + break; + case SIGEV_SIGNAL: + if (ev.sigev_signo < 1 || ev.sigev_signo >= NSIG) + return (set_errno(EINVAL)); + break; + case SIGEV_THREAD: + case SIGEV_PORT: + break; + default: + return (set_errno(EINVAL)); + } + } else { + /* + * Use the clock's default sigevent (this is a structure copy). + */ + ev = backend->clk_default; + } + + if ((error = timer_setup(backend, &ev, &tim_pnevp, &it, &tid)) != 0) { + return (set_errno(error)); + } + + /* + * Populate si_value with the timer ID if no sigevent was passed in. + */ + if (evp == NULL) { + it->it_sigq->sq_info.si_value.sival_int = tid; + } + + if (copyout(&tid, tidp, sizeof (timer_t)) != 0) { + timer_delete_grabbed(p, tid, it); + return (set_errno(EFAULT)); + } + + /* + * If we're here, then we have successfully created the timer; we + * just need to release the timer and return. + */ + timer_release(p, it); + + return (0); } + int timer_gettime(timer_t tid, itimerspec_t *val) { @@ -923,17 +963,20 @@ timer_lwpexit(void) uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } for (i = 0; i < p->p_itimer_sz; i++) { - if ((it = itp[i]) == NULL) + if ((it = p->p_itimer[i]) == NULL) { continue; + } + /* This may drop p_lock temporarily. */ timer_lock(p, it); if ((it->it_lock & ITLK_REMOVE) || it->it_lwp != lwp) { @@ -967,17 +1010,19 @@ timer_lwpbind() uint_t i; proc_t *p = curproc; klwp_t *lwp = ttolwp(curthread); - itimer_t *it, **itp; + itimer_t *it; ASSERT(MUTEX_HELD(&p->p_lock)); - if ((itp = p->p_itimer) == NULL) + if (p->p_itimer == NULL) { return; + } for (i = 0; i < p->p_itimer_sz; i++) { - if ((it = itp[i]) == NULL) + if ((it = p->p_itimer[i]) == NULL) continue; + /* This may drop p_lock temporarily. */ timer_lock(p, it); if (!(it->it_lock & ITLK_REMOVE) && it->it_lwp == lwp) { @@ -1068,7 +1113,7 @@ timer_close_port(void *arg, int port, pid_t pid, int lastclose) for (tid = 0; tid < timer_max; tid++) { if ((it = timer_grab(p, tid)) == NULL) continue; - if (it->it_portev) { + if (it->it_flags & IT_PORT) { mutex_enter(&it->it_mutex); if (it->it_portfd == port) { port_kevent_t *pev; diff --git a/usr/src/uts/common/os/vm_pageout.c b/usr/src/uts/common/os/vm_pageout.c index b65a6cea2f..0cfcf80d62 100644 --- a/usr/src/uts/common/os/vm_pageout.c +++ b/usr/src/uts/common/os/vm_pageout.c @@ -27,6 +27,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -63,6 +64,7 @@ #include <sys/callb.h> #include <sys/mem_cage.h> #include <sys/time.h> +#include <sys/zone.h> #include <sys/stdbool.h> #include <vm/hat.h> @@ -239,15 +241,22 @@ pgcnt_t lotsfree = 0; pgcnt_t needfree = 0; pgcnt_t throttlefree = 0; pgcnt_t pageout_reserve = 0; +pri_t pageout_pri; pgcnt_t deficit; pgcnt_t nscan; pgcnt_t desscan; +/* kstats */ +uint64_t low_mem_scan; +uint64_t zone_cap_scan; + +#define MAX_PSCAN_THREADS 16 + /* - * Values for min_pageout_nsec, max_pageout_nsec and pageout_nsec are the - * number of nanoseconds in each wakeup cycle that gives the equivalent of some - * underlying %CPU duty cycle. + * Values for min_pageout_nsec, max_pageout_nsec, pageout_nsec and + * zone_pageout_nsec are the number of nanoseconds in each wakeup cycle + * that gives the equivalent of some underlying %CPU duty cycle. * * min_pageout_nsec: * nanoseconds/wakeup equivalent of min_percent_cpu. @@ -259,15 +268,31 @@ pgcnt_t desscan; * Number of nanoseconds budgeted for each wakeup cycle. * Computed each time around by schedpaging(). * Varies between min_pageout_nsec and max_pageout_nsec, - * depending on memory pressure. + * depending on memory pressure or zones over their cap. + * + * zone_pageout_nsec: + * Number of nanoseconds budget for each cycle when a zone + * is over its memory cap. If this is zero, then the value + * of max_pageout_nsec is used instead. */ static hrtime_t min_pageout_nsec; static hrtime_t max_pageout_nsec; static hrtime_t pageout_nsec; +static hrtime_t zone_pageout_nsec; -static uint_t reset_hands; +static boolean_t reset_hands[MAX_PSCAN_THREADS]; #define PAGES_POLL_MASK 1023 +#define SCHEDPAGING_HZ 4 + +/* + * despagescanners: + * The desired number of page scanner threads. The value can be set in + * /etc/system or tuned directly with 'mdb -kw'. The system will bring + * the actual number of threads into line with the desired number. If set + * to an invalid value, the system will correct the setting. + */ +uint_t despagescanners = 0; /* * pageout_sample_lim: @@ -293,26 +318,29 @@ static uint_t reset_hands; * pageout_scanner(), which then sets this value once per system boot after * enough samples have been recorded (pageout_sample_cnt). Once set, this * new value is used for fastscan and handspreadpages. - * - * sample_start, sample_end: - * The hrtime at which the last pageout_scanner() sample began and ended. */ typedef hrtime_t hrrate_t; static uint64_t pageout_sample_lim = 4; static uint64_t pageout_sample_cnt = 0; static pgcnt_t pageout_sample_pages = 0; +static hrtime_t pageout_sample_etime = 0; static hrrate_t pageout_rate = 0; static pgcnt_t pageout_new_spread = 0; -static hrtime_t pageout_cycle_nsec; -static hrtime_t sample_start, sample_end; -static hrtime_t pageout_sample_etime = 0; +/* True if the page scanner is first starting up */ +#define PAGE_SCAN_STARTUP (pageout_sample_cnt < pageout_sample_lim) + +/* The current number of page scanner threads */ +static uint_t n_page_scanners = 1; +/* The number of page scanner threads that are actively scanning. */ +static uint_t pageouts_running; /* * Record number of times a pageout_scanner() wakeup cycle finished because it * timed out (exceeded its CPU budget), rather than because it visited - * its budgeted number of pages. + * its budgeted number of pages. This is only done when scanning under low + * free memory conditions, not when scanning for zones over their cap. */ uint64_t pageout_timeouts = 0; @@ -356,9 +384,10 @@ static struct clockinit { pgcnt_t ci_fastscan; pgcnt_t ci_slowscan; pgcnt_t ci_handspreadpages; + uint_t ci_despagescanners; } clockinit = { .ci_init = false }; -static pgcnt_t +static inline pgcnt_t clamp(pgcnt_t value, pgcnt_t minimum, pgcnt_t maximum) { if (value < minimum) { @@ -381,6 +410,83 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) } /* + * Local boolean to control scanning when zones are over their cap. Avoids + * accessing the zone_num_over_cap variable except within schedpaging(), which + * only runs periodically. This is here only to reduce our access to + * zone_num_over_cap, since it is already accessed a lot during paging, and + * the page scanner accesses the zones_over variable on each page during a + * scan. There is no lock needed for zone_num_over_cap since schedpaging() + * doesn't modify the variable, it only cares if the variable is 0 or non-0. + */ +static boolean_t zones_over = B_FALSE; + +/* + * On large memory systems, multiple instances of the page scanner are run, + * each responsible for a separate region of memory. This speeds up page + * invalidation under low memory conditions. + * + * despagescanners can be set in /etc/system or via mdb and it will + * be used as a guide for how many page scanners to create; the value + * will be adjusted if it is not sensible. Otherwise, the number of + * page scanners is determined dynamically based on handspreadpages. + */ +static void +recalc_pagescanners(void) +{ + pgcnt_t sz; + uint_t des; + + /* If the initial calibration has not been done, take no action. */ + if (pageout_new_spread == 0) + return; + + /* + * If the desired number of scanners is set in /etc/system + * then try to use it. + */ + if (despagescanners == 0 && clockinit.ci_despagescanners != 0) + despagescanners = clockinit.ci_despagescanners; + + if (despagescanners != 0) { + /* + * We have a desired number of page scanners, either from + * /etc/system or set via mdb. Try and use it (it will be + * clamped below). + */ + des = despagescanners; + } else { + /* + * Calculate the number of desired scanners based on the + * system's memory size. + * + * A 64GiB region size is used as the basis for calculating how + * many scanner threads should be created. For systems with up + * to 64GiB of RAM, a single thread is used; for very large + * memory systems the threads are limited to MAX_PSCAN_THREADS. + */ + sz = btop(64ULL << 30); + + if (sz > looppages) { + des = 1; + } else { + pgcnt_t tmp = sz; + + for (des = 1; tmp < looppages; des++) + tmp += sz; + } + } + + /* + * clamp the number of scanners so that we are under MAX_PSCAN_THREADS + * and so that each scanner covers at least 10% more than + * handspreadpages. + */ + des = clamp(des, 1, + looppages / (handspreadpages + handspreadpages / 10)); + despagescanners = clamp(des, 1, MAX_PSCAN_THREADS); +} + +/* * Set up the paging constants for the clock algorithm used by * pageout_scanner(), and by the virtual memory system overall. See the * comments at the top of this file for more information about the threshold @@ -394,7 +500,6 @@ tune(pgcnt_t initval, pgcnt_t initval_ceiling, pgcnt_t defval) void setupclock(void) { - pgcnt_t defval; bool half = (pageout_threshold_style == 1); bool recalc = true; @@ -423,6 +528,7 @@ setupclock(void) clockinit.ci_fastscan = fastscan; clockinit.ci_slowscan = slowscan; clockinit.ci_handspreadpages = handspreadpages; + clockinit.ci_despagescanners = despagescanners; /* * The first call does not trigger a recalculation, only @@ -604,7 +710,7 @@ setupclock(void) } /* - * Handspreadpages is distance (in pages) between front and back + * Handspreadpages is the distance (in pages) between front and back * pageout daemon hands. The amount of time to reclaim a page * once pageout examines it increases with this distance and * decreases as the scan rate rises. It must be < the amount @@ -640,12 +746,31 @@ setupclock(void) } /* - * If we have been called to recalculate the parameters, set a flag to - * re-evaluate the clock hand pointers. + * Establish the minimum and maximum length of time to be spent + * scanning pages per wakeup, limiting the scanner duty cycle. The + * input percentage values (0-100) must be converted to a fraction of + * the number of nanoseconds in a second of wall time, then further + * scaled down by the number of scanner wakeups in a second. */ - if (recalc) { - reset_hands = 1; - } + min_pageout_nsec = MAX(1, + NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); + max_pageout_nsec = MAX(min_pageout_nsec, + NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); + + /* + * If not called for recalculation, return and skip the remaining + * steps. + */ + if (!recalc) + return; + + /* + * Set a flag to re-evaluate the clock hand positions. + */ + for (uint_t i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; + + recalc_pagescanners(); } /* @@ -659,9 +784,8 @@ setupclock(void) * in its next pass; schedpaging() sets this value based on the amount of * currently available memory. */ -#define SCHEDPAGING_HZ 4 -static kmutex_t pageout_mutex; /* held while pageout or schedpaging running */ +static kmutex_t pageout_mutex; /* * Pool of available async pageout putpage requests. @@ -689,9 +813,9 @@ static bool pageout_pushing = false; static uint64_t pageout_pushcount = 0; static uint64_t pageout_pushcount_seen = 0; -static int async_list_size = 256; /* number of async request structs */ +static int async_list_size = 8192; /* number of async request structs */ -static void pageout_scanner(void); +static void pageout_scanner(void *); /* * If a page is being shared more than "po_share" times @@ -721,24 +845,17 @@ schedpaging(void *arg) kcage_cageout_wakeup(); if (mutex_tryenter(&pageout_mutex)) { - /* pageout() not running */ + + if (pageouts_running != 0) + goto out; + + /* No pageout scanner threads running. */ nscan = 0; vavail = freemem - deficit; if (pageout_new_spread != 0) vavail -= needfree; - if (vavail < 0) - vavail = 0; - if (vavail > lotsfree) - vavail = lotsfree; + vavail = clamp(vavail, 0, lotsfree); - /* - * Fix for 1161438 (CRS SPR# 73922). All variables - * in the original calculation for desscan were 32 bit signed - * ints. As freemem approaches 0x0 on a system with 1 Gig or - * more of memory, the calculation can overflow. When this - * happens, desscan becomes negative and pageout_scanner() - * stops paging out. - */ if (needfree > 0 && pageout_new_spread == 0) { /* * If we've not yet collected enough samples to @@ -764,14 +881,92 @@ schedpaging(void *arg) pageout_nsec = min_pageout_nsec + (lotsfree - vavail) * (max_pageout_nsec - min_pageout_nsec) / nz(lotsfree); - if (freemem < lotsfree + needfree || - pageout_sample_cnt < pageout_sample_lim) { + DTRACE_PROBE2(schedpage__calc, pgcnt_t, desscan, hrtime_t, + pageout_nsec); + + if (pageout_new_spread != 0 && despagescanners != 0 && + despagescanners != n_page_scanners) { + /* + * We have finished the pagescan initialisation and the + * desired number of page scanners has changed, either + * because initialisation just finished, because of a + * memory DR, or because despagescanners has been + * modified on the fly (i.e. by mdb). + */ + uint_t i, curr_nscan = n_page_scanners; + + /* Re-validate despagescanners */ + recalc_pagescanners(); + + n_page_scanners = despagescanners; + + for (i = 0; i < MAX_PSCAN_THREADS; i++) + reset_hands[i] = B_TRUE; + + /* If we need more scanners, start them now. */ + if (n_page_scanners > curr_nscan) { + for (i = curr_nscan; i < n_page_scanners; i++) { + (void) lwp_kernel_create(proc_pageout, + pageout_scanner, + (void *)(uintptr_t)i, TS_RUN, + pageout_pri); + } + } + + /* + * If the number of scanners has decreased, trigger a + * wakeup so that the excess threads will terminate. + */ + if (n_page_scanners < curr_nscan) { + WAKE_PAGEOUT_SCANNER(); + } + } + + zones_over = B_FALSE; + + if (PAGE_SCAN_STARTUP) { /* - * Either we need more memory, or we still need to - * measure the average scan rate. Wake the scanner. + * We still need to measure the rate at which the + * system is able to scan pages of memory. Each of + * these initial samples is a scan of as much system + * memory as practical, regardless of whether or not we + * are experiencing memory pressure. */ - DTRACE_PROBE(pageout__cv__signal); - cv_signal(&proc_pageout->p_cv); + desscan = total_pages; + pageout_nsec = max_pageout_nsec; + + DTRACE_PROBE(schedpage__wake__sample); + WAKE_PAGEOUT_SCANNER(); + } else if (freemem < lotsfree + needfree) { + /* + * We need more memory. + */ + low_mem_scan++; + + DTRACE_PROBE(schedpage__wake__low); + WAKE_PAGEOUT_SCANNER(); + } else if (zone_num_over_cap > 0) { + /* + * One of more zones are over their cap. + */ + + /* No page limit */ + desscan = total_pages; + + /* + * Increase the scanning CPU% to the max. This implies + * 80% of one CPU/sec if the scanner can run each + * opportunity. Can also be tuned via setting + * zone_pageout_nsec in /etc/system or with mdb. + */ + pageout_nsec = (zone_pageout_nsec != 0) ? + zone_pageout_nsec : max_pageout_nsec; + + zones_over = B_TRUE; + zone_cap_scan++; + + DTRACE_PROBE(schedpage__wake__zone); + WAKE_PAGEOUT_SCANNER(); } else { /* * There are enough free pages, no need to @@ -784,6 +979,7 @@ schedpaging(void *arg) po_share >>= 1; } } +out: mutex_exit(&pageout_mutex); } @@ -812,37 +1008,39 @@ uint_t dopageout = 1; /* * The page out daemon, which runs as process 2. * - * As long as there are at least lotsfree pages, - * this process is not run. When the number of free - * pages stays in the range desfree to lotsfree, - * this daemon runs through the pages in the loop - * at a rate determined in schedpaging(). Pageout manages - * two hands on the clock. The front hand moves through - * memory, clearing the reference bit, - * and stealing pages from procs that are over maxrss. - * The back hand travels a distance behind the front hand, - * freeing the pages that have not been referenced in the time - * since the front hand passed. If modified, they are pushed to - * swap before being freed. + * The daemon treats physical memory as a circular array of pages and scans + * the pages using a 'two-handed clock' algorithm. The front hand moves + * through the pages, clearing the reference bit. The back hand travels a + * distance (handspreadpages) behind the front hand, freeing the pages that + * have not been referenced in the time since the front hand passed. If + * modified, they are first written to their backing store before being + * freed. + * + * In order to make page invalidation more responsive on machines with + * larger memory, multiple pageout_scanner threads may be created. In this + * case, each thread is given a segment of the memory "clock face" so that + * memory can be reclaimed more quickly. * - * There are 2 threads that act on behalf of the pageout process. - * One thread scans pages (pageout_scanner) and frees them up if - * they don't require any VOP_PUTPAGE operation. If a page must be - * written back to its backing store, the request is put on a list - * and the other (pageout) thread is signaled. The pageout thread - * grabs VOP_PUTPAGE requests from the list, and processes them. - * Some filesystems may require resources for the VOP_PUTPAGE - * operations (like memory) and hence can block the pageout - * thread, but the scanner thread can still operate. There is still - * no guarantee that memory deadlocks cannot occur. + * As long as there are at least lotsfree pages, or no zones over their + * cap, then pageout_scanner threads are not run. When pageout_scanner + * threads are running for case (a), all pages are considered for pageout. + * For case (b), only pages belonging to a zone over its cap will be + * considered for pageout. * - * For now, this thing is in very rough form. + * There are multiple threads that act on behalf of the pageout process. A + * set of threads scan pages (pageout_scanner) and frees them up if they + * don't require any VOP_PUTPAGE operation. If a page must be written back + * to its backing store, the request is put on a list and the other + * (pageout) thread is signaled. The pageout thread grabs VOP_PUTPAGE + * requests from the list, and processes them. Some filesystems may require + * resources for the VOP_PUTPAGE operations (like memory) and hence can + * block the pageout thread, but the scanner thread can still operate. + * There is still no guarantee that memory deadlocks cannot occur. */ void pageout() { struct async_reqs *arg; - pri_t pageout_pri; int i; pgcnt_t max_pushes; callb_cpr_t cprinfo; @@ -873,11 +1071,12 @@ pageout() push_req[i].a_next = &push_req[i + 1]; } - pageout_pri = curthread->t_pri; + pageout_pri = curthread->t_pri - 1; - /* Create the pageout scanner thread. */ - (void) lwp_kernel_create(proc_pageout, pageout_scanner, NULL, TS_RUN, - pageout_pri - 1); + /* Create the first pageout scanner thread. */ + (void) lwp_kernel_create(proc_pageout, pageout_scanner, + (void *)0, /* this is instance 0, not NULL */ + TS_RUN, pageout_pri); /* * kick off pageout scheduler. @@ -912,6 +1111,8 @@ pageout() pageout_pushing = true; mutex_exit(&push_lock); + DTRACE_PROBE(pageout__push); + if (VOP_PUTPAGE(arg->a_vp, (offset_t)arg->a_off, arg->a_len, arg->a_flags, arg->a_cred, NULL) == 0) { pushes++; @@ -934,14 +1135,19 @@ pageout() * Kernel thread that scans pages looking for ones to free */ static void -pageout_scanner(void) +pageout_scanner(void *a) { - struct page *fronthand, *backhand; + struct page *fronthand, *backhand, *fronthandstart; + struct page *regionstart, *regionend; uint_t laps; callb_cpr_t cprinfo; - pgcnt_t nscan_limit; + pgcnt_t nscan_cnt, tick; pgcnt_t pcount; - bool sampling; + bool bhwrapping, fhwrapping; + hrtime_t sample_start, sample_end; + uint_t inst = (uint_t)(uintptr_t)a; + + VERIFY3U(inst, <, MAX_PSCAN_THREADS); CALLB_CPR_INIT(&cprinfo, &pageout_mutex, callb_generic_cpr, "poscan"); mutex_enter(&pageout_mutex); @@ -951,113 +1157,153 @@ pageout_scanner(void) * the right point on the assumption that after one circuit things * will have settled down, and restarts shouldn't be that often. */ + reset_hands[inst] = B_TRUE; - /* - * Set the two clock hands to be separated by a reasonable amount, - * but no more than 360 degrees apart. - */ - backhand = page_first(); - if (handspreadpages >= total_pages) { - fronthand = page_nextn(backhand, total_pages - 1); - } else { - fronthand = page_nextn(backhand, handspreadpages); - } - - /* - * Establish the minimum and maximum length of time to be spent - * scanning pages per wakeup, limiting the scanner duty cycle. The - * input percentage values (0-100) must be converted to a fraction of - * the number of nanoseconds in a second of wall time, then further - * scaled down by the number of scanner wakeups in a second: - */ - min_pageout_nsec = MAX(1, - NANOSEC * min_percent_cpu / 100 / SCHEDPAGING_HZ); - max_pageout_nsec = MAX(min_pageout_nsec, - NANOSEC * max_percent_cpu / 100 / SCHEDPAGING_HZ); + pageouts_running++; + mutex_exit(&pageout_mutex); loop: cv_signal_pageout(); + mutex_enter(&pageout_mutex); + pageouts_running--; CALLB_CPR_SAFE_BEGIN(&cprinfo); cv_wait(&proc_pageout->p_cv, &pageout_mutex); CALLB_CPR_SAFE_END(&cprinfo, &pageout_mutex); + pageouts_running++; + mutex_exit(&pageout_mutex); /* - * Check if pageout has been disabled for debugging purposes: + * Check if pageout has been disabled for debugging purposes. */ if (!dopageout) { goto loop; } /* - * One may reset the clock hands for debugging purposes. Hands will - * also be reset if memory is added to or removed from the system. + * One may reset the clock hands and scanned region for debugging + * purposes. Hands will also be reset on first thread startup, if + * the number of scanning threads (n_page_scanners) changes, or if + * memory is added to, or removed from, the system. */ - if (reset_hands) { - reset_hands = 0; + if (reset_hands[inst]) { + struct page *first; + + reset_hands[inst] = B_FALSE; + + if (inst >= n_page_scanners) { + /* + * The desired number of page scanners has been + * reduced and this instance is no longer wanted. + * Exit the lwp. + */ + VERIFY3U(inst, !=, 0); + DTRACE_PROBE1(pageout__exit, uint_t, inst); + mutex_enter(&pageout_mutex); + pageouts_running--; + mutex_exit(&pageout_mutex); + mutex_enter(&curproc->p_lock); + lwp_exit(); + /* NOTREACHED */ + } + + first = page_first(); + + /* + * Each scanner thread gets its own sector of the memory + * clock face. + */ + pgcnt_t span, offset; - backhand = page_first(); - if (handspreadpages >= total_pages) { - fronthand = page_nextn(backhand, total_pages - 1); + span = looppages / n_page_scanners; + VERIFY3U(span, >, handspreadpages); + + offset = inst * span; + regionstart = page_nextn(first, offset); + if (inst == n_page_scanners - 1) { + /* The last instance goes up to the last page */ + regionend = page_nextn(first, looppages - 1); } else { - fronthand = page_nextn(backhand, handspreadpages); + regionend = page_nextn(regionstart, span - 1); } + + backhand = regionstart; + fronthand = page_nextn(backhand, handspreadpages); + tick = 1; + + bhwrapping = fhwrapping = B_FALSE; + + DTRACE_PROBE4(pageout__reset, uint_t, inst, + pgcnt_t, regionstart, pgcnt_t, regionend, + pgcnt_t, fronthand); } + /* + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. + */ CPU_STATS_ADDQ(CPU, vm, pgrrun, 1); /* * Keep track of the number of times we have scanned all the way around - * the loop: + * the loop on this wakeup. */ laps = 0; - DTRACE_PROBE(pageout__start); - /* * Track the number of pages visited during this scan so that we can * periodically measure our duty cycle. */ + nscan_cnt = 0; pcount = 0; - if (pageout_sample_cnt < pageout_sample_lim) { - /* - * We need to measure the rate at which the system is able to - * scan pages of memory. Each of these initial samples is a - * scan of all system memory, regardless of whether or not we - * are experiencing memory pressure. - */ - nscan_limit = total_pages; - sampling = true; - } else { - nscan_limit = desscan; - sampling = false; - } + DTRACE_PROBE5(pageout__start, uint_t, inst, pgcnt_t, desscan, + hrtime_t, pageout_nsec, page_t *, backhand, page_t *, fronthand); + + /* + * Record the initial position of the front hand for this cycle so + * that we can detect when the hand wraps around. + */ + fronthandstart = fronthand; sample_start = gethrtime(); /* * Scan the appropriate number of pages for a single duty cycle. */ - while (nscan < nscan_limit) { + while (nscan_cnt < desscan) { checkpage_result_t rvfront, rvback; - if (!sampling && freemem >= lotsfree + needfree) { + /* + * Only scan while at least one of these is true: + * 1) one or more zones is over its cap + * 2) there is not enough free memory + * 3) during page scan startup when determining sample data + */ + if (!PAGE_SCAN_STARTUP && freemem >= lotsfree + needfree && + !zones_over) { /* * We are not sampling and enough memory has become * available that scanning is no longer required. */ + DTRACE_PROBE1(pageout__memfree, uint_t, inst); break; } + DTRACE_PROBE2(pageout__loop, uint_t, inst, pgcnt_t, pcount); + /* * Periodically check to see if we have exceeded the CPU duty * cycle for a single wakeup. */ if ((pcount & PAGES_POLL_MASK) == PAGES_POLL_MASK) { + hrtime_t pageout_cycle_nsec; + pageout_cycle_nsec = gethrtime() - sample_start; if (pageout_cycle_nsec >= pageout_nsec) { - ++pageout_timeouts; + if (!zones_over) + atomic_inc_64(&pageout_timeouts); + DTRACE_PROBE1(pageout__timeout, uint_t, inst); break; } } @@ -1076,7 +1322,8 @@ loop: ++pcount; /* - * Protected by pageout_mutex instead of cpu_stat_lock: + * This CPU kstat is only incremented here and we're obviously + * on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, scan, 1); @@ -1084,26 +1331,48 @@ loop: * Don't include ineligible pages in the number scanned. */ if (rvfront != CKP_INELIGIBLE || rvback != CKP_INELIGIBLE) { - nscan++; + nscan_cnt++; } - backhand = page_next(backhand); - fronthand = page_next(fronthand); + if (bhwrapping) { + backhand = regionstart; + bhwrapping = B_FALSE; + } else { + backhand = page_nextn(backhand, tick); + if (backhand == regionend) + bhwrapping = B_TRUE; + } + + if (fhwrapping) { + fronthand = regionstart; + fhwrapping = B_FALSE; + } else { + fronthand = page_nextn(fronthand, tick); + if (fronthand == regionend) + fhwrapping = B_TRUE; + } /* - * The front hand has wrapped around to the first page in the - * loop. + * The front hand has wrapped around during this wakeup. */ - if (fronthand == page_first()) { + if (fronthand == fronthandstart) { laps++; - DTRACE_PROBE1(pageout__hand__wrap, uint_t, laps); + DTRACE_PROBE2(pageout__hand__wrap, uint_t, inst, + uint_t, laps); /* - * Protected by pageout_mutex instead of cpu_stat_lock: + * This CPU kstat is only incremented here and we're + * obviously on this CPU, so no lock. */ CPU_STATS_ADDQ(CPU, vm, rev, 1); - if (laps > 1) { + /* + * then when we wraparound memory we want to try to + * reclaim more pages. + * If scanning only because zones are over their cap, + * then wrapping is common and we simply keep going. + */ + if (laps > 1 && freemem < lotsfree + needfree) { /* * Extremely unlikely, but it happens. * We went around the loop at least once @@ -1122,21 +1391,30 @@ loop: } sample_end = gethrtime(); + atomic_add_long(&nscan, nscan_cnt); - DTRACE_PROBE1(pageout__end, uint_t, laps); + DTRACE_PROBE4(pageout__end, uint_t, inst, uint_t, laps, + pgcnt_t, nscan_cnt, pgcnt_t, pcount) + /* + * The global variables used below are only modified by this thread and + * only during initial scanning when there is a single page scanner + * thread running. + */ if (pageout_new_spread == 0) { - if (pageout_sample_cnt < pageout_sample_lim) { + VERIFY3U(inst, ==, 0); + + if (PAGE_SCAN_STARTUP) { /* * Continue accumulating samples until we have enough - * to get a reasonable value for average scan rate: + * to get a reasonable value for average scan rate. */ pageout_sample_pages += pcount; pageout_sample_etime += sample_end - sample_start; ++pageout_sample_cnt; } - if (pageout_sample_cnt >= pageout_sample_lim) { + if (!PAGE_SCAN_STARTUP) { /* * We have enough samples, set the spread. */ @@ -1222,6 +1500,7 @@ checkpage(struct page *pp, pageout_hand_t whichhand) int isfs = 0; int isexec = 0; int pagesync_flag; + zoneid_t zid = ALL_ZONES; /* * Skip pages: @@ -1264,6 +1543,21 @@ checkpage(struct page *pp, pageout_hand_t whichhand) return (CKP_INELIGIBLE); } + if (zones_over) { + ASSERT(pp->p_zoneid == ALL_ZONES || + pp->p_zoneid >= 0 && pp->p_zoneid <= MAX_ZONEID); + if (pp->p_zoneid == ALL_ZONES || + zone_pdata[pp->p_zoneid].zpers_over == 0) { + /* + * Cross-zone shared page, or zone not over it's cap. + * Leave the page alone. + */ + page_unlock(pp); + return (CKP_INELIGIBLE); + } + zid = pp->p_zoneid; + } + /* * Maintain statistics for what we are freeing */ @@ -1371,6 +1665,11 @@ recheck: VN_RELE(vp); return (CKP_NOT_FREED); } + if (isfs) { + zone_pageout_stat(zid, ZPO_DIRTY); + } else { + zone_pageout_stat(zid, ZPO_ANONDIRTY); + } return (CKP_FREED); } @@ -1397,8 +1696,10 @@ recheck: } else { CPU_STATS_ADD_K(vm, fsfree, 1); } + zone_pageout_stat(zid, ZPO_FS); } else { CPU_STATS_ADD_K(vm, anonfree, 1); + zone_pageout_stat(zid, ZPO_ANON); } return (CKP_FREED); diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index 7d2b89408a..933834aee9 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1629,7 +1629,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index a398830833..fa841df9ff 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -106,14 +106,16 @@ * removed from the list of active zones. zone_destroy() returns, and * the zone can be recreated. * - * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor - * callbacks are executed, and all memory associated with the zone is - * freed. + * ZONE_IS_FREE (internal state): All references have been dropped and + * the zone_t is no longer in the zone_active nor zone_deathrow lists. + * The zone_t is in the process of being freed. This state exists + * only for publishing a sysevent to indicate that the zone by this + * name can be booted again. * - * Threads can wait for the zone to enter a requested state by using - * zone_status_wait() or zone_status_timedwait() with the desired - * state passed in as an argument. Zone state transitions are - * uni-directional; it is not possible to move back to an earlier state. + * Threads can wait for the zone to enter a requested state (other than + * ZONE_IS_FREE) by using zone_status_wait() or zone_status_timedwait() + * with the desired state passed in as an argument. Zone state transitions + * are uni-directional; it is not possible to move back to an earlier state. * * * Zone-Specific Data: @@ -252,6 +254,8 @@ #include <sys/cpucaps.h> #include <vm/seg.h> #include <sys/mac.h> +#include <sys/rt.h> +#include <sys/fx.h> /* * This constant specifies the number of seconds that threads waiting for @@ -312,6 +316,7 @@ static id_space_t *zoneid_space; * 'global_zone'. */ zone_t zone0; +zone_zfs_io_t zone0_zp_zfs; zone_t *global_zone = NULL; /* Set when the global zone is initialized */ /* @@ -327,8 +332,8 @@ static list_t zone_active; static list_t zone_deathrow; static kmutex_t zone_deathrow_lock; -/* number of zones is limited by virtual interface limit in IP */ -uint_t maxzones = 8192; +/* This can be dynamically reduced if various subsystems hit internal limits. */ +uint_t maxzones = MAX_ZONES; /* Event channel to sent zone state change notifications */ evchan_t *zone_event_chan; @@ -350,6 +355,7 @@ const char *zone_status_table[] = { ZONE_EVENT_SHUTTING_DOWN, /* down */ ZONE_EVENT_SHUTTING_DOWN, /* dying */ ZONE_EVENT_UNINITIALIZED, /* dead */ + ZONE_EVENT_FREE, /* free */ }; /* @@ -372,8 +378,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -389,6 +399,7 @@ static int zone_remove_datalink(zoneid_t, datalink_id_t); static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); static int zone_set_network(zoneid_t, zone_net_data_t *); static int zone_get_network(zoneid_t, zone_net_data_t *); +static void zone_status_set(zone_t *, zone_status_t); typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); @@ -419,8 +430,72 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; + +/* + * "zone_pdata" is an array indexed by zoneid. It is used to store "persistent" + * data which can be referenced independently of the zone_t structure. This + * data falls into two categories; + * 1) pages and RSS data associated with processes inside a zone + * 2) in-flight ZFS I/O data + * + * Each member of zone_persist_t stores the zone's current page usage, its page + * limit, a flag indicating if the zone is over its physical memory cap and + * various page-related statistics. The zpers_over flag is the interface for + * the page scanner to use when reclaiming pages for zones that are over their + * cap. The zone_persist_t structure also includes a mutex and a reference to a + * zone_zfs_io_t structure used for tracking the zone's ZFS I/O data. + * + * All zone physical memory cap data is stored in this array instead of within + * the zone structure itself. This is because zone structures come and go, but + * paging-related work can be asynchronous to any particular zone. In, + * particular: + * 1) Page scanning to reclaim pages occurs from a kernel thread that is not + * associated with any zone. + * 2) Freeing segkp pages can occur long after the zone which first + * instantiated those pages has gone away. + * We want to be able to account for pages/zone without constantly having to + * take extra locks and finding the relevant zone structure, particularly during + * page scanning. + * + * The page scanner can run when "zone_num_over_cap" is non-zero. It can + * do a direct lookup of a zoneid into the "zone_pdata" array to determine + * if that zone is over its cap. + * + * There is no locking for the page scanner to perform these two checks. + * We cannot have the page scanner blocking normal paging activity for + * running processes. Because the physical memory cap is a soft cap, it is + * fine for the scanner to simply read the current state of the counter and + * the zone's zpers_over entry in the array. The scanner should never modify + * either of these items. Internally the entries and the counter are managed + * with the "zone_physcap_lock" mutex as we add/remove mappings to pages. We + * take care to ensure that we only take the zone_physcap_lock mutex when a + * zone is transitioning over/under its physical memory cap. + * + * The "zone_incr_capped" and "zone_decr_capped" functions are used to manage + * the "zone_pdata" array and associated counter. + * + * The zone_persist_t structure tracks the zone's physical cap and phyiscal + * usage in terms of pages. These values are currently defined as uint32. Thus, + * the maximum number of pages we can track is a UINT_MAX-1 (4,294,967,295) + * since UINT_MAX means the zone's RSS is unlimited. Assuming a 4k page size, a + * zone's maximum RSS is limited to 17.5 TB and twice that with an 8k page size. + * In the future we may need to expand these counters to 64-bit, but for now + * we're using 32-bit to conserve memory, since this array is statically + * allocated within the kernel based on the maximum number of zones supported. + * + * With respect to the zone_zfs_io_t referenced by the zone_persist_t, under + * a heavy I/O workload, the "zonehash_lock" would become extremely hot if we + * had to continuously find the zone structure associated with an I/O that has + * just completed. To avoid that overhead, we track the I/O data within the + * zone_zfs_io_t instead. We can directly access that data without having to + * lookup the full zone_t structure. + */ +uint_t zone_num_over_cap; +zone_persist_t zone_pdata[MAX_ZONES]; +static kmutex_t zone_physcap_lock; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1379,6 +1454,127 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + rctl_qty_t r = 0; + + ASSERT(MUTEX_HELD(&p->p_lock)); + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + r = (rctl_qty_t)zp->zpers_zfsp->zpers_zfs_io_pri; + mutex_exit(&zp->zpers_zfs_lock); + + return (r); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + zone_persist_t *zp; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zp = &zone_pdata[zone->zone_id]; + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp != NULL) + zp->zpers_zfsp->zpers_zfs_io_pri = (uint16_t)nv; + mutex_exit(&zp->zpers_zfs_lock); + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1705,6 +1901,57 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_persist_t *zp = &zone_pdata[p->p_zone->zone_id]; + + ASSERT(MUTEX_HELD(&p->p_lock)); + q = ptob(zp->zpers_pg_cnt); + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zoneid_t zid; + uint_t pg_val; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + zid = e->rcep_p.zone->zone_id; + if (nv == UINT64_MAX) { + pg_val = UINT32_MAX; + } else { + uint64_t pages = btop(nv); + + /* + * Return from RCTLOP_SET is always ignored so just clamp an + * out-of-range value to our largest "limited" value. + */ + if (pages >= UINT32_MAX) { + pg_val = UINT32_MAX - 1; + } else { + pg_val = (uint_t)pages; + } + } + zone_pdata[zid].zpers_pg_limit = pg_val; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1798,6 +2045,21 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = ptob(zp->zpers_pg_cnt); + zk->zk_value.value.ui64 = ptob(zp->zpers_pg_limit); + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1826,7 +2088,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1851,16 +2113,200 @@ zone_kstat_create_common(zone_t *zone, char *name, return (ksp); } +static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rcnt.value.ui64 = kiop->rcnt; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wcnt.value.ui64 = kiop->wcnt; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rcnt, "rcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wcnt, "wcnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + zone_persist_t *zp = &zone_pdata[zone->zone_id]; + + if (rw == KSTAT_WRITE) + return (EACCES); + + mutex_enter(&zp->zpers_zfs_lock); + if (zp->zpers_zfsp == NULL) { + zzp->zz_nread.value.ui64 = 0; + zzp->zz_reads.value.ui64 = 0; + zzp->zz_rtime.value.ui64 = 0; + zzp->zz_rlentime.value.ui64 = 0; + zzp->zz_nwritten.value.ui64 = 0; + zzp->zz_writes.value.ui64 = 0; + zzp->zz_waittime.value.ui64 = 0; + } else { + kstat_io_t *kiop = &zp->zpers_zfsp->zpers_zfs_rwstats; + + /* + * Extract the ZFS statistics from the kstat_io_t structure + * used by kstat_runq_enter() and related functions. Since the + * I/O throttle counters are updated directly by the ZFS layer, + * there's no need to copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + zzp->zz_waittime.value.ui64 = + zp->zpers_zfsp->zpers_zfs_rd_waittime; + } + mutex_exit(&zp->zpers_zfs_lock); + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} static int zone_mcap_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; zone_mcap_kstat_t *zmp = ksp->ks_data; + zone_persist_t *zp; if (rw == KSTAT_WRITE) return (EACCES); + zp = &zone_pdata[zone->zone_id]; + + zmp->zm_rss.value.ui64 = ptob(zp->zpers_pg_cnt); + zmp->zm_phys_cap.value.ui64 = ptob(zp->zpers_pg_limit); + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zp->zpers_nover; +#ifndef DEBUG + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_out); +#else + zmp->zm_pagedout.value.ui64 = ptob(zp->zpers_pg_fsdirty + + zp->zpers_pg_fs + zp->zpers_pg_anon + zp->zpers_pg_anondirty); +#endif zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; @@ -1893,6 +2339,12 @@ zone_mcap_kstat_create(zone_t *zone) /* The kstat "name" field is not large enough for a full zonename */ kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); @@ -1942,9 +2394,12 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_mfseglim.value.ui32 = zone->zone_mfseglim; + zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_init_restarts.value.ui32 = zone->zone_proc_init_restarts; zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; return (0); @@ -1985,9 +2440,13 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_mfseglim, "mapfail_seglim", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_nested_intp, "nested_interp", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_init_restarts, "init_restarts", + KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; @@ -2000,13 +2459,25 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { zone->zone_mcap_stats = kmem_zalloc( sizeof (zone_mcap_kstat_t), KM_SLEEP); @@ -2038,8 +2509,15 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); zone_kstat_delete_common(&zone->zone_mcap_ksp, sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, @@ -2101,8 +2579,12 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + zone_pdata[0].zpers_zfsp = &zone0_zp_zfs; + zone_pdata[0].zpers_zfsp->zpers_zfs_io_pri = 1; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), @@ -2209,6 +2691,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2250,6 +2747,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2260,6 +2771,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2283,6 +2799,7 @@ zone_init(void) zone0.zone_restart_init = B_TRUE; zone0.zone_reboot_on_init_exit = B_FALSE; zone0.zone_restart_init_0 = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2364,6 +2881,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2379,6 +2898,9 @@ zone_free(zone_t *zone) */ cpucaps_zone_remove(zone); + /* Clear physical memory capping data. */ + bzero(&zone_pdata[zone->zone_id], sizeof (zone_persist_t)); + ASSERT(zone->zone_cpucap == NULL); /* remove from deathrow list */ @@ -2392,8 +2914,30 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); + /* + * This zone_t can no longer inhibit creation of another zone_t + * with the same name or debug ID. Generate a sysevent so that + * userspace tools know it is safe to carry on. + */ + mutex_enter(&zone_status_lock); + zone_status_set(zone, ZONE_IS_FREE); + mutex_exit(&zone_status_lock); + cpu_uarray_free(zone->zone_ustate); if (zone->zone_rootvp != NULL) @@ -2438,11 +2982,17 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); - ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && - status >= zone_status_get(zone)); + ASSERT((status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE || + status == ZONE_IS_FREE) && status >= zone_status_get(zone)); + + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || @@ -2451,12 +3001,14 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG (void) printf( "Failed to allocate and send zone state change event.\n"); +#else + /* EMPTY */ #endif } nvlist_free(nvl); @@ -2476,6 +3028,38 @@ zone_status_get(zone_t *zone) return (zone->zone_status); } +/* + * Publish a zones-related sysevent for purposes other than zone state changes. + * While it is unfortunate that zone_event_chan is associated with + * "com.sun:zones:status" (rather than "com.sun:zones") state changes should be + * the only ones with class "status" and subclass "change". + */ +void +zone_sysevent_publish(zone_t *zone, const char *class, const char *subclass, + nvlist_t *ev_nvl) +{ + nvlist_t *nvl = NULL; + timestruc_t now; + uint64_t t; + + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + + if (nvlist_dup(ev_nvl, &nvl, KM_SLEEP) != 0 || + nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_ZONEID, zone->zone_id) != 0 || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) != 0 || + sysevent_evc_publish(zone_event_chan, class, subclass, "sun.com", + "kernel", nvl, EVCH_SLEEP) != 0) { +#ifdef DEBUG + (void) printf("Failed to allocate and send zone misc event.\n"); +#else + /* EMPTY */ +#endif + } + nvlist_free(nvl); +} + static int zone_set_bootargs(zone_t *zone, const char *zone_bootargs) { @@ -2529,9 +3113,14 @@ zone_set_brand(zone_t *zone, const char *brand) return (EINVAL); } - /* set up the brand specific data */ + /* + * Set up the brand specific data. + * Note that it's possible that the hook has to drop the + * zone_status_lock and reaquire it before returning so we can't + * assume the lock has been held the entire time. + */ zone->zone_brand = bp; - ZBROP(zone)->b_init_brand_data(zone); + ZBROP(zone)->b_init_brand_data(zone, &zone_status_lock); mutex_exit(&zone_status_lock); return (0); @@ -2604,18 +3193,6 @@ zone_set_initname(zone_t *zone, const char *zone_initname) } static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) -{ - uint64_t mcap; - int err = 0; - - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; - - return (err); -} - -static int zone_set_sched_class(zone_t *zone, const char *new_class) { char sched_class[PC_CLNMSZ]; @@ -3022,6 +3599,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -3768,6 +4351,17 @@ zone_start_init(void) */ z->zone_proc_initpid = p->p_pid; + if (z->zone_setup_app_contract == B_TRUE) { + /* + * Normally a process cannot modify its own contract, but we're + * just starting the zone's init process and its contract is + * always initialized from the sys_process_tmpl template, so + * this is the simplest way to setup init's contract to kill + * the process if any other process in the contract exits. + */ + p->p_ct_process->conp_ev_fatal |= CT_PR_EV_EXIT; + } + /* * We maintain zone_boot_err so that we can return the cause of the * failure back to the caller of the zone_boot syscall. @@ -3796,9 +4390,54 @@ zone_start_init(void) lwp_exit(); } } else { + id_t cid = curthread->t_cid; + if (zone_status_get(z) == ZONE_IS_BOOTING) zone_status_set(z, ZONE_IS_RUNNING); mutex_exit(&zone_status_lock); + + mutex_enter(&class_lock); + ASSERT(cid < loaded_classes); + if (strcmp(sclass[cid].cl_name, "FX") == 0 && + z->zone_fixed_hipri) { + /* + * If the zone is using FX then by default all + * processes start at the lowest priority and stay + * there. We provide a mechanism for the zone to + * indicate that it should run at "high priority". In + * this case we setup init to run at the highest FX + * priority (which is one level higher than the + * non-fixed scheduling classes can use). + */ + pcparms_t pcparms; + + pcparms.pc_cid = cid; + ((fxkparms_t *)pcparms.pc_clparms)->fx_upri = FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_uprilim = + FXMAXUPRI; + ((fxkparms_t *)pcparms.pc_clparms)->fx_cflags = + FX_DOUPRILIM | FX_DOUPRI; + + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + (void) parmsset(&pcparms, curthread); + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + } else if (strcmp(sclass[cid].cl_name, "RT") == 0) { + /* + * zsched always starts the init lwp at priority + * minclsyspri - 1. This priority gets set in t_pri and + * is invalid for RT, but RT never uses t_pri. However + * t_pri is used by procfs, so we always see processes + * within an RT zone with an invalid priority value. + * We fix that up now. + */ + curthread->t_pri = RTGPPRIO0; + } + mutex_exit(&class_lock); + /* cause the process to return to userland. */ lwp_rtt(); } @@ -3839,7 +4478,11 @@ zsched(void *arg) bcopy("zsched", PTOU(pp)->u_comm, sizeof ("zsched")); PTOU(pp)->u_argc = 0; PTOU(pp)->u_argv = 0; + PTOU(pp)->u_argvstrs = 0; + PTOU(pp)->u_argvstrsize = 0; PTOU(pp)->u_envp = 0; + PTOU(pp)->u_envstrs = 0; + PTOU(pp)->u_envstrsize = 0; PTOU(pp)->u_commpagep = 0; closeall(P_FINFO(pp)); @@ -4284,8 +4927,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4404,7 +5048,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4476,6 +5120,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zone->zone_id = zoneid; + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4485,6 +5130,7 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_restart_init = B_TRUE; zone->zone_reboot_on_init_exit = B_FALSE; zone->zone_restart_init_0 = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4551,8 +5197,13 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_max_swap_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + + zone_pdata[zoneid].zpers_zfsp = + kmem_zalloc(sizeof (zone_zfs_io_t), KM_SLEEP); + zone_pdata[zoneid].zpers_zfsp->zpers_zfs_io_pri = 1; zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); @@ -4561,6 +5212,13 @@ zone_create(const char *zone_name, const char *zone_root, */ zone->zone_rctls = NULL; + /* + * Ensure page count is 0 (in case zoneid has wrapped). + * Initialize physical memory cap as unlimited. + */ + zone_pdata[zoneid].zpers_pg_cnt = 0; + zone_pdata[zoneid].zpers_pg_limit = UINT32_MAX; + if ((error = parse_rctls(rctlbuf, rctlbufsz, &rctls)) != 0) { zone_free(zone); return (zone_create_error(error, 0, extended_error)); @@ -4709,8 +5367,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4849,6 +5507,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4859,7 +5518,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5188,6 +5856,7 @@ zone_destroy(zoneid_t zoneid) zone_status_t status; clock_t wait_time; boolean_t log_refcounts; + zone_persist_t *zp; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -5221,6 +5890,12 @@ zone_destroy(zoneid_t zoneid) zone_hold(zone); mutex_exit(&zonehash_lock); + zp = &zone_pdata[zoneid]; + mutex_enter(&zp->zpers_zfs_lock); + kmem_free(zp->zpers_zfsp, sizeof (zone_zfs_io_t)); + zp->zpers_zfsp = NULL; + mutex_exit(&zp->zpers_zfs_lock); + /* * wait for zsched to exit */ @@ -5610,14 +6285,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5681,6 +6348,23 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; + case ZONE_ATTR_SCHED_FIXEDHI: + size = sizeof (boolean_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_fixed_hipri, buf, + bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5712,10 +6396,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * No attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID) { return (set_errno(EINVAL)); } @@ -5728,11 +6411,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) mutex_exit(&zonehash_lock); /* - * At present most attributes can only be set on non-running, + * At present attributes can only be set on non-running, * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5765,9 +6448,6 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_SECFLAGS: err = zone_set_secflags(zone, (psecflags_t *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); - break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); break; @@ -5795,6 +6475,22 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_APP_SVC_CT: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_setup_app_contract = (boolean_t)buf; + err = 0; + } + break; + case ZONE_ATTR_SCHED_FIXEDHI: + if (bufsize != sizeof (boolean_t)) { + err = EINVAL; + } else { + zone->zone_fixed_hipri = (boolean_t)buf; + err = 0; + } + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) err = ZBROP(zone)->b_setattr(zone, attr, buf, bufsize); @@ -6493,6 +7189,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6503,7 +7200,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6604,6 +7301,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6781,7 +7479,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise @@ -6844,16 +7542,15 @@ zone_shutdown_global(void) } /* - * Returns true if the named dataset is visible in the current zone. + * Returns true if the named dataset is visible in the specified zone. * The 'write' parameter is set to 1 if the dataset is also writable. */ int -zone_dataset_visible(const char *dataset, int *write) +zone_dataset_visible_inzone(zone_t *zone, const char *dataset, int *write) { static int zfstype = -1; zone_dataset_t *zd; size_t len; - zone_t *zone = curproc->p_zone; const char *name = NULL; vfs_t *vfsp = NULL; @@ -6921,7 +7618,8 @@ zone_dataset_visible(const char *dataset, int *write) vfs_list_read_lock(); vfsp = zone->zone_vfslist; do { - ASSERT(vfsp); + if (vfsp == NULL) + break; if (vfsp->vfs_fstype == zfstype) { name = refstr_value(vfsp->vfs_resource); @@ -6958,6 +7656,18 @@ zone_dataset_visible(const char *dataset, int *write) } /* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + zone_t *zone = curproc->p_zone; + + return (zone_dataset_visible_inzone(zone, dataset, write)); +} + +/* * zone_find_by_any_path() - * * kernel-private routine similar to zone_find_by_path(), but which @@ -7059,6 +7769,27 @@ zone_add_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; zone_t *thiszone; + /* + * Only the GZ may add a datalink to a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may add a + * datalink to a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * When links exist in the GZ, they aren't added to the GZ's + * zone_dl_list. We must enforce this because link_activate() + * depends on zone_check_datalink() returning only NGZs. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((thiszone = zone_find_by_id(zoneid)) == NULL) return (set_errno(ENXIO)); @@ -7091,6 +7822,26 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) zone_t *zone; int err = 0; + /* + * Only the GZ may remove a datalink from a zone's list. + */ + if (getzoneid() != GLOBAL_ZONEID) + return (set_errno(EPERM)); + + /* + * Only a process with the datalink config priv may remove a + * datalink from a zone's list. + */ + if (secpolicy_dl_config(CRED()) != 0) + return (set_errno(EPERM)); + + /* + * If we can't add a datalink to the GZ's zone_dl_list then we + * certainly can't remove them either. + */ + if (zoneid == GLOBAL_ZONEID) + return (set_errno(EINVAL)); + if ((zone = zone_find_by_id(zoneid)) == NULL) return (set_errno(EINVAL)); @@ -7108,25 +7859,63 @@ zone_remove_datalink(zoneid_t zoneid, datalink_id_t linkid) } /* - * Using the zoneidp as ALL_ZONES, we can lookup which zone has been assigned - * the linkid. Otherwise we just check if the specified zoneidp has been - * assigned the supplied linkid. + * + * This function may be used in two ways: + * + * 1. to get the zoneid of the zone this link is under, or + * + * 2. to verify that the link is under a specific zone. + * + * The first use is achieved by passing a zoneid of ALL_ZONES. The + * function then iterates the datalink list of every zone on the + * system until it finds the linkid. If the linkid is found then the + * function returns 0 and zoneidp is updated. Otherwise, ENXIO is + * returned and zoneidp is not modified. The use of ALL_ZONES is + * limited to callers in the GZ to prevent leaking information to + * NGZs. If an NGZ passes ALL_ZONES it's query is implicitly changed + * to the second type in the list above. + * + * The second use is achieved by passing a specific zoneid. The GZ can + * use this to verify a link is under a particular zone. An NGZ can + * use this to verify a link is under itself. But an NGZ cannot use + * this to determine if a link is under some other zone as that would + * result in information leakage. If the link exists under the zone + * then 0 is returned. Otherwise, ENXIO is returned. */ int zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) { zone_t *zone; + zoneid_t zoneid = *zoneidp; + zoneid_t caller = getzoneid(); int err = ENXIO; - if (*zoneidp != ALL_ZONES) { - if ((zone = zone_find_by_id(*zoneidp)) != NULL) { - if (zone_dl_exists(zone, linkid)) + /* + * Only the GZ may enquire about all zones; an NGZ may only + * enuqire about itself. + */ + if (zoneid == ALL_ZONES && caller != GLOBAL_ZONEID) + zoneid = caller; + + if (zoneid != caller && caller != GLOBAL_ZONEID) + return (err); + + if (zoneid != ALL_ZONES) { + if ((zone = zone_find_by_id(zoneid)) != NULL) { + if (zone_dl_exists(zone, linkid)) { + /* + * We need to set this in case an NGZ + * passes ALL_ZONES. + */ + *zoneidp = zoneid; err = 0; + } zone_rele(zone); } return (err); } + ASSERT(caller == GLOBAL_ZONEID); mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { @@ -7137,6 +7926,7 @@ zone_check_datalink(zoneid_t *zoneidp, datalink_id_t linkid) } } mutex_exit(&zonehash_lock); + return (err); } @@ -7157,6 +7947,12 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) zone_dl_t *zdl; datalink_id_t *idptr = idarray; + /* + * Only the GZ or the owning zone may look at the datalink list. + */ + if ((getzoneid() != GLOBAL_ZONEID) && (getzoneid() != zoneid)) + return (set_errno(EPERM)); + if (copyin(nump, &dlcount, sizeof (dlcount)) != 0) return (set_errno(EFAULT)); if ((zone = zone_find_by_id(zoneid)) == NULL) @@ -7182,6 +7978,13 @@ zone_list_datalink(zoneid_t zoneid, int *nump, datalink_id_t *idarray) mutex_exit(&zone->zone_lock); zone_rele(zone); + /* + * Prevent returning negative nump values -- we should never + * have this many links anyways. + */ + if (num > INT_MAX) + return (set_errno(EOVERFLOW)); + /* Increased or decreased, caller should be notified. */ if (num != dlcount) { if (copyout(&num, nump, sizeof (num)) != 0) @@ -7395,3 +8198,231 @@ done: else return (0); } + +static void +zone_incr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + + /* See if over (unlimited is UINT32_MAX), or already marked that way. */ + if (zp->zpers_pg_cnt <= zp->zpers_pg_limit || zp->zpers_over == 1) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck setting under mutex */ + if (zp->zpers_pg_cnt > zp->zpers_pg_limit && zp->zpers_over == 0) { + zp->zpers_over = 1; + zp->zpers_nover++; + zone_num_over_cap++; + DTRACE_PROBE1(zone__over__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * We want some hysteresis when the zone is going under its cap so that we're + * not continuously toggling page scanning back and forth by a single page + * around the cap. Using ~1% of the zone's page limit seems to be a good + * quantity. This table shows some various zone memory caps and the number of + * pages (assuming a 4k page size). Given this, we choose to shift the page + * limit by 7 places to get a hysteresis that is slightly less than 1%. + * + * cap pages pages 1% shift7 shift7 + * 128M 32768 0x0008000 327 256 0x00100 + * 512M 131072 0x0020000 1310 1024 0x00400 + * 1G 262144 0x0040000 2621 2048 0x00800 + * 4G 1048576 0x0100000 10485 8192 0x02000 + * 8G 2097152 0x0200000 20971 16384 0x04000 + * 16G 4194304 0x0400000 41943 32768 0x08000 + * 32G 8388608 0x0800000 83886 65536 0x10000 + * 64G 16777216 0x1000000 167772 131072 0x20000 + */ +static void +zone_decr_capped(zoneid_t zid) +{ + zone_persist_t *zp = &zone_pdata[zid]; + uint32_t adjusted_limit; + + /* + * See if under, or already marked that way. There is no need to + * check for an unlimited cap (zpers_pg_limit == UINT32_MAX) + * since we'll never set zpers_over in zone_incr_capped(). + */ + if (zp->zpers_over == 0 || zp->zpers_pg_cnt >= zp->zpers_pg_limit) { + return; + } + + adjusted_limit = zp->zpers_pg_limit - (zp->zpers_pg_limit >> 7); + + /* Recheck, accounting for our hysteresis. */ + if (zp->zpers_pg_cnt >= adjusted_limit) { + return; + } + + mutex_enter(&zone_physcap_lock); + /* Recheck under mutex. */ + if (zp->zpers_pg_cnt < adjusted_limit && zp->zpers_over == 1) { + zp->zpers_over = 0; + ASSERT(zone_num_over_cap > 0); + zone_num_over_cap--; + DTRACE_PROBE1(zone__under__pcap, zoneid_t, zid); + } + mutex_exit(&zone_physcap_lock); +} + +/* + * For zone_add_page() and zone_rm_page(), access to the page we're touching is + * controlled by our caller's locking. + * On x86 our callers already did: ASSERT(x86_hm_held(pp)) + * On SPARC our callers already did: ASSERT(sfmmu_mlist_held(pp)) + */ +void +zone_add_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + ASSERT(!PP_ISFREE(pp)); + + zid = curzone->zone_id; + if (pp->p_zoneid == zid) { + /* Another mapping to this page for this zone, do nothing */ + return; + } + + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = page_get_pagecnt(pp->p_szc); + } + + if (pp->p_share == 0) { + /* First mapping to this page. */ + pp->p_zoneid = zid; + zp = &zone_pdata[zid]; + ASSERT(zp->zpers_pg_cnt + pcnt < UINT32_MAX); + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, pcnt); + zone_incr_capped(zid); + return; + } + + if (pp->p_zoneid != ALL_ZONES) { + /* + * The page is now being shared across a different zone. + * Decrement the original zone's usage. + */ + zid = pp->p_zoneid; + pp->p_zoneid = ALL_ZONES; + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + } +} + +void +zone_rm_page(page_t *pp) +{ + uint_t pcnt; + zone_persist_t *zp; + zoneid_t zid; + + /* Skip pages in segkmem, etc. (KV_KVP, ...) */ + if (PP_ISKAS(pp)) + return; + + zid = pp->p_zoneid; + if (zid == ALL_ZONES || pp->p_share != 0) + return; + + /* This is the last mapping to the page for a zone. */ + if (pp->p_szc == 0) { + pcnt = 1; + } else { + /* large page */ + pcnt = (int64_t)page_get_pagecnt(pp->p_szc); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + if (zp->zpers_pg_cnt > 0) { + atomic_add_32((uint32_t *)&zp->zpers_pg_cnt, -pcnt); + } + zone_decr_capped(zid); + pp->p_zoneid = ALL_ZONES; +} + +void +zone_pageout_stat(int zid, zone_pageout_op_t op) +{ + zone_persist_t *zp; + + if (zid == ALL_ZONES) + return; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + +#ifndef DEBUG + atomic_add_64(&zp->zpers_pg_out, 1); +#else + switch (op) { + case ZPO_DIRTY: + atomic_add_64(&zp->zpers_pg_fsdirty, 1); + break; + case ZPO_FS: + atomic_add_64(&zp->zpers_pg_fs, 1); + break; + case ZPO_ANON: + atomic_add_64(&zp->zpers_pg_anon, 1); + break; + case ZPO_ANONDIRTY: + atomic_add_64(&zp->zpers_pg_anondirty, 1); + break; + default: + cmn_err(CE_PANIC, "Invalid pageout operator %d", op); + break; + } +#endif +} + +/* + * Return the zone's physical memory cap and current free memory (in pages). + */ +void +zone_get_physmem_data(int zid, pgcnt_t *memcap, pgcnt_t *free) +{ + zone_persist_t *zp; + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + zp = &zone_pdata[zid]; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are also in pages. + */ + if (zp->zpers_pg_limit == UINT32_MAX) { + *memcap = physmem; + *free = freemem; + } else { + int64_t freemem; + + *memcap = (pgcnt_t)zp->zpers_pg_limit; + freemem = zp->zpers_pg_limit - zp->zpers_pg_cnt; + if (freemem > 0) { + *free = (pgcnt_t)freemem; + } else { + *free = (pgcnt_t)0; + } + } +} diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 70c8342377..94922f459b 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -259,6 +259,7 @@ CHKHDRS= \ flock.h \ flock_impl.h \ fork.h \ + frameio.h \ fss.h \ fsspriocntl.h \ fsid.h \ @@ -284,6 +285,7 @@ CHKHDRS= \ idmap.h \ ieeefp.h \ id_space.h \ + inotify.h \ instance.h \ int_const.h \ int_fmtio.h \ @@ -496,6 +498,7 @@ CHKHDRS= \ rctl_impl.h \ rds.h \ reboot.h \ + refhash.h \ refstr.h \ refstr_impl.h \ resource.h \ @@ -661,6 +664,8 @@ CHKHDRS= \ vmem.h \ vmem_impl.h \ vmsystm.h \ + vnd.h \ + vnd_errno.h \ vnic.h \ vnic_impl.h \ vnode.h \ @@ -678,6 +683,7 @@ CHKHDRS= \ watchpoint.h \ winlockio.h \ zcons.h \ + zfd.h \ zone.h \ xti_inet.h \ xti_osi.h \ @@ -839,14 +845,14 @@ FSHDRS= \ autofs.h \ decomp.h \ dv_node.h \ - sdev_impl.h \ - sdev_plugin.h \ fifonode.h \ hsfs_isospec.h \ hsfs_node.h \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ @@ -856,6 +862,8 @@ FSHDRS= \ pc_label.h \ pc_node.h \ pxfs_ki.h \ + sdev_impl.h \ + sdev_plugin.h \ snode.h \ swapnode.h \ tmp.h \ diff --git a/usr/src/uts/common/sys/acct.h b/usr/src/uts/common/sys/acct.h index f00884681b..e01ad61025 100644 --- a/usr/src/uts/common/sys/acct.h +++ b/usr/src/uts/common/sys/acct.h @@ -22,6 +22,7 @@ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -88,7 +89,7 @@ extern int acct(const char *); #if defined(_KERNEL) -void acct(char); +void acct(int); int sysacct(char *); struct vnode; diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index e591320025..203b884cd3 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -79,6 +79,9 @@ typedef struct { #define AT_FLAGS 8 /* processor flags */ #define AT_ENTRY 9 /* a.out entry point */ +/* First introduced on Linux */ +#define AT_RANDOM 25 /* address of 16 random bytes */ + /* * These relate to the original PPC ABI document; Linux reused * the values for other things (see below), so disambiguation of @@ -91,19 +94,18 @@ typedef struct { * These are the values from LSB 1.3, the first five are also described * in the draft amd64 ABI. * - * At the time of writing, Solaris doesn't place any of these values into - * the aux vector, except AT_CLKTCK which is placed on the aux vector for - * lx branded processes; also, we do similar things via AT_SUN_ values. + * At the time of writing, illumos doesn't place any of these values into the + * aux vector, except where noted. We do similar things via AT_SUN_ values. * * AT_NOTELF 10 program is not ELF? - * AT_UID 11 real user id - * AT_EUID 12 effective user id - * AT_GID 13 real group id - * AT_EGID 14 effective group id + * AT_UID 11 real user id (provided in LX) + * AT_EUID 12 effective user id (provided in LX) + * AT_GID 13 real group id (provided in LX) + * AT_EGID 14 effective group id (provided in LX) * * AT_PLATFORM 15 * AT_HWCAP 16 - * AT_CLKTCK 17 c.f. _SC_CLK_TCK + * AT_CLKTCK 17 c.f. _SC_CLK_TCK (provided in LX) * AT_FPUCW 18 * * AT_DCACHEBSIZE 19 (moved from 10) @@ -111,6 +113,16 @@ typedef struct { * AT_UCACHEBSIZE 21 (moved from 12) * * AT_IGNOREPPC 22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE 23 secure mode boolean (provided in LX) + * AT_BASE_PLATFORM 24 string identifying real platform, may + * differ from AT_PLATFORM. + * AT_HWCAP2 26 extension of AT_HWCAP + * AT_EXECFN 31 filename of program + * AT_SYSINFO 32 + * AT_SYSINFO_EHDR 33 The vDSO location */ /* @@ -190,6 +202,8 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_AUX1 2020 #define AT_SUN_BRAND_AUX2 2021 #define AT_SUN_BRAND_AUX3 2022 +#define AT_SUN_BRAND_AUX4 2025 +#define AT_SUN_BRAND_NROOT 2024 /* * Aux vector for comm page diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index a2feda573d..bace1142f9 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -103,28 +103,106 @@ struct brand_mach_ops; struct intpdata; struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock) + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + * during lwp creation before b_initlwp could be called. + * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock) + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + * b_clearbrand - Perform any actions necessary when clearing the brand. + * b_rpc_statd - Upcall to rpc.statd running within the zone + * b_acct_out - Output properly formatted accounting record + */ struct brand_ops { - void (*b_init_brand_data)(zone_t *); + void (*b_init_brand_data)(zone_t *, kmutex_t *); void (*b_free_brand_data)(zone_t *); int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void (*b_setbrand)(struct proc *); int (*b_getattr)(zone_t *, int, void *, size_t *); int (*b_setattr)(zone_t *, int, void *, size_t); void (*b_copy_procdata)(struct proc *, struct proc *); - void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_proc_exit)(struct proc *); void (*b_exec)(); void (*b_lwp_setrval)(klwp_t *, int, int); - int (*b_initlwp)(klwp_t *); + void *(*b_lwpdata_alloc)(struct proc *); + void (*b_lwpdata_free)(void *); + void (*b_initlwp)(klwp_t *, void *); + void (*b_initlwp_post)(klwp_t *); void (*b_forklwp)(klwp_t *, klwp_t *); void (*b_freelwp)(klwp_t *); void (*b_lwpexit)(klwp_t *); int (*b_elfexec)(struct vnode *, struct execa *, struct uarg *, - struct intpdata *, int, size_t *, int, caddr_t, struct cred *, - int); + struct intpdata *, int, size_t *, int, caddr_t, struct cred *, + int *); void (*b_sigset_native_to_brand)(sigset_t *); void (*b_sigset_brand_to_native)(sigset_t *); + void (*b_sigfd_translate)(k_siginfo_t *); int b_nsig; + void (*b_exit_with_sig)(proc_t *, sigqueue_t *); + boolean_t (*b_wait_filter)(proc_t *, proc_t *); + boolean_t (*b_native_exec)(uint8_t, const char **); + uint32_t (*b_map32limit)(proc_t *); + void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); + int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, + boolean_t *, int *); + int (*b_sigcld_repost)(proc_t *, sigqueue_t *); + int (*b_issig_stop)(proc_t *, klwp_t *); + boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); + int (*b_setid_clear)(vattr_t *vap, cred_t *cr); + int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); + boolean_t b_intp_parse_arg; + void (*b_clearbrand)(proc_t *, boolean_t); + void (*b_rpc_statd)(int, void *, void *); + void (*b_acct_out)(struct vnode *, int); }; /* @@ -135,6 +213,7 @@ typedef struct brand { char *b_name; struct brand_ops *b_ops; struct brand_mach_ops *b_machops; + size_t b_data_size; } brand_t; extern brand_t native_brand; @@ -165,7 +244,7 @@ extern brand_t *brand_register_zone(struct brand_attr *); extern brand_t *brand_find_name(char *); extern void brand_unregister_zone(brand_t *); extern int brand_zone_count(brand_t *); -extern void brand_setbrand(proc_t *); +extern int brand_setbrand(proc_t *, boolean_t); extern void brand_clearbrand(proc_t *, boolean_t); /* @@ -178,17 +257,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t, extern void brand_solaris_copy_procdata(proc_t *, proc_t *, struct brand *); extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, - intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int, - struct brand *, char *, char *, char *, char *, char *); + intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int *, + struct brand *, char *, char *, char *); extern void brand_solaris_exec(struct brand *); extern int brand_solaris_fini(char **, struct modlinkage *, struct brand *); extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *); extern void brand_solaris_freelwp(klwp_t *, struct brand *); -extern int brand_solaris_initlwp(klwp_t *, struct brand *); +extern void brand_solaris_initlwp(klwp_t *, struct brand *); extern void brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void brand_solaris_proc_exit(struct proc *, klwp_t *, - struct brand *); +extern void brand_solaris_proc_exit(struct proc *, struct brand *); extern void brand_solaris_setbrand(proc_t *, struct brand *); #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index e20e0e0c35..b6b5c20e44 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. * * Copyright 2017 RackTop Systems. */ @@ -188,6 +189,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -200,6 +202,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h index 21cf94dcf9..2c70d7c9f1 100644 --- a/usr/src/uts/common/sys/contract/process.h +++ b/usr/src/uts/common/sys/contract/process.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_CONTRACT_PROCESS_H #define _SYS_CONTRACT_PROCESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/contract.h> #include <sys/time.h> @@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t; #define CT_PR_NOORPHAN 0x2 /* kill when contract is abandoned */ #define CT_PR_PGRPONLY 0x4 /* only kill process group on fatal errors */ #define CT_PR_REGENT 0x8 /* automatically detach inherited contracts */ -#define CT_PR_ALLPARAM 0xf +#define CT_PR_KEEP_EXEC 0x10 /* preserve template accross exec */ +#define CT_PR_ALLPARAM 0x1f /* * ctr_ev_* flags diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 7b153a3e9e..24adbb7418 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -390,6 +390,8 @@ extern cpu_core_t cpu_core[]; #define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ +/* Note: inside ifdef: _KERNEL || _KMEMUSER || _BOOT */ + /* * Macros for manipulating sets of CPUs as a bitmap. Note that this * bitmap may vary in size depending on the maximum CPU id a specific @@ -512,6 +514,7 @@ extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ extern cpu_t *cpu_list; /* list of CPUs */ extern cpu_t *cpu_active; /* list of active CPUs */ extern cpuset_t cpu_active_set; /* cached set of active CPUs */ +extern cpuset_t cpu_available; /* cached set of available CPUs */ extern int ncpus; /* number of CPUs present */ extern int ncpus_online; /* number of CPUs not quiesced */ extern int ncpus_intr_enabled; /* nr of CPUs taking I/O intrs */ diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index fb79dfecde..1f938132e0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index de7ac46db4..b73d22249a 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -192,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -204,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* @@ -356,6 +358,7 @@ typedef struct dld_ioc_led { #define DLD_CAPAB_POLL 0x00000002 #define DLD_CAPAB_PERIM 0x00000003 #define DLD_CAPAB_LSO 0x00000004 +#define DLD_CAPAB_IPCHECK 0x00000005 #define DLD_ENABLE 0x00000001 #define DLD_DISABLE 0x00000002 @@ -382,6 +385,7 @@ typedef struct dld_ioc_led { */ typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define DI_DIRECT_RAW 0x1 /* * Direct Tx/Rx capability. */ @@ -406,8 +410,16 @@ typedef struct dld_capab_direct_s { /* flow control "can I put on a ring" callback */ uintptr_t di_tx_fctl_df; /* canput-like callback */ void *di_tx_fctl_dh; + + /* flags that control our behavior */ + uint_t di_flags; } dld_capab_direct_t; +typedef struct dld_capab_ipcheck_s { + uintptr_t ipc_allowed_df; + void *ipc_allowed_dh; +} dld_capab_ipcheck_t; + /* * Polling/softring capability. */ diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 035eea893a..336fa9cb67 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum { typedef enum { DLD_UNINITIALIZED, DLD_PASSIVE, - DLD_ACTIVE + DLD_ACTIVE, + DLD_EXCLUSIVE } dld_passivestate_t; /* @@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, extern void dld_str_notify_ind(dld_str_t *); extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, uintptr_t, uint16_t); +extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index e9ac27cddd..e71a55ab84 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -109,6 +109,7 @@ typedef struct dl_ipnetinfo { #define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */ #define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */ #define DL_NOTIFY_CONF 0x116 /* Notification from upstream */ +#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */ /* * Primitives used for Connectionless Service @@ -391,6 +392,7 @@ typedef struct dl_ipnetinfo { #define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */ #define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */ #define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */ +#define DL_PROMISC_FIXUPS 0x05 /* above will be fixed up */ /* * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -1085,6 +1087,13 @@ typedef struct { } dl_intr_mode_req_t; /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { + t_uscalar_t dl_primitive; +} dl_exclusive_req_t; + +/* * CONNECTION-ORIENTED SERVICE PRIMITIVES */ @@ -1506,6 +1515,7 @@ union DL_primitives { dl_control_ack_t control_ack; dl_passive_req_t passive_req; dl_intr_mode_req_t intr_mode_req; + dl_exclusive_req_t exclusive_req; }; #define DL_INFO_REQ_SIZE sizeof (dl_info_req_t) @@ -1574,6 +1584,7 @@ union DL_primitives { #define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t) #define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t) #define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t) +#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t) #ifdef _KERNEL /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index cd3749dc21..0c5ffb0dd7 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_H @@ -86,6 +87,7 @@ typedef struct dls_link_s dls_link_t; #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 #define DLS_PROMISC_RX_ONLY 0x00000008 +#define DLS_PROMISC_FIXUPS 0x00000010 extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); extern void dls_close(dld_str_t *); @@ -107,11 +109,13 @@ extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); +extern int dls_devnet_open_in_zone(const char *, + dls_dl_handle_t *, dev_t *, zoneid_t); extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -129,7 +133,7 @@ extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -143,6 +147,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t, extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, datalink_class_t *, uint32_t *, uint32_t *); extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int dls_mgmt_get_linkid_in_zone(const char *, + datalink_id_t *, zoneid_t); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); extern int dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index cd13a41413..329f8dd08e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_IMPL_H @@ -62,6 +63,7 @@ struct dls_link_s { /* Protected by */ uint_t dl_zone_ref; link_tagmode_t dl_tagmode; /* atomic */ uint_t dl_nonip_cnt; /* SL */ + uint_t dl_exclusive; /* SL */ }; typedef struct dls_head_s { @@ -97,7 +99,8 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); @@ -129,6 +132,7 @@ extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int dls_exclusive_set(dld_str_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b60e53b267..6fec277991 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -114,10 +114,14 @@ typedef uint64_t datalink_media_t; #define DLMGMT_CMD_BASE 128 /* - * Indicate the link mapping is active or persistent + * Indicate if the link mapping is active, persistent, or transient. A + * transient link is an active link with a twist -- it is an active + * link which is destroyed along with the zone rather than reassigned + * to the GZ. */ #define DLMGMT_ACTIVE 0x01 #define DLMGMT_PERSIST 0x02 +#define DLMGMT_TRANSIENT 0x04 /* upcall argument */ typedef struct dlmgmt_door_arg { @@ -168,6 +172,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { diff --git a/usr/src/uts/common/sys/dumpadm.h b/usr/src/uts/common/sys/dumpadm.h index 616828bb2b..8ca10ff3c5 100644 --- a/usr/src/uts/common/sys/dumpadm.h +++ b/usr/src/uts/common/sys/dumpadm.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DUMPADM_H @@ -44,11 +45,13 @@ extern "C" { #define DIOCSETUUID (DDIOC | 0x17) #define DIOCGETUUID (DDIOC | 0x18) #define DIOCRMDEV (DDIOC | 0x19) +#define DIOCSCRYPTKEY (DDIOC | 0x1a) /* * Kernel-controlled dump state flags for dump_conflags */ #define DUMP_EXCL 0x00000001 /* dedicated dump device (not swap) */ +#define DUMP_ENCRYPT 0x00000002 /* encrypt dump */ #define DUMP_STATE 0x0000ffff /* the set of all kernel flags */ /* diff --git a/usr/src/uts/common/sys/dumphdr.h b/usr/src/uts/common/sys/dumphdr.h index 2019f60a5d..aa2fbde7a5 100644 --- a/usr/src/uts/common/sys/dumphdr.h +++ b/usr/src/uts/common/sys/dumphdr.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DUMPHDR_H @@ -60,6 +61,22 @@ extern "C" { sizeof (summary_dump_t) + 1024), \ DUMP_OFFSET)) /* summary save area */ +#define DUMP_CRYPT_KEYLEN 32 /* byte len for crypto key */ +#define DUMP_CRYPT_NONCELEN 8 /* byte len for nonce */ +#define DUMP_CRYPT_HMACLEN 64 /* byte len for HMAC */ +#define DUMP_CRYPT_BLOCKSHIFT 6 /* 64-byte blocks */ + +#define DUMP_CRYPT_ALGO_NONE 0 /* dump not encrypted */ +#define DUMP_CRYPT_ALGO_CHACHA20 1 /* ChaCha20 */ + +#if DUMP_OFFSET & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1) +#error DUMP_OFFSET not DUMP_CRYPT_BLOCKSHIFT aligned +#endif + +#if DUMP_LOGSIZE & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1) +#error DUMP_LOGSIZE not DUMP_CRYPT_BLOCKSHIFT aligned +#endif + typedef struct dumphdr { uint32_t dump_magic; /* magic number */ uint32_t dump_version; /* version number */ @@ -86,12 +103,22 @@ typedef struct dumphdr { } dumphdr_t; /* + * If DF_ENCRYPTED is set, this header will be found after the dumphdr. + */ +typedef struct dump_crypt { + uint8_t dump_crypt_algo; /* encryption algorithm */ + uint8_t dump_crypt_hmac[DUMP_CRYPT_HMACLEN]; /* HMAC for crypto key */ + uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; /* encryption none */ +} dump_crypt_t; + +/* * Values for dump_flags */ #define DF_VALID 0x00000001 /* Dump is valid (savecore clears) */ #define DF_COMPLETE 0x00000002 /* All pages present as configured */ #define DF_LIVE 0x00000004 /* Dump was taken on a live system */ #define DF_COMPRESSED 0x00000008 /* Dump is compressed */ +#define DF_ENCRYPTED 0x00000010 /* Dump is encrypted */ #define DF_KERNEL 0x00010000 /* Contains kernel pages only */ #define DF_ALL 0x00020000 /* Contains all pages */ #define DF_CURPROC 0x00040000 /* Contains kernel + cur proc pages */ @@ -175,6 +202,8 @@ extern u_offset_t dumpvp_size; extern struct dumphdr *dumphdr; extern int dump_conflags; extern char *dumppath; +extern uint8_t dump_crypt_key[DUMP_CRYPT_KEYLEN]; +extern uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; extern int dump_timeout; extern int dump_timeleft; diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index 1f290c282c..556a49c60f 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -501,6 +501,11 @@ typedef struct { #define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */ #define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */ +/* + * Linux specific program headers not even used by Linux (!!) + */ +#define PT_PAX_FLAGS 0x65041580 /* PaX flags (see below) */ + #define PT_LOSUNW 0x6ffffffa #define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment (unused) */ #define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */ @@ -516,6 +521,45 @@ typedef struct { #define PF_W 0x2 #define PF_X 0x1 +/* + * PaX is a regrettable series of never-integrated Linux patches for a + * facility to provide additional protections on memory pages for purposes of + * increasing security, and for allowing binaries to demand (or refuse) those + * protections via the PT_PAX_FLAGS program header. (Portents of its + * rudderless existence, "PaX" is a term of indefinite origin written by an + * unknown group of people.) This facility is unfortunate in any number of + * ways, and was largely obviated by the broad adoption of non-executable + * stacks at any rate -- but it lives on in binaries that continue to mark + * themselves to explicitly refuse the (never-integrated, now-obviated) + * facility. One might cringe that PaX overloads the meaning of the p_flags + * to specify protections, but that is the least of its transgressions: + * instead of using one p_type constant to explicitly enable a series of + * protections and another to explicitly disable others, it insists on + * conflating both actions into PT_PAX_FLAGS. The resulting doubling of + * constant definitions (two constant definitions for every protection instead + * of merely one) assures that the values can't even fit in the eight + * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for + * such filthy functionality. And were all of this not enough, there is one + * final nomenclature insult to be added to this semantic injury: the + * constants for the p_flags don't even embed "_PAX_" in their name -- despite + * the fact that this is their only purpose! We resist the temptation to + * right this final wrong here; we grit our teeth and provide exactly the + * Linux definitions -- or rather, what would have been the Linux definitions + * had this belching jalopy ever been permitted to crash itself into mainline. + */ +#define PF_PAGEEXEC 0x00000010 /* PaX: enable PAGEEXEC */ +#define PF_NOPAGEEXEC 0x00000020 /* PaX: disable PAGEEXEC */ +#define PF_SEGMEXEC 0x00000040 /* PaX: enable SEGMEXEC */ +#define PF_NOSEGMEXEC 0x00000080 /* PaX: disable SEGMEXEC */ +#define PF_MPROTECT 0x00000100 /* PaX: enable MPROTECT */ +#define PF_NOMPROTECT 0x00000200 /* PaX: disable MPROTECT */ +#define PF_RANDEXEC 0x00000400 /* PaX: enable RANDEXEC */ +#define PF_NORANDEXEC 0x00000800 /* PaX: disable RANDEXEC */ +#define PF_EMUTRAMP 0x00001000 /* PaX: enable EMUTRAMP */ +#define PF_NOEMUTRAMP 0x00002000 /* PaX: disable EMUTRAMP */ +#define PF_RANDMMAP 0x00004000 /* PaX: enable RANDMMAP */ +#define PF_NORANDMMAP 0x00008000 /* PaX: disable RANDMMAP */ + #define PF_MASKOS 0x0ff00000 /* OS specific values */ #define PF_MASKPROC 0xf0000000 /* processor specific values */ diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h index 1b0d961b0b..b64a101348 100644 --- a/usr/src/uts/common/sys/eventfd.h +++ b/usr/src/uts/common/sys/eventfd.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -47,6 +47,13 @@ typedef uint64_t eventfd_t; #define EVENTFDIOC (('e' << 24) | ('f' << 16) | ('d' << 8)) #define EVENTFDIOC_SEMAPHORE (EVENTFDIOC | 1) /* toggle sem state */ +/* + * Kernel-internal method to write to eventfd while bypassing overflow limits, + * therefore avoiding potential to block as well. This is used to fulfill AIO + * behavior in LX related to eventfd notification. + */ +#define EVENTFDIOC_POST (EVENTFDIOC | 2) + #ifndef _KERNEL extern int eventfd(unsigned int, int); @@ -58,6 +65,7 @@ extern int eventfd_write(int, eventfd_t); #define EVENTFDMNRN_EVENTFD 0 #define EVENTFDMNRN_CLONE 1 #define EVENTFD_VALMAX (ULLONG_MAX - 1ULL) +#define EVENTFD_VALOVERFLOW ULLONG_MAX #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index 23eb5b6bf7..0d5b4c4611 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -81,7 +81,8 @@ typedef struct uarg { ssize_t na; ssize_t ne; ssize_t nc; - ssize_t arglen; + size_t argstrlen; + size_t envstrlen; char *fname; char *pathname; size_t auxsize; @@ -107,10 +108,13 @@ typedef struct uarg { vnode_t *ex_vp; char *emulator; char *brandname; + const char *brand_nroot; char *auxp_auxflags; /* addr of auxflags auxv on the user stack */ char *auxp_brand; /* address of first brand auxv on user stack */ cred_t *pfcred; boolean_t scrubenv; + uintptr_t maxstack; + boolean_t stk_prot_override; uintptr_t commpage; } uarg_t; @@ -181,7 +185,7 @@ struct execsw { int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, size_t *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -209,7 +213,7 @@ extern int exec_common(const char *fname, const char **argp, const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, size_t *execsz, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); @@ -234,18 +238,20 @@ extern void exec_set_sp(size_t); * when compiling the 32-bit compatability elf code in the elfexec module. */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - size_t *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, uint_t *, caddr_t *, size_t *); #endif /* !_ELF32_COMPAT */ #if defined(_LP64) extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - size_t *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, uint_t *, caddr_t *, size_t *); #endif /* _LP64 */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index d300b940e2..66620ab7b9 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -28,6 +28,7 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright 2020 Joyent, Inc. */ +/* Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ #ifndef _SYS_FILE_H #define _SYS_FILE_H @@ -120,6 +121,15 @@ typedef struct fpollinfo { #define FCLOEXEC 0x800000 /* O_CLOEXEC = 0x800000 */ #define FDIRECTORY 0x1000000 /* O_DIRECTORY = 0x1000000 */ #define FDIRECT 0x2000000 /* O_DIRECT = 0x2000000 */ +/* + * Private interface for lx O_PATH|O_NOFOLLOW emulation for symlinks. + */ +#define __FLXPATH 0x80000000 +/* + * Private interface for lx fstatat(AT_NO_AUTOMOUNT) emulation. + * Since usage is disjoint, the __FLXPATH bit is re-used. + */ +#define __FLXNOAUTO 0x80000000 #if defined(_KERNEL) || defined(_FAKE_KERNEL) @@ -224,6 +234,7 @@ extern void fcnt_add(struct uf_info *, int); extern void close_exec(struct uf_info *); extern void clear_stale_fd(void); extern void clear_active_fd(int); +extern void set_active_fd(int); extern void free_afd(afd_t *afd); extern int fgetstartvp(int, char *, struct vnode **); extern int fsetattrat(int, char *, int, struct vattr *); diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define _SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { + void *fv_buf; /* Buffer with data */ + size_t fv_buflen; /* Size of the buffer */ + size_t fv_actlen; /* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { + uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */ + uint_t fio_nvpf; /* How many vectors make up one frame */ + uint_t fio_nvecs; /* The total number of vectors */ + framevec_t fio_vecs[]; /* C99 VLA */ +} frameio_t; + + +#define FRAMEIO_VERSION_ONE 1 +#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE + +#define FRAMEIO_NVECS_MAX 32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { + caddr32_t fv_buf; + size32_t fv_buflen; + size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { + uint_t fio_version; + uint_t fio_vecspframe; + uint_t fio_nvecs; + framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { + MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, + int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/fifonode.h b/usr/src/uts/common/sys/fs/fifonode.h index d8b158ce3c..1ea8563e1c 100644 --- a/usr/src/uts/common/sys/fs/fifonode.h +++ b/usr/src/uts/common/sys/fs/fifonode.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -83,6 +84,7 @@ struct fifonode { struct msgb *fn_tail; /* last message to read */ fifolock_t *fn_lock; /* pointer to per fifo lock */ uint_t fn_count; /* Number of bytes on fn_mp */ + uint_t fn_hiwat; /* pipe (fifofast) high water */ kcondvar_t fn_wait_cv; /* fifo conditional variable */ ushort_t fn_wcnt; /* number of writers */ ushort_t fn_rcnt; /* number of readers */ @@ -135,6 +137,8 @@ typedef struct fifodata { #define FIFOPOLLRBAND 0x20000 #define FIFOSTAYFAST 0x40000 /* don't turn into stream mode */ #define FIFOWAITMODE 0x80000 /* waiting for the possibility to change mode */ +/* Data on loan, block reads. Use in conjunction with FIFOSTAYFAST. */ +#define FIFORDBLOCK 0x100000 #define FIFOHIWAT (16 * 1024) #define FIFOLOWAT (0) @@ -147,16 +151,6 @@ typedef struct fifodata { #if defined(_KERNEL) -/* - * Fifohiwat defined as a variable is to allow tuning of the high - * water mark if needed. It is not meant to be released. - */ -#if FIFODEBUG -extern int Fifohiwat; -#else /* FIFODEBUG */ -#define Fifohiwat FIFOHIWAT -#endif /* FIFODEBUG */ - extern struct vnodeops *fifo_vnodeops; extern const struct fs_operation_def fifo_vnodeops_template[]; extern struct kmem_cache *fnode_cache; @@ -181,6 +175,8 @@ extern void fifo_fastoff(fifonode_t *); extern struct streamtab *fifo_getinfo(); extern void fifo_wakereader(fifonode_t *, fifolock_t *); extern void fifo_wakewriter(fifonode_t *, fifolock_t *); +extern boolean_t fifo_stayfast_enter(fifonode_t *); +extern void fifo_stayfast_exit(fifonode_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_INFO_H +#define _SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index dc6601bb43..676193fcfa 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -37,6 +37,7 @@ extern "C" { #include <sys/vfs_opreg.h> #include <sys/list.h> #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h> #include <sys/sunddi.h> #include <sys/fs/sdev_plugin.h> diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index fb07de6588..f4cee09244 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -23,7 +23,7 @@ * All rights reserved. Use is subject to license terms. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_FS_TMP_H @@ -43,8 +43,10 @@ struct tmount { struct vfs *tm_vfsp; /* filesystem's vfs struct */ struct tmpnode *tm_rootnode; /* root tmpnode */ char *tm_mntpath; /* name of tmpfs mount point */ - ulong_t tm_anonmax; /* file system max anon reservation */ - pgcnt_t tm_anonmem; /* pages of reserved anon memory */ + size_t tm_anonmax; /* file system max anon reservation */ + size_t tm_anonmem; /* bytes of reserved anon memory */ + /* and allocated kmem for the fs */ + size_t tm_allocmem; /* bytes alloced from tmp_kmem_ funcs */ dev_t tm_dev; /* unique dev # of mounted `device' */ uint_t tm_gen; /* pseudo generation number for files */ kmutex_t tm_contents; /* lock for tmount structure */ @@ -58,6 +60,7 @@ struct tmount { #define VTOTM(vp) ((struct tmount *)(vp)->v_vfsp->vfs_data) #define VTOTN(vp) ((struct tmpnode *)(vp)->v_data) #define TNTOV(tp) ((tp)->tn_vnode) +#define TNTOTM(tp) (VTOTM(TNTOV(tp))) #define tmpnode_hold(tp) VN_HOLD(TNTOV(tp)) #define tmpnode_rele(tp) VN_RELE(TNTOV(tp)) @@ -69,41 +72,39 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ /* * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system. E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system. In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care + * should be exercised in changing this: tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure. */ -#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */ +#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */ extern size_t tmpfs_minfree; /* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */ - -extern size_t tmp_kmemspace; -extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */ - extern void tmpnode_init(struct tmount *, struct tmpnode *, struct vattr *, struct cred *); +extern void tmpnode_cleanup(struct tmpnode *tp); extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t); extern void tmpnode_growmap(struct tmpnode *, ulong_t); extern int tdirlookup(struct tmpnode *, char *, struct tmpnode **, struct cred *); extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *, enum dr_op, struct cred *); -extern void tdirinit(struct tmpnode *, struct tmpnode *); +extern int tdirinit(struct tmpnode *, struct tmpnode *); extern void tdirtrunc(struct tmpnode *); -extern void *tmp_memalloc(size_t, int); -extern void tmp_memfree(void *, size_t); extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int); extern int tmp_taccess(void *, int, struct cred *); extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, @@ -114,6 +115,9 @@ extern int tdirenter(struct tmount *, struct tmpnode *, char *, enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *, struct tmpnode **, struct cred *, caller_context_t *); +extern void *tmp_kmem_zalloc(struct tmount *, size_t, int); +extern void tmp_kmem_free(struct tmount *, void *, size_t); + #define TMP_MUSTHAVE 0x01 #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h index 6168e9d9ed..87d798d6c1 100644 --- a/usr/src/uts/common/sys/fss.h +++ b/usr/src/uts/common/sys/fss.h @@ -160,6 +160,7 @@ typedef struct fsszone { /* * fss_flags */ +/* Formerly: FSSKPRI 0x01 - the thread is in kernel mode */ #define FSSBACKQ 0x02 /* thread should be placed at the back of */ /* the dispatch queue if preempted */ #define FSSRESTORE 0x04 /* thread was not preempted, due to schedctl */ diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 2d4e1aa7fb..4a48af52a1 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -21,13 +21,12 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FX_H #define _SYS_FX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/thread.h> #include <sys/ddi.h> @@ -145,7 +144,14 @@ typedef struct fxkparms { uint_t fx_cflags; } fxkparms_t; +/* + * control flags (kparms->fx_cflags). + */ +#define FX_DOUPRILIM 0x01 /* change user priority limit */ +#define FX_DOUPRI 0x02 /* change user priority */ +#define FX_DOTQ 0x04 /* change FX time quantum */ +#define FXMAXUPRI 60 /* maximum user priority setting */ /* * Interface for partner private code. This is not a public interface. diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..91ab46fc44 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,59 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_GSQUEUE_H +#define _SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define GSQUEUE_FILL 0x0001 +#define GSQUEUE_NODRAIN 0x0002 +#define GSQUEUE_PROCESS 0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, + int, uint8_t); + +#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/ia.h b/usr/src/uts/common/sys/ia.h index 02dc29aaec..567c121bb0 100644 --- a/usr/src/uts/common/sys/ia.h +++ b/usr/src/uts/common/sys/ia.h @@ -86,6 +86,7 @@ typedef struct iaproc { /* flags */ +/* Formerly: IAKPRI 0x01 - thread at kernel model priority */ #define IABACKQ 0x02 /* thread goes to back of disp q when preempted */ #define IASLEPT 0x04 /* thread had long-term suspend - give new slice */ diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the inotify facility. Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define _SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ISDIR 0x40000000 +#define IN_ONESHOT 0x80000000 + +/* + * Helpful constants. + */ +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_ALL_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ + IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define IN_CHILD_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +struct inotify_event { + int32_t wd; /* watch descriptor */ + uint32_t mask; /* mask of events */ + uint32_t cookie; /* event association cookie, if any */ + uint32_t len; /* size of name field */ + char name[]; /* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8)) +#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */ +#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */ +#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */ +#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name +#else +#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define INOTIFY_PTR(type, name) type *name +#endif + +typedef struct inotify_addwatch { + int inaw_fd; /* open fd for object */ + uint32_t inaw_mask; /* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { + INOTIFY_PTR(char, inac_name); /* pointer to name */ + int inac_fd; /* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define IN_UNMASKABLE \ + (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define IN_MODIFIERS \ + (IN_EXCL_UNLINK | IN_ONESHOT) + +#define IN_FLAGS \ + (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define IN_REMOVAL (1ULL << 32) +#define INOTIFYMNRN_INOTIFY 0 +#define INOTIFYMNRN_CLONE 1 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h index 0569c3e967..d7dc365c09 100644 --- a/usr/src/uts/common/sys/ipc_impl.h +++ b/usr/src/uts/common/sys/ipc_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #ifndef _IPC_IMPL_H @@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *); kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *); void ipc_cleanup(ipc_service_t *, kipc_perm_t *); +void ipc_rmsvc(ipc_service_t *, kipc_perm_t *); int ipc_rmid(ipc_service_t *, int, cred_t *); int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h index bad74f8b81..f21c3fb5af 100644 --- a/usr/src/uts/common/sys/ipd.h +++ b/usr/src/uts/common/sys/ipd.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -35,7 +35,7 @@ extern "C" { #endif #define IPD_DEV_PATH "/dev/ipd" -#define IPD_MAX_DELAY 10000 /* 10 ms in us */ +#define IPD_MAX_DELAY 1000000 /* 1 second in microseconds */ typedef struct ipd_ioc_perturb { zoneid_t ipip_zoneid; diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index bf89ef0d33..0a76ee19a7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -95,7 +96,7 @@ extern "C" { /* insert new signals here, and move _SIGRTM* appropriately */ #define _SIGRTMIN 42 /* first (highest-priority) realtime signal */ -#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */ +#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */ extern long _sysconf(int); /* System Private interface to sysconf() */ #define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */ #define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..0ea1a396b9 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -191,7 +191,14 @@ typedef struct _klwp { struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */ struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ - void *lwp_brand; /* per-lwp brand data */ + /* + * Branding: + * lwp_brand - per-lwp brand data + * lwp_brand_syscall - brand syscall interposer + */ + void *lwp_brand; + int (*lwp_brand_syscall)(void); + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 1d1915a816..8fff314bfe 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -116,6 +116,7 @@ typedef enum { #define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 #define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004 #define MAC_PROMISC_FLAGS_NO_COPY 0x0008 +#define MAC_PROMISC_FLAGS_DO_FIXUPS 0x0010 /* flags passed to mac_tx() */ #define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 0e3a6306e0..0f8be50fde 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -83,6 +83,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */ boolean_t mpi_no_phys; /* WO */ boolean_t mpi_strip_vlan_tag; /* WO */ boolean_t mpi_no_copy; /* WO */ + boolean_t mpi_do_fixups; /* WO */ } mac_promisc_impl_t; typedef union mac_tx_percpu_s { diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 01cb27644c..97b3fd685a 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -58,6 +58,9 @@ extern const mac_info_t *mac_info(mac_handle_t); extern boolean_t mac_info_get(const char *, mac_info_t *); extern boolean_t mac_promisc_get(mac_handle_t); +extern boolean_t mac_protect_check_addr(mac_client_handle_t, boolean_t, + in6_addr_t *); + extern int mac_start(mac_handle_t); extern void mac_stop(mac_handle_t); diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h index 04aa8be3f3..a9a2a5f61e 100644 --- a/usr/src/uts/common/sys/mac_flow.h +++ b/usr/src/uts/common/sys/mac_flow.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. * Copyright 2020 RackTop Systems, Inc. */ @@ -156,6 +156,14 @@ typedef enum { #define MPT_MAXIPADDR MPT_MAXCNT #define MPT_MAXCID MPT_MAXCNT #define MPT_MAXCIDLEN 256 +#define MPT_FALSE 0x00000000 +#define MPT_TRUE 0x00000001 + +/* Dynamic address detection types */ +#define MPT_DYN_DHCPV4 0x00000001 +#define MPT_DYN_DHCPV6 0x00000002 +#define MPT_DYN_SLAAC 0x00000004 +#define MPT_DYN_ALL 0x00000007 typedef struct mac_ipaddr_s { uint32_t ip_version; @@ -176,11 +184,13 @@ typedef struct mac_dhcpcid_s { } mac_dhcpcid_t; typedef struct mac_protect_s { - uint32_t mp_types; - uint32_t mp_ipaddrcnt; - mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; - uint32_t mp_cidcnt; - mac_dhcpcid_t mp_cids[MPT_MAXCID]; + uint32_t mp_types; /* Enabled protection types */ + uint32_t mp_ipaddrcnt; /* Count of allowed IPs */ + mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; /* Allowed IPs */ + uint32_t mp_cidcnt; /* Count of allowed DHCP CIDs */ + mac_dhcpcid_t mp_cids[MPT_MAXCID]; /* Allowed DHCP CIDs */ + uint32_t mp_allcids; /* Whether to allow all CIDs through */ + uint32_t mp_dynamic; /* Enabled dynamic address methods */ } mac_protect_t; /* The default priority for links */ diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 11fa46e571..6906cb3dbf 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -337,6 +337,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if !defined(_STRICT_POSIX) || (_POSIX_C_SOURCE > 2) || defined(_XPG5) /* flags to mlockall */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index 88c98dc5a4..7196f7b3ac 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 6407534a3b..658f9f3f6b 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@ * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_NETCONFIG_H diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index e7027f8ece..92bd5b897d 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -48,6 +48,8 @@ struct msgb; /* avoiding sys/stream.h here */ #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VND_INET "NHF_VND_INET" +#define NHF_VND_INET6 "NHF_VND_INET6" #define NHF_VIONA "NHF_VIONA" /* diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 7ee33318cd..b327e69fad 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -88,7 +88,8 @@ typedef id_t netstackid_t; #define NS_IPSECESP 16 #define NS_IPNET 17 #define NS_ILB 18 -#define NS_MAX (NS_ILB+1) +#define NS_VND 19 +#define NS_MAX (NS_VND+1) /* * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index 282d84b912..66bd91f76f 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -116,7 +116,7 @@ extern "C" { #define DEFAULT_MAXPID 999999 #define DEFAULT_JUMPPID 100000 #else -#define DEFAULT_MAXPID 30000 +#define DEFAULT_MAXPID 99999 #define DEFAULT_JUMPPID 0 #endif diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index d8983a28c4..9f1b80d390 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *); int secpolicy_kmdb(const cred_t *); int secpolicy_lock_memory(const cred_t *); int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *); int secpolicy_modctl(const cred_t *, int); int secpolicy_net(const cred_t *, int, boolean_t); int secpolicy_net_bindmlp(const cred_t *); @@ -176,6 +177,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index ff277f89c8..388849a14f 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -141,6 +141,7 @@ struct pollstate { pollstate_t *ps_contend_nextp; /* next in contender list */ pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ int ps_flags; /* state flags */ + short ps_implicit_ev; /* implicit poll event interest */ }; /* pollstate flags */ diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 06e5a8caf4..d05886d1fc 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -357,6 +357,7 @@ typedef struct proc { struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ struct brand *p_brand; /* process's brand */ + void *p_brand_data; /* per-process brand state */ psecflags_t p_secflags; /* per-process security flags */ @@ -373,7 +374,6 @@ typedef struct proc { */ struct user p_user; /* (see sys/user.h) */ } proc_t; - #define PROC_T /* headers relying on proc_t are OK */ #ifdef _KERNEL @@ -647,6 +647,7 @@ extern int signal_is_blocked(kthread_t *, int); extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid); extern void pid_setmin(void); extern pid_t pid_allocate(proc_t *, pid_t, int); extern int pid_rele(struct pid *); @@ -662,6 +663,7 @@ extern int sprtrylock_proc(proc_t *); extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); +extern void sprunprlock(proc_t *); extern void pid_init(void); extern proc_t *pid_entry(int); extern int pid_slot(proc_t *); @@ -753,6 +755,10 @@ extern kthread_t *thread_unpin(void); extern void thread_init(void); extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern void thread_splitstack(void (*)(void *), void *, size_t); +extern void thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern void thread_splitstack_cleanup(void); + extern void tsd_create(uint_t *, void (*)(void *)); extern void tsd_destroy(uint_t *); extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); @@ -794,7 +800,7 @@ extern void pokelwps(proc_t *); extern void continuelwps(proc_t *); extern int exitlwps(int); extern void lwp_ctmpl_copy(klwp_t *, klwp_t *); -extern void lwp_ctmpl_clear(klwp_t *); +extern void lwp_ctmpl_clear(klwp_t *, boolean_t); extern klwp_t *forklwp(klwp_t *, proc_t *, id_t); extern void lwp_load(klwp_t *, gregset_t, uintptr_t); extern void lwp_setrval(klwp_t *, int, int); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index 00ba23594e..3d6760a7b4 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -25,7 +25,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ @@ -237,6 +237,7 @@ typedef struct pstatus { #define PR_FAULTED 6 #define PR_SUSPENDED 7 #define PR_CHECKPOINT 8 +#define PR_BRAND 9 /* * lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo @@ -271,10 +272,12 @@ typedef struct lwpsinfo { int pr_filler[4]; /* reserved for future use */ } lwpsinfo_t; +#define PRARGSZ 80 /* number of chars of arguments */ +#define PRMAXARGVLEN 4096 /* max len of /proc/%s/argv */ + /* * process ps(1) information file. /proc/<pid>/psinfo */ -#define PRARGSZ 80 /* number of chars of arguments */ typedef struct psinfo { int pr_flag; /* process flags (DEPRECATED; do not use) */ int pr_nlwp; /* number of active lwps in the process */ diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 23594fdc13..52d69b3416 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -125,6 +125,12 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ typedef struct pt_own { @@ -160,6 +166,19 @@ typedef struct pt_own { #define ZONEPT (('P'<<8)|4) /* set zone of manager/subsidiary pair */ #define OWNERPT (('P'<<8)|5) /* set owner/group for subsidiary */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/refhash.h b/usr/src/uts/common/sys/refhash.h index b7427a454d..469cb6d686 100644 --- a/usr/src/uts/common/sys/refhash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -19,6 +19,10 @@ #include <sys/types.h> #include <sys/list.h> +#ifdef __cplusplus +extern "C" { +#endif + #define RHL_F_DEAD 0x01 typedef struct refhash_link { @@ -58,4 +62,8 @@ extern void *refhash_first(refhash_t *); extern void *refhash_next(refhash_t *, void *); extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); +#ifdef __cplusplus +} +#endif + #endif /* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 13166f378d..d65ca00f69 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h index d4233aecb5..2ed7320a09 100644 --- a/usr/src/uts/common/sys/rt.h +++ b/usr/src/uts/common/sys/rt.h @@ -22,6 +22,7 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -75,6 +76,16 @@ typedef struct rtkparms { int rt_tqsig; /* real-time time quantum signal */ uint_t rt_cflags; /* real-time control flags */ } rtkparms_t; + +#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ + +/* + * control flags (kparms->rt_cflags). + */ +#define RT_DOPRI 0x01 /* change priority */ +#define RT_DOTQ 0x02 /* change RT time quantum */ +#define RT_DOSIG 0x04 /* change RT time quantum signal */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h new file mode 100644 index 0000000000..afb7a94c58 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2012-2015 LSI Corp. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2000-2015 LSI Corporation. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * + * Name: mpi2_pci.h + * Title: MPI PCIe Attached Devices structures and definitions. + * Creation Date: October 9, 2012 + * + * mpi2_pci.h Version: 02.00.02 + * + * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25 + * prefix are for use only on MPI v2.5 products, and must not be used + * with MPI v2.0 products. Unless otherwise noted, names beginning with + * MPI2 or Mpi2 are for use with both MPI v2.0 and MPI v2.5 products. + * + * Version History + * --------------- + * + * Date Version Description + * -------- -------- ------------------------------------------------------ + * 03-16-15 02.00.00 Initial version. + * 02-17-16 02.00.01 Removed AHCI support. + * Removed SOP support. + * 07-01-16 02.00.02 Added MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP to + * NVME Encapsulated Request. + * -------------------------------------------------------------------------- + */ + +#ifndef MPI2_PCI_H +#define MPI2_PCI_H + + +/* + * Values for the PCIe DeviceInfo field used in PCIe Device Status Change Event + * data and PCIe Configuration pages. + */ +#define MPI26_PCIE_DEVINFO_DIRECT_ATTACH (0x00000010) + +#define MPI26_PCIE_DEVINFO_MASK_DEVICE_TYPE (0x0000000F) +#define MPI26_PCIE_DEVINFO_NO_DEVICE (0x00000000) +#define MPI26_PCIE_DEVINFO_PCI_SWITCH (0x00000001) +#define MPI26_PCIE_DEVINFO_NVME (0x00000003) + + +/**************************************************************************** +* NVMe Encapsulated message +****************************************************************************/ + +/* NVME Encapsulated Request Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_REQUEST +{ + U16 DevHandle; /* 0x00 */ + U8 ChainOffset; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U32 Reserved3; /* 0x0C */ + U64 ErrorResponseBaseAddress; /* 0x10 */ + U16 ErrorResponseAllocationLength; /* 0x18 */ + U16 Flags; /* 0x1A */ + U32 DataLength; /* 0x1C */ + U8 NVMe_Command[4]; /* 0x20 */ /* variable length */ + +} MPI26_NVME_ENCAPSULATED_REQUEST, MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_REQUEST, + Mpi26NVMeEncapsulatedRequest_t, MPI2_POINTER pMpi26NVMeEncapsulatedRequest_t; + +/* defines for the Flags field */ +#define MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP (0x0020) +/* Submission Queue Type*/ +#define MPI26_NVME_FLAGS_SUBMISSIONQ_MASK (0x0010) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_IO (0x0000) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0010) +/* Error Response Address Space */ +#define MPI26_NVME_FLAGS_MASK_ERROR_RSP_ADDR (0x000C) +#define MPI26_NVME_FLAGS_SYSTEM_RSP_ADDR (0x0000) +#define MPI26_NVME_FLAGS_IOCPLB_RSP_ADDR (0x0008) +#define MPI26_NVME_FLAGS_IOCPLBNTA_RSP_ADDR (0x000C) +/* Data Direction*/ +#define MPI26_NVME_FLAGS_DATADIRECTION_MASK (0x0003) +#define MPI26_NVME_FLAGS_NODATATRANSFER (0x0000) +#define MPI26_NVME_FLAGS_WRITE (0x0001) +#define MPI26_NVME_FLAGS_READ (0x0002) +#define MPI26_NVME_FLAGS_BIDIRECTIONAL (0x0003) + + +/* NVMe Encapuslated Reply Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_ERROR_REPLY +{ + U16 DevHandle; /* 0x00 */ + U8 MsgLength; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U16 Reserved3; /* 0x0C */ + U16 IOCStatus; /* 0x0E */ + U32 IOCLogInfo; /* 0x10 */ + U16 ErrorResponseCount; /* 0x14 */ + U16 Reserved4; /* 0x16 */ +} MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + Mpi26NVMeEncapsulatedErrorReply_t, + MPI2_POINTER pMpi26NVMeEncapsulatedErrorReply_t; + + +#endif + + diff --git a/usr/src/uts/common/sys/scsi/generic/inquiry.h b/usr/src/uts/common/sys/scsi/generic/inquiry.h index ddfd683169..fcbf00d5dc 100644 --- a/usr/src/uts/common/sys/scsi/generic/inquiry.h +++ b/usr/src/uts/common/sys/scsi/generic/inquiry.h @@ -362,7 +362,8 @@ struct scsi_inquiry { #define DTYPE_NOTPRESENT (DPQ_NEVER | DTYPE_UNKNOWN) /* - * Defined Response Data Formats: + * Defined Versions for inquiry data. These represent the base version that a + * device supports. */ #define RDF_LEVEL0 0x00 /* no conformance claim (SCSI-1) */ #define RDF_CCS 0x01 /* Obsolete (pseudo-spec) */ @@ -370,7 +371,8 @@ struct scsi_inquiry { #define RDF_SCSI_SPC 0x03 /* ANSI INCITS 301-1997 (SPC) */ #define RDF_SCSI_SPC2 0x04 /* ANSI INCITS 351-2001 (SPC-2) */ #define RDF_SCSI_SPC3 0x05 /* ANSI INCITS 408-2005 (SPC-3) */ -#define RDF_SCSI_SPC4 0x06 /* t10 (SPC-4) */ +#define RDF_SCSI_SPC4 0x06 /* ANSI INCITS 513-2015 (SPC-4) */ +#define RDF_SCSI_SPC5 0x07 /* t10 (SPC-5) */ /* * Defined Target Port Group Select values: @@ -436,6 +438,7 @@ struct vpd_desc { #define PM_CAPABLE_SPC2 RDF_SCSI_SPC2 #define PM_CAPABLE_SPC3 RDF_SCSI_SPC3 #define PM_CAPABLE_SPC4 RDF_SCSI_SPC4 +#define PM_CAPABLE_SPC5 RDF_SCSI_SPC5 #define PM_CAPABLE_LOG_MASK 0xffff0000 /* use upper 16 bit to */ /* indicate log specifics */ #define PM_CAPABLE_LOG_SUPPORTED 0x10000 /* Log page 0xE might be */ diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h index d28918d9c5..bb522141af 100644 --- a/usr/src/uts/common/sys/scsi/targets/sddef.h +++ b/usr/src/uts/common/sys/scsi/targets/sddef.h @@ -763,6 +763,12 @@ _NOTE(MUTEX_PROTECTS_DATA(sd_lun::un_fi_mutex, #define SD_FM_LOG(un) (((struct sd_fm_internal *)\ ((un)->un_fm_private))->fm_log_level) +/* + * Version Related Macros + */ +#define SD_SCSI_VERS_IS_GE_SPC_4(un) \ + (SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC4 || \ + SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC5) /* * Values for un_ctype @@ -1821,6 +1827,10 @@ struct sd_fm_internal { #define SD_PM_CAPABLE_IS_SPC_4(pm_cap) \ ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) +#define SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap) \ + (((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) || \ + ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC5)) + #define SD_PM_CAP_LOG_SUPPORTED(pm_cap) \ ((pm_cap & PM_CAPABLE_LOG_SUPPORTED) ? TRUE : FALSE) diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h index 0219fc2cf7..8f530afda2 100644 --- a/usr/src/uts/common/sys/shm.h +++ b/usr/src/uts/common/sys/shm.h @@ -21,6 +21,7 @@ */ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2016 Joyent, Inc. * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -120,6 +121,10 @@ struct shmid_ds { #define SHM_LOCK 3 /* Lock segment in core */ #define SHM_UNLOCK 4 /* Unlock segment */ +#if defined(_KERNEL) +#define SHM_RMID 5 /* Private RMID for lx support */ +#endif + #if !defined(_KERNEL) int shmget(key_t, size_t, int); int shmids(int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h index 4d8cdcede5..1eae2ca0a4 100644 --- a/usr/src/uts/common/sys/shm_impl.h +++ b/usr/src/uts/common/sys/shm_impl.h @@ -21,13 +21,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SHM_IMPL_H #define _SYS_SHM_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ipc_impl.h> #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/shm.h> @@ -70,7 +69,11 @@ typedef struct kshmid { time_t shm_ctime; /* last change time */ struct sptinfo *shm_sptinfo; /* info about ISM segment */ struct seg *shm_sptseg; /* pointer to ISM segment */ - long shm_sptprot; /* was reserved (still a "long") */ + ulong_t shm_opts; + /* + * Composed of: sptprot (uchar_t) and + * RM_PENDING flag (1 bit). + */ } kshmid_t; /* @@ -78,6 +81,14 @@ typedef struct kshmid { */ #define SHMSA_ISM 1 /* uses shared page table */ +/* + * shm_opts definitions + * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are + * used for additional options. + */ +#define SHM_PROT_MASK 0xff +#define SHM_RM_PENDING 0x100 + typedef struct sptinfo { struct as *sptas; /* dummy as ptr. for spt segment */ } sptinfo_t; diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index aece147bec..b12dff6034 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ struct sigaction32 { * use of these symbols by applications is injurious * to binary compatibility */ -#define NSIG 74 /* valid signals range from 1 to NSIG-1 */ -#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */ +#define NSIG 75 /* valid signals range from 1 to NSIG-1 */ +#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */ #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */ #define MINSIGSTKSZ 2048 diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 9e61bc7bb0..25880522e9 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -40,6 +40,9 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #ifndef _SYS_SOCKET_H #define _SYS_SOCKET_H @@ -205,6 +208,7 @@ struct so_snd_bufinfo { #define SO_SRCADDR 0x2001 /* Internal: AF_UNIX source address */ #define SO_FILEP 0x2002 /* Internal: AF_UNIX file pointer */ #define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */ +#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */ #endif /* _KERNEL */ /* @@ -304,8 +308,9 @@ struct linger { #define AF_INET_OFFLOAD 30 /* Sun private; do not use */ #define AF_TRILL 31 /* TRILL interface */ #define AF_PACKET 32 /* PF_PACKET Linux socket interface */ +#define AF_LX_NETLINK 33 /* Linux-compatible netlink */ -#define AF_MAX 32 +#define AF_MAX 33 /* * Protocol families, same as address families for now. @@ -345,6 +350,7 @@ struct linger { #define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */ #define PF_TRILL AF_TRILL #define PF_PACKET AF_PACKET +#define PF_LX_NETLINK AF_LX_NETLINK #define PF_MAX AF_MAX diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 479641a11b..1e48b00dd7 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -304,15 +304,16 @@ struct sonode { #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ -#define SS_CLOSING 0x00010000 /* in process of closing */ +#define SS_CLOSING 0x00010000 /* in process of closing */ #define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ #define SS_FILOP_OK 0x00040000 /* socket can attach filters */ #define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ + #define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ #define SS_FIL_STOP 0x00200000 /* no more filter actions */ - #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ +#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */ #define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ #define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ @@ -328,7 +329,8 @@ struct sonode { /* * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment. */ #define SO_BLOCK_FALLBACK(so, fn) \ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ @@ -344,6 +346,24 @@ struct sonode { } \ } +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define SO_BLOCK_FALLBACK_SAFE(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state |= SS_FILOP_UNSF; \ + mutex_exit(&(so)->so_lock); \ + } + #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } @@ -375,6 +395,7 @@ struct sonode { /* The modes below are only for non-streams sockets */ #define SM_ACCEPTSUPP 0x400 /* can handle accept() */ #define SM_SENDFILESUPP 0x800 /* Private: proto supp sendfile */ +#define SM_DEFERERR 0x1000 /* Private: defer so_error delivery */ /* * Socket versions. Used by the socket library when calling _so_socket(). diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops { #define SOF_VERSION 1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define SOF_ATT_SAFE 0x1 + extern int sof_register(int, const char *, const sof_ops_t *, int); extern int sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index 040963eef7..89b355970e 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -30,6 +30,17 @@ extern "C" { #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ + #include <sys/types.h> #include <sys/processor.h> #include <sys/stream.h> @@ -77,12 +88,13 @@ typedef enum { struct ip_recv_attr_s; extern void squeue_init(void); -extern squeue_t *squeue_create(pri_t); +extern squeue_t *squeue_create(pri_t, boolean_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *); struct conn_s; extern int squeue_synch_enter(struct conn_s *, mblk_t *); diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 8eb6a30add..2bb717fb52 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -114,6 +114,7 @@ struct squeue_s { squeue_set_t *sq_set; /* managed by squeue creator */ pri_t sq_priority; /* squeue thread priority */ + boolean_t sq_isip; /* use IP-centric features */ /* Keep the debug-only fields at the end of the structure */ #ifdef DEBUG @@ -161,6 +162,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 #define SQS_PAUSE 0x04000000 /* The squeue has been paused */ +#define SQS_EXIT 0x08000000 /* squeue is being torn down */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index ea2c3d8e9a..7d118b09e8 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. * Copyright 2015 Joyent, Inc. All rights reserved. * Copyright 2022 Garrett D'Amore diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index 442595289f..c0dedf555c 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -1599,8 +1599,14 @@ int ddi_ffs(long mask); int +ddi_ffsll(long long mask); + +int ddi_fls(long mask); +int +ddi_flsll(long long mask); + /* * The ddi_soft_state* routines comprise generic storage management utilities * for driver soft state structures. Two types of soft_state indexes are diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dtrace.h> #ifdef __cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent; extern systrace_sysent_t *systrace_sysent32; extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #ifdef _SYSCALL32_IMPL extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #endif #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 39106a14fc..4edeb7a41c 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -363,6 +363,24 @@ extern pid_t tcgetsid(int); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 53a31c848c..76e6835349 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -375,7 +375,7 @@ typedef struct _kthread { #define T_WOULDBLOCK 0x0020 /* for lockfs */ #define T_DONTBLOCK 0x0040 /* for lockfs */ #define T_DONTPEND 0x0080 /* for lockfs */ -#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */ +#define T_SPLITSTK 0x0100 /* kernel stack is currently split */ #define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ @@ -427,8 +427,9 @@ typedef struct _kthread { #define TS_RESUME 0x1000 /* setrun() by CPR resume process */ #define TS_CREATE 0x2000 /* setrun() by syslwp_create() */ #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ +#define TS_BSTART 0x8000 /* setrun() by brand */ #define TS_ALLSTART \ - (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) + (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART) #define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* @@ -456,6 +457,10 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \ + !((t)->t_schedflag & TS_BSTART)) + /* True if thread is asleep and wakeable */ #define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ ((t)->t_flag & T_WAKEABLE))) diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 634d5fb3a6..d82508e6b3 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -15,6 +15,7 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ @@ -266,6 +267,14 @@ typedef longlong_t hrtime_t; #if defined(_KERNEL) || defined(_FAKE_KERNEL) +/* + * Unsigned counterpart to hrtime_t + */ +typedef u_longlong_t uhrtime_t; + +#define HRTIME_MAX LLONG_MAX +#define UHRTIME_MAX ULLONG_MAX + #include <sys/time_impl.h> #include <sys/mutex.h> diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h index 4bbc5b4fb8..db27960413 100644 --- a/usr/src/uts/common/sys/timer.h +++ b/usr/src/uts/common/sys/timer.h @@ -35,6 +35,8 @@ #include <sys/proc.h> #include <sys/thread.h> #include <sys/param.h> +#include <sys/siginfo.h> +#include <sys/port.h> #ifdef __cplusplus extern "C" { @@ -65,6 +67,7 @@ extern int timer_max; */ #define IT_SIGNAL 0x01 #define IT_PORT 0x02 /* use event port notification */ +#define IT_CALLBACK 0x04 /* custom callback function */ struct clock_backend; @@ -92,14 +95,27 @@ struct itimer { struct clock_backend *it_backend; void (*it_fire)(itimer_t *); kmutex_t it_mutex; - void *it_portev; /* port_kevent_t pointer */ - void *it_portsrc; /* port_source_t pointer */ - int it_portfd; /* port file descriptor */ + union { + struct { + void *_it_portev; /* port_kevent_t pointer */ + void *_it_portsrc; /* port_source_t pointer */ + int _it_portfd; /* port file descriptor */ + } _it_ev_port; + struct { + void (*_it_cb_func)(itimer_t *); + uintptr_t _it_cb_data[2]; + } _it_ev_cb; + } _it_ev_data; }; #define it_sigq __data.__proc.__it_sigq #define it_lwp __data.__proc.__it_lwp #define it_frontend __data.__it_frontend +#define it_portev _it_ev_data._it_ev_port._it_portev +#define it_portsrc _it_ev_data._it_ev_port._it_portsrc +#define it_portfd _it_ev_data._it_ev_port._it_portfd +#define it_cb_func _it_ev_data._it_ev_cb._it_cb_func +#define it_cb_data _it_ev_data._it_ev_cb._it_cb_data typedef struct clock_backend { struct sigevent clk_default; @@ -116,7 +132,11 @@ typedef struct clock_backend { extern void clock_add_backend(clockid_t clock, clock_backend_t *backend); extern clock_backend_t *clock_get_backend(clockid_t clock); +extern void timer_release(struct proc *, itimer_t *); +extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it); extern void timer_lwpbind(); +extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *, + itimer_t **, timer_t *); extern void timer_func(sigqueue_t *); extern void timer_exit(void); diff --git a/usr/src/uts/common/sys/ts.h b/usr/src/uts/common/sys/ts.h index 7949058565..2cf5dcade3 100644 --- a/usr/src/uts/common/sys/ts.h +++ b/usr/src/uts/common/sys/ts.h @@ -79,6 +79,8 @@ typedef struct tsproc { } tsproc_t; /* flags */ + +/* Formerly: TSKPRI 0x01 - thread at kernel mode priority */ #define TSBACKQ 0x02 /* thread goes to back of dispq if preempted */ #define TSIA 0x04 /* thread is interactive */ #define TSIASET 0x08 /* interactive thread is "on" */ diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index 904b52cac4..75d000b831 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -159,7 +160,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index bca1ed1fa3..9584be559f 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -145,7 +145,8 @@ typedef struct uioa_s { */ typedef enum xuio_type { UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY + UIOTYPE_ZEROCOPY, + UIOTYPE_PEEKSIZE } xuio_type_t; typedef struct xuio { @@ -175,6 +176,15 @@ typedef struct xuio { int xu_zc_rw; /* read or write buffer */ void *xu_zc_priv; /* fs specific */ } xu_zc; + + /* + * Peek Size Support -- facilitate peeking at the size of a + * waiting message on a socket. + */ + struct { + ssize_t xu_ps_size; /* size of waiting msg */ + boolean_t xu_ps_set; /* was size calculated? */ + } xu_ps; } xu_ext; } xuio_t; diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index 7f54dcf3ab..90fde4ef98 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2022 Oxide Computer Company */ @@ -203,9 +203,9 @@ typedef struct { /* kernel syscall set type */ * This value should not be changed in a patch. */ #if defined(__sparc) -#define __KERN_NAUXV_IMPL 20 +#define __KERN_NAUXV_IMPL 24 #elif defined(__i386) || defined(__amd64) -#define __KERN_NAUXV_IMPL 26 +#define __KERN_NAUXV_IMPL 29 #endif struct execsw; @@ -228,7 +228,11 @@ typedef struct user { char u_psargs[PSARGSZ]; /* arguments from exec */ int u_argc; /* value of argc passed to main() */ uintptr_t u_argv; /* value of argv passed to main() */ + uintptr_t u_argvstrs; /* argv string space pointer */ + size_t u_argvstrsize; /* size of argv string space */ uintptr_t u_envp; /* value of envp passed to main() */ + uintptr_t u_envstrs; /* env string space pointer */ + size_t u_envstrsize; /* size of env string space */ uintptr_t u_commpagep; /* address of mapped comm page */ /* diff --git a/usr/src/uts/common/sys/vm.h b/usr/src/uts/common/sys/vm.h index 14b5754b28..b32a789d36 100644 --- a/usr/src/uts/common/sys/vm.h +++ b/usr/src/uts/common/sys/vm.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -57,6 +58,8 @@ int queue_io_request(struct vnode *, u_offset_t); extern kmutex_t memavail_lock; extern kcondvar_t memavail_cv; +#define WAKE_PAGEOUT_SCANNER() cv_broadcast(&proc_pageout->p_cv) + #endif /* defined(_KERNEL) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..afbf438eff 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,12 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ + +#define VMUSAGE_ZONE_FLAGS (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | \ + VMUSAGE_A_ZONE) typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index e8e30b7608..daf76f9f51 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright (c) 2017, Joyent, Inc. All rights reserved. + */ +/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +61,9 @@ extern pgcnt_t desscan; /* desired pages scanned per second */ extern pgcnt_t slowscan; extern pgcnt_t fastscan; extern pgcnt_t pushes; /* number of pages pushed to swap device */ +extern uint64_t low_mem_scan; /* num times page scan due to low memory */ +extern uint64_t zone_cap_scan; /* num times page scan due to zone cap */ +extern uint64_t n_throttle; /* num times page create throttled */ /* writable copies of tunables */ extern pgcnt_t maxpgio; /* max paging i/o per sec before start swaps */ @@ -160,6 +166,8 @@ extern void *boot_virt_alloc(void *addr, size_t size); extern size_t exec_get_spslew(void); +extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define _SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define VND_IOC_ATTACH (VND_IOC | 0x1) + +#define VND_NAMELEN 32 + +typedef struct vnd_ioc_attach { + char via_name[VND_NAMELEN]; + zoneid_t via_zoneid; + uint32_t via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define VND_IOC_LINK (VND_IOC | 0x2) + +typedef struct vnd_ioc_link { + char vil_name[VND_NAMELEN]; + uint32_t vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define VND_IOC_UNLINK (VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { + uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { + uint64_t vib_size; + uint32_t vib_filler; + uint32_t vib_errno; +} vnd_ioc_buf_t; + +#define VND_IOC_GETRXBUF (VND_IOC | 0x04) +#define VND_IOC_SETRXBUF (VND_IOC | 0x05) +#define VND_IOC_GETMAXBUF (VND_IOC | 0x06) +#define VND_IOC_GETTXBUF (VND_IOC | 0x07) +#define VND_IOC_SETTXBUF (VND_IOC | 0x08) +#define VND_IOC_GETMINTU (VND_IOC | 0x09) +#define VND_IOC_GETMAXTU (VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { + uint32_t vii_version; + zoneid_t vii_zone; + char vii_name[VND_NAMELEN]; + char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { + uint_t vl_nents; + uint_t vl_actents; + vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { + uint_t vl_nents; + uint_t vl_actents; + caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif /* _KERNEL */ + +#define VND_IOC_LIST (VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30) +#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define _SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { + VND_E_SUCCESS = 0, /* no error */ + VND_E_NOMEM, /* no memory */ + VND_E_NODATALINK, /* no such datalink */ + VND_E_NOTETHER, /* not DL_ETHER */ + VND_E_DLPIINVAL, /* Unknown DLPI failures */ + VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */ + VND_E_BINDFAIL, /* DL_BIND_REQ failed */ + VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */ + VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */ + VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */ + VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */ + VND_E_DLDBADVERS, /* bad dld version */ + VND_E_KSTATCREATE, /* failed to create kstats */ + VND_E_NODEV, /* no such vnd link */ + VND_E_NONETSTACK, /* netstack doesn't exist */ + VND_E_ASSOCIATED, /* device already associated */ + VND_E_ATTACHED, /* device already attached */ + VND_E_LINKED, /* device already linked */ + VND_E_BADNAME, /* invalid name */ + VND_E_PERM, /* can't touch this */ + VND_E_NOZONE, /* no such zone */ + VND_E_STRINIT, /* failed to initialize vnd stream module */ + VND_E_NOTATTACHED, /* device not attached */ + VND_E_NOTLINKED, /* device not linked */ + VND_E_LINKEXISTS, /* another device has the same link name */ + VND_E_MINORNODE, /* failed to create minor node */ + VND_E_BUFTOOBIG, /* requested buffer size is too large */ + VND_E_BUFTOOSMALL, /* requested buffer size is too small */ + VND_E_DLEXCL, /* unable to get dlpi excl access */ + VND_E_DIRECTNOTSUP, + /* DLD direct capability not suported over data link */ + VND_E_BADPROPSIZE, /* invalid property size */ + VND_E_BADPROP, /* invalid property */ + VND_E_PROPRDONLY, /* property is read only */ + VND_E_SYS, /* unexpected system error */ + VND_E_CAPABPASS, + /* capabilities invalid, pass-through module detected */ + VND_E_UNKNOWN /* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index b8702bc8f5..df5da6c2e7 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2020 Joyent, Inc. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. */ @@ -197,6 +197,7 @@ struct vsd_node { * v_count * v_shrlocks * v_path + * v_phantom_count * v_vsd * v_xattrdir * @@ -214,6 +215,7 @@ struct vsd_node { * v_lock * v_flag * v_count + * v_phantom_count * v_data * v_vfsp * v_stream @@ -285,6 +287,8 @@ typedef struct vnode { kmutex_t v_lock; /* protects vnode fields */ uint_t v_flag; /* vnode flags (see below) */ uint_t v_count; /* reference count */ + /* non vn_count() ref count (see below) */ + uint_t v_phantom_count; void *v_data; /* private data for fs */ struct vfs *v_vfsp; /* ptr to containing VFS */ struct stdata *v_stream; /* associated stream */ @@ -811,13 +815,15 @@ typedef enum vnevent { VE_REMOVE = 3, /* Remove of vnode's name */ VE_RMDIR = 4, /* Remove of directory vnode's name */ VE_CREATE = 5, /* Create with vnode's name which exists */ - VE_LINK = 6, /* Link with vnode's name as source */ - VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ - VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ + VE_LINK = 6, /* Link with vnode's name as source */ + VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ + VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ VE_TRUNCATE = 9, /* Truncate */ VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */ VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ - VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */ + VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */ + VE_RENAME_SRC_DIR = 13, /* Rename with vnode as source dir */ + VE_RESIZE = 14 /* Resize/truncate to non-zero offset */ } vnevent_t; /* @@ -1292,9 +1298,9 @@ void vn_recycle(vnode_t *); void vn_free(vnode_t *); int vn_is_readonly(vnode_t *); -int vn_is_opened(vnode_t *, v_mode_t); -int vn_is_mapped(vnode_t *, v_mode_t); -int vn_has_other_opens(vnode_t *, v_mode_t); +int vn_is_opened(vnode_t *, v_mode_t); +int vn_is_mapped(vnode_t *, v_mode_t); +int vn_has_other_opens(vnode_t *, v_mode_t); void vn_open_upgrade(vnode_t *, int); void vn_open_downgrade(vnode_t *, int); @@ -1333,10 +1339,12 @@ int vn_createat(char *pnamep, enum uio_seg seg, struct vattr *vap, int vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, ssize_t len, offset_t offset, enum uio_seg seg, int ioflag, rlim64_t ulimit, cred_t *cr, ssize_t *residp); +uint_t vn_count(struct vnode *vp); void vn_rele(struct vnode *vp); void vn_rele_async(struct vnode *vp, struct taskq *taskq); void vn_rele_dnlc(struct vnode *vp); void vn_rele_stream(struct vnode *vp); +void vn_phantom_rele(struct vnode *vp); int vn_link(char *from, char *to, enum uio_seg seg); int vn_linkat(vnode_t *fstartvp, char *from, enum symfollow follow, vnode_t *tstartvp, char *to, enum uio_seg seg); @@ -1377,7 +1385,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_create(vnode_t *, caller_context_t *); void vnevent_link(vnode_t *, caller_context_t *); -void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, + caller_context_t *ct); void vnevent_mountedover(vnode_t *, caller_context_t *); void vnevent_truncate(vnode_t *, caller_context_t *); int vnevent_support(vnode_t *, caller_context_t *); @@ -1387,6 +1396,7 @@ void vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *, caller_context_t *); +void vnevent_resize(vnode_t *, caller_context_t *); /* Vnode specific data */ void vsd_create(uint_t *, void (*)(void *)); @@ -1439,6 +1449,16 @@ extern uint_t pvn_vmodsort_supported; * this->vp->v_path == NULL ? "NULL" : stringof(this->vp->v_path), * this->vp->v_count) * }' + * + * There are some situations where we don't want a hold to make the vnode + * 'busy'. For example, watching a directory via port events or inotify + * should not prevent a filesystem from mounting on a watched directory. + * For those instances, a phantom hold is used via VN_PHANTOM_HOLD(). + * + * A phantom hold works identically to regular hold, except that those holds + * are excluded from the return value of vn_count(). + * + * A phantom hold must be released by VN_PHANTOM_RELE(). */ #define VN_HOLD_LOCKED(vp) { \ ASSERT(mutex_owned(&(vp)->v_lock)); \ @@ -1467,6 +1487,22 @@ extern uint_t pvn_vmodsort_supported; DTRACE_PROBE1(vn__rele, vnode_t *, vp); \ } +#define VN_PHANTOM_HOLD_LOCKED(vp) { \ + VN_HOLD_LOCKED(vp); \ + (vp)->v_phantom_count++; \ + DTRACE_PROBE1(vn__phantom_hold, vnode_t *, vp); \ +} + +#define VN_PHANTOM_HOLD(vp) { \ + mutex_enter(&(vp)->v_lock); \ + VN_PHANTOM_HOLD_LOCKED(vp); \ + mutex_exit(&(vp)->v_lock); \ +} + +#define VN_PHANTOM_RELE(vp) { \ + vn_phantom_rele(vp); \ +} + #define VN_SET_VFS_TYPE_DEV(vp, vfsp, type, dev) { \ (vp)->v_vfsp = (vfsp); \ (vp)->v_type = (type); \ @@ -1477,7 +1513,7 @@ extern uint_t pvn_vmodsort_supported; * Compare two vnodes for equality. In general this macro should be used * in preference to calling VOP_CMP directly. */ -#define VN_CMP(VP1, VP2) ((VP1) == (VP2) ? 1 : \ +#define VN_CMP(VP1, VP2) ((VP1) == (VP2) ? 1 : \ ((VP1) && (VP2) && (vn_getops(VP1) == vn_getops(VP2)) ? \ VOP_CMP(VP1, VP2, NULL) : 0)) diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..e08d75ecba --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,78 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define _SYS_ZFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define ZFD_MASTER_NAME "master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define ZFD_SLAVE_NAME "slave" + +#define ZFD_NAME_LEN 16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define ZFD_MAKETTY (ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define ZFD_EOF (ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define ZFD_HAS_SLAVE (ZFD_IOC | 2) + +/* + * This ioctl links two streams into a multiplexer configuration for in-zone + * logging. + */ +#define ZFD_MUX (ZFD_IOC | 3) + +/* + * This ioctl controls the flow control setting for the log multiplexer stream + * (1 = true, 0 = false). The default is false which implies teeing into the + * log stream is "best-effort" but data will be discarded if the stream + * becomes full. If set and the log stream begins to fill up, the primary + * stream will stop flowing. + */ +#define ZFD_MUX_FLOWCON (ZFD_IOC | 4) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 26b74ca34a..afef75013f 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -50,8 +50,10 @@ #include <sys/socket_impl.h> #include <sys/secflags.h> #include <sys/cpu_uarray.h> +#include <sys/nvpair.h> #include <sys/list.h> #include <sys/loadavg.h> +#include <sys/vnode.h> #endif /* _KERNEL */ #ifdef __cplusplus @@ -62,15 +64,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -97,7 +111,13 @@ extern "C" { #define ZONE_CHECK_DATALINK 12 #define ZONE_LIST_DATALINK 13 -/* zone attributes */ +/* + * zone attributes + * + * Note that values up to ZONE_ATTR_HOSTID are baked into things like Solaris + * 10 which can be run under the s10 brand; don't renumber or change them. Ones + * which are no longer used are commented out. + */ #define ZONE_ATTR_ROOT 1 #define ZONE_ATTR_NAME 2 #define ZONE_ATTR_STATUS 3 @@ -109,17 +129,24 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 +/* #define ZONE_ATTR_PHYS_MCAP 12 */ #define ZONE_ATTR_SCHED_CLASS 13 #define ZONE_ATTR_FLAGS 14 #define ZONE_ATTR_HOSTID 15 #define ZONE_ATTR_FS_ALLOWED 16 #define ZONE_ATTR_NETWORK 17 + +/* illumos extensions */ #define ZONE_ATTR_INITNORESTART 20 #define ZONE_ATTR_SECFLAGS 21 #define ZONE_ATTR_INITRESTART0 22 #define ZONE_ATTR_INITREBOOT 23 +/* OmniOS/SmartOS extensions */ +#define ZONE_ATTR_DID 30 +#define ZONE_ATTR_APP_SVC_CT 31 +#define ZONE_ATTR_SCHED_FIXEDHI 32 + /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -134,13 +161,18 @@ extern "C" { #define ZONE_EVENT_READY "ready" #define ZONE_EVENT_RUNNING "running" #define ZONE_EVENT_SHUTTING_DOWN "shutting_down" +#define ZONE_EVENT_FREE "free" #define ZONE_CB_NAME "zonename" #define ZONE_CB_NEWSTATE "newstate" #define ZONE_CB_OLDSTATE "oldstate" +#define ZONE_CB_RESTARTS "restarts" #define ZONE_CB_TIMESTAMP "when" #define ZONE_CB_ZONEID "zoneid" +#define ZONE_EVENT_INIT_CLASS "init" +#define ZONE_EVENT_INIT_RESTART_SC "restart" + /* * Exit values that may be returned by scripts or programs invoked by various * zone commands. @@ -199,6 +231,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -215,6 +248,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -239,7 +273,8 @@ typedef enum { ZONE_IS_EMPTY, ZONE_IS_DOWN, ZONE_IS_DYING, - ZONE_IS_DEAD + ZONE_IS_DEAD, + ZONE_IS_FREE /* transient state for zone sysevent */ } zone_status_t; #define ZONE_MIN_STATE ZONE_IS_UNINITIALIZED #define ZONE_MAX_STATE ZONE_IS_DEAD @@ -259,9 +294,12 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + int status; /* init status on shutdown */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ + /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ + int pad; } zone_cmd_arg_t; /* @@ -389,7 +427,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -400,12 +438,57 @@ typedef struct zone_kstat { struct cpucap; typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_rcnt; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_wcnt; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; kstat_named_t zm_pgpgin; kstat_named_t zm_anonpgin; kstat_named_t zm_execpgin; kstat_named_t zm_fspgin; kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; } zone_mcap_kstat_t; typedef struct { @@ -420,8 +503,10 @@ typedef struct { kstat_named_t zm_ffnoproc; kstat_named_t zm_ffnomem; kstat_named_t zm_ffmisc; + kstat_named_t zm_mfseglim; kstat_named_t zm_nested_intp; kstat_named_t zm_init_pid; + kstat_named_t zm_init_restarts; kstat_named_t zm_boot_time; } zone_misc_kstat_t; @@ -464,6 +549,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -516,10 +602,11 @@ typedef struct zone { kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ pid_t zone_proc_initpid; /* pid of "init" for this zone */ - char *zone_initname; /* fs path to 'init' */ + uint_t zone_proc_init_restarts; /* times init restarted */ + char *zone_initname; /* fs path to 'init' */ + int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -559,9 +646,11 @@ typedef struct zone { boolean_t zone_restart_init; /* Restart init if it dies? */ boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */ boolean_t zone_restart_init_0; /* Restart only if it exits 0 */ + boolean_t zone_setup_app_contract; /* setup contract? */ struct brand *zone_brand; /* zone's brand */ void *zone_brand_data; /* store brand specific data */ id_t zone_defaultcid; /* dflt scheduling class id */ + boolean_t zone_fixed_hipri; /* fixed sched. hi prio */ kstat_t *zone_swapresv_kstat; kstat_t *zone_lockedmem_kstat; /* @@ -570,8 +659,24 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + /* - * Solaris Auditing per-zone audit context + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + zone_zfs_kstat_t *zone_zfs_stats; + + /* + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -588,7 +693,11 @@ typedef struct zone { /* zone_rctls->rcs_lock */ kstat_t *zone_nprocs_kstat; - kmutex_t zone_mcap_lock; /* protects mcap statistics */ + /* + * kstats and counters for physical memory capping. + */ + kstat_t *zone_physmem_kstat; + kmutex_t zone_mcap_lock; /* protects mcap statistics */ kstat_t *zone_mcap_ksp; zone_mcap_kstat_t *zone_mcap_stats; uint64_t zone_pgpgin; /* pages paged in */ @@ -613,6 +722,8 @@ typedef struct zone { uint32_t zone_ffnomem; /* as_dup/memory error */ uint32_t zone_ffmisc; /* misc. other error */ + uint32_t zone_mfseglim; /* map failure (# segs limit) */ + uint32_t zone_nested_intp; /* nested interp. kstat */ struct loadavg_s zone_loadavg; /* loadavg for this zone */ @@ -640,6 +751,53 @@ typedef struct zone { } zone_t; /* + * Data and counters used for ZFS fair-share disk IO. + */ +typedef struct zone_zfs_io { + uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */ + uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */ + sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zpers_wr_ops; /* writes, and */ + sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */ + kstat_io_t zpers_zfs_rwstats; + uint64_t zpers_io_util; /* IO utilization metric */ + uint64_t zpers_zfs_rd_waittime; + uint8_t zpers_io_delay; /* IO delay on logical r/w */ + uint8_t zpers_zfs_weight; /* used to prevent starvation */ + uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */ +} zone_zfs_io_t; + +/* + * "Persistent" zone data which can be accessed idependently of the zone_t. + */ +typedef struct zone_persist { + kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */ + zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */ + uint8_t zpers_over; /* currently over cap */ + uint32_t zpers_pg_cnt; /* current RSS in pages */ + uint32_t zpers_pg_limit; /* current RRS limit in pages */ + uint32_t zpers_nover; /* # of times over phys. cap */ +#ifndef DEBUG + uint64_t zpers_pg_out; /* # pages flushed */ +#else + /* + * To conserve memory, some detailed kstats are only kept for DEBUG + * builds. + */ + uint64_t zpers_zfs_rd_waittime; + + uint64_t zpers_pg_anon; /* # clean anon pages flushed */ + uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */ + uint64_t zpers_pg_fs; /* # clean fs pages flushed */ + uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */ +#endif +} zone_persist_t; + +typedef enum zone_pageout_op { + ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY +} zone_pageout_op_t; + +/* * Special value of zone_psetid to indicate that pools are disabled. */ #define ZONE_PS_INVAL PS_MYID @@ -668,6 +826,7 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -802,7 +961,7 @@ struct zsd_entry { * NOTE: Using the VN_ prefix, even though it's defined here in zone.h. * NOTE2: See above warning about ZONE_ROOTVP(). */ -#define VN_IS_CURZONEROOT(vp) (VN_CMP(vp, ZONE_ROOTVP())) +#define VN_IS_CURZONEROOT(vp) (VN_CMP(vp, ZONE_ROOTVP())) /* * Zone-safe version of thread_create() to be used when the caller wants to @@ -868,6 +1027,7 @@ extern int zone_ncpus_online_get(zone_t *); * Returns true if the named pool/dataset is visible in the current zone. */ extern int zone_dataset_visible(const char *, int *); +extern int zone_dataset_visible_inzone(zone_t *, const char *, int *); /* * zone version of kadmin() @@ -880,10 +1040,25 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); +extern void zone_pageout_stat(int, zone_pageout_op_t); +extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern zone_persist_t zone_pdata[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; +/* For publishing sysevents related to a particular zone */ +extern void zone_sysevent_publish(zone_t *, const char *, const char *, + nvlist_t *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c index 9b4bd38baa..245ef9f14f 100644 --- a/usr/src/uts/common/syscall/brandsys.c +++ b/usr/src/uts/common/syscall/brandsys.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright 2016 Joyent, Inc. + */ #include <sys/brand.h> #include <sys/systm.h> @@ -35,7 +37,7 @@ */ int64_t brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg4) { struct proc *p = curthread->t_procp; int64_t rval = 0; @@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (set_errno(ENOSYS)); if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, - arg4, arg5, arg6)) != 0) + arg4)) != 0) return (set_errno(err)); return (rval); diff --git a/usr/src/uts/common/syscall/chdir.c b/usr/src/uts/common/syscall/chdir.c index 84c924f570..deb5532b50 100644 --- a/usr/src/uts/common/syscall/chdir.c +++ b/usr/src/uts/common/syscall/chdir.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -62,7 +63,7 @@ /* * Change current working directory ("."). */ -static int chdirec(vnode_t *, int ischroot, int do_traverse); +static int chdirec(vnode_t *, boolean_t ischroot, boolean_t do_traverse); int chdir(char *fname) @@ -78,7 +79,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 0, 1); + error = chdirec(vp, B_FALSE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -102,7 +103,7 @@ fchdir(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 0, 0); + error = chdirec(vp, B_FALSE, B_FALSE); if (error) return (set_errno(error)); return (0); @@ -125,7 +126,7 @@ lookup: return (set_errno(error)); } - error = chdirec(vp, 1, 1); + error = chdirec(vp, B_TRUE, B_TRUE); if (error) { if ((error == ESTALE) && fs_need_estale_retry(estale_retry++)) goto lookup; @@ -152,18 +153,18 @@ fchroot(int fd) vp = fp->f_vnode; VN_HOLD(vp); releasef(fd); - error = chdirec(vp, 1, 0); + error = chdirec(vp, B_TRUE, B_FALSE); if (error) return (set_errno(error)); return (0); } static int -chdirec(vnode_t *vp, int ischroot, int do_traverse) +chdirec_common(proc_t *pp, vnode_t *vp, boolean_t ischroot, + boolean_t do_traverse) { int error; vnode_t *oldvp; - proc_t *pp = curproc; vnode_t **vpp; refstr_t *cwd; int newcwd = 1; @@ -194,7 +195,7 @@ chdirec(vnode_t *vp, int ischroot, int do_traverse) if (ischroot) { struct vattr tattr; struct vattr rattr; - vnode_t *zonevp = curproc->p_zone->zone_rootvp; + vnode_t *zonevp = pp->p_zone->zone_rootvp; tattr.va_mask = AT_FSID|AT_NODEID; if (error = VOP_GETATTR(vp, &tattr, 0, CRED(), NULL)) @@ -243,3 +244,15 @@ bad: VN_RELE(vp); return (error); } + +int +chdir_proc(proc_t *pp, vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(pp, vp, ischroot, do_traverse)); +} + +static int +chdirec(vnode_t *vp, boolean_t ischroot, boolean_t do_traverse) +{ + return (chdirec_common(curproc, vp, ischroot, do_traverse)); +} diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c index 7b787a4acb..b029d92f1b 100644 --- a/usr/src/uts/common/syscall/fcntl.c +++ b/usr/src/uts/common/syscall/fcntl.c @@ -54,7 +54,8 @@ #include <sys/cmn_err.h> -static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +/* This is global so that it can be used by brand emulation. */ +int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); static void fd_too_big(proc_t *); diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index 9b932af275..027ba7a2bc 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -116,13 +116,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c index 40d9717a5b..94dc02b284 100644 --- a/usr/src/uts/common/syscall/open.c +++ b/usr/src/uts/common/syscall/open.c @@ -75,12 +75,12 @@ copen(int startfd, char *fname, int filemode, int createmode) if (filemode & (FSEARCH|FEXEC)) { /* - * Must be one or the other and neither FREAD nor FWRITE + * Must be one or the other. * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN - * XXX: Should these just be silently ignored? + * XXX: Should these just be silently ignored like we + * silently ignore FREAD|FWRITE? */ - if ((filemode & (FREAD|FWRITE)) || - (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || + if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 7af1c7edfe..a10b2623db 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,7 +29,7 @@ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright (c) 2017, Joyent, Inc. * Copyright 2022 Oxide Computer Company */ @@ -326,20 +326,57 @@ polllock(pollhead_t *php, kmutex_t *lp) return (0); } -static int -poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) +int +poll_copyin(pollstate_t *ps, pollfd_t *fds, nfds_t nfds) +{ + pollfd_t *pollfdp; + nfds_t old_nfds; + + /* + * NOTE: for performance, buffers are saved across poll() calls. + * The theory is that if a process polls heavily, it tends to poll + * on the same set of descriptors. Therefore, we only reallocate + * buffers when nfds changes. There is no hysteresis control, + * because there is no data to suggest that this is necessary; + * the penalty of reallocating is not *that* great in any event. + */ + old_nfds = ps->ps_nfds; + if (nfds != old_nfds) { + kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); + pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); + ps->ps_pollfd = pollfdp; + ps->ps_nfds = nfds; + } + + pollfdp = ps->ps_pollfd; + if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { + return (EFAULT); + } + + if (fds == NULL) { + /* + * If the process has page 0 mapped, then the copyin() above + * will succeed even if fds is NULL. However, our cached + * poll lists are keyed by the address of the passed-in fds + * structure, and we use the value NULL to indicate an unused + * poll cache list entry. As such, we elect not to support + * NULL as a valid (user) memory address and fail the poll() + * call. + */ + return (EFAULT); + } + return (0); +} + +int +poll_common(pollstate_t *ps, pollfd_t *fds, nfds_t nfds, timespec_t *tsp, + int *fdcnt) { kthread_t *t = curthread; - klwp_t *lwp = ttolwp(t); - proc_t *p = ttoproc(t); - int fdcnt = 0; - int i; hrtime_t deadline; /* hrtime value when we want to return */ pollfd_t *pollfdp; - pollstate_t *ps; pollcache_t *pcp; int error = 0; - nfds_t old_nfds; int cacheindex = 0; /* which cache set is used */ /* @@ -349,33 +386,34 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) deadline = -1; } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { deadline = 0; + } else if (tsp->tv_sec >= HRTIME_MAX/NANOSEC) { + /* Use an indefinite timeout if tv_sec would cause overflow */ + deadline = -1; } else { + /* + * The above check, when combined with the protections offered + * by itimerspecfix (ensuring that neither field is negative + * and that tv_nsec represents less than a whole second), will + * prevent overflow during the conversion from timespec_t to + * uhrtime_t. + */ + uhrtime_t utime = tsp->tv_sec * NANOSEC; + utime += tsp->tv_nsec; + /* They must wait at least a tick. */ - deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; - deadline = MAX(deadline, nsec_per_tick); - deadline += gethrtime(); - } + utime = MAX(utime, nsec_per_tick); - /* - * Reset our signal mask, if requested. - */ - if (ksetp != NULL) { - mutex_enter(&p->p_lock); - schedctl_finish_sigblock(t); - lwp->lwp_sigoldmask = t->t_hold; - t->t_hold = *ksetp; - t->t_flag |= T_TOMASK; /* - * Call cv_reltimedwait_sig() just to check for signals. - * We will return immediately with either 0 or -1. + * Since utime has an upper bound of HRTIME_MAX, adding the + * gethrtime() result cannot incur an overflow as the unsigned + * type has an adequate bound. */ - if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, - TR_CLOCK_TICK)) { - mutex_exit(&p->p_lock); - error = EINTR; - goto pollout; + utime += (uhrtime_t)gethrtime(); + if (utime > HRTIME_MAX) { + deadline = -1; + } else { + deadline = (hrtime_t)utime; } - mutex_exit(&p->p_lock); } /* @@ -383,6 +421,7 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * If yes then bypass all the other stuff and make it sleep. */ if (nfds == 0) { + *fdcnt = 0; /* * Sleep until we have passed the requested future * time or until interrupted by a signal. @@ -394,66 +433,14 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) &t->t_delay_lock, deadline)) > 0) continue; mutex_exit(&t->t_delay_lock); - error = (error == 0) ? EINTR : 0; + return ((error == 0) ? EINTR : 0); } - goto pollout; - } - - if (nfds > p->p_fno_ctl) { - mutex_enter(&p->p_lock); - (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], - p->p_rctls, p, RCA_SAFE); - mutex_exit(&p->p_lock); - error = EINVAL; - goto pollout; - } - - /* - * Need to allocate memory for pollstate before anything because - * the mutex and cv are created in this space - */ - ps = pollstate_create(); - - if (ps->ps_pcache == NULL) - ps->ps_pcache = pcache_alloc(); - pcp = ps->ps_pcache; - - /* - * NOTE: for performance, buffers are saved across poll() calls. - * The theory is that if a process polls heavily, it tends to poll - * on the same set of descriptors. Therefore, we only reallocate - * buffers when nfds changes. There is no hysteresis control, - * because there is no data to suggest that this is necessary; - * the penalty of reallocating is not *that* great in any event. - */ - old_nfds = ps->ps_nfds; - if (nfds != old_nfds) { - - kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); - pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); - ps->ps_pollfd = pollfdp; - ps->ps_nfds = nfds; + return (0); } + VERIFY(ps != NULL); pollfdp = ps->ps_pollfd; - if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { - error = EFAULT; - goto pollout; - } - - if (fds == NULL) { - /* - * If the process has page 0 mapped, then the copyin() above - * will succeed even if fds is NULL. However, our cached - * poll lists are keyed by the address of the passed-in fds - * structure, and we use the value NULL to indicate an unused - * poll cache list entry. As such, we elect not to support - * NULL as a valid (user) memory address and fail the poll() - * call. - */ - error = EINVAL; - goto pollout; - } + VERIFY(pollfdp != NULL); /* * If this thread polls for the first time, allocate ALL poll @@ -469,10 +456,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) /* * poll and cache this poll fd list in ps_pcacheset[0]. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } else { pollcacheset_t *pcset = ps->ps_pcacheset; @@ -497,11 +484,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * the callee will guarantee the consistency * of cached poll list and cache content. */ - error = pcacheset_resolve(ps, nfds, &fdcnt, + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -518,11 +505,11 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) * found an unused entry. Use it to cache * this poll list. */ - error = pcacheset_cache_list(ps, fds, &fdcnt, + error = pcacheset_cache_list(ps, fds, fdcnt, cacheindex); - if (fdcnt || error) { + if (error || *fdcnt) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } break; } @@ -536,10 +523,10 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) cacheindex = pcacheset_replace(ps); ASSERT(cacheindex < ps->ps_nsets); pcset[cacheindex].pcs_usradr = (uintptr_t)fds; - error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); + error = pcacheset_resolve(ps, nfds, fdcnt, cacheindex); if (error) { mutex_exit(&ps->ps_lock); - goto pollout; + return (error); } } } @@ -557,8 +544,8 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); for (;;) { pcp->pc_flag = 0; - error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); - if (fdcnt || error) { + error = pcache_poll(pollfdp, ps, nfds, fdcnt, cacheindex); + if (error || *fdcnt) { mutex_exit(&pcp->pc_lock); mutex_exit(&ps->ps_lock); break; @@ -604,13 +591,116 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) mutex_enter(&pcp->pc_lock); } + return (error); +} + +/* + * This is the system call trap that poll(), + * select() and pselect() are built upon. + * It is a private interface between libc and the kernel. + */ +int +pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + timespec_t ts; + timespec_t *tsp; + k_sigset_t kset; + pollstate_t *ps = NULL; + pollfd_t *pollfdp = NULL; + int error = 0, fdcnt = 0; + + /* + * Copy in timeout + */ + if (timeoutp == NULL) { + tsp = NULL; + } else { + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(timeoutp, &ts, sizeof (ts))) + return (set_errno(EFAULT)); + } else { + timespec32_t ts32; + + if (copyin(timeoutp, &ts32, sizeof (ts32))) + return (set_errno(EFAULT)); + TIMESPEC32_TO_TIMESPEC(&ts, &ts32) + } + + if (itimerspecfix(&ts)) + return (set_errno(EINVAL)); + tsp = &ts; + } + + /* + * Copy in and reset signal mask, if requested. + */ + if (setp != NULL) { + sigset_t set; + + if (copyin(setp, &set, sizeof (set))) + return (set_errno(EFAULT)); + sigutok(&set, &kset); + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = kset; + t->t_flag |= T_TOMASK; + /* + * Call cv_reltimedwait_sig() just to check for signals. + * We will return immediately with either 0 or -1. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, + TR_CLOCK_TICK)) { + mutex_exit(&p->p_lock); + error = EINTR; + goto pollout; + } + mutex_exit(&p->p_lock); + } + + /* + * Initialize pollstate and copy in pollfd data if present. + * If nfds == 0, we will skip all of the copying and check steps and + * proceed directly into poll_common to process the supplied timeout. + */ + if (nfds != 0) { + if (nfds > p->p_fno_ctl) { + mutex_enter(&p->p_lock); + (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], + p->p_rctls, p, RCA_SAFE); + mutex_exit(&p->p_lock); + error = EINVAL; + goto pollout; + } + + /* + * Need to allocate memory for pollstate before anything + * because the mutex and cv are created in this space + */ + ps = pollstate_create(); + if (ps->ps_pcache == NULL) + ps->ps_pcache = pcache_alloc(); + + if ((error = poll_copyin(ps, fds, nfds)) != 0) + goto pollout; + pollfdp = ps->ps_pollfd; + } + + /* + * Perform the actual poll. + */ + error = poll_common(ps, fds, nfds, tsp, &fdcnt); + pollout: /* - * If we changed the signal mask but we received - * no signal then restore the signal mask. - * Otherwise psig() will deal with the signal mask. + * If we changed the signal mask but we received no signal then restore + * the signal mask. Otherwise psig() will deal with the signal mask. */ - if (ksetp != NULL) { + if (setp != NULL) { mutex_enter(&p->p_lock); if (lwp->lwp_cursig == 0) { t->t_hold = lwp->lwp_sigoldmask; @@ -621,12 +711,10 @@ pollout: if (error) return (set_errno(error)); - /* * Copy out the events and return the fdcnt to the user. */ - if (nfds != 0 && - copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) + if (nfds != 0 && copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) return (set_errno(EFAULT)); #ifdef DEBUG @@ -634,7 +722,7 @@ pollout: * Another sanity check: */ if (fdcnt) { - int reventcnt = 0; + int i, reventcnt = 0; for (i = 0; i < nfds; i++) { if (pollfdp[i].fd < 0) { @@ -647,6 +735,8 @@ pollout: } ASSERT(fdcnt == reventcnt); } else { + int i; + for (i = 0; i < nfds; i++) { ASSERT(pollfdp[i].revents == 0); } @@ -657,52 +747,6 @@ pollout: } /* - * This is the system call trap that poll(), - * select() and pselect() are built upon. - * It is a private interface between libc and the kernel. - */ -int -pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) -{ - timespec_t ts; - timespec_t *tsp; - sigset_t set; - k_sigset_t kset; - k_sigset_t *ksetp; - model_t datamodel = get_udatamodel(); - - if (timeoutp == NULL) - tsp = NULL; - else { - if (datamodel == DATAMODEL_NATIVE) { - if (copyin(timeoutp, &ts, sizeof (ts))) - return (set_errno(EFAULT)); - } else { - timespec32_t ts32; - - if (copyin(timeoutp, &ts32, sizeof (ts32))) - return (set_errno(EFAULT)); - TIMESPEC32_TO_TIMESPEC(&ts, &ts32) - } - - if (itimerspecfix(&ts)) - return (set_errno(EINVAL)); - tsp = &ts; - } - - if (setp == NULL) - ksetp = NULL; - else { - if (copyin(setp, &set, sizeof (set))) - return (set_errno(EFAULT)); - sigutok(&set, &kset); - ksetp = &kset; - } - - return (poll_common(fds, nfds, tsp, ksetp)); -} - -/* * Clean up any state left around by poll(2). Called when a thread exits. */ void @@ -1317,8 +1361,8 @@ pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, * be OK too. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, - &memphp, NULL); + error = VOP_POLL(fp->f_vnode, pollfdp->events | ps->ps_implicit_ev, 0, + &pollfdp->revents, &memphp, NULL); if (error) { return (error); } @@ -2028,7 +2072,8 @@ retry: * flag. */ ASSERT(curthread->t_pollcache == NULL); - error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, + error = VOP_POLL(fp->f_vnode, + pollfdp[entry].events | ps->ps_implicit_ev, 0, &pollfdp[entry].revents, &php, NULL); /* * releasef after completely done with this cached @@ -2330,6 +2375,7 @@ pollstate_create() } else { ASSERT(ps->ps_depth == 0); ASSERT(ps->ps_flags == 0); + ASSERT(ps->ps_implicit_ev == 0); ASSERT(ps->ps_pc_stack[0] == 0); } return (ps); @@ -3067,7 +3113,7 @@ plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, php = NULL; ASSERT(curthread->t_pollcache == NULL); error = VOP_POLL(fp->f_vnode, - pollfdp[i].events, 0, + pollfdp[i].events | psp->ps_implicit_ev, 0, &pollfdp[i].revents, &php, NULL); if (error) { return (error); diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e0e63f4c0..09f3266ab4 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/syscall/stat.c b/usr/src/uts/common/syscall/stat.c index a37a86f05a..54dc532cd4 100644 --- a/usr/src/uts/common/syscall/stat.c +++ b/usr/src/uts/common/syscall/stat.c @@ -61,7 +61,7 @@ * to VOP_GETATTR */ -static int +int cstatat_getvp(int fd, char *name, int follow, vnode_t **vp, cred_t **cred) { vnode_t *startvp; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 9c50922b7e..96535fdd08 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2016 Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -172,44 +172,29 @@ sysconfig(int which) /* * If the non-global zone has a phys. memory cap, use that. * We always report the system-wide value for the global zone, - * even though rcapd can be used on the global zone too. + * even though memory capping can be used on the global zone + * too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), - physinstalled)); + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); + return (MIN(cap, physinstalled)); + } return (physinstalled); case _CONFIG_AVPHYS_PAGES: /* - * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always - * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * If the non-global zone has a phys. memory cap, use its + * free value. We always report the system-wide value for the + * global zone, even though memory capping can be used on the + * global zone too. */ - if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { - pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - - cap = btop(curproc->p_zone->zone_phys_mcap); - if (cap > physinstalled) - return (freemem); - - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); - /* - * Because rcapd implements a soft cap, it is possible - * for rss to be temporarily over the cap. - */ - if (cap > rss) - free = cap - rss; - else - free = 0; + if (!INGLOBALZONE(curproc)) { + pgcnt_t cap, free; + + zone_get_physmem_data(curzone->zone_id, &cap, &free); return (MIN(free, freemem)); } diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 858305504d..dfe7f22d44 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -22,7 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/param.h> @@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -108,7 +108,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -245,12 +245,13 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. */ if (ttoproc(curthread) != &p0) { + mutex_enter(&curproc->p_lock); VN_RELE(PTOU(curproc)->u_cdir); if (PTOU(curproc)->u_rdir) VN_RELE(PTOU(curproc)->u_rdir); @@ -260,6 +261,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) PTOU(curproc)->u_cdir = rootdir; PTOU(curproc)->u_rdir = NULL; PTOU(curproc)->u_cwd = NULL; + mutex_exit(&curproc->p_lock); } /* diff --git a/usr/src/uts/common/syscall/umount.c b/usr/src/uts/common/syscall/umount.c index a2deedb163..b25f89b6d5 100644 --- a/usr/src/uts/common/syscall/umount.c +++ b/usr/src/uts/common/syscall/umount.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -125,6 +126,7 @@ umount2(char *pathp, int flag) struct pathname pn; struct vfs *vfsp; int error; + boolean_t altroot; /* * Some flags are disallowed through the system call interface. @@ -154,9 +156,12 @@ umount2(char *pathp, int flag) * isn't in an environment with an alternate root (to the zone's root) * directory, i.e. chroot(2). */ - if (secpolicy_fs_unmount(CRED(), NULL) != 0 || - (PTOU(curproc)->u_rdir != NULL && - PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp) || + mutex_enter(&curproc->p_lock); + altroot = (PTOU(curproc)->u_rdir != NULL && + PTOU(curproc)->u_rdir != curproc->p_zone->zone_rootvp); + mutex_exit(&curproc->p_lock); + + if (secpolicy_fs_unmount(CRED(), NULL) != 0 || altroot || (vfsp = vfs_mntpoint2vfsp(pn.pn_path)) == NULL) { vnode_t *fsrootvp; diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index a2509e7bb6..3735139068 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/page.h b/usr/src/uts/common/vm/page.h index b2f61429e9..bd17487be3 100644 --- a/usr/src/uts/common/vm/page.h +++ b/usr/src/uts/common/vm/page.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017, Joyent, Inc. * Copyright 2021 Oxide Computer Company */ @@ -230,6 +231,7 @@ struct as; * p_nrm * p_mapping * p_share + * p_zoneid * * The following field is file system dependent. How it is used and * the locking strategies applied are up to the individual file system @@ -528,9 +530,8 @@ typedef struct page { pfn_t p_pagenum; /* physical page number */ uint_t p_share; /* number of translations */ -#if defined(_LP64) - uint_t p_sharepad; /* pad for growing p_share */ -#endif + short p_zoneid; /* zone page use tracking */ + short p_pad1; /* TBD */ uint_t p_slckcnt; /* number of softlocks */ #if defined(__sparc) uint_t p_kpmref; /* number of kpm mapping sharers */ diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index e1b5b79b8d..75e3bf5acb 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7319,7 +7319,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7344,11 +7345,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7400,7 +7401,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index c217270943..24e03795e6 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -57,6 +57,7 @@ #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/as.h> @@ -71,6 +72,8 @@ clock_t deadlk_wait = 1; /* number of ticks to wait before retrying */ +ulong_t as_user_seg_limit = 0xffff; /* max segments in an (non-kas) AS */ + static struct kmem_cache *as_cache; static void as_setwatchprot(struct as *, caddr_t, size_t, uint_t); @@ -838,8 +841,6 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, int as_lock_held; klwp_t *lwp = ttolwp(curthread); - - retry: /* * Indicate that the lwp is not to be stopped while waiting for a @@ -1703,6 +1704,20 @@ as_map_locked(struct as *as, caddr_t addr, size_t size, segcreate_func_t crfp, p->p_rctls, p, RCA_UNSAFE_ALL); return (ENOMEM); } + + /* + * Keep the number of segments in a userspace AS constrained to + * a reasonable limit. Linux enforces a value slightly less + * than 64k in order to avoid ELF limits if/when a process + * dumps core. While SunOS avoids that specific problem with + * other tricks, the limit is still valuable to keep kernel + * memory consumption in check. + */ + if (avl_numnodes(&as->a_segtree) >= as_user_seg_limit) { + AS_LOCK_EXIT(as); + atomic_inc_32(&p->p_zone->zone_mfseglim); + return (ENOMEM); + } } if (AS_MAP_CHECK_VNODE_LPOOB(crfp, argsp)) { diff --git a/usr/src/uts/common/vm/vm_page.c b/usr/src/uts/common/vm/vm_page.c index 134b3bcb33..cb99a6b92b 100644 --- a/usr/src/uts/common/vm/vm_page.c +++ b/usr/src/uts/common/vm/vm_page.c @@ -84,6 +84,7 @@ static pgcnt_t max_page_get; /* max page_get request size in pages */ pgcnt_t total_pages = 0; /* total number of pages (used by /proc) */ +uint64_t n_throttle = 0; /* num times page create throttled */ /* * freemem_lock protects all freemem variables: @@ -1477,6 +1478,8 @@ page_create_throttle(pgcnt_t npages, int flags) uint_t i; pgcnt_t tf; /* effective value of throttlefree */ + atomic_inc_64(&n_throttle); + /* * Normal priority allocations. */ @@ -1509,7 +1512,7 @@ page_create_throttle(pgcnt_t npages, int flags) tf = throttlefree - ((flags & PG_PUSHPAGE) ? pageout_reserve : 0); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); for (;;) { fm = 0; @@ -1596,7 +1599,7 @@ checkagain: } ASSERT(proc_pageout != NULL); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START, "page_create_sleep_start: freemem %ld needfree %ld", @@ -2243,7 +2246,7 @@ page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } pp = rootpp; @@ -2372,7 +2375,7 @@ page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags, if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } /* diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index aaad06792b..db8b86d6e6 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -431,7 +432,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -565,8 +573,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -620,13 +629,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -638,7 +651,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -650,6 +663,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -674,7 +690,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index 98ed21d059..88c694336d 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright 2018, Joyent, Inc. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1306,6 +1353,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1322,8 +1375,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1463,8 +1515,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1597,8 +1650,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1619,13 +1671,42 @@ vmu_free_extra() extern kcondvar_t *pr_pid_cv; +static void +vmu_get_zone_rss(zoneid_t zid) +{ + vmu_zone_t *zone; + zone_t *zp; + int ret; + uint_t pgcnt; + + if ((zp = zone_find_by_id(zid)) == NULL) + return; + + ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, (mod_hash_val_t *)&zone); + if (ret != 0) { + zone = vmu_alloc_zone(zid); + ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, + (mod_hash_key_t)(uintptr_t)zid, + (mod_hash_val_t)zone, (mod_hash_hndl_t)0); + ASSERT(ret == 0); + } + + ASSERT(zid >= 0 && zid <= MAX_ZONEID); + pgcnt = zone_pdata[zid].zpers_pg_cnt; + zone->vmz_zone->vme_result.vmu_rss_all = (size_t)ptob(pgcnt); + zone->vmz_zone->vme_result.vmu_swap_all = zp->zone_max_swap; + + zone_rele(zp); +} + /* * Determine which entity types are relevant and allocate the hashes to - * track them. Then walk the process table and count rss and swap - * for each process'es address space. Address space object such as - * vnodes, amps and anons are tracked per entity, so that they are - * not double counted in the results. - * + * track them. First get the zone rss using the data we already have. Then, + * if necessary, walk the process table and count rss and swap for each + * process'es address space. Address space object such as vnodes, amps and + * anons are tracked per entity, so that they are not double counted in the + * results. */ static void vmu_calculate() @@ -1633,6 +1714,7 @@ vmu_calculate() int i = 0; int ret; proc_t *p; + uint_t zone_flags = 0; vmu_clear_calc(); @@ -1640,9 +1722,34 @@ vmu_calculate() vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, ALL_ZONES); + zone_flags = vmu_data.vmu_calc_flags & VMUSAGE_ZONE_FLAGS; + if (zone_flags != 0) { + /* + * Use the accurate zone RSS data we already keep track of. + */ + int i; + + for (i = 0; i <= MAX_ZONEID; i++) { + if (zone_pdata[i].zpers_pg_cnt > 0) { + vmu_get_zone_rss(i); + } + } + } + + /* If only neeeded zone data, we're done. */ + if ((vmu_data.vmu_calc_flags & ~VMUSAGE_ZONE_FLAGS) == 0) { + return; + } + + DTRACE_PROBE(vmu__calculate__all); + vmu_data.vmu_calc_flags &= ~VMUSAGE_ZONE_FLAGS; + /* * Walk process table and calculate rss of each proc. * + * Since we already obtained all zone rss above, the following loop + * executes with the VMUSAGE_ZONE_FLAGS cleared. + * * Pidlock and p_lock cannot be held while doing the rss calculation. * This is because: * 1. The calculation allocates using KM_SLEEP. @@ -1697,6 +1804,12 @@ again: mutex_exit(&pidlock); vmu_free_extra(); + + /* + * Restore any caller-supplied zone flags we blocked during + * the process-table walk. + */ + vmu_data.vmu_calc_flags |= zone_flags; } /* @@ -1747,7 +1860,7 @@ vmu_cache_rele(vmu_cache_t *cache) */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1766,7 +1879,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1829,26 +1942,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1904,10 +2024,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1927,6 +2049,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1937,6 +2063,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1956,7 +2097,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2013,7 +2154,8 @@ start: mutex_exit(&vmu_data.vmu_lock); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index bed1885700..a398d741dc 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -24,6 +24,7 @@ # # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. +# Copyright 2019 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. # Copyright 2019 Joyent, Inc. # diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.c b/usr/src/uts/i86pc/dboot/dboot_printf.c index 9d02c1943a..59d4e247f0 100644 --- a/usr/src/uts/i86pc/dboot/dboot_printf.c +++ b/usr/src/uts/i86pc/dboot/dboot_printf.c @@ -203,6 +203,10 @@ unsigned_num: dboot_putnum(x, B_FALSE, base); break; + case 'z': + size = sizeof (size_t); + goto again; + default: dboot_puts("dboot_printf(): unknown % escape\n"); } diff --git a/usr/src/uts/i86pc/dboot/dboot_printf.h b/usr/src/uts/i86pc/dboot/dboot_printf.h index 22cf561e51..94b3db92e7 100644 --- a/usr/src/uts/i86pc/dboot/dboot_printf.h +++ b/usr/src/uts/i86pc/dboot/dboot_printf.h @@ -22,32 +22,29 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2020 Joyent, Inc. */ #ifndef _DBOOT_PRINTF_H #define _DBOOT_PRINTF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif /* - * Very primitive printf. This only understands the following simple formats: - * %%, %c, %s, %d, %ld, %lld, %x, %lx, %llx, %p + * Very primitive printf. We mark this as PRINTFLIKE so we can use %z */ -/*PRINTFLIKE1*/ extern void dboot_printf(char *fmt, ...) - __KPRINTFLIKE(1); + __PRINTFLIKE(1); /* * Primitive version of panic, prints a message, waits for a keystroke, * then resets the system */ -/*PRINTFLIKE1*/ extern void dboot_panic(char *fmt, ...) - __KPRINTFLIKE(1); + __NORETURN __PRINTFLIKE(1); #ifdef __cplusplus diff --git a/usr/src/uts/i86pc/dboot/dboot_startkern.c b/usr/src/uts/i86pc/dboot/dboot_startkern.c index 6621356133..6654244be2 100644 --- a/usr/src/uts/i86pc/dboot/dboot_startkern.c +++ b/usr/src/uts/i86pc/dboot/dboot_startkern.c @@ -75,6 +75,10 @@ extern int have_cpuid(void); #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2) +#define ULL(v) ((u_longlong_t)(v)) + +static void *page_alloc(void); + /* * This file contains code that runs to transition us from either a multiboot * compliant loader (32 bit non-paging) or a XPV domain loader to @@ -105,7 +109,10 @@ x86pte_t pte_bits = PT_VALID | PT_REF | PT_WRITABLE | PT_MOD | PT_NOCONSIST; * virtual address. */ paddr_t ktext_phys; -uint32_t ksize = 2 * FOUR_MEG; /* kernel nucleus is 8Meg */ +/* + * Nucleus size is 8Mb, including text, data, and BSS. + */ +uint32_t ksize = 2 * FOUR_MEG; static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ @@ -115,9 +122,16 @@ static uint64_t target_kernel_text; /* value to use for KERNEL_TEXT */ char stack_space[STACK_SIZE]; /* - * Used to track physical memory allocation + * The highest address we build page tables for. */ -static paddr_t next_avail_addr = 0; +static paddr_t boot_map_end; + +/* + * The dboot allocator. This is a small area we use for allocating the + * kernel nucleus and pages for the identity page tables we build here. + */ +static paddr_t alloc_addr; +static paddr_t alloc_end; #if defined(__xpv) /* @@ -127,7 +141,6 @@ static paddr_t next_avail_addr = 0; * to derive a pfn from a pointer, you subtract mfn_base. */ -static paddr_t scratch_end = 0; /* we can't write all of mem here */ static paddr_t mfn_base; /* addr corresponding to mfn_list[0] */ start_info_t *xen_info; @@ -233,6 +246,12 @@ uint_t map_debug = 0; static char noname[2] = "-"; +static boolean_t +ranges_intersect(uint64_t s1, uint64_t e1, uint64_t s2, uint64_t e2) +{ + return (s1 < e2 && e1 >= s2); +} + /* * Either hypervisor-specific or grub-specific code builds the initial * memlists. This code does the sort/merge/link for final use. @@ -288,8 +307,16 @@ sort_physinstall(void) if (prom_debug) { dboot_printf("\nFinal memlists:\n"); for (i = 0; i < memlists_used; ++i) { - dboot_printf("\t%d: addr=%" PRIx64 " size=%" - PRIx64 "\n", i, memlists[i].addr, memlists[i].size); + dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n", + i, ULL(memlists[i].addr), ULL(memlists[i].addr + + memlists[i].size), ULL(memlists[i].size)); + } + + dboot_printf("\nBoot modules:\n"); + for (i = 0; i < bi->bi_module_cnt; i++) { + dboot_printf("\t%d: 0x%llx-0x%llx size=0x%llx\n", + i, ULL(modules[i].bm_addr), ULL(modules[i].bm_addr + + modules[i].bm_size), ULL(modules[i].bm_size)); } } @@ -341,6 +368,8 @@ dboot_halt(void) while (--i) (void) HYPERVISOR_yield(); (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff); + for (;;) + ; } /* @@ -427,7 +456,7 @@ set_pteval(paddr_t table, uint_t index, uint_t level, x86pte_t pteval) paddr_t make_ptable(x86pte_t *pteval, uint_t level) { - paddr_t new_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + paddr_t new_table = (paddr_t)(uintptr_t)page_alloc(); if (level == top_level && level == 2) *pteval = pa_to_ma((uintptr_t)new_table) | PT_VALID; @@ -659,18 +688,6 @@ exclude_from_pci(uint64_t start, uint64_t end) } } -/* - * During memory allocation, find the highest address not used yet. - */ -static void -check_higher(paddr_t a) -{ - if (a < next_avail_addr) - return; - next_avail_addr = RNDUP(a + 1, MMU_PAGESIZE); - DBG(next_avail_addr); -} - static int dboot_loader_mmap_entries(void) { @@ -687,7 +704,6 @@ dboot_loader_mmap_entries(void) DBG(mb_info->mmap_addr); DBG(mb_info->mmap_length); - check_higher(mb_info->mmap_addr + mb_info->mmap_length); for (mmap_addr = mb_info->mmap_addr; mmap_addr < mb_info->mmap_addr + @@ -894,17 +910,13 @@ build_pcimemlists(void) } #if defined(__xpv) -/* - * Initialize memory allocator stuff from hypervisor-supplied start info. - */ static void -init_mem_alloc(void) +init_dboot_alloc(void) { int local; /* variables needed to find start region */ - paddr_t scratch_start; xen_memory_map_t map; - DBG_MSG("Entered init_mem_alloc()\n"); + DBG_MSG("Entered init_dboot_alloc()\n"); /* * Free memory follows the stack. There's at least 512KB of scratch @@ -913,17 +925,17 @@ init_mem_alloc(void) * allocated last and will be outside the addressible range. We'll * switch to new page tables before we unpack the kernel */ - scratch_start = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); - DBG(scratch_start); - scratch_end = RNDUP((paddr_t)scratch_start + 512 * 1024, TWO_MEG); - DBG(scratch_end); + alloc_addr = RNDUP((paddr_t)(uintptr_t)&local, MMU_PAGESIZE); + DBG(alloc_addr); + alloc_end = RNDUP((paddr_t)alloc_addr + 512 * 1024, TWO_MEG); + DBG(alloc_end); /* * For paranoia, leave some space between hypervisor data and ours. * Use 500 instead of 512. */ - next_avail_addr = scratch_end - 500 * 1024; - DBG(next_avail_addr); + alloc_addr = alloc_end - 500 * 1024; + DBG(alloc_addr); /* * The domain builder gives us at most 1 module @@ -1271,7 +1283,6 @@ process_module(int midx) char *cmdline = dboot_multiboot_modcmdline(midx); char *p, *q; - check_higher(mod_end); if (prom_debug) { dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n", midx, cmdline, (ulong_t)mod_start, (ulong_t)mod_end); @@ -1435,7 +1446,6 @@ static void dboot_process_modules(void) { int i, modcount; - extern char _end[]; DBG_MSG("\nFinding Modules\n"); modcount = dboot_multiboot_modcount(); @@ -1443,11 +1453,11 @@ dboot_process_modules(void) dboot_panic("Too many modules (%d) -- the maximum is %d.", modcount, MAX_BOOT_MODULES); } + /* * search the modules to find the last used address * we'll build the module list while we're walking through here */ - check_higher((paddr_t)(uintptr_t)&_end); for (i = 0; i < modcount; ++i) { process_module(i); modules_used++; @@ -1462,6 +1472,80 @@ dboot_process_modules(void) check_images(); } +#define CORRUPT_REGION_START 0xc700000 +#define CORRUPT_REGION_SIZE 0x100000 +#define CORRUPT_REGION_END (CORRUPT_REGION_START + CORRUPT_REGION_SIZE) + +static void +dboot_add_memlist(uint64_t start, uint64_t end) +{ + if (end > max_mem) + max_mem = end; + + /* + * Well, this is sad. On some systems, there is a region of memory that + * can be corrupted until some number of seconds after we have booted. + * And the BIOS doesn't tell us that this memory is unsafe to use. And + * we don't know how long it's dangerous. So we'll chop out this range + * from any memory list that would otherwise be usable. Note that any + * system of this type will give us the new-style (0x40) memlist, so we + * need not fix up the other path below. + * + * However, if we're boot-loaded from something that doesn't have a + * RICHMOND-16 workaround (which on many systems is just fine), it could + * actually use this region for the boot modules; if we remove it from + * the memlist, we'll keel over when trying to access the region. + * + * So, if we see that a module intersects the region, we presume it's + * OK. + */ + + if (find_boot_prop("disable-RICHMOND-16") != NULL) + goto out; + + for (uint32_t i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_start = modules[i].bm_addr; + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + if (ranges_intersect(mod_start, mod_end, CORRUPT_REGION_START, + CORRUPT_REGION_END)) { + if (prom_debug) { + dboot_printf("disabling RICHMOND-16 workaround " + "due to module #%d: " + "name %s addr %lx size %lx\n", + i, (char *)(uintptr_t)modules[i].bm_name, + (ulong_t)modules[i].bm_addr, + (ulong_t)modules[i].bm_size); + } + goto out; + } + } + + if (start < CORRUPT_REGION_START && end > CORRUPT_REGION_START) { + memlists[memlists_used].addr = start; + memlists[memlists_used].size = + CORRUPT_REGION_START - start; + ++memlists_used; + if (end > CORRUPT_REGION_END) + start = CORRUPT_REGION_END; + else + return; + } + + if (start >= CORRUPT_REGION_START && start < CORRUPT_REGION_END) { + if (end <= CORRUPT_REGION_END) + return; + start = CORRUPT_REGION_END; + } + +out: + memlists[memlists_used].addr = start; + memlists[memlists_used].size = end - start; + ++memlists_used; + if (memlists_used > MAX_MEMLIST) + dboot_panic("too many memlists"); +} + /* * We then build the phys_install memlist from the multiboot information. */ @@ -1505,13 +1589,7 @@ dboot_process_mmap(void) */ switch (type) { case 1: - if (end > max_mem) - max_mem = end; - memlists[memlists_used].addr = start; - memlists[memlists_used].size = end - start; - ++memlists_used; - if (memlists_used > MAX_MEMLIST) - dboot_panic("too many memlists"); + dboot_add_memlist(start, end); break; case 2: rsvdmemlists[rsvdmemlists_used].addr = start; @@ -1593,21 +1671,15 @@ dboot_multiboot1_highest_addr(void) return (addr); } -static void +static uint64_t dboot_multiboot_highest_addr(void) { - paddr_t addr; - switch (multiboot_version) { case 1: - addr = dboot_multiboot1_highest_addr(); - if (addr != (paddr_t)(uintptr_t)NULL) - check_higher(addr); + return (dboot_multiboot1_highest_addr()); break; case 2: - addr = dboot_multiboot2_highest_addr(mb2_info); - if (addr != (paddr_t)(uintptr_t)NULL) - check_higher(addr); + return (dboot_multiboot2_highest_addr(mb2_info)); break; default: dboot_panic("Unknown multiboot version: %d\n", @@ -1617,15 +1689,97 @@ dboot_multiboot_highest_addr(void) } /* - * Walk the boot loader provided information and find the highest free address. + * Set up our simple physical memory allocator. This is used to allocate both + * the kernel nucleus (ksize) and our page table pages. + * + * We need to find a contiguous region in the memlists that is below 4Gb (as + * we're 32-bit and need to use the addresses), and isn't otherwise in use by + * dboot, multiboot allocations, or boot modules. The memlist is sorted and + * merged by this point. + * + * Historically, this code always did the allocations past the end of the + * highest used address, even if there was space below. For reasons unclear, if + * we don't do this, then we get massive corruption during early kernel boot. + * + * Note that find_kalloc_start() starts its search at the end of this + * allocation. + * + * This all falls apart horribly on some EFI systems booting under iPXE, where + * we end up with boot module allocation such that there is no room between the + * highest used address and our 4Gb limit. To that end, we have an iPXE hack + * that limits the maximum address used by its allocations in an attempt to give + * us room. */ static void -init_mem_alloc(void) +init_dboot_alloc(void) { - DBG_MSG("Entered init_mem_alloc()\n"); + extern char _end[]; + + DBG_MSG("Entered init_dboot_alloc()\n"); + dboot_process_modules(); dboot_process_mmap(); - dboot_multiboot_highest_addr(); + + size_t align = FOUR_MEG; + + /* + * We need enough alloc space for the nucleus memory... + */ + size_t size = RNDUP(ksize, align); + + /* + * And enough page table pages to cover potentially 4Gb. Each leaf PT + * covers 2Mb, so we need a maximum of 2048 pages for those. Next level + * up each covers 1Gb, and so on, so we'll just add a little slop (which + * gets aligned up anyway). + */ + size += RNDUP(MMU_PAGESIZE * (2048 + 256), align); + + uint64_t start = MAX(dboot_multiboot_highest_addr(), + (paddr_t)(uintptr_t)&_end); + start = RNDUP(start, align); + + /* + * As mentioned above, only start our search after all the boot modules. + */ + for (uint_t i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + start = MAX(start, RNDUP(mod_end, MMU_PAGESIZE)); + } + + uint64_t end = start + size; + + DBG(start); + DBG(end); + + for (uint_t i = 0; i < memlists_used; i++) { + uint64_t ml_start = memlists[i].addr; + uint64_t ml_end = memlists[i].addr + memlists[i].size; + + /* + * If we're past our starting point for search, begin at this + * memlist. + */ + if (start < ml_start) { + start = RNDUP(ml_start, align); + end = start + size; + } + + if (end >= (uint64_t)UINT32_MAX) { + dboot_panic("couldn't find alloc space below 4Gb"); + } + + if (end < ml_end) { + alloc_addr = start; + alloc_end = end; + DBG(alloc_addr); + DBG(alloc_end); + return; + } + } + + dboot_panic("couldn't find alloc space in memlists"); } static int @@ -1869,77 +2023,89 @@ print_efi64(EFI_SYSTEM_TABLE64 *efi) #endif /* !__xpv */ /* - * Simple memory allocator, allocates aligned physical memory. - * Note that startup_kernel() only allocates memory, never frees. - * Memory usage just grows in an upward direction. + * Simple memory allocator for aligned physical memory from the area provided by + * init_dboot_alloc(). This is a simple bump allocator, and it's never directly + * freed by dboot. */ static void * -do_mem_alloc(uint32_t size, uint32_t align) +dboot_alloc(uint32_t size, uint32_t align) { - uint_t i; - uint64_t best; - uint64_t start; - uint64_t end; + uint32_t start = RNDUP(alloc_addr, align); - /* - * make sure size is a multiple of pagesize - */ size = RNDUP(size, MMU_PAGESIZE); - next_avail_addr = RNDUP(next_avail_addr, align); - /* - * XXPV fixme joe - * - * a really large bootarchive that causes you to run out of memory - * may cause this to blow up - */ - /* LINTED E_UNEXPECTED_UINT_PROMOTION */ - best = (uint64_t)-size; - for (i = 0; i < memlists_used; ++i) { - start = memlists[i].addr; -#if defined(__xpv) - start += mfn_base; -#endif - end = start + memlists[i].size; + if (start + size > alloc_end) { + dboot_panic("%s: couldn't allocate 0x%x bytes aligned 0x%x " + "alloc_addr = 0x%llx, alloc_end = 0x%llx", __func__, + size, align, (u_longlong_t)alloc_addr, + (u_longlong_t)alloc_end); + } - /* - * did we find the desired address? - */ - if (start <= next_avail_addr && next_avail_addr + size <= end) { - best = next_avail_addr; - goto done; - } + alloc_addr = start + size; - /* - * if not is this address the best so far? - */ - if (start > next_avail_addr && start < best && - RNDUP(start, align) + size <= end) - best = RNDUP(start, align); + if (map_debug) { + dboot_printf("%s(0x%x, 0x%x) = 0x%x\n", __func__, size, + align, start); } - /* - * We didn't find exactly the address we wanted, due to going off the - * end of a memory region. Return the best found memory address. - */ -done: - next_avail_addr = best + size; -#if defined(__xpv) - if (next_avail_addr > scratch_end) - dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: " - "0x%lx", (ulong_t)next_avail_addr, - (ulong_t)scratch_end); -#endif - (void) memset((void *)(uintptr_t)best, 0, size); - return ((void *)(uintptr_t)best); + (void) memset((void *)(uintptr_t)start, 0, size); + return ((void *)(uintptr_t)start); } -void * -mem_alloc(uint32_t size) +static void * +page_alloc(void) { - return (do_mem_alloc(size, MMU_PAGESIZE)); + return (dboot_alloc(MMU_PAGESIZE, MMU_PAGESIZE)); } +/* + * This is where we tell the kernel to start physical allocations from, beyond + * the end of our allocation area and all boot modules. It might be beyond 4Gb, + * so we can't touch that area ourselves. + * + * We might set kalloc_start to the end of a memlist; if so make sure we skip it + * along to the next one. + * + * This is making the massive assumption that there is a suitably large area for + * kernel allocations past the end of the last boot module and the dboot + * allocated region. Worse, we don't have a simple way to assert that is so. + */ +static paddr_t +find_kalloc_start(void) +{ + paddr_t kalloc_start = alloc_end; + uint_t i; + + for (i = 0; i < bi->bi_module_cnt; i++) { + native_ptr_t mod_end = modules[i].bm_addr + modules[i].bm_size; + + kalloc_start = MAX(kalloc_start, RNDUP(mod_end, MMU_PAGESIZE)); + } + + boot_map_end = kalloc_start; + DBG(boot_map_end); + + for (i = 0; i < memlists_used; i++) { + uint64_t ml_start = memlists[i].addr; + uint64_t ml_end = memlists[i].addr + memlists[i].size; + + if (kalloc_start >= ml_end) + continue; + + if (kalloc_start < ml_start) + kalloc_start = ml_start; + break; + } + + if (i == memlists_used) { + dboot_panic("fell off the end of memlists finding a " + "kalloc_start value > 0x%llx", (u_longlong_t)kalloc_start); + } + + DBG(kalloc_start); + + return (kalloc_start); +} /* * Build page tables to map all of memory used so far as well as the kernel. @@ -1962,7 +2128,7 @@ build_page_tables(void) #if defined(__xpv) top_page_table = (paddr_t)(uintptr_t)xen_info->pt_base; #else /* __xpv */ - top_page_table = (paddr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + top_page_table = (paddr_t)(uintptr_t)page_alloc(); #endif /* __xpv */ DBG((uintptr_t)top_page_table); @@ -1988,7 +2154,7 @@ build_page_tables(void) /* * The kernel will need a 1 page window to work with page tables */ - bi->bi_pt_window = (native_ptr_t)(uintptr_t)mem_alloc(MMU_PAGESIZE); + bi->bi_pt_window = (native_ptr_t)(uintptr_t)page_alloc(); DBG(bi->bi_pt_window); bi->bi_pte_to_pt_window = (native_ptr_t)(uintptr_t)find_pte(bi->bi_pt_window, NULL, 0, 0); @@ -2029,6 +2195,10 @@ build_page_tables(void) #if !defined(__xpv) + /* + * Map every valid memlist address up until boot_map_end: this will + * cover at least our alloc region and all boot modules. + */ for (i = 0; i < memlists_used; ++i) { start = memlists[i].addr; end = start + memlists[i].size; @@ -2036,11 +2206,11 @@ build_page_tables(void) if (map_debug) dboot_printf("1:1 map pa=%" PRIx64 "..%" PRIx64 "\n", start, end); - while (start < end && start < next_avail_addr) { + while (start < end && start < boot_map_end) { map_pa_at_va(start, start, 0); start += MMU_PAGESIZE; } - if (start >= next_avail_addr) + if (start >= boot_map_end) break; } @@ -2302,7 +2472,9 @@ startup_kernel(void) /* * Need correct target_kernel_text value */ +#if defined(_BOOT_TARGET_amd64) target_kernel_text = KERNEL_TEXT; +#endif DBG(target_kernel_text); #if defined(__xpv) @@ -2462,7 +2634,7 @@ startup_kernel(void) /* * initialize the simple memory allocator */ - init_mem_alloc(); + init_dboot_alloc(); #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64) /* @@ -2516,7 +2688,7 @@ startup_kernel(void) * For grub, copy kernel bits from the ELF64 file to final place. */ DBG_MSG("\nAllocating nucleus pages.\n"); - ktext_phys = (uintptr_t)do_mem_alloc(ksize, FOUR_MEG); + ktext_phys = (uintptr_t)dboot_alloc(ksize, FOUR_MEG); if (ktext_phys == 0) dboot_panic("failed to allocate aligned kernel memory"); @@ -2527,6 +2699,8 @@ startup_kernel(void) DBG(ktext_phys); + paddr_t kalloc_start = find_kalloc_start(); + /* * Allocate page tables. */ @@ -2544,18 +2718,18 @@ startup_kernel(void) #if defined(__xpv) - bi->bi_next_paddr = next_avail_addr - mfn_base; + bi->bi_next_paddr = kalloc_start - mfn_base; DBG(bi->bi_next_paddr); - bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; + bi->bi_next_vaddr = (native_ptr_t)kalloc_start; DBG(bi->bi_next_vaddr); /* * unmap unused pages in start area to make them available for DMA */ - while (next_avail_addr < scratch_end) { - (void) HYPERVISOR_update_va_mapping(next_avail_addr, + while (alloc_addr < alloc_end) { + (void) HYPERVISOR_update_va_mapping(alloc_addr, 0, UVMF_INVLPG | UVMF_LOCAL); - next_avail_addr += MMU_PAGESIZE; + alloc_addr += MMU_PAGESIZE; } bi->bi_xen_start_info = (native_ptr_t)(uintptr_t)xen_info; @@ -2565,9 +2739,9 @@ startup_kernel(void) #else /* __xpv */ - bi->bi_next_paddr = next_avail_addr; + bi->bi_next_paddr = kalloc_start; DBG(bi->bi_next_paddr); - bi->bi_next_vaddr = (native_ptr_t)(uintptr_t)next_avail_addr; + bi->bi_next_vaddr = (native_ptr_t)kalloc_start; DBG(bi->bi_next_vaddr); bi->bi_mb_version = multiboot_version; diff --git a/usr/src/uts/i86pc/dboot/dboot_xboot.h b/usr/src/uts/i86pc/dboot/dboot_xboot.h index 7d0876c79c..f261f3f2b1 100644 --- a/usr/src/uts/i86pc/dboot/dboot_xboot.h +++ b/usr/src/uts/i86pc/dboot/dboot_xboot.h @@ -22,6 +22,8 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2020 Joyent, Inc. */ #ifndef _DBOOT_XBOOT_H @@ -52,16 +54,14 @@ extern uint_t prom_debug; #define DBG_MSG(s) do { if (prom_debug) \ dboot_printf(s); \ - _NOTE(CONSTANTCONDITION) \ } while (0) -#define DBG(x) do { if (prom_debug) { \ - dboot_printf("%s is 0x%" PRIx64 "\n", #x, (uint64_t)(x)); \ - _NOTE(CONSTANTCONDITION) \ +#define DBG(x) do { if (prom_debug) { \ + dboot_printf("%s: %s is 0x%" PRIx64 "\n", \ + __func__, #x, (uint64_t)(x)); \ } } while (0) -extern void dboot_halt(void); -extern void *mem_alloc(uint32_t size); +extern void dboot_halt(void) __NORETURN; #define RNDUP(x, y) (((x) + ((y) - 1ul)) & ~((y) - 1ul)) diff --git a/usr/src/uts/i86pc/io/mp_platform_common.c b/usr/src/uts/i86pc/io/mp_platform_common.c index bff745b483..54a0ac3506 100644 --- a/usr/src/uts/i86pc/io/mp_platform_common.c +++ b/usr/src/uts/i86pc/io/mp_platform_common.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. + * Copyright 2017 Joyent, Inc. * Copyright (c) 2017 by Delphix. All rights reserved. * Copyright 2020 Joyent, Inc. * Copyright 2020 RackTop Systems, Inc. diff --git a/usr/src/uts/i86pc/io/psm/psm_common.c b/usr/src/uts/i86pc/io/psm/psm_common.c index b59d87bdcc..623c6e5617 100644 --- a/usr/src/uts/i86pc/io/psm/psm_common.c +++ b/usr/src/uts/i86pc/io/psm/psm_common.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #include <sys/types.h> diff --git a/usr/src/uts/i86pc/ml/kpti_trampolines.s b/usr/src/uts/i86pc/ml/kpti_trampolines.s index 4b5102d547..17249eb747 100644 --- a/usr/src/uts/i86pc/ml/kpti_trampolines.s +++ b/usr/src/uts/i86pc/ml/kpti_trampolines.s @@ -667,6 +667,8 @@ tr_intr_ret_end: MK_INTR_TRAMPOLINE_NOERR(invaltrap) MK_INTR_TRAMPOLINE_NOERR(fasttrap) MK_INTR_TRAMPOLINE_NOERR(dtrace_ret) + MK_INTR_TRAMPOLINE_NOERR(brand_sys_int80) + MK_INTR_TRAMPOLINE_NOERR(sys_int80) /* * These are special because they can interrupt other traps, and diff --git a/usr/src/uts/i86pc/ml/offsets.in b/usr/src/uts/i86pc/ml/offsets.in index 622f7cd2a3..6c1de5c145 100644 --- a/usr/src/uts/i86pc/ml/offsets.in +++ b/usr/src/uts/i86pc/ml/offsets.in @@ -144,6 +144,7 @@ _klwp lwp_thread lwp_procp lwp_brand + lwp_brand_syscall lwp_eosys lwp_regs lwp_arg diff --git a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s index 8a68b4bced..8040e35297 100644 --- a/usr/src/uts/i86pc/ml/syscall_asm_amd64.s +++ b/usr/src/uts/i86pc/ml/syscall_asm_amd64.s @@ -515,6 +515,7 @@ noprod_sys_syscall: movq T_LWP(%r15), %r14 ASSERT_NO_RUPDATE_PENDING(%r14) + ENABLE_INTR_FLAGS MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM) @@ -528,6 +529,37 @@ noprod_sys_syscall: incq %gs:CPU_STATS_SYS_SYSCALL + /* + * If our LWP has an alternate system call handler, run that instead of + * the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rdi + testq %rdi, %rdi + jz _syscall_no_brand + + pushq %rax + subq $8, %rsp /* align stack for call to C */ + INDIRECT_CALL_REG(rdi) + addq $8, %rsp + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + popq %rax + jnz _syscall_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movq REGOFF_RAX(%rsp), %r12 + movq REGOFF_RDX(%rsp), %r13 + jmp _syscall_after_brand + +_syscall_no_brand: movw %ax, T_SYSNUM(%r15) movzbl T_PRE_SYS(%r15), %ebx ORL_SYSCALLTRACE(%ebx) @@ -563,6 +595,8 @@ _syscall_invoke: shrq $32, %r13 /* upper 32-bits into %edx */ movl %r12d, %r12d /* lower 32-bits into %eax */ 5: + +_syscall_after_brand: /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -825,11 +859,46 @@ _syscall32_save: incq %gs:CPU_STATS_SYS_SYSCALL /* + * If our lwp has an alternate system call handler, run that instead + * of the regular system call path. + */ + movq LWP_BRAND_SYSCALL(%r14), %rax + testq %rax, %rax + jz _syscall32_no_brand + + movb $LWP_SYS, LWP_STATE(%r14) + INDIRECT_CALL_REG(rax) + + /* + * If the alternate handler returns non-zero, the normal system call + * processing is resumed. + */ + testl %eax, %eax + jnz _syscall32_no_brand + + /* + * For branded syscalls which were handled in-kernel, shuffle the + * register state as would be done by the native handler before jumping + * to the post-syscall logic. + */ + movl REGOFF_RAX(%rsp), %r12d + movl REGOFF_RDX(%rsp), %r13d + jmp _syscall32_after_brand + +_syscall32_no_brand: + /* * Make some space for MAXSYSARGS (currently 8) 32-bit args placed * into 64-bit (long) arg slots, maintaining 16 byte alignment. Or * more succinctly: * * SA(MAXSYSARGS * sizeof (long)) == 64 + * + * Note, this space is used both to copy in the arguments from user + * land, but also to as part of the old UNIX style syscall_ap() method. + * syscall_entry expects that we do not change the values of this space + * that we give it. However, this means that when we end up in the more + * recent model of passing the arguments based on the calling + * conventions, we'll need to save an additional 16 bytes of stack. */ #define SYS_DROP 64 /* drop for args */ subq $SYS_DROP, %rsp @@ -857,12 +926,16 @@ _syscall32_save: */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -881,6 +954,8 @@ _syscall32_save: shrq $32, %r13 /* upper 32-bits into %edx */ movl %eax, %r12d /* lower 32-bits into %eax */ +_syscall32_after_brand: + /* * Optimistically assume that there's no post-syscall * work to do. (This is to avoid having to call syscall_mstate() @@ -1133,15 +1208,20 @@ _full_syscall_postsys32: /* * Fetch the arguments copied onto the kernel stack and put * them in the right registers to invoke a C-style syscall handler. - * %rax contains the handler address. + * %rax contains the handler address. For the last two arguments, we + * push them onto the stack -- we can't clobber the old arguments. */ movq %rax, %rbx - movl 0(%rsp), %edi - movl 8(%rsp), %esi - movl 0x10(%rsp), %edx - movl 0x18(%rsp), %ecx - movl 0x20(%rsp), %r8d - movl 0x28(%rsp), %r9d + movl 0x0(%rsp), %edi /* arg0 */ + movl 0x8(%rsp), %esi /* arg1 */ + movl 0x10(%rsp), %edx /* arg2 */ + movl 0x38(%rsp), %eax /* arg7 load */ + movl 0x18(%rsp), %ecx /* arg3 */ + pushq %rax /* arg7 saved to stack */ + movl 0x28(%rsp), %r8d /* arg4 */ + movl 0x38(%rsp), %eax /* arg6 load */ + movl 0x30(%rsp), %r9d /* arg5 */ + pushq %rax /* arg6 saved to stack */ movq SY_CALLC(%rbx), %rax INDIRECT_CALL_REG(rax) @@ -1220,6 +1300,66 @@ _full_syscall_postsys32: SET_SIZE(brand_sys_sysenter) /* + * System call via an int80. This entry point is only used by the Linux + * application environment. Unlike the other entry points, there is no + * default action to take if no callback is registered for this process. + */ + + ENTRY_NP(brand_sys_int80) + SWAPGS /* kernel gsbase */ + XPV_TRAP_POP + call smap_enable + + /* + * We first attempt to call the "b_int80" handler from the "struct + * brand_mach_ops" for this brand. If no handler function is installed + * for this brand, the BRAND_CALLBACK() macro returns here and we + * check the lwp for a "lwp_brand_syscall" handler. + */ + BRAND_CALLBACK(BRAND_CB_INT80, BRAND_URET_FROM_INTR_STACK()) + + /* + * Check to see if this lwp provides "lwp_brand_syscall". If so, we + * will route this int80 through the regular system call handling path. + */ + movq %r15, %gs:CPU_RTMP_R15 + movq %gs:CPU_THREAD, %r15 + movq T_LWP(%r15), %r15 + movq LWP_BRAND_SYSCALL(%r15), %r15 + testq %r15, %r15 + movq %gs:CPU_RTMP_R15, %r15 + jnz nopop_syscall_int + + /* + * The brand provided neither a "b_int80", nor a "lwp_brand_syscall" + * function, and has thus opted out of handling this trap. + */ + SWAPGS /* user gsbase */ + jmp nopop_int80 + + ENTRY_NP(sys_int80) + /* + * We hit an int80, but this process isn't of a brand with an int80 + * handler. Bad process! Make it look as if the INT failed. + * Modify %rip to point before the INT, push the expected error + * code and fake a GP fault. Note on 64-bit hypervisor we need + * to undo the XPV_TRAP_POP and push rcx and r11 back on the stack + * because gptrap will pop them again with its own XPV_TRAP_POP. + */ + XPV_TRAP_POP + call smap_enable +nopop_int80: + subq $2, (%rsp) /* int insn 2-bytes */ + pushq $_CONST(_MUL(T_INT80, GATE_DESC_SIZE) + 2) +#if defined(__xpv) + push %r11 + push %rcx +#endif + jmp gptrap / GP fault + SET_SIZE(sys_int80) + SET_SIZE(brand_sys_int80) + +/* * This is the destination of the "int $T_SYSCALLINT" interrupt gate, used by * the generic i386 libc to do system calls. We do a small amount of setup * before jumping into the existing sys_syscall32 path. diff --git a/usr/src/uts/i86pc/os/cpr_impl.c b/usr/src/uts/i86pc/os/cpr_impl.c index 4a5c71b35d..e878f765ef 100644 --- a/usr/src/uts/i86pc/os/cpr_impl.c +++ b/usr/src/uts/i86pc/os/cpr_impl.c @@ -23,6 +23,10 @@ */ /* + * Copyright 2019 Joyent, Inc. + */ + +/* * Platform specific implementation code * Currently only suspend to RAM is supported (ACPI S3) */ @@ -737,6 +741,20 @@ i_cpr_is_supported(int sleeptype) if (sleeptype != CPR_TORAM) return (0); + /* + * Unfortunately, the x86 resume code was never implemented for GAS. + * The only obvious problem is that a trick necessary to appease Sun + * Studio does the wrong thing for GAS. Doubley unfortunate is that + * the condition used to detect GAS is incorrect, so we do in fact + * compile the Studio path, it just immediately fails in resume. + * + * Given that, if we were built using GCC, never allow CPR to be + * attempted. + */ +#ifdef __GNUC__ + return (0); +#else + /* * The next statement tests if a specific platform has turned off * cpr support. @@ -751,6 +769,7 @@ i_cpr_is_supported(int sleeptype) return (1); return (pm_S3_enabled); +#endif } void diff --git a/usr/src/uts/i86pc/os/ibft.c b/usr/src/uts/i86pc/os/ibft.c index d9ed882705..fab1324787 100644 --- a/usr/src/uts/i86pc/os/ibft.c +++ b/usr/src/uts/i86pc/os/ibft.c @@ -39,6 +39,7 @@ #include <sys/kmem.h> #include <sys/psm.h> #include <sys/bootconf.h> +#include <sys/reboot.h> typedef enum ibft_structure_type { Reserved = 0, @@ -206,6 +207,7 @@ static ibft_status_t iscsi_parse_ibft_NIC(iscsi_ibft_nic_t *nicp); static ibft_status_t iscsi_parse_ibft_target(char *begin_of_ibft, iscsi_ibft_tgt_t *tgtp); +extern int boothowto; /* * Return value: @@ -759,7 +761,9 @@ ld_ib_prop() * 1) pass "-B ibft-noprobe=1" on kernel command line * 2) add line "set ibft_noprobe=1" in /etc/system */ - cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + if (boothowto & RB_VERBOSE) { + cmn_err(CE_NOTE, IBFT_NOPROBE_MSG); + } return; } diff --git a/usr/src/uts/i86pc/os/lgrpplat.c b/usr/src/uts/i86pc/os/lgrpplat.c index ed463fba8f..6320c0a949 100644 --- a/usr/src/uts/i86pc/os/lgrpplat.c +++ b/usr/src/uts/i86pc/os/lgrpplat.c @@ -2800,7 +2800,11 @@ lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, /* * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs * and memory are local to each other in the same NUMA node and return number - * of nodes + * of nodes. + * + * The SRAT table pointer is populated during bootup by + * build_firmware_properties() in fakebop.c. Several motherboard and BIOS + * manufacturers are guilty of not having a SRAT table. */ static int lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, @@ -2817,9 +2821,15 @@ lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, /* * Nothing to do when no SRAT or disabled */ - if (tp == NULL || !lgrp_plat_srat_enable) + if (!lgrp_plat_srat_enable) return (-1); + if (tp == NULL) { + cmn_err(CE_WARN, "Couldn't read ACPI SRAT table from BIOS. " + "lgrp support will be limited to one group.\n"); + return (-1); + } + /* * Try to get domain information from MSCT table. * ACPI4.0: OSPM will use information provided by the MSCT only diff --git a/usr/src/uts/i86pc/os/startup.c b/usr/src/uts/i86pc/os/startup.c index a0bb296e70..e1e92ffe4f 100644 --- a/usr/src/uts/i86pc/os/startup.c +++ b/usr/src/uts/i86pc/os/startup.c @@ -2450,6 +2450,7 @@ add_physmem_cb(page_t *pp, pfn_t pnum) pp->p_mapping = NULL; pp->p_embed = 0; pp->p_share = 0; + pp->p_zoneid = ALL_ZONES; pp->p_mlentry = 0; } diff --git a/usr/src/uts/i86pc/os/trap.c b/usr/src/uts/i86pc/os/trap.c index b7c18bb8c9..063fac49f7 100644 --- a/usr/src/uts/i86pc/os/trap.c +++ b/usr/src/uts/i86pc/os/trap.c @@ -99,6 +99,7 @@ #include <sys/hypervisor.h> #endif #include <sys/contract/process_impl.h> +#include <sys/brand.h> #define USER 0x10000 /* user-mode flag added to trap type */ @@ -810,6 +811,17 @@ trap(struct regs *rp, caddr_t addr, processorid_t cpuid) fault_type = F_INVAL; } + /* + * Allow the brand to interpose on invalid memory accesses + * prior to running the native pagefault handler. If this + * brand hook returns zero, it was able to handle the fault + * completely. Otherwise, drive on and call pagefault(). + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_pagefault != NULL && + BROP(p)->b_pagefault(p, lwp, addr, fault_type, rw) == 0) { + goto out; + } + res = pagefault(addr, fault_type, rw, 0); /* diff --git a/usr/src/uts/i86pc/sys/apic.h b/usr/src/uts/i86pc/sys/apic.h index 26626ec5a4..f2528a632f 100644 --- a/usr/src/uts/i86pc/sys/apic.h +++ b/usr/src/uts/i86pc/sys/apic.h @@ -386,7 +386,7 @@ struct apic_io_intr { /* special or reserve vectors */ #define APIC_CHECK_RESERVE_VECTORS(v) \ (((v) == T_FASTTRAP) || ((v) == APIC_SPUR_INTR) || \ - ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET)) + ((v) == T_SYSCALLINT) || ((v) == T_DTRACE_RET) || ((v) == 0x80)) /* cmos shutdown code for BIOS */ #define BIOS_SHUTDOWN 0x0a diff --git a/usr/src/uts/i86pc/sys/comm_page.h b/usr/src/uts/i86pc/sys/comm_page.h index 520ad9001d..ea19c856a8 100644 --- a/usr/src/uts/i86pc/sys/comm_page.h +++ b/usr/src/uts/i86pc/sys/comm_page.h @@ -27,6 +27,7 @@ extern "C" { #endif #define COMM_PAGE_SIZE PAGESIZE +#define COMM_PAGE_ALIGN 0x4000 #ifndef _ASM diff --git a/usr/src/uts/i86pc/sys/vm_machparam.h b/usr/src/uts/i86pc/sys/vm_machparam.h index 90a5245217..0d0c95535c 100644 --- a/usr/src/uts/i86pc/sys/vm_machparam.h +++ b/usr/src/uts/i86pc/sys/vm_machparam.h @@ -23,6 +23,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_VM_MACHPARAM_H @@ -129,11 +130,12 @@ extern "C" { * * XXX - The system doesn't account for multiple swap devices. */ -#define DISKRPM 60 +#define DISKRPM 600 /* * The maximum value for handspreadpages which is the the distance - * between the two clock hands in pages. + * between the two clock hands in pages. This is only used when the page + * scanner is first started. */ #define MAXHANDSPREADPAGES ((64 * 1024 * 1024) / PAGESIZE) diff --git a/usr/src/uts/i86pc/vm/hat_i86.c b/usr/src/uts/i86pc/vm/hat_i86.c index 7650d28f41..ea9436e881 100644 --- a/usr/src/uts/i86pc/vm/hat_i86.c +++ b/usr/src/uts/i86pc/vm/hat_i86.c @@ -3808,7 +3808,7 @@ hat_page_getattr(struct page *pp, uint_t flag) /* - * common code used by hat_pageunload() and hment_steal() + * common code used by hat_page_inval() and hment_steal() */ hment_t * hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) @@ -3864,15 +3864,13 @@ hati_page_unmap(page_t *pp, htable_t *ht, uint_t entry) extern int vpm_enable; /* - * Unload all translations to a page. If the page is a subpage of a large + * Unload translations to a page. If the page is a subpage of a large * page, the large page mappings are also removed. - * - * The forceflags are unused. + * If curhat is not NULL, then we only unload the translation + * for the given process, otherwise all translations are unloaded. */ - -/*ARGSUSED*/ -static int -hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) +void +hat_page_inval(struct page *pp, uint_t pg_szcd, struct hat *curhat) { page_t *cur_pp = pp; hment_t *hm; @@ -3880,16 +3878,11 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) htable_t *ht; uint_t entry; level_t level; + ulong_t cnt = 0; XPV_DISALLOW_MIGRATE(); /* - * prevent recursion due to kmem_free() - */ - ++curthread->t_hatdepth; - ASSERT(curthread->t_hatdepth < 16); - - /* * clear the vpm ref. */ if (vpm_enable) { @@ -3899,6 +3892,8 @@ hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t forceflag) * The loop with next_size handles pages with multiple pagesize mappings */ next_size: + if (curhat != NULL) + cnt = hat_page_getshare(cur_pp); for (;;) { /* @@ -3910,14 +3905,13 @@ next_size: if (hm == NULL) { x86_hm_exit(cur_pp); +curproc_done: /* * If not part of a larger page, we're done. */ if (cur_pp->p_szc <= pg_szcd) { - ASSERT(curthread->t_hatdepth > 0); - --curthread->t_hatdepth; XPV_ALLOW_MIGRATE(); - return (0); + return; } /* @@ -3936,8 +3930,20 @@ next_size: * If this mapping size matches, remove it. */ level = ht->ht_level; - if (level == pg_szcd) - break; + if (level == pg_szcd) { + if (curhat == NULL || ht->ht_hat == curhat) + break; + /* + * Unloading only the given process but it's + * not the hat for the current process. Leave + * entry in place. Also do a safety check to + * ensure we don't get in an infinite loop + */ + if (cnt-- == 0) { + x86_hm_exit(cur_pp); + goto curproc_done; + } + } } /* @@ -3947,14 +3953,44 @@ next_size: hm = hati_page_unmap(cur_pp, ht, entry); if (hm != NULL) hment_free(hm); + + /* Perform check above for being part of a larger page. */ + if (curhat != NULL) + goto curproc_done; } } +/* + * Unload translations to a page. If unloadflag is HAT_CURPROC_PGUNLOAD, then + * we only unload the translation for the current process, otherwise all + * translations are unloaded. + */ +static int +hati_pageunload(struct page *pp, uint_t pg_szcd, uint_t unloadflag) +{ + struct hat *curhat = NULL; + + /* + * prevent recursion due to kmem_free() + */ + ++curthread->t_hatdepth; + ASSERT(curthread->t_hatdepth < 16); + + if (unloadflag == HAT_CURPROC_PGUNLOAD) + curhat = curthread->t_procp->p_as->a_hat; + + hat_page_inval(pp, pg_szcd, curhat); + + ASSERT(curthread->t_hatdepth > 0); + --curthread->t_hatdepth; + return (0); +} + int -hat_pageunload(struct page *pp, uint_t forceflag) +hat_pageunload(struct page *pp, uint_t unloadflag) { ASSERT(PAGE_EXCL(pp)); - return (hati_pageunload(pp, 0, forceflag)); + return (hati_pageunload(pp, 0, unloadflag)); } /* diff --git a/usr/src/uts/i86pc/vm/hment.c b/usr/src/uts/i86pc/vm/hment.c index bb18b5c462..769bbd15d2 100644 --- a/usr/src/uts/i86pc/vm/hment.c +++ b/usr/src/uts/i86pc/vm/hment.c @@ -21,6 +21,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #include <sys/types.h> @@ -35,6 +36,7 @@ #include <vm/hat_i86.h> #include <sys/cmn_err.h> #include <sys/avl.h> +#include <sys/zone.h> /* @@ -319,6 +321,8 @@ hment_insert(hment_t *hm, page_t *pp) ((hment_t *)pp->p_mapping)->hm_prev = hm; pp->p_mapping = hm; + zone_add_page(pp); + /* * Add the hment to the system-wide hash table. */ @@ -460,6 +464,7 @@ hment_assign(htable_t *htable, uint_t entry, page_t *pp, hment_t *hm) pp->p_embed = 1; pp->p_mapping = htable; pp->p_mlentry = entry; + zone_add_page(pp); return; } @@ -541,6 +546,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) pp->p_mapping = NULL; pp->p_mlentry = 0; pp->p_embed = 0; + zone_rm_page(pp); return (NULL); } @@ -576,6 +582,7 @@ hment_remove(page_t *pp, htable_t *ht, uint_t entry) hm->hm_hashlink = null_avl_link; hm->hm_next = NULL; hm->hm_prev = NULL; + zone_rm_page(pp); return (hm); } diff --git a/usr/src/uts/i86pc/vm/vm_machdep.c b/usr/src/uts/i86pc/vm/vm_machdep.c index 225628b1c8..bc9d03e7f5 100644 --- a/usr/src/uts/i86pc/vm/vm_machdep.c +++ b/usr/src/uts/i86pc/vm/vm_machdep.c @@ -711,10 +711,8 @@ void map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags) { struct proc *p = curproc; - caddr_t userlimit = (flags & _MAP_LOW32) ? - (caddr_t)_userlimit32 : p->p_as->a_userlimit; - - map_addr_proc(addrp, len, off, vacalign, userlimit, curproc, flags); + map_addr_proc(addrp, len, off, vacalign, + map_userlimit(p, p->p_as, flags), curproc, flags); } /*ARGSUSED*/ @@ -3546,7 +3544,7 @@ page_create_io( if (nscan < desscan && freemem < minfree) { TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL, "pageout_cv_signal:freemem %ld", freemem); - cv_signal(&proc_pageout->p_cv); + WAKE_PAGEOUT_SCANNER(); } if (flags & PG_PHYSCONTIG) { diff --git a/usr/src/uts/intel/Makefile b/usr/src/uts/intel/Makefile index ff9ed42c94..13b7642799 100644 --- a/usr/src/uts/intel/Makefile +++ b/usr/src/uts/intel/Makefile @@ -59,7 +59,7 @@ install_h.prereq := TARGET= install_h .PARALLEL: $(PARALLEL_KMODS) $(XMODS) config -def all install clean clobber modlist: $(KMODS) $(XMODS) config +def all install clean clobber modlist: genassym $(KMODS) $(XMODS) config clobber: clobber.targ @@ -101,7 +101,7 @@ CLOBBERFILES += $(PRIVS_C) # intel/dtrace depends on i86pc/genassym, so we need to build both # i86pc/genassym and intel/genassym. # -all.prereq install.prereq def.prereq: genunix FRC +all.prereq install.prereq def.prereq: genassym genunix FRC @cd ../i86pc/genassym; pwd; $(MAKE) $(@:%.prereq=%) # @@ -111,7 +111,7 @@ all.prereq install.prereq def.prereq: genunix FRC genunix: $(PRIVS_C) -$(KMODS) $(SUBDIRS) config: FRC +genassym $(KMODS) $(SUBDIRS) config: FRC @cd $@; pwd; $(MAKE) $(NO_STATE) $(TARGET) $(XMODS): FRC diff --git a/usr/src/uts/intel/Makefile.files b/usr/src/uts/intel/Makefile.files index 20d5e89ec9..0ead68a021 100644 --- a/usr/src/uts/intel/Makefile.files +++ b/usr/src/uts/intel/Makefile.files @@ -98,6 +98,30 @@ GENUNIX_OBJS += \ CORE_OBJS += \ prmachdep.o +LX_CGROUP_OBJS += \ + cgrps_node.o \ + cgrps_vfsops.o \ + cgrps_vnops.o + +LX_DEVFS_OBJS += \ + lxd_attrdb.o \ + lxd_node.o \ + lxd_vfsops.o \ + lxd_vnops.o + +LX_PROC_OBJS += \ + lx_prsubr.o \ + lx_prvfsops.o \ + lx_prvnops.o + +LX_SYS_OBJS += \ + lx_syssubr.o \ + lx_sysvfsops.o \ + lx_sysvnops.o + +LX_AUTOFS_OBJS += \ + lx_autofs.o + # # shared hypervisor functionality # @@ -271,6 +295,74 @@ IOMMULIB_OBJS = iommulib.o # SN1_BRAND_OBJS = sn1_brand.o sn1_brand_asm.o S10_BRAND_OBJS = s10_brand.o s10_brand_asm.o +LX_BRAND_OBJS = \ + lx_access.o \ + lx_acct.o \ + lx_acl.o \ + lx_aio.o \ + lx_archdep.o \ + lx_audit.o \ + lx_auxv.o \ + lx_brand.o \ + lx_brk.o \ + lx_chmod.o \ + lx_chown.o \ + lx_clone.o \ + lx_close.o \ + lx_cpu.o \ + lx_dup.o \ + lx_errno.o \ + lx_epoll.o \ + lx_eventfd.o \ + lx_fadvise.o \ + lx_fallocate.o \ + lx_fcntl.o \ + lx_futex.o \ + lx_getcwd.o \ + lx_getdents.o \ + lx_getpid.o \ + lx_getrandom.o \ + lx_id.o \ + lx_ioctl.o \ + lx_ioprio.o \ + lx_kill.o \ + lx_link.o \ + lx_lseek.o \ + lx_mem.o \ + lx_misc.o \ + lx_miscsys.o \ + lx_mkdir.o \ + lx_modify_ldt.o \ + lx_mount.o \ + lx_lockd.o \ + lx_open.o \ + lx_personality.o \ + lx_pgrp.o \ + lx_pid.o \ + lx_pipe.o \ + lx_poll.o \ + lx_prctl.o \ + lx_priority.o \ + lx_ptrace.o \ + lx_rename.o \ + lx_rlimit.o \ + lx_rw.o \ + lx_sched.o \ + lx_signal.o \ + lx_signum.o \ + lx_socket.o \ + lx_splice.o \ + lx_stat.o \ + lx_sync.o \ + lx_syscall.o \ + lx_sysinfo.o \ + lx_thread_area.o \ + lx_time.o \ + lx_timer.o \ + lx_umask.o \ + lx_uname.o \ + lx_wait.o \ + lx_xattr.o # # special files diff --git a/usr/src/uts/intel/Makefile.intel b/usr/src/uts/intel/Makefile.intel index 77713eb279..6b6093d555 100644 --- a/usr/src/uts/intel/Makefile.intel +++ b/usr/src/uts/intel/Makefile.intel @@ -22,7 +22,7 @@ # Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright (c) 2013 Andrew Stormont. All rights reserved. # Copyright (c) 2014 by Delphix. All rights reserved. -# Copyright 2019 Joyent, Inc. +# Copyright 2021 Joyent, Inc. # Copyright 2022 Garrett D'Amore <garrett@damore.org> # Copyright 2018 Nexenta Systems, Inc. # Copyright 2019 RackTop Systems @@ -50,6 +50,7 @@ PLATFORM = i86pc # UNIX_DIR = $(UTSBASE)/i86pc/unix GENLIB_DIR = $(UTSBASE)/intel/genunix +GENASSYM_DIR = $(UTSBASE)/intel/genassym IPDRV_DIR = $(UTSBASE)/intel/ip MODSTUBS_DIR = $(UNIX_DIR) DSF_DIR = $(UTSBASE)/$(PLATFORM)/genassym @@ -132,6 +133,7 @@ ASFLAGS_XARCH_64 = $(amd64_ASFLAGS) ASFLAGS_XARCH = $(ASFLAGS_XARCH_$(CLASS)) ASFLAGS += $(ASFLAGS_XARCH) +AS_INC_PATH += -I$(GENASSYM_DIR)/$(OBJS_DIR) # # Define the base directory for installation. @@ -245,6 +247,7 @@ DRV_KMODS += hxge DRV_KMODS += i8042 DRV_KMODS += icmp DRV_KMODS += icmp6 +DRV_KMODS += inotify DRV_KMODS += intel_nb5000 DRV_KMODS += intel_nhm DRV_KMODS += ip @@ -279,6 +282,7 @@ DRV_KMODS += mouse8042 DRV_KMODS += mpt_sas DRV_KMODS += mr_sas DRV_KMODS += mwl +DRV_KMODS += nfp DRV_KMODS += nsmb DRV_KMODS += nulldriver DRV_KMODS += nv_sata @@ -347,6 +351,8 @@ DRV_KMODS += ural DRV_KMODS += uath DRV_KMODS += urtw DRV_KMODS += vgatext +DRV_KMODS += vmxnet +DRV_KMODS += vnd DRV_KMODS += vnic DRV_KMODS += vscan DRV_KMODS += wc @@ -355,6 +361,7 @@ DRV_KMODS += wpi DRV_KMODS += xge DRV_KMODS += yge DRV_KMODS += zcons +DRV_KMODS += zfd DRV_KMODS += zyd DRV_KMODS += simnet DRV_KMODS += smrt @@ -511,7 +518,8 @@ DRV_KMODS += sol_umad # # Brand modules # -BRAND_KMODS += sn1_brand s10_brand +BRAND_KMODS += sn1_brand s10_brand lx_brand +DRV_KMODS += lx_systrace lx_ptm lx_netlink # # Exec Class Modules (/kernel/exec): @@ -526,10 +534,10 @@ SCHED_KMODS += IA RT TS RT_DPTBL TS_DPTBL FSS FX FX_DPTBL SDC # # File System Modules (/kernel/fs): # -FS_KMODS += autofs ctfs dcfs dev devfs fdfs fifofs hsfs lofs -FS_KMODS += mntfs namefs nfs objfs zfs zut -FS_KMODS += pcfs procfs sockfs specfs tmpfs udfs ufs sharefs -FS_KMODS += smbfs bootfs +FS_KMODS += autofs ctfs dcfs dev devfs fdfs fifofs hsfs hyprlofs +FS_KMODS += lofs lxautofs lx_proc lxprocfs mntfs namefs nfs objfs zfs zut +FS_KMODS += pcfs procfs sockfs specfs tmpfs udfs ufs sharefs lx_sysfs +FS_KMODS += smbfs bootfs lx_cgroup lx_devfs # # Streams Modules (/kernel/strmod): @@ -587,6 +595,7 @@ MISC_KMODS += dls MISC_KMODS += fssnap_if MISC_KMODS += gda MISC_KMODS += gld +MISC_KMODS += gsqueue MISC_KMODS += hidparser MISC_KMODS += hook MISC_KMODS += hpcsvc @@ -713,6 +722,7 @@ SOCKET_KMODS += sockpfp SOCKET_KMODS += socksctp SOCKET_KMODS += socksdp SOCKET_KMODS += sockrds +SOCKET_KMODS += datafilt # # kiconv modules (/kernel/kiconv): diff --git a/usr/src/uts/intel/Makefile.rules b/usr/src/uts/intel/Makefile.rules index 31b87f55b0..778f82b13b 100644 --- a/usr/src/uts/intel/Makefile.rules +++ b/usr/src/uts/intel/Makefile.rules @@ -178,6 +178,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/intel_nb5000/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/vmxnet/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/intel/io/intel_nhm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/amd64/krtld/kobj_reloc.c b/usr/src/uts/intel/amd64/krtld/kobj_reloc.c index 78b1fb7777..69e0342768 100644 --- a/usr/src/uts/intel/amd64/krtld/kobj_reloc.c +++ b/usr/src/uts/intel/amd64/krtld/kobj_reloc.c @@ -25,6 +25,9 @@ * * Copyright 2020 Joyent, Inc. */ +/* + * Copyright (c) 2017 Joyent, Inc. + */ /* * x86 relocation code. diff --git a/usr/src/uts/intel/bpf/Makefile b/usr/src/uts/intel/bpf/Makefile index 3729bd2523..e07edeb12c 100644 --- a/usr/src/uts/intel/bpf/Makefile +++ b/usr/src/uts/intel/bpf/Makefile @@ -60,7 +60,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # CFLAGS += $(CCVERBOSE) -LDFLAGS += -Nmisc/mac -Nmisc/dls -Ndrv/ipnet -Nmisc/neti +LDFLAGS += -Nmisc/mac -Nmisc/dls -Ndrv/ipnet -Nmisc/neti -Ndrv/ip INC_PATH += -I$(UTSBASE)/common/io/bpf # diff --git a/usr/src/uts/intel/brand/lx/lx_archdep.c b/usr/src/uts/intel/brand/lx/lx_archdep.c new file mode 100644 index 0000000000..24f3d2c446 --- /dev/null +++ b/usr/src/uts/intel/brand/lx/lx_archdep.c @@ -0,0 +1,1720 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * LX brand Intel-specific routines. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/privregs.h> +#include <sys/pcb.h> +#include <sys/archsystm.h> +#include <sys/stack.h> +#include <sys/sdt.h> +#include <sys/sysmacros.h> +#include <sys/psw.h> +#include <lx_errno.h> + +/* + * Argument constants for fix_segreg. + * See usr/src/uts/intel/ia32/os/archdep.c for the originals. + */ +#define IS_CS 1 +#define IS_NOT_CS 0 + +extern greg_t fix_segreg(greg_t, int, model_t); + + +#define LX_REG(ucp, r) ((ucp)->uc_mcontext.gregs[(r)]) + +#define PSLMERGE(oldval, newval) \ + (((oldval) & ~PSL_USERMASK) | ((newval) & PSL_USERMASK)) + +#ifdef __amd64 +/* 64-bit native user_regs_struct */ +typedef struct lx_user_regs64 { + int64_t lxur_r15; + int64_t lxur_r14; + int64_t lxur_r13; + int64_t lxur_r12; + int64_t lxur_rbp; + int64_t lxur_rbx; + int64_t lxur_r11; + int64_t lxur_r10; + int64_t lxur_r9; + int64_t lxur_r8; + int64_t lxur_rax; + int64_t lxur_rcx; + int64_t lxur_rdx; + int64_t lxur_rsi; + int64_t lxur_rdi; + int64_t lxur_orig_rax; + int64_t lxur_rip; + int64_t lxur_xcs; + int64_t lxur_rflags; + int64_t lxur_rsp; + int64_t lxur_xss; + int64_t lxur_xfs_base; + int64_t lxur_xgs_base; + int64_t lxur_xds; + int64_t lxur_xes; + int64_t lxur_xfs; + int64_t lxur_xgs; +} lx_user_regs64_t; + +/* 64-bit native user_fpregs_struct */ +typedef struct lx_user_fpregs64 { + uint16_t lxufp_cwd; + uint16_t lxufp_swd; + uint16_t lxufp_ftw; + uint16_t lxufp_fop; + uint64_t lxufp_rip; + uint64_t lxufp_rdp; + uint32_t lxufp_mxcsr; + uint32_t lxufp_mxcr_mask; + /* 8*16 bytes for each FP-reg = 128 bytes */ + uint32_t lxufp_st_space[32]; + /* 16*16 bytes for each XMM-reg = 256 bytes */ + uint32_t lxufp_xmm_space[64]; + uint32_t lxufp_padding[24]; +} lx_user_fpregs64_t; + +/* 64-bit native user_struct */ +typedef struct lx_user64 { + lx_user_regs64_t lxu_regs; + int32_t lxu_fpvalid; + int32_t lxu_pad0; + lx_user_fpregs64_t lxu_i387; + uint64_t lxu_tsize; + uint64_t lxu_dsize; + uint64_t lxu_ssize; + uint64_t lxu_start_code; + uint64_t lxu_start_stack; + int64_t lxu_signal; + int32_t lxu_reserved; + int32_t lxu_pad1; + /* help gdb to locate user_regs structure */ + caddr_t lxu_ar0; + /* help gdb to locate user_fpregs structure */ + caddr_t lxu_fpstate; + uint64_t lxu_magic; + char lxu_comm[32]; + uint64_t lxu_debugreg[8]; + uint64_t lxu_error_code; + uint64_t lxu_fault_address; +} lx_user64_t; + +#endif /* __amd64 */ + +/* 32-bit native user_regs_struct */ +typedef struct lx_user_regs32 { + int32_t lxur_ebx; + int32_t lxur_ecx; + int32_t lxur_edx; + int32_t lxur_esi; + int32_t lxur_edi; + int32_t lxur_ebp; + int32_t lxur_eax; + int32_t lxur_xds; + int32_t lxur_xes; + int32_t lxur_xfs; + int32_t lxur_xgs; + int32_t lxur_orig_eax; + int32_t lxur_eip; + int32_t lxur_xcs; + int32_t lxur_eflags; + int32_t lxur_esp; + int32_t lxur_xss; +} lx_user_regs32_t; + +/* 32-bit native user_fpregs_struct */ +typedef struct lx_user_fpregs32 { + int32_t lxufp_cwd; + int32_t lxufp_swd; + int32_t lxufp_twd; + int32_t lxufp_fip; + int32_t lxufp_fcs; + int32_t lxufp_foo; + int32_t lxufp_fos; + int32_t lxufp_st_space[20]; +} lx_user_fpregs32_t; + +/* 32-bit native user_fpxregs_struct */ +typedef struct lx_user_fpxregs32 { + uint16_t lxufpx_cwd; + uint16_t lxufpx_swd; + uint16_t lxufpx_twd; + uint16_t lxufpx_fop; + int32_t lxufpx_fip; + int32_t lxufpx_fcs; + int32_t lxufpx_foo; + int32_t lxufpx_fos; + int32_t lxufpx_mxcsr; + int32_t lxufpx_reserved; + /* 8*16 bytes for each FP-reg = 128 bytes */ + int32_t lxufpx_st_space[32]; + /* 8*16 bytes for each XMM-reg = 128 bytes */ + int32_t lxufpx_xmm_space[32]; + int32_t lxufpx_padding[56]; +} lx_user_fpxregs32_t; + +/* 32-bit native user_struct */ +typedef struct lx_user32 { + lx_user_regs32_t lxu_regs; + int32_t lxu_fpvalid; + lx_user_fpregs32_t lxu_i387; + uint32_t lxu_tsize; + uint32_t lxu_dsize; + uint32_t lxu_ssize; + uint32_t lxu_start_code; + uint32_t lxu_start_stack; + int32_t lxu_signal; + int32_t lxu_reserved; + caddr32_t lxu_ar0; + caddr32_t lxu_fpstate; + uint32_t lxu_magic; + char lxu_comm[32]; + int32_t lxu_debugreg[8]; +} lx_user32_t; + +/* + * Certain version of strace (on centos6 for example) use the %cs value to + * determine what kind of process is being traced. Here is a sample comment: + * Check CS register value. On x86-64 linux it is: + * 0x33 for long mode (64 bit and x32)) + * 0x23 for compatibility mode (32 bit) + * %ds = 0x2b for x32 mode (x86-64 in 32 bit) + * We can't change the %cs value in the ucp (see setgregs and _sys_rtt) so we + * emulate the expected value for ptrace use. + */ +#define LX_CS_64BIT 0x33 +#define LX_CS_32BIT 0x23 + +extern int getsetcontext(int, void *); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +#endif + +static int +lx_rw_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz, boolean_t writing) +{ + int error = 0; + size_t rem = ucsz; + off_t pos = 0; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Grab P_PR_LOCK so that we can drop p_lock while doing I/O. + */ + sprlock_proc(p); + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + while (rem != 0) { + uintptr_t addr = (uintptr_t)ucp + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if (writing) { + error = uwrite(p, (caddr_t)kucp + pos, len, addr); + } else { + error = uread(p, (caddr_t)kucp + pos, len, addr); + } + + if (error != 0) { + break; + } + + rem -= len; + pos += len; + } + mutex_enter(&p->p_lock); + + sprunlock(p); + mutex_enter(&p->p_lock); + + return (error); +} + +/* + * Read a ucontext_t from the target process, which may or may not be + * the current process. + */ +static int +lx_read_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz) +{ + return (lx_rw_uc(p, ucp, kucp, ucsz, B_FALSE)); +} + +/* + * Write a ucontext_t to the target process, which may or may not be + * the current process. + */ +static int +lx_write_uc(proc_t *p, void *ucp, void *kucp, size_t ucsz) +{ + return (lx_rw_uc(p, ucp, kucp, ucsz, B_TRUE)); +} + +static void +lx_getfpregs32(lx_lwp_data_t *lwpd, lx_user_fpregs32_t *lfp) +{ +#ifdef __amd64 + fpregset32_t fp; + getfpregs32(lwpd->br_lwp, &fp); +#else /* __i386 */ + fpregset_t fp; + getfpregs(lwpd->br_lwp, &fp); +#endif /* __amd64 */ + + /* + * The fpchip_state.state field should correspond to all 27 fields in + * the 32-bit structure. + */ + bcopy(&fp.fp_reg_set.fpchip_state.state, lfp, sizeof (*lfp)); +} + +static void +lx_setfpregs32(lx_lwp_data_t *lwpd, lx_user_fpregs32_t *lfp) +{ +#ifdef __amd64 + fpregset32_t fp; +#else /* __i386 */ + fpregset_t fp; +#endif /* __amd64 */ + + /* + * The fpchip_state field should correspond to all 27 fields in the + * native 32-bit structure. + */ + bcopy(lfp, &fp.fp_reg_set.fpchip_state.state, sizeof (*lfp)); + +#ifdef __amd64 + setfpregs32(lwpd->br_lwp, &fp); +#else /* __i386 */ + setfpregs(lwpd->br_lwp, &fp); +#endif /* __amd64 */ +} + +static int +lx_get_user_regs32_uc(klwp_t *lwp, void *ucp, lx_user_regs32_t *lxrp) +{ + proc_t *p = lwptoproc(lwp); + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + lxrp->lxur_ebx = LX_REG(&uc, EBX); + lxrp->lxur_ecx = LX_REG(&uc, ECX); + lxrp->lxur_edx = LX_REG(&uc, EDX); + lxrp->lxur_esi = LX_REG(&uc, ESI); + lxrp->lxur_edi = LX_REG(&uc, EDI); + lxrp->lxur_ebp = LX_REG(&uc, EBP); + lxrp->lxur_eax = LX_REG(&uc, EAX); + lxrp->lxur_orig_eax = 0; + + lxrp->lxur_eip = LX_REG(&uc, EIP); + lxrp->lxur_eflags = LX_REG(&uc, EFL); + lxrp->lxur_esp = LX_REG(&uc, UESP); + lxrp->lxur_xss = LX_REG(&uc, SS); + + /* emulated %cs, see defines */ + lxrp->lxur_xcs = LX_CS_32BIT; + lxrp->lxur_xds = LX_REG(&uc, DS); + lxrp->lxur_xes = LX_REG(&uc, ES); + lxrp->lxur_xfs = LX_REG(&uc, FS); + lxrp->lxur_xgs = LX_REG(&uc, GS); + return (0); +} + +static int +lx_get_user_regs32(lx_lwp_data_t *lwpd, lx_user_regs32_t *lxrp) +{ + klwp_t *lwp = lwpd->br_lwp; + struct regs *rp = lwptoregs(lwp); + void *ucp; +#ifdef __amd64 + struct pcb *pcb = &lwp->lwp_pcb; +#endif + + VERIFY(lwp_getdatamodel(lwp) == DATAMODEL_ILP32); + + switch (lx_regs_location(lwpd, &ucp, B_FALSE)) { + case LX_REG_LOC_UNAVAIL: + return (-1); + + case LX_REG_LOC_UCP: + return (lx_get_user_regs32_uc(lwp, ucp, lxrp)); + + case LX_REG_LOC_LWP: + /* transformation below */ + break; + + default: + VERIFY(0); + break; + } + +#ifdef __amd64 + lxrp->lxur_ebx = (int32_t)rp->r_rbx; + lxrp->lxur_ecx = (int32_t)rp->r_rcx; + lxrp->lxur_edx = (int32_t)rp->r_rdx; + lxrp->lxur_esi = (int32_t)rp->r_rsi; + lxrp->lxur_edi = (int32_t)rp->r_rdi; + lxrp->lxur_ebp = (int32_t)rp->r_rbp; + lxrp->lxur_eax = (int32_t)rp->r_rax; + lxrp->lxur_orig_eax = 0; + lxrp->lxur_eip = (int32_t)rp->r_rip; + lxrp->lxur_eflags = (int32_t)rp->r_rfl; + lxrp->lxur_esp = (int32_t)rp->r_rsp; + lxrp->lxur_xss = (int32_t)rp->r_ss; + + kpreempt_disable(); + if (PCB_NEED_UPDATE_SEGS(pcb)) { + lxrp->lxur_xds = pcb->pcb_ds; + lxrp->lxur_xes = pcb->pcb_es; + lxrp->lxur_xfs = pcb->pcb_fs; + lxrp->lxur_xgs = pcb->pcb_gs; + } else { + lxrp->lxur_xds = rp->r_ds; + lxrp->lxur_xes = rp->r_es; + lxrp->lxur_xfs = rp->r_fs; + lxrp->lxur_xgs = rp->r_gs; + } + kpreempt_enable(); +#else /* __i386 */ + lxrp->lxur_ebx = rp->r_ebx; + lxrp->lxur_ecx = rp->r_ecx; + lxrp->lxur_edx = rp->r_edx; + lxrp->lxur_esi = rp->r_esi; + lxrp->lxur_edi = rp->r_edi; + lxrp->lxur_ebp = rp->r_ebp; + lxrp->lxur_eax = rp->r_eax; + lxrp->lxur_orig_eax = 0; + lxrp->lxur_eip = rp->r_eip; + lxrp->lxur_eflags = rp->r_efl; + lxrp->lxur_esp = rp->r_esp; + lxrp->lxur_xss = rp->r_ss; + + lxrp->lxur_xds = rp->r_ds; + lxrp->lxur_xes = rp->r_es; + lxrp->lxur_xfs = rp->r_fs; + lxrp->lxur_xgs = rp->r_gs; +#endif /* __amd64 */ + + /* emulated %cs, see defines */ + lxrp->lxur_xcs = LX_CS_32BIT; + + if (lwpd->br_ptrace_whatstop == LX_PR_SYSENTRY) { + lxrp->lxur_eax = (int32_t)-lx_errno(ENOTSUP, EINVAL); + lxrp->lxur_orig_eax = (int32_t)lwpd->br_syscall_num; + } else if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) { + lxrp->lxur_orig_eax = (int32_t)lwpd->br_syscall_num; + } + + return (0); +} + +static int +lx_set_user_regs32_uc(klwp_t *lwp, void *ucp, lx_user_regs32_t *lxrp) +{ + proc_t *p = lwptoproc(lwp); + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + /* + * Note: we currently ignore "lxur_orig_rax" here since this + * path should not be used for system call stops. + */ + LX_REG(&uc, EBP) = lxrp->lxur_ebp; + LX_REG(&uc, EBX) = lxrp->lxur_ebx; + LX_REG(&uc, EAX) = lxrp->lxur_eax; + LX_REG(&uc, ECX) = lxrp->lxur_ecx; + LX_REG(&uc, EDX) = lxrp->lxur_edx; + LX_REG(&uc, ESI) = lxrp->lxur_esi; + LX_REG(&uc, EDI) = lxrp->lxur_edi; + LX_REG(&uc, EIP) = lxrp->lxur_eip; + LX_REG(&uc, EFL) = PSLMERGE(LX_REG(&uc, EFL), lxrp->lxur_eflags); + LX_REG(&uc, UESP) = lxrp->lxur_esp; + LX_REG(&uc, SS) = fix_segreg(lxrp->lxur_xss, IS_NOT_CS, + DATAMODEL_ILP32); + + /* %cs is ignored because of our lies */ + LX_REG(&uc, DS) = fix_segreg(lxrp->lxur_xds, IS_NOT_CS, + DATAMODEL_ILP32); + LX_REG(&uc, ES) = fix_segreg(lxrp->lxur_xes, IS_NOT_CS, + DATAMODEL_ILP32); + LX_REG(&uc, FS) = fix_segreg(lxrp->lxur_xfs, IS_NOT_CS, + DATAMODEL_ILP32); + LX_REG(&uc, GS) = fix_segreg(lxrp->lxur_xgs, IS_NOT_CS, + DATAMODEL_ILP32); + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + return (0); +} + +static int +lx_set_user_regs32(lx_lwp_data_t *lwpd, lx_user_regs32_t *lxrp) +{ + klwp_t *lwp = lwpd->br_lwp; + struct regs *rp = lwptoregs(lwp); + void *ucp; +#ifdef __amd64 + struct pcb *pcb = &lwp->lwp_pcb; +#endif + + VERIFY(lwp_getdatamodel(lwp) == DATAMODEL_ILP32); + + switch (lx_regs_location(lwpd, &ucp, B_TRUE)) { + case LX_REG_LOC_UNAVAIL: + return (-1); + + case LX_REG_LOC_UCP: + return (lx_set_user_regs32_uc(lwp, ucp, lxrp)); + + case LX_REG_LOC_LWP: + /* transformation below */ + break; + + default: + VERIFY(0); + break; + } + +#ifdef __amd64 + rp->r_rbx = (int32_t)lxrp->lxur_ebx; + rp->r_rcx = (int32_t)lxrp->lxur_ecx; + rp->r_rdx = (int32_t)lxrp->lxur_edx; + rp->r_rsi = (int32_t)lxrp->lxur_esi; + rp->r_rdi = (int32_t)lxrp->lxur_edi; + rp->r_rbp = (int32_t)lxrp->lxur_ebp; + rp->r_rax = (int32_t)lxrp->lxur_eax; + lwpd->br_syscall_num = (int)lxrp->lxur_orig_eax; + rp->r_rip = (int32_t)lxrp->lxur_eip; + rp->r_rfl = (int32_t)PSLMERGE(rp->r_rfl, lxrp->lxur_eflags); + rp->r_rsp = (int32_t)lxrp->lxur_esp; + rp->r_ss = (int32_t)fix_segreg(lxrp->lxur_xss, IS_NOT_CS, + DATAMODEL_ILP32); + + kpreempt_disable(); + PCB_SET_UPDATE_SEGS(pcb); + pcb->pcb_ds = fix_segreg(lxrp->lxur_xds, IS_NOT_CS, DATAMODEL_ILP32); + pcb->pcb_es = fix_segreg(lxrp->lxur_xes, IS_NOT_CS, DATAMODEL_ILP32); + pcb->pcb_fs = fix_segreg(lxrp->lxur_xfs, IS_NOT_CS, DATAMODEL_ILP32); + pcb->pcb_gs = fix_segreg(lxrp->lxur_xgs, IS_NOT_CS, DATAMODEL_ILP32); + kpreempt_enable(); +#else /* __i386 */ + rp->r_ebx = lxrp->lxur_ebx; + rp->r_ecx = lxrp->lxur_ecx; + rp->r_edx = lxrp->lxur_edx; + rp->r_esi = lxrp->lxur_esi; + rp->r_edi = lxrp->lxur_edi; + rp->r_ebp = lxrp->lxur_ebp; + rp->r_eax = lxrp->lxur_eax; + lwpd->br_syscall_num = (int)lxrp->lxur_orig_eax; + rp->r_eip = lxrp->lxur_eip; + rp->r_efl = PSLMERGE(rp->r_efl, lxrp->lxur_eflags); + rp->r_esp = lxrp->lxur_esp; + rp->r_ss = fix_segreg(lxrp->lxur_xss, IS_NOT_CS, DATAMODEL_ILP32); + + rp->r_ds = fix_segreg(lxrp->lxur_xds, IS_NOT_CS, DATAMODEL_ILP32); + rp->r_es = fix_segreg(lxrp->lxur_xes, IS_NOT_CS, DATAMODEL_ILP32); + rp->r_fs = fix_segreg(lxrp->lxur_xfs, IS_NOT_CS, DATAMODEL_ILP32); + rp->r_gs = fix_segreg(lxrp->lxur_xgs, IS_NOT_CS, DATAMODEL_ILP32); +#endif /* __amd64 */ + + return (0); +} + +#ifdef __amd64 + +static void +lx_getfpregs64(lx_lwp_data_t *lwpd, lx_user_fpregs64_t *lfp) +{ + fpregset_t fp; + + getfpregs(lwpd->br_lwp, &fp); + /* Drop the extra illumos status/xstatus fields when copying state */ + bcopy(&fp.fp_reg_set.fpchip_state, lfp, sizeof (*lfp)); +} + +static void +lx_setfpregs64(lx_lwp_data_t *lwpd, lx_user_fpregs64_t *lfp) +{ + fpregset_t fp; + + /* + * Since the Linux fpregs structure does not contain the same + * additional status register which illumos contains, we simply + * preserve the existing values when setting fp state. + */ + getfpregs(lwpd->br_lwp, &fp); + + /* Copy the identically formatted state */ + bcopy(lfp, &fp.fp_reg_set.fpchip_state, sizeof (*lfp)); + + setfpregs(lwpd->br_lwp, &fp); +} + +static int +lx_get_user_regs64_uc(klwp_t *lwp, void *ucp, lx_user_regs64_t *lxrp) +{ + proc_t *p = lwptoproc(lwp); + + switch (lwp_getdatamodel(lwp)) { + case DATAMODEL_LP64: { + ucontext_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + lxrp->lxur_r15 = LX_REG(&uc, REG_R15); + lxrp->lxur_r14 = LX_REG(&uc, REG_R14); + lxrp->lxur_r13 = LX_REG(&uc, REG_R13); + lxrp->lxur_r12 = LX_REG(&uc, REG_R12); + lxrp->lxur_rbp = LX_REG(&uc, REG_RBP); + lxrp->lxur_rbx = LX_REG(&uc, REG_RBX); + lxrp->lxur_r11 = LX_REG(&uc, REG_R11); + lxrp->lxur_r10 = LX_REG(&uc, REG_R10); + lxrp->lxur_r9 = LX_REG(&uc, REG_R9); + lxrp->lxur_r8 = LX_REG(&uc, REG_R8); + lxrp->lxur_rax = LX_REG(&uc, REG_RAX); + lxrp->lxur_rcx = LX_REG(&uc, REG_RCX); + lxrp->lxur_rdx = LX_REG(&uc, REG_RDX); + lxrp->lxur_rsi = LX_REG(&uc, REG_RSI); + lxrp->lxur_rdi = LX_REG(&uc, REG_RDI); + lxrp->lxur_orig_rax = 0; + lxrp->lxur_rip = LX_REG(&uc, REG_RIP); + lxrp->lxur_rflags = LX_REG(&uc, REG_RFL); + lxrp->lxur_rsp = LX_REG(&uc, REG_RSP); + lxrp->lxur_xss = LX_REG(&uc, REG_SS); + lxrp->lxur_xfs_base = LX_REG(&uc, REG_FSBASE); + lxrp->lxur_xgs_base = LX_REG(&uc, REG_GSBASE); + + lxrp->lxur_xds = LX_REG(&uc, REG_DS); + lxrp->lxur_xes = LX_REG(&uc, REG_ES); + lxrp->lxur_xfs = LX_REG(&uc, REG_FS); + lxrp->lxur_xgs = LX_REG(&uc, REG_GS); + + /* emulated %cs, see defines */ + lxrp->lxur_xcs = LX_CS_64BIT; + return (0); + } + + case DATAMODEL_ILP32: { + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + lxrp->lxur_r15 = 0; + lxrp->lxur_r14 = 0; + lxrp->lxur_r13 = 0; + lxrp->lxur_r12 = 0; + lxrp->lxur_r11 = 0; + lxrp->lxur_r10 = 0; + lxrp->lxur_r9 = 0; + lxrp->lxur_r8 = 0; + lxrp->lxur_rbp = LX_REG(&uc, EBP); + lxrp->lxur_rbx = LX_REG(&uc, EBX); + lxrp->lxur_rax = LX_REG(&uc, EAX); + lxrp->lxur_orig_rax = 0; + lxrp->lxur_rcx = LX_REG(&uc, ECX); + lxrp->lxur_rdx = LX_REG(&uc, EDX); + lxrp->lxur_rsi = LX_REG(&uc, ESI); + lxrp->lxur_rdi = LX_REG(&uc, EDI); + lxrp->lxur_rip = LX_REG(&uc, EIP); + + lxrp->lxur_rflags = LX_REG(&uc, EFL); + lxrp->lxur_rsp = LX_REG(&uc, UESP); + lxrp->lxur_xss = LX_REG(&uc, SS); + lxrp->lxur_xfs_base = 0; + lxrp->lxur_xgs_base = 0; + + lxrp->lxur_xds = LX_REG(&uc, DS); + lxrp->lxur_xes = LX_REG(&uc, ES); + lxrp->lxur_xfs = LX_REG(&uc, FS); + lxrp->lxur_xgs = LX_REG(&uc, GS); + + /* See comment above re: %cs register */ + lxrp->lxur_xcs = LX_CS_32BIT; + return (0); + } + + default: + break; + } + + return (-1); +} + +static int +lx_get_user_regs64(lx_lwp_data_t *lwpd, lx_user_regs64_t *lxrp) +{ + klwp_t *lwp = lwpd->br_lwp; + struct regs *rp = lwptoregs(lwp); + struct pcb *pcb = &lwp->lwp_pcb; + void *ucp; + + switch (lx_regs_location(lwpd, &ucp, B_FALSE)) { + case LX_REG_LOC_UNAVAIL: + return (-1); + + case LX_REG_LOC_UCP: + return (lx_get_user_regs64_uc(lwp, ucp, lxrp)); + + case LX_REG_LOC_LWP: + /* transformation below */ + break; + + default: + VERIFY(0); + break; + } + + lxrp->lxur_r15 = rp->r_r15; + lxrp->lxur_r14 = rp->r_r14; + lxrp->lxur_r13 = rp->r_r13; + lxrp->lxur_r12 = rp->r_r12; + lxrp->lxur_rbp = rp->r_rbp; + lxrp->lxur_rbx = rp->r_rbx; + lxrp->lxur_r11 = rp->r_r11; + lxrp->lxur_r10 = rp->r_r10; + lxrp->lxur_r9 = rp->r_r9; + lxrp->lxur_r8 = rp->r_r8; + lxrp->lxur_rax = rp->r_rax; + lxrp->lxur_rcx = rp->r_rcx; + lxrp->lxur_rdx = rp->r_rdx; + lxrp->lxur_rsi = rp->r_rsi; + lxrp->lxur_rdi = rp->r_rdi; + lxrp->lxur_orig_rax = 0; + lxrp->lxur_rip = rp->r_rip; + + lxrp->lxur_rflags = rp->r_rfl; + lxrp->lxur_rsp = rp->r_rsp; + lxrp->lxur_xss = rp->r_ss; + lxrp->lxur_xfs_base = pcb->pcb_fsbase; + lxrp->lxur_xgs_base = pcb->pcb_gsbase; + + /* emulated %cs, see defines */ + switch (lwp_getdatamodel(lwp)) { + case DATAMODEL_LP64: + lxrp->lxur_xcs = LX_CS_64BIT; + break; + case DATAMODEL_ILP32: + lxrp->lxur_xcs = LX_CS_32BIT; + break; + default: + VERIFY(0); + break; + } + + kpreempt_disable(); + if (PCB_NEED_UPDATE_SEGS(pcb)) { + lxrp->lxur_xds = pcb->pcb_ds; + lxrp->lxur_xes = pcb->pcb_es; + lxrp->lxur_xfs = pcb->pcb_fs; + lxrp->lxur_xgs = pcb->pcb_gs; + } else { + lxrp->lxur_xds = rp->r_ds; + lxrp->lxur_xes = rp->r_es; + lxrp->lxur_xfs = rp->r_fs; + lxrp->lxur_xgs = rp->r_gs; + } + kpreempt_enable(); + + if (lwpd->br_ptrace_whatstop == LX_PR_SYSENTRY) { + lxrp->lxur_rax = -lx_errno(ENOTSUP, EINVAL); + lxrp->lxur_orig_rax = lwpd->br_syscall_num; + } else if (lwpd->br_ptrace_whatstop == LX_PR_SYSEXIT) { + lxrp->lxur_orig_rax = lwpd->br_syscall_num; + } + + return (0); +} + +static int +lx_set_user_regs64_uc(klwp_t *lwp, void *ucp, lx_user_regs64_t *lxrp) +{ + proc_t *p = lwptoproc(lwp); + + switch (lwp_getdatamodel(lwp)) { + case DATAMODEL_LP64: { + ucontext_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + /* + * Note: we currently ignore "lxur_orig_rax" here since this + * path should not be used for system call stops. + */ + LX_REG(&uc, REG_R15) = lxrp->lxur_r15; + LX_REG(&uc, REG_R14) = lxrp->lxur_r14; + LX_REG(&uc, REG_R13) = lxrp->lxur_r13; + LX_REG(&uc, REG_R12) = lxrp->lxur_r12; + LX_REG(&uc, REG_RBP) = lxrp->lxur_rbp; + LX_REG(&uc, REG_RBX) = lxrp->lxur_rbx; + LX_REG(&uc, REG_R11) = lxrp->lxur_r11; + LX_REG(&uc, REG_R10) = lxrp->lxur_r10; + LX_REG(&uc, REG_R9) = lxrp->lxur_r9; + LX_REG(&uc, REG_R8) = lxrp->lxur_r8; + LX_REG(&uc, REG_RAX) = lxrp->lxur_rax; + LX_REG(&uc, REG_RCX) = lxrp->lxur_rcx; + LX_REG(&uc, REG_RDX) = lxrp->lxur_rdx; + LX_REG(&uc, REG_RSI) = lxrp->lxur_rsi; + LX_REG(&uc, REG_RDI) = lxrp->lxur_rdi; + LX_REG(&uc, REG_RIP) = lxrp->lxur_rip; + LX_REG(&uc, REG_RFL) = PSLMERGE(LX_REG(&uc, REG_RFL), + lxrp->lxur_rflags); + LX_REG(&uc, REG_RSP) = lxrp->lxur_rsp; + LX_REG(&uc, REG_SS) = fix_segreg(lxrp->lxur_xss, IS_NOT_CS, + DATAMODEL_LP64); + LX_REG(&uc, REG_FSBASE) = lxrp->lxur_xfs_base; + LX_REG(&uc, REG_GSBASE) = lxrp->lxur_xgs_base; + + /* %cs is ignored because of our lies */ + LX_REG(&uc, REG_DS) = fix_segreg(lxrp->lxur_xds, IS_NOT_CS, + DATAMODEL_LP64); + LX_REG(&uc, REG_ES) = fix_segreg(lxrp->lxur_xes, IS_NOT_CS, + DATAMODEL_LP64); + LX_REG(&uc, REG_FS) = fix_segreg(lxrp->lxur_xfs, IS_NOT_CS, + DATAMODEL_LP64); + LX_REG(&uc, REG_GS) = fix_segreg(lxrp->lxur_xgs, IS_NOT_CS, + DATAMODEL_LP64); + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + return (0); + } + + case DATAMODEL_ILP32: { + ucontext32_t uc; + + if (lx_read_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + + /* + * Note: we currently ignore "lxur_orig_rax" here since this + * path should not be used for system call stops. + */ + LX_REG(&uc, EBP) = (int32_t)lxrp->lxur_rbp; + LX_REG(&uc, EBX) = (int32_t)lxrp->lxur_rbx; + LX_REG(&uc, EAX) = (int32_t)lxrp->lxur_rax; + LX_REG(&uc, ECX) = (int32_t)lxrp->lxur_rcx; + LX_REG(&uc, EDX) = (int32_t)lxrp->lxur_rdx; + LX_REG(&uc, ESI) = (int32_t)lxrp->lxur_rsi; + LX_REG(&uc, EDI) = (int32_t)lxrp->lxur_rdi; + LX_REG(&uc, EIP) = (int32_t)lxrp->lxur_rip; + LX_REG(&uc, EFL) = (int32_t)PSLMERGE(LX_REG(&uc, EFL), + lxrp->lxur_rflags); + LX_REG(&uc, UESP) = (int32_t)lxrp->lxur_rsp; + LX_REG(&uc, SS) = (int32_t)fix_segreg(lxrp->lxur_xss, + IS_NOT_CS, DATAMODEL_ILP32); + + /* %cs is ignored because of our lies */ + LX_REG(&uc, DS) = (int32_t)fix_segreg(lxrp->lxur_xds, + IS_NOT_CS, DATAMODEL_ILP32); + LX_REG(&uc, ES) = (int32_t)fix_segreg(lxrp->lxur_xes, + IS_NOT_CS, DATAMODEL_ILP32); + LX_REG(&uc, FS) = (int32_t)fix_segreg(lxrp->lxur_xfs, + IS_NOT_CS, DATAMODEL_ILP32); + LX_REG(&uc, GS) = (int32_t)fix_segreg(lxrp->lxur_xgs, + IS_NOT_CS, DATAMODEL_ILP32); + + if (lx_write_uc(p, ucp, &uc, sizeof (uc)) != 0) { + return (-1); + } + return (0); + } + + default: + break; + } + + return (-1); +} + +static int +lx_set_user_regs64(lx_lwp_data_t *lwpd, lx_user_regs64_t *lxrp) +{ + klwp_t *lwp = lwpd->br_lwp; + struct regs *rp = lwptoregs(lwp); + struct pcb *pcb = &lwp->lwp_pcb; + void *ucp; + + switch (lx_regs_location(lwpd, &ucp, B_TRUE)) { + case LX_REG_LOC_UNAVAIL: + return (-1); + + case LX_REG_LOC_UCP: + return (lx_set_user_regs64_uc(lwp, ucp, lxrp)); + + case LX_REG_LOC_LWP: + /* transformation below */ + break; + + default: + VERIFY(0); + break; + } + + rp->r_r15 = lxrp->lxur_r15; + rp->r_r14 = lxrp->lxur_r14; + rp->r_r13 = lxrp->lxur_r13; + rp->r_r12 = lxrp->lxur_r12; + rp->r_rbp = lxrp->lxur_rbp; + rp->r_rbx = lxrp->lxur_rbx; + rp->r_r11 = lxrp->lxur_r11; + rp->r_r10 = lxrp->lxur_r10; + rp->r_r9 = lxrp->lxur_r9; + rp->r_r8 = lxrp->lxur_r8; + rp->r_rax = lxrp->lxur_rax; + rp->r_rcx = lxrp->lxur_rcx; + rp->r_rdx = lxrp->lxur_rdx; + rp->r_rsi = lxrp->lxur_rsi; + rp->r_rdi = lxrp->lxur_rdi; + lwpd->br_syscall_num = (int)lxrp->lxur_orig_rax; + rp->r_rip = lxrp->lxur_rip; + rp->r_rfl = PSLMERGE(rp->r_rfl, lxrp->lxur_rflags); + rp->r_rsp = lxrp->lxur_rsp; + rp->r_ss = fix_segreg(lxrp->lxur_xss, IS_NOT_CS, DATAMODEL_LP64); + pcb->pcb_fsbase = lxrp->lxur_xfs_base; + pcb->pcb_gsbase = lxrp->lxur_xgs_base; + + kpreempt_disable(); + PCB_SET_UPDATE_SEGS(pcb); + pcb->pcb_ds = fix_segreg(lxrp->lxur_xds, IS_NOT_CS, DATAMODEL_LP64); + pcb->pcb_es = fix_segreg(lxrp->lxur_xes, IS_NOT_CS, DATAMODEL_LP64); + pcb->pcb_fs = fix_segreg(lxrp->lxur_xfs, IS_NOT_CS, DATAMODEL_LP64); + pcb->pcb_gs = fix_segreg(lxrp->lxur_xgs, IS_NOT_CS, DATAMODEL_LP64); + kpreempt_enable(); + + return (0); +} + +#endif /* __amd64 */ + +static int +lx_peekuser32(lx_lwp_data_t *lwpd, uintptr_t offset, uint32_t *res) +{ + lx_user32_t lxu; + boolean_t valid = B_FALSE; + + bzero(&lxu, sizeof (lxu)); + if (offset < sizeof (lx_user_regs32_t)) { + if (lx_get_user_regs32(lwpd, &lxu.lxu_regs) == 0) { + valid = B_TRUE; + } + } + if (valid) { + uint32_t *data = (uint32_t *)&lxu; + *res = data[offset / sizeof (uint32_t)]; + return (0); + } + return (-1); +} + +#ifdef __amd64 +static int +lx_peekuser64(lx_lwp_data_t *lwpd, uintptr_t offset, uintptr_t *res) +{ + lx_user64_t lxu; + boolean_t valid = B_FALSE; + + bzero(&lxu, sizeof (lxu)); + if (offset < sizeof (lx_user_regs64_t)) { + if (lx_get_user_regs64(lwpd, &lxu.lxu_regs) == 0) { + valid = B_TRUE; + } + } + if (valid) { + uintptr_t *data = (uintptr_t *)&lxu; + *res = data[offset / sizeof (uintptr_t)]; + return (0); + } + return (-1); +} +#endif /* __amd64 */ + +int +lx_user_regs_copyin(lx_lwp_data_t *lwpd, void *uregsp) +{ + model_t target_model = lwp_getdatamodel(lwpd->br_lwp); + + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + if (target_model == DATAMODEL_ILP32) { + lx_user_regs32_t regs; + + if (copyin(uregsp, ®s, sizeof (regs)) != 0) { + return (EFAULT); + } + if (lx_set_user_regs32(lwpd, ®s) != 0) { + return (EIO); + } + return (0); + } + break; + +#ifdef __amd64 + case DATAMODEL_LP64: + if (target_model == DATAMODEL_ILP32 || + target_model == DATAMODEL_LP64) { + lx_user_regs64_t regs; + + if (copyin(uregsp, ®s, sizeof (regs)) != 0) { + return (EFAULT); + } + if (lx_set_user_regs64(lwpd, ®s) != 0) { + return (EIO); + } + return (0); + } + break; +#endif /* __amd64 */ + + default: + break; + } + return (EIO); +} + +int +lx_user_regs_copyout(lx_lwp_data_t *lwpd, void *uregsp) +{ + model_t target_model = lwp_getdatamodel(lwpd->br_lwp); + + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + if (target_model == DATAMODEL_ILP32) { + lx_user_regs32_t regs; + + if (lx_get_user_regs32(lwpd, ®s) != 0) { + return (EIO); + } + if (copyout(®s, uregsp, sizeof (regs)) != 0) { + return (EFAULT); + } + return (0); + } + break; + +#ifdef __amd64 + case DATAMODEL_LP64: + if (target_model == DATAMODEL_ILP32 || + target_model == DATAMODEL_LP64) { + lx_user_regs64_t regs; + + if (lx_get_user_regs64(lwpd, ®s) != 0) { + return (EIO); + } + if (copyout(®s, uregsp, sizeof (regs)) != 0) { + return (EFAULT); + } + return (0); + } + break; +#endif /* __amd64 */ + + default: + break; + } + return (EIO); +} + +int +lx_user_fpregs_copyin(lx_lwp_data_t *lwpd, void *uregsp) +{ + model_t target_model = lwp_getdatamodel(lwpd->br_lwp); + + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + if (target_model == DATAMODEL_ILP32) { + lx_user_fpregs32_t regs; + + if (copyin(uregsp, ®s, sizeof (regs)) != 0) { + return (EFAULT); + } + lx_setfpregs32(lwpd, ®s); + return (0); + } + break; + +#ifdef __amd64 + case DATAMODEL_LP64: + if (target_model == DATAMODEL_ILP32 || + target_model == DATAMODEL_LP64) { + lx_user_fpregs64_t regs; + + if (copyin(uregsp, ®s, sizeof (regs)) != 0) { + return (EFAULT); + } + lx_setfpregs64(lwpd, ®s); + return (0); + } + break; +#endif /* __amd64 */ + + default: + break; + } + return (EIO); +} + +int +lx_user_fpregs_copyout(lx_lwp_data_t *lwpd, void *uregsp) +{ + model_t target_model = lwp_getdatamodel(lwpd->br_lwp); + + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + if (target_model == DATAMODEL_ILP32) { + lx_user_fpregs32_t regs; + + lx_getfpregs32(lwpd, ®s); + if (copyout(®s, uregsp, sizeof (regs)) != 0) { + return (EFAULT); + } + return (0); + } + break; + +#ifdef __amd64 + case DATAMODEL_LP64: + if (target_model == DATAMODEL_ILP32 || + target_model == DATAMODEL_LP64) { + lx_user_fpregs64_t regs; + + lx_getfpregs64(lwpd, ®s); + if (copyout(®s, uregsp, sizeof (regs)) != 0) { + return (EFAULT); + } + return (0); + } + break; +#endif /* __amd64 */ + + default: + break; + } + return (EIO); +} + +/* ARGSUSED */ +int +lx_user_fpxregs_copyin(lx_lwp_data_t *lwpd, void *uregsp) +{ + /* Punt on fpxregs for now */ + return (EIO); +} + +/* ARGSUSED */ +int +lx_user_fpxregs_copyout(lx_lwp_data_t *lwpd, void *uregsp) +{ + /* Punt on fpxregs for now */ + return (EIO); +} + +int +lx_ptrace_peekuser(lx_lwp_data_t *lwpd, uintptr_t offset, void *uptr) +{ + model_t target_model = lwp_getdatamodel(lwpd->br_lwp); + + switch (get_udatamodel()) { + case DATAMODEL_ILP32: + if ((offset & (sizeof (uint32_t) - 1)) != 0) { + /* Must be aligned to 32bit boundary */ + break; + } + if (target_model == DATAMODEL_ILP32) { + uint32_t res; + + if (lx_peekuser32(lwpd, offset, &res) != 0) { + return (EIO); + } + if (copyout(&res, uptr, sizeof (res)) != 0) { + return (EFAULT); + } + return (0); + } + break; + +#ifdef __amd64 + case DATAMODEL_LP64: + if ((offset & (sizeof (uintptr_t) - 1)) != 0) { + /* Must be aligned to 64bit boundary */ + break; + } + if (target_model == DATAMODEL_ILP32 || + target_model == DATAMODEL_LP64) { + uintptr_t res; + + if (lx_peekuser64(lwpd, offset, &res) != 0) { + return (EIO); + } + if (copyout(&res, uptr, sizeof (res)) != 0) { + return (EFAULT); + } + return (0); + } + break; +#endif /* __amd64 */ + + default: + break; + } + return (EIO); +} + +/* ARGSUSED */ +int +lx_ptrace_pokeuser(lx_lwp_data_t *lwpd, uintptr_t offset, void *uptr) +{ + return (EIO); +} + + +/* + * Load registers and repoint the stack and program counter. This function is + * used by the B_JUMP_TO_LINUX brand system call to revector to a Linux + * entrypoint. + */ +int +lx_runexe(klwp_t *lwp, void *ucp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We should only make it here when transitioning to Linux from + * the NATIVE or INIT mode. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_NATIVE || + lwpd->br_stack_mode == LX_STACK_MODE_INIT); + +#if defined(__amd64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + struct pcb *pcb = &lwp->lwp_pcb; + + /* + * Preserve the %fs/%gsbase value for this LWP, as set and used + * by native illumos code. + */ + lwpd->br_ntv_fsbase = pcb->pcb_fsbase; + lwpd->br_ntv_gsbase = pcb->pcb_gsbase; + + return (getsetcontext(SETCONTEXT, ucp)); + } else { + return (getsetcontext32(SETCONTEXT, ucp)); + } +#else + return (getsetcontext(SETCONTEXT, ucp)); +#endif +} + +/* + * The usermode emulation code is illumos library code. This routine ensures + * the segment registers are set up correctly for native illumos code. It + * should be called _after_ we have stored the outgoing Linux machine state + * but _before_ we return from the kernel to any illumos native code; e.g. the + * usermode emulation library, or any interposed signal handlers. + * + * See the comment on lwp_segregs_save() for how we handle the usermode + * registers when we come into the kernel and see update_sregs() for how we + * restore. + */ +void +lx_switch_to_native(klwp_t *lwp) +{ +#if defined(__amd64) + model_t datamodel = lwp_getdatamodel(lwp); + + switch (datamodel) { + case DATAMODEL_ILP32: { + struct pcb *pcb = &lwp->lwp_pcb; + + /* + * For 32-bit processes, we ensure that the correct %gs value + * is loaded: + */ + kpreempt_disable(); + if (PCB_NEED_UPDATE_SEGS(pcb)) { + /* + * If we are already flushing the segment registers, + * then ensure we are flushing the native %gs. + */ + pcb->pcb_gs = LWPGS_SEL; + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * If we are not flushing the segment registers yet, + * only do so if %gs is not correct already: + */ + if (rp->r_gs != LWPGS_SEL) { + pcb->pcb_gs = LWPGS_SEL; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + } + kpreempt_enable(); + break; + } + + case DATAMODEL_LP64: { + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * For 64-bit processes we ensure that the correct %fsbase + * value is loaded: + */ + if (lwpd->br_ntv_fsbase != 0) { + struct pcb *pcb = &lwp->lwp_pcb; + + kpreempt_disable(); + if (pcb->pcb_fsbase != lwpd->br_ntv_fsbase) { + pcb->pcb_fsbase = lwpd->br_ntv_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + } + /* + * ... and the correct %gsbase + */ + if (lwpd->br_ntv_gsbase != 0) { + struct pcb *pcb = &lwp->lwp_pcb; + + kpreempt_disable(); + if (pcb->pcb_gsbase != lwpd->br_ntv_gsbase) { + pcb->pcb_gsbase = lwpd->br_ntv_gsbase; + + /* + * Ensure we go out via update_sregs. + */ + PCB_SET_UPDATE_SEGS(pcb); + } + kpreempt_enable(); + } + break; + } + + default: + cmn_err(CE_PANIC, "unknown data model: %d", datamodel); + } +#elif defined(__i386) + struct regs *rp = lwptoregs(lwp); + + rp->r_gs = LWPGS_SEL; +#else +#error "unknown x86" +#endif +} + +#if defined(__amd64) +/* + * Call frame for the 64-bit usermode emulation handler: + * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args) + * + * old sp: -------------------------------------------------------------- + * | - ucontext_t (register state for emulation) + * | - uintptr_t[6] (system call arguments array) + * V -------------------------------------------------------------- + * new sp: - bogus return address + * + * Arguments are passed in registers, per the AMD64 ABI: %rdi, %rsi and %rdx. + */ +void +lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + label_t lab; + uintptr_t uc_addr; + uintptr_t args_addr; + uintptr_t top; + /* + * Variables used after on_fault() returns for a fault + * must be volatile. + */ + volatile size_t frsz; + volatile uintptr_t sp; + volatile proc_t *p = lwptoproc(lwp); + volatile int watched; + + /* + * We should not be able to get here unless we are running Linux + * code for a system call we cannot emulate in the kernel. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND); + + /* + * The AMD64 ABI requires us to align the return address on the stack + * so that when the called function pushes %rbp, the stack is 16-byte + * aligned. + * + * This routine, like the amd64 version of sendsig(), depends on + * STACK_ALIGN being 16 and STACK_ENTRY_ALIGN being 8. + */ +#if STACK_ALIGN != 16 || STACK_ENTRY_ALIGN != 8 +#error "lx_emulate_user() amd64 did not find the expected stack alignments" +#endif + + /* + * We begin at the current native stack pointer, and reserve space for + * the ucontext_t we are copying onto the stack, as well as the call + * arguments for the usermode emulation handler. + * + * We 16-byte align the entire frame, and then unalign it again by + * adding space for the return address. + */ + frsz = SA(sizeof (ucontext_t)) + SA(6 * sizeof (uintptr_t)) + + sizeof (uintptr_t); + VERIFY((frsz & (STACK_ALIGN - 1UL)) == 8); + VERIFY((frsz & (STACK_ENTRY_ALIGN - 1UL)) == 0); + + if (lwpd->br_ntv_stack == lwpd->br_ntv_stack_current) { + /* + * Nobody else is using the stack right now, so start at the + * top. + */ + top = lwpd->br_ntv_stack_current; + } else { + /* + * Drop below the 128-byte reserved region of the stack frame + * we are interrupting. + */ + top = lwpd->br_ntv_stack_current - STACK_RESERVE; + } + top = top & ~(STACK_ALIGN - 1); + sp = top - frsz; + + uc_addr = top - SA(sizeof (ucontext_t)); + args_addr = uc_addr - SA(6 * sizeof (uintptr_t)); + + watched = watch_disable_addr((caddr_t)sp, frsz, S_WRITE); + + /* + * Save the register state we preserved on the way into this brand + * system call and drop it on the native stack. + */ + { + /* + * Note: the amd64 ucontext_t is 864 bytes. + */ + ucontext_t uc; + + /* + * We do not want to save the signal mask for an emulation + * context. Some emulated system calls alter the signal mask; + * restoring it when the emulation is complete would clobber + * those intentional side effects. + */ + savecontext(&uc, NULL); + + if (on_fault(&lab)) { + goto badstack; + } + + /* + * Mark this as a system call emulation context: + */ + uc.uc_brand_data[0] = (void *)((uintptr_t) + uc.uc_brand_data[0] | LX_UC_FRAME_IS_SYSCALL); + + copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc)); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr); + lwp->lwp_oldcontext = (uintptr_t)uc_addr; + + /* + * Copy the system call arguments out to userland: + */ + copyout_noerr(args, (void *)(uintptr_t)args_addr, + 6 * sizeof (uintptr_t)); + + /* + * Drop the bogus return address on the stack. + */ + suword64_noerr((void *)sp, 0); + + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)sp, frsz, S_WRITE); + } + + /* + * Pass the arguments to lx_emulate() in the appropriate registers. + */ + rp->r_rdi = uc_addr; + rp->r_rsi = syscall_num; + rp->r_rdx = args_addr; + + /* + * In order to be able to restore %edx, we need to JUSTRETURN. + */ + lwp->lwp_eosys = JUSTRETURN; + curthread->t_post_sys = 1; + aston(curthread); + + /* + * Set stack pointer and return address to the usermode emulation + * handler: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, sp); + + /* + * Divert execution, on our return, to the usermode emulation stack + * and handler: + */ + rp->r_fp = 0; + rp->r_sp = sp; + rp->r_pc = ptolxproc(p)->l_handler; + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + + return; + +badstack: + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)sp, frsz, S_WRITE); + } + +#ifdef DEBUG + printf("lx_emulate_user: bad native stack cmd=%s, pid=%d, sp=0x%lx\n", + PTOU(p)->u_comm, p->p_pid, sp); +#endif + + exit(CLD_KILLED, SIGSEGV); +} + +#if defined(_SYSCALL32_IMPL) +/* + * Call frame for the 32-bit usermode emulation handler: + * lx_emulate(ucontext_t *ucp, int syscall_num, uintptr_t *args) + * + * old sp: -------------------------------------------------------------- + * | - ucontext_t (register state for emulation) + * | - uintptr_t[6] (system call arguments array) + * | -------------------------------------------------------------- + * | - arg2: uintptr_t * (pointer to arguments array above) + * | - arg1: int (system call number) + * V - arg0: ucontext_t * (pointer to context above) + * new sp: - bogus return address + */ +struct lx_emu_frame32 { + caddr32_t retaddr; /* 0 */ + caddr32_t ucontextp; /* 4 */ + int32_t syscall_num; /* 8 */ + caddr32_t argsp; /* c */ +}; + +/* + * This function arranges for the lwp to execute the usermode emulation handler + * for this system call. The mechanism is similar to signal handling, and this + * function is modelled on sendsig32(). + */ +void +lx_emulate_user32(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + label_t lab; + caddr32_t uc_addr; + caddr32_t args_addr; + caddr32_t top; + /* + * Variables used after on_fault() returns for a fault + * must be volatile. + */ + volatile size_t frsz; + volatile caddr32_t sp; + volatile proc_t *p = lwptoproc(lwp); + volatile int watched; + + /* + * We should not be able to get here unless we are running Linux + * code for a system call we cannot emulate in the kernel. + */ + VERIFY(lwpd->br_stack_mode == LX_STACK_MODE_BRAND); + + /* + * We begin at the current native stack pointer, and reserve space for + * the ucontext_t we are copying onto the stack, as well as the call + * arguments for the usermode emulation handler. + */ + frsz = SA32(sizeof (ucontext32_t)) + SA32(6 * sizeof (uint32_t)) + + SA32(sizeof (struct lx_emu_frame32)); + VERIFY((frsz & (STACK_ALIGN32 - 1)) == 0); + + top = (caddr32_t)(lwpd->br_ntv_stack_current & ~(STACK_ALIGN32 - 1)); + sp = top - frsz; + + uc_addr = top - SA32(sizeof (ucontext32_t)); + args_addr = uc_addr - SA32(6 * sizeof (uint32_t)); + + watched = watch_disable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + + /* + * Save the register state we preserved on the way into this brand + * system call and drop it on the native stack. + */ + { + /* + * Note: ucontext32_t is 512 bytes. + */ + ucontext32_t uc; + + /* + * We do not want to save the signal mask for an emulation + * context. Some emulated system calls alter the signal mask; + * restoring it when the emulation is complete would clobber + * those intentional side effects. + */ + savecontext32(&uc, NULL); + + if (on_fault(&lab)) { + goto badstack; + } + + /* + * Mark this as a system call emulation context: + */ + uc.uc_brand_data[0] |= LX_UC_FRAME_IS_SYSCALL; + copyout_noerr(&uc, (void *)(uintptr_t)uc_addr, sizeof (uc)); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, uc_addr); + lwp->lwp_oldcontext = (uintptr_t)uc_addr; + + /* + * Copy the system call arguments out to userland: + */ + { + uint32_t args32[6]; + + args32[0] = args[0]; + args32[1] = args[1]; + args32[2] = args[2]; + args32[3] = args[3]; + args32[4] = args[4]; + args32[5] = args[5]; + + copyout_noerr(&args32, (void *)(uintptr_t)args_addr, + sizeof (args32)); + } + + /* + * Assemble the call frame on the stack. + */ + { + struct lx_emu_frame32 frm; + + frm.retaddr = 0; + frm.ucontextp = uc_addr; + frm.argsp = args_addr; + frm.syscall_num = syscall_num; + + copyout_noerr(&frm, (void *)(uintptr_t)sp, sizeof (frm)); + } + + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + } + + /* + * Set stack pointer and return address to the usermode emulation + * handler: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, sp); + + /* + * Divert execution, on our return, to the usermode emulation stack + * and handler: + */ + rp->r_fp = 0; + rp->r_sp = sp; + rp->r_pc = ptolxproc(p)->l_handler; + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + + return; + +badstack: + no_fault(); + if (watched) { + watch_enable_addr((caddr_t)(uintptr_t)sp, frsz, S_WRITE); + } + +#ifdef DEBUG + printf("lx_emulate_user32: bad native stack cmd=%s, pid=%d, sp=0x%x\n", + PTOU(p)->u_comm, p->p_pid, sp); +#endif + + exit(CLD_KILLED, SIGSEGV); +} +#endif /* _SYSCALL32_IMPL */ + +#else /* !__amd64 (__i386) */ + +/* ARGSUSED */ +void +lx_emulate_user(klwp_t *lwp, int syscall_num, uintptr_t *args) +{ + cmn_err(CE_WARN, "%s: no 32-bit kernel support", __FUNCTION__); + exit(CLD_KILLED, SIGSYS); +} + +#endif /* __amd64 */ diff --git a/usr/src/uts/intel/core_pcbe/Makefile b/usr/src/uts/intel/core_pcbe/Makefile index abb2713efc..f48ff5e69a 100644 --- a/usr/src/uts/intel/core_pcbe/Makefile +++ b/usr/src/uts/intel/core_pcbe/Makefile @@ -70,7 +70,7 @@ CPCGEN_SRCS = $(CPCGEN_OBJS:%.o=%.c) core_pcbe_cpcgen.h MODULE = pcbe.GenuineIntel.6.15 OBJECTS = $(CORE_PCBE_OBJS:%=$(OBJS_DIR)/%) OBJECTS += $(CPCGEN_OBJS:%=$(OBJS_DIR)/%) -ROOTMODULE = $(USR_PCBE_DIR)/$(MODULE) +ROOTMODULE = $(ROOT_PSM_PCBE_DIR)/$(MODULE) # # This order matches the families declared in uts/intel/sys/x86_archext.h. @@ -120,7 +120,7 @@ SOFTLINKS = \ pcbe.GenuineIntel.6.140 \ pcbe.GenuineIntel.6.141 -ROOTSOFTLINKS = $(SOFTLINKS:%=$(USR_PCBE_DIR)/%) +ROOTSOFTLINKS = $(SOFTLINKS:%=$(ROOT_PSM_PCBE_DIR)/%) # # Include common rules. diff --git a/usr/src/uts/intel/datafilt/Makefile b/usr/src/uts/intel/datafilt/Makefile new file mode 100644 index 0000000000..89d8354e09 --- /dev/null +++ b/usr/src/uts/intel/datafilt/Makefile @@ -0,0 +1,63 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2011, OmniTI Computer Consulting, Inc. All rights reserved. +# Copyright 2012, Nexenta Systems, Inc. All rights reserved. +# + + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = datafilt +OBJECTS = $(DATAFILT_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nfs/sockfs -Ndrv/ip + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/dld/Makefile b/usr/src/uts/intel/dld/Makefile index acc064ca35..6bed0a217b 100644 --- a/usr/src/uts/intel/dld/Makefile +++ b/usr/src/uts/intel/dld/Makefile @@ -54,7 +54,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/dls -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # For now, disable these warnings; maintainers should endeavor diff --git a/usr/src/uts/intel/dls/Makefile b/usr/src/uts/intel/dls/Makefile index e10be370a9..f600dd5391 100644 --- a/usr/src/uts/intel/dls/Makefile +++ b/usr/src/uts/intel/dls/Makefile @@ -52,7 +52,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # For now, disable these warnings; maintainers should endeavor diff --git a/usr/src/uts/intel/dtrace/fasttrap_isa.c b/usr/src/uts/intel/dtrace/fasttrap_isa.c index 9318fd5e9b..d71d70f9eb 100644 --- a/usr/src/uts/intel/dtrace/fasttrap_isa.c +++ b/usr/src/uts/intel/dtrace/fasttrap_isa.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/fasttrap_isa.h> #include <sys/fasttrap_impl.h> #include <sys/dtrace.h> @@ -36,6 +40,9 @@ #include <sys/sysmacros.h> #include <sys/trap.h> #include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/machbrand.h> /* * Lossless User-Land Tracing on x86 @@ -1335,6 +1342,14 @@ fasttrap_pid_probe(struct regs *rp) */ if (p->p_model == DATAMODEL_LP64) { addr = lwp->lwp_pcb.pcb_fsbase; + + /* + * If we're branded, convert the fsbase from the + * brand's fsbase to the native fsbase. + */ + if (PROC_IS_BRANDED(p) && BRMOP(p)->b_fsbase != NULL) + addr = BRMOP(p)->b_fsbase(lwp, addr); + addr += sizeof (void *); } else { addr = lwp->lwp_pcb.pcb_gsbase; diff --git a/usr/src/uts/intel/genassym/Makefile b/usr/src/uts/intel/genassym/Makefile new file mode 100644 index 0000000000..a42925cfab --- /dev/null +++ b/usr/src/uts/intel/genassym/Makefile @@ -0,0 +1,83 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# ident "%Z%%M% %I% %E% SMI" +# +# This makefile drives the production of genassym.h through +# compile time intialized data. +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +GENASSYM_H = $(GENASSYM_DIR)/$(OBJS_DIR)/genassym.h +OFFSETS_SRC = $(GENASSYM_DIR)/offsets.in + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(GENASSYM_H) + +INC_PATH += -I$(UTSBASE)/common/brand/lx + +# +# Overrides +# +CLEANFILES = Nothing_to_remove +CLOBBERFILES = $(GENASSYM_H) Nothing_to_remove + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: def + +# +# Create genassym.h +# +$(GENASSYM_H): $(OFFSETS_SRC) + $(OFFSETS_CREATE) <$(OFFSETS_SRC) >$@ + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/genassym/offsets.in b/usr/src/uts/intel/genassym/offsets.in new file mode 100644 index 0000000000..70221c02f9 --- /dev/null +++ b/usr/src/uts/intel/genassym/offsets.in @@ -0,0 +1,43 @@ +\ +\ CDDL HEADER START +\ +\ The contents of this file are subject to the terms of the +\ Common Development and Distribution License (the "License"). +\ You may not use this file except in compliance with the License. +\ +\ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +\ or http://www.opensolaris.org/os/licensing. +\ See the License for the specific language governing permissions +\ and limitations under the License. +\ +\ When distributing Covered Code, include this CDDL HEADER in each +\ file and include the License file at usr/src/OPENSOLARIS.LICENSE. +\ If applicable, add the following below this CDDL HEADER, with the +\ fields enclosed by brackets "[]" replaced with your own identifying +\ information: Portions Copyright [yyyy] [name of copyright owner] +\ +\ CDDL HEADER END +\ +\ +\ Copyright 2010 Sun Microsystems, Inc. All rights reserved. +\ Use is subject to license terms. +\ Copyright 2015 Joyent, Inc. +\ + +\ +\ offsets.in: input file to produce the architecture-dependent genassym.h +\ using the ctfstabs program +\ + +#ifndef _GENASSYM +#define _GENASSYM +#endif + +#include <sys/lx_brand.h> + +lx_proc_data + l_handler + +lx_lwp_data + br_lx_fsbase + br_ntv_fsbase diff --git a/usr/src/uts/intel/gsqueue/Makefile b/usr/src/uts/intel/gsqueue/Makefile new file mode 100644 index 0000000000..330205cbb6 --- /dev/null +++ b/usr/src/uts/intel/gsqueue/Makefile @@ -0,0 +1,41 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +UTSBASE = ../.. + +MODULE = gsqueue +OBJECTS = $(GSQUEUE_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +LDFLAGS += -Ndrv/ip + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/hyprlofs/Makefile b/usr/src/uts/intel/hyprlofs/Makefile new file mode 100644 index 0000000000..bf7a225fc4 --- /dev/null +++ b/usr/src/uts/intel/hyprlofs/Makefile @@ -0,0 +1,74 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/hyprlofs/Makefile +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright 2019 Joyent, Inc. + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = hyprlofs +OBJECTS = $(HYPRLOFS_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_FS_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# needs work +$(OBJS_DIR)/hyprlofs_vnops.o := SMOFF += signed + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/icmp/Makefile b/usr/src/uts/intel/icmp/Makefile index 87785610c8..c5bd9a810f 100644 --- a/usr/src/uts/intel/icmp/Makefile +++ b/usr/src/uts/intel/icmp/Makefile @@ -62,6 +62,8 @@ include $(UTSBASE)/intel/Makefile.intel ALL_TARGET = $(BINARY) $(SRC_CONFFILE) INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) +INC_PATH += -I$(UTSBASE)/common/io/bpf + # # depends on ip and sockfs # diff --git a/usr/src/uts/intel/inotify/Makefile b/usr/src/uts/intel/inotify/Makefile new file mode 100644 index 0000000000..3198797024 --- /dev/null +++ b/usr/src/uts/intel/inotify/Makefile @@ -0,0 +1,61 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = inotify +OBJECTS = $(INOTIFY_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +CERRWARN += -_gcc=-Wno-parentheses +LDFLAGS += -Nfs/specfs + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/io/dktp/dcdev/dadk.c b/usr/src/uts/intel/io/dktp/dcdev/dadk.c index 35f97482b8..f74a0d4137 100644 --- a/usr/src/uts/intel/io/dktp/dcdev/dadk.c +++ b/usr/src/uts/intel/io/dktp/dcdev/dadk.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. */ /* @@ -170,6 +171,8 @@ static int dadk_debug = DGEOM; #endif /* DADK_DEBUG */ +#define ONE_MIN ((longlong_t)60 * NANOSEC) + static int dadk_check_media_time = 3000000; /* 3 Second State Check */ static int dadk_dk_maxphys = 0x80000; @@ -1376,6 +1379,47 @@ static struct dadkio_derr dadk_errtab[] = { {COMMAND_DONE_ERROR, GDA_FATAL}, /* 23 DERR_RESV */ }; +/* + * A bad disk can result in a large number of errors spewed to the log. + * This can in turn lead to /var/adm/messages filling up the file system on + * a machine with a small root or /var file system. + * + * Instead of logging every error, if we're seeing repeated errors on a disk + * only log them periodically. + */ +static void +dadk_logerr(struct dadk *dadkp, struct cmpkt *pktp, char *label, + int severity, daddr_t blkno, daddr_t err_blkno, + char **cmdvec, char **senvec) +{ + hrtime_t now; + + now = gethrtime(); + if ((now - dadkp->dad_last_log) < ONE_MIN) { + atomic_add_32(&dadkp->dad_err_cnt, 1); + return; + } + + if (dadkp->dad_err_cnt > 0) { + dev_info_t *dev = dadkp->dad_sd->sd_dev; + char name[256], buf[256]; + + if (dev) + (void) snprintf(name, sizeof (name), "%s (%s%d)", + ddi_pathname(dev, buf), label, + ddi_get_instance(dev)); + else + (void) strlcpy(name, label, sizeof (name)); + cmn_err(CE_WARN, "%s: %d additional unlogged errors\n", + name, dadkp->dad_err_cnt); + } + + gda_errmsg(dadkp->dad_sd, pktp, label, severity, blkno, err_blkno, + cmdvec, senvec); + dadkp->dad_err_cnt = 0; + dadkp->dad_last_log = now; +} + static int dadk_chkerr(struct cmpkt *pktp) { @@ -1462,7 +1506,7 @@ dadk_chkerr(struct cmpkt *pktp) return (COMMAND_DONE); } if (pktp->cp_passthru == NULL) { - gda_errmsg(dadkp->dad_sd, pktp, dadk_name, + dadk_logerr(dadkp, pktp, dadk_name, dadk_errtab[scb].d_severity, pktp->cp_srtsec, err_blkno, dadk_cmds, dadk_sense); } @@ -1519,7 +1563,7 @@ dadk_recorderr(struct cmpkt *pktp, struct dadkio_rwcmd *rwcmdp) if (rwcmdp->flags & DADKIO_FLAG_SILENT) return; - gda_errmsg(dadkp->dad_sd, pktp, dadk_name, dadk_errtab[scb].d_severity, + dadk_logerr(dadkp, pktp, dadk_name, dadk_errtab[scb].d_severity, rwcmdp->blkaddr, rwcmdp->status.failed_blk, dadk_cmds, dadk_sense); } diff --git a/usr/src/uts/intel/io/ipmi/ipmivars.h b/usr/src/uts/intel/io/ipmi/ipmivars.h index fec94bb24f..cd73753438 100644 --- a/usr/src/uts/intel/io/ipmi/ipmivars.h +++ b/usr/src/uts/intel/io/ipmi/ipmivars.h @@ -78,6 +78,7 @@ struct ipmi_request { #define SMIC_CTL_STS 1 #define SMIC_FLAGS 2 +struct ipmi_softc; #define IPMI_BUSY 0x1 #define IPMI_CLOSING 0x2 diff --git a/usr/src/uts/intel/io/pci/pci_boot.c b/usr/src/uts/intel/io/pci/pci_boot.c index ab3b5a5a8f..d5de14a9bc 100644 --- a/usr/src/uts/intel/io/pci/pci_boot.c +++ b/usr/src/uts/intel/io/pci/pci_boot.c @@ -3093,7 +3093,7 @@ add_ppb_props(dev_info_t *dip, uchar_t bus, uchar_t dev, uchar_t func, * If it is unset, we disable i/o and mark it for reconfiguration in * later passes by setting the base > limit */ - val = (uint_t)pci_getw(bus, dev, func, PCI_CONF_COMM); + val = (uint64_t)pci_getw(bus, dev, func, PCI_CONF_COMM); if (val & PCI_COMM_IO) { val = (uint_t)pci_getb(bus, dev, func, PCI_BCNF_IO_LIMIT_LOW); io_range[1] = ((val & PCI_BCNF_IO_MASK) << PCI_BCNF_IO_SHIFT) | diff --git a/usr/src/uts/intel/io/scsi/targets/sd.conf b/usr/src/uts/intel/io/scsi/targets/sd.conf index 1863937888..b0aebdb5b1 100644 --- a/usr/src/uts/intel/io/scsi/targets/sd.conf +++ b/usr/src/uts/intel/io/scsi/targets/sd.conf @@ -42,7 +42,7 @@ name="sd" class="scsi" target=15 lun=0; # # The following stub node is needed for pathological bottom-up -# devid resolution on a self-identifying transport. +# devid resolution on a self-identifying transport. # name="sd" class="scsi-self-identifying"; @@ -50,3 +50,34 @@ name="sd" class="scsi-self-identifying"; # Associate the driver with devid resolution. # ddi-devid-registrant=1; + +# +# Certain hardware RAID controllers have nonvolatile caches but do not +# support the SYNC_NV bit to restrict flushes to the volatile portion of +# the cache, if any. In order to get acceptable performance out of these +# devices, we need to suppress cache flushing on them. In most (hopefully +# all) cases, if the battery fails or the cache otherwise becomes volatile, +# the controller will switch to write-through mode, and ensure that any +# underlying drive cache is off. In this case, it should still be safe to +# dispense with cache flush commands. Controllers for which this is not the +# case should have cache-nonvolatile set unless data loss and corruption are +# acceptable. +# +# In addition, *all* devices have their retries capped at 1. There are an +# additional 2 retries for "victim" IOs if a reset is needed. Retrying is +# very rarely successful, and it is preferable to let ZFS do it where needed. +# +# For the Samsung client drives, users have seen data corruption when they use +# the advertised 512 byte sectors. The actual sector size of the flash +# translation layer is 4K, so it's relatively safe to make this change which +# annecdotally solves that problem. +# +sd-config-list= + "", "retries-timeout:1,retries-busy:1,retries-reset:1,retries-victim:2", + "DELL PERC H710", "cache-nonvolatile:true", + "DELL PERC H700", "cache-nonvolatile:true", + "DELL PERC/6i", "cache-nonvolatile:true", + "ATA Samsung SSD 830", "physical-block-size:4096", + "ATA Samsung SSD 840", "physical-block-size:4096", + "ATA Samsung SSD 850", "physical-block-size:4096", + "ATA Samsung SSD 860", "physical-block-size:4096"; diff --git a/usr/src/uts/intel/io/vmxnet/buildNumber.h b/usr/src/uts/intel/io/vmxnet/buildNumber.h new file mode 100644 index 0000000000..97f18a3cbc --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/buildNumber.h @@ -0,0 +1,12 @@ +#define BUILD_NUMBER \ + "build-425873" +#define BUILD_NUMBER_NUMERIC \ + 425873 +#define BUILD_NUMBER_NUMERIC_STRING \ + "425873" +#define PRODUCT_BUILD_NUMBER \ + "product-build-6261" +#define PRODUCT_BUILD_NUMBER_NUMERIC \ + 6261 +#define PRODUCT_BUILD_NUMBER_NUMERIC_STRING \ + "6261" diff --git a/usr/src/uts/intel/io/vmxnet/includeCheck.h b/usr/src/uts/intel/io/vmxnet/includeCheck.h new file mode 100644 index 0000000000..c414d6daf5 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/includeCheck.h @@ -0,0 +1,159 @@ +/********************************************************* + * Copyright (C) 1998 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +/********************************************************* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of VMware Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission of VMware Inc. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/* + * includeCheck.h -- + * + * Restrict include file use. + * + * In every .h file, define one or more of these + * + * INCLUDE_ALLOW_VMX + * INCLUDE_ALLOW_USERLEVEL + * INCLUDE_ALLOW_VMCORE + * INCLUDE_ALLOW_MODULE + * INCLUDE_ALLOW_VMKERNEL + * INCLUDE_ALLOW_DISTRIBUTE + * INCLUDE_ALLOW_VMK_MODULE + * INCLUDE_ALLOW_VMKDRIVERS + * INCLUDE_ALLOW_VMIROM + * + * Then include this file. + * + * Any file that has INCLUDE_ALLOW_DISTRIBUTE defined will potentially + * be distributed in source form along with GPLed code. Ensure + * that this is acceptable. + */ + + +/* + * Declare a VMCORE-only variable to help classify object + * files. The variable goes in the common block and does + * not create multiple definition link-time conflicts. + */ + +#if defined VMCORE && defined VMX86_DEVEL && defined VMX86_DEBUG && \ + defined linux && !defined MODULE && \ + !defined COMPILED_WITH_VMCORE +#define COMPILED_WITH_VMCORE compiled_with_vmcore +#ifdef ASM + .comm compiled_with_vmcore, 0 +#else + asm(".comm compiled_with_vmcore, 0"); +#endif /* ASM */ +#endif + + +#if defined VMCORE && \ + !(defined VMX86_VMX || defined VMM || \ + defined MONITOR_APP || defined VMMON) +#error "Makefile problem: VMCORE without VMX86_VMX or \ + VMM or MONITOR_APP or MODULE." +#endif + +#if defined VMCORE && !defined INCLUDE_ALLOW_VMCORE +#error "The surrounding include file is not allowed in vmcore." +#endif +#undef INCLUDE_ALLOW_VMCORE + +#if defined VMX86_VMX && !defined VMCORE && \ + !(defined INCLUDE_ALLOW_VMX || defined INCLUDE_ALLOW_USERLEVEL) +#error "The surrounding include file is not allowed in the VMX." +#endif +#undef INCLUDE_ALLOW_VMX + +#if defined USERLEVEL && !defined VMX86_VMX && !defined VMCORE && \ + !defined INCLUDE_ALLOW_USERLEVEL +#error "The surrounding include file is not allowed at userlevel." +#endif +#undef INCLUDE_ALLOW_USERLEVEL + +#if defined MODULE && !defined VMKERNEL_MODULE && \ + !defined VMMON && !defined INCLUDE_ALLOW_MODULE +#error "The surrounding include file is not allowed in driver modules." +#endif +#undef INCLUDE_ALLOW_MODULE + +#if defined VMMON && !defined INCLUDE_ALLOW_VMMON +#error "The surrounding include file is not allowed in vmmon." +#endif +#undef INCLUDE_ALLOW_VMMON + +#if defined VMKERNEL && !defined INCLUDE_ALLOW_VMKERNEL +#error "The surrounding include file is not allowed in the vmkernel." +#endif +#undef INCLUDE_ALLOW_VMKERNEL + +#if defined GPLED_CODE && !defined INCLUDE_ALLOW_DISTRIBUTE +#error "The surrounding include file is not allowed in GPL code." +#endif +#undef INCLUDE_ALLOW_DISTRIBUTE + +#if defined VMKERNEL_MODULE && !defined VMKERNEL && \ + !defined INCLUDE_ALLOW_VMK_MODULE && !defined INCLUDE_ALLOW_VMKDRIVERS +#error "The surrounding include file is not allowed in vmkernel modules." +#endif +#undef INCLUDE_ALLOW_VMK_MODULE +#undef INCLUDE_ALLOW_VMKDRIVERS + +#if defined VMIROM && ! defined INCLUDE_ALLOW_VMIROM +#error "The surrounding include file is not allowed in vmirom." +#endif +#undef INCLUDE_ALLOW_VMIROM diff --git a/usr/src/uts/intel/io/vmxnet/net.h b/usr/src/uts/intel/io/vmxnet/net.h new file mode 100644 index 0000000000..41b6eb1d14 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/net.h @@ -0,0 +1,220 @@ +/********************************************************* + * Copyright (C) 1998 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/************************************************************ + * + * net.h + * + * This file should contain all network global defines. + * No vlance/vmxnet/vnet/vmknet specific stuff should be + * put here only defines used/usable by all network code. + * --gustav + * + ************************************************************/ + +#ifndef VMWARE_DEVICES_NET_H +#define VMWARE_DEVICES_NET_H + +#define INCLUDE_ALLOW_USERLEVEL +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMCORE + +#include "includeCheck.h" +#include "vm_device_version.h" + +#ifdef VMCORE +#include "config.h" +#include "str.h" +#include "strutil.h" +#endif + +#define ETHERNET_MTU 1518 +#define ETH_MIN_FRAME_LEN 60 + +#ifndef ETHER_ADDR_LEN +#define ETHER_ADDR_LEN 6 /* length of MAC address */ +#endif +#define ETH_HEADER_LEN 14 /* length of Ethernet header */ +#define IP_ADDR_LEN 4 /* length of IPv4 address */ +#define IP_HEADER_LEN 20 /* minimum length of IPv4 header */ + +#define ETHER_MAX_QUEUED_PACKET 1600 + + +/* + * State's that a NIC can be in currently we only use this + * in VLance but if we implement/emulate new adapters that + * we also want to be able to morph a new corresponding + * state should be added. + */ + +#define LANCE_CHIP 0x2934 +#define VMXNET_CHIP 0x4392 + +/* + * Size of reserved IO space needed by the LANCE adapter and + * the VMXNET adapter. If you add more ports to Vmxnet than + * there is reserved space you must bump VMXNET_CHIP_IO_RESV_SIZE. + * The sizes must be powers of 2. + */ + +#define LANCE_CHIP_IO_RESV_SIZE 0x20 +#define VMXNET_CHIP_IO_RESV_SIZE 0x40 + +#define MORPH_PORT_SIZE 4 + +#ifdef VMCORE +typedef struct Net_AdapterCount { + uint8 vlance; + uint8 vmxnet2; + uint8 vmxnet3; + uint8 e1000; + uint8 e1000e; +} Net_AdapterCount; +#endif + +#ifdef USERLEVEL + +/* + *---------------------------------------------------------------------------- + * + * Net_AddAddrToLADRF -- + * + * Given a MAC address, sets the corresponding bit in the LANCE style + * Logical Address Filter 'ladrf'. + * The caller should have initialized the ladrf to all 0's, as this + * function only ORs on a bit in the array. + * 'addr' is presumed to be ETHER_ADDR_LEN in size; + * 'ladrf' is presumed to point to a 64-bit vector. + * + * Derived from a long history of derivations, originally inspired by + * sample code from the AMD "Network Products: Ethernet Controllers 1998 + * Data Book, Book 2", pages 1-53..1-55. + * + * Returns: + * None. + * + * Side effects: + * Updates 'ladrf'. + * + *---------------------------------------------------------------------------- + */ + +static INLINE void +Net_AddAddrToLadrf(const uint8 *addr, // IN: pointer to MAC address + uint8 *ladrf) // IN/OUT: pointer to ladrf +{ +#define CRC_POLYNOMIAL_BE 0x04c11db7UL /* Ethernet CRC, big endian */ + + uint16 hashcode; + int32 crc = 0xffffffff; /* init CRC for each address */ + int32 j; + int32 bit; + int32 byte; + + ASSERT(addr); + ASSERT(ladrf); + + for (byte = 0; byte < ETHER_ADDR_LEN; byte++) { /* for each address byte */ + /* process each address bit */ + for (bit = *addr++, j = 0; + j < 8; + j++, bit >>= 1) { + crc = (crc << 1) ^ ((((crc < 0 ? 1 : 0) ^ bit) & 0x01) ? + CRC_POLYNOMIAL_BE : 0); + } + } + hashcode = (crc & 1); /* hashcode is 6 LSb of CRC ... */ + for (j = 0; j < 5; j++) { /* ... in reverse order. */ + hashcode = (hashcode << 1) | ((crc>>=1) & 1); + } + + ladrf[hashcode >> 3] |= 1 << (hashcode & 0x07); +} +#endif // USERLEVEL + +#ifdef VMCORE +/* + *---------------------------------------------------------------------- + * + * Net_GetNumAdapters -- + * + * Returns the number of each type of network adapter configured in this + * VM. + * + * Results: + * None. + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +static INLINE void +Net_GetNumAdapters(Net_AdapterCount *counts) +{ + uint32 i; + + counts->vlance = 0; + counts->vmxnet2 = 0; + counts->vmxnet3 = 0; + counts->e1000 = 0; + counts->e1000e = 0; + + for (i = 0; i < MAX_ETHERNET_CARDS; i++) { + char* adapterStr; + + if (!Config_GetBool(FALSE, "ethernet%d.present", i)) { + continue; + } + adapterStr = Config_GetString("vlance", "ethernet%d.virtualDev", i); + if (Str_Strcasecmp(adapterStr, "vmxnet3") == 0) { + counts->vmxnet3++; + } else if (Str_Strcasecmp(adapterStr, "vlance") == 0) { + counts->vlance++; + } else if (Str_Strcasecmp(adapterStr, "vmxnet") == 0) { + counts->vmxnet2++; + } else if (Str_Strcasecmp(adapterStr, "e1000") == 0) { + counts->e1000++; + } else if (Str_Strcasecmp(adapterStr, "e1000e") == 0) { + counts->e1000e++; + } else { + LOG_ONCE(("%s: unknown adapter: %s\n", __FUNCTION__, adapterStr)); + } + free(adapterStr); + } +} + +#endif // VMCORE + +#endif // VMWARE_DEVICES_NET_H diff --git a/usr/src/uts/intel/io/vmxnet/net_sg.h b/usr/src/uts/intel/io/vmxnet/net_sg.h new file mode 100644 index 0000000000..f6c30fb2b5 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/net_sg.h @@ -0,0 +1,84 @@ +/********************************************************* + * Copyright (C) 2000 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/* + * net_sg.h -- + * + * Network packet scatter gather structure. + */ + + +#ifndef _NET_SG_H +#define _NET_SG_H + +#define INCLUDE_ALLOW_USERLEVEL + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMK_MODULE +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_DISTRIBUTE +#include "includeCheck.h" + +#define NET_SG_DEFAULT_LENGTH 16 + +/* + * A single scatter-gather element for a network packet. + * The address is split into low and high to save space. + * If we make it 64 bits then Windows pads things out such that + * we lose a lot of space for each scatter gather array. + * This adds up when you have embedded scatter-gather + * arrays for transmit and receive ring buffers. + */ +typedef struct NetSG_Elem { + uint32 addrLow; + uint16 addrHi; + uint16 length; +} NetSG_Elem; + +typedef enum NetSG_AddrType { + NET_SG_MACH_ADDR, + NET_SG_PHYS_ADDR, + NET_SG_VIRT_ADDR, +} NetSG_AddrType; + +typedef struct NetSG_Array { + uint16 addrType; + uint16 length; + NetSG_Elem sg[NET_SG_DEFAULT_LENGTH]; +} NetSG_Array; + +#define NET_SG_SIZE(len) (sizeof(NetSG_Array) + (len - NET_SG_DEFAULT_LENGTH) * sizeof(NetSG_Elem)) + +#define NET_SG_MAKE_PA(elem) (PA)QWORD(elem.addrHi, elem.addrLow) +#define NET_SG_MAKE_PTR(elem) (char *)(uintptr_t)QWORD(elem.addrHi, elem.addrLow) + +#endif diff --git a/usr/src/uts/intel/io/vmxnet/vm_basic_types.h b/usr/src/uts/intel/io/vmxnet/vm_basic_types.h new file mode 100644 index 0000000000..adeac1b708 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vm_basic_types.h @@ -0,0 +1,1037 @@ +/********************************************************* + * Copyright (C) 1998-2009 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +/********************************************************* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of VMware Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission of VMware Inc. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/* + * + * vm_basic_types.h -- + * + * basic data types. + */ + + +#ifndef _VM_BASIC_TYPES_H_ +#define _VM_BASIC_TYPES_H_ + +#define INCLUDE_ALLOW_USERLEVEL + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMMON +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_VMKDRIVERS +#define INCLUDE_ALLOW_VMK_MODULE +#define INCLUDE_ALLOW_DISTRIBUTE +#define INCLUDE_ALLOW_VMCORE +#define INCLUDE_ALLOW_VMIROM +#include "includeCheck.h" + +/* STRICT ANSI means the Xserver build and X defines Bool differently. */ +#if !defined(_XTYPEDEF_BOOL) && \ + (!defined(__STRICT_ANSI__) || defined(__FreeBSD__) || defined(__MINGW32__)) +#define _XTYPEDEF_BOOL +typedef char Bool; +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#define IsBool(x) (((x) & ~1) == 0) +#define IsBool2(x, y) ((((x) | (y)) & ~1) == 0) + +/* + * Macros __i386__ and __ia64 are intrinsically defined by GCC + */ +#if defined _MSC_VER && defined _M_X64 +# define __x86_64__ +#elif defined _MSC_VER && defined _M_IX86 +# define __i386__ +#endif + +#ifdef __i386__ +#define VM_I386 +#endif + +#ifdef __x86_64__ +#define VM_X86_64 +#define VM_I386 +#define vm_x86_64 (1) +#else +#define vm_x86_64 (0) +#endif + + +#ifdef _MSC_VER + +#pragma warning (3 :4505) // unreferenced local function +#pragma warning (disable :4018) // signed/unsigned mismatch +#pragma warning (disable :4761) // integral size mismatch in argument; conversion supplied +#pragma warning (disable :4305) // truncation from 'const int' to 'short' +#pragma warning (disable :4244) // conversion from 'unsigned short' to 'unsigned char' +#pragma warning (disable :4267) // truncation of 'size_t' +#pragma warning (disable :4146) // unary minus operator applied to unsigned type, result still unsigned +#pragma warning (disable :4142) // benign redefinition of type + +#endif + +#if defined(__APPLE__) || defined(HAVE_STDINT_H) + +/* + * TODO: This is a C99 standard header. We should be able to test for + * #if __STDC_VERSION__ >= 199901L, but that breaks the Netware build + * (which doesn't have stdint.h). + */ + +#include <stdint.h> + +typedef uint64_t uint64; +typedef int64_t int64; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint16_t uint16; +typedef int16_t int16; +typedef uint8_t uint8; +typedef int8_t int8; + +/* + * Note: C does not specify whether char is signed or unsigned, and + * both gcc and msvc implement processor-specific signedness. With + * three types: + * typeof(char) != typeof(signed char) != typeof(unsigned char) + * + * Be careful here, because gcc (4.0.1 and others) likes to warn about + * conversions between signed char * and char *. + */ + +#else /* !HAVE_STDINT_H */ + +#ifdef _MSC_VER + +typedef unsigned __int64 uint64; +typedef signed __int64 int64; + +#elif defined(__GNUC__) || defined(__SUNPRO_C) +/* The Xserver source compiles with -ansi -pendantic */ +# if !defined(__STRICT_ANSI__) || defined(__FreeBSD__) +# if defined(VM_X86_64) +typedef unsigned long uint64; +typedef long int64; +# else +typedef unsigned long long uint64; +typedef long long int64; +# endif +# endif +#else +# error - Need compiler define for int64/uint64 +#endif /* _MSC_VER */ + +typedef unsigned int uint32; +typedef unsigned short uint16; +typedef unsigned char uint8; + +typedef int int32; +typedef short int16; +typedef signed char int8; + +#endif /* HAVE_STDINT_H */ + +/* + * FreeBSD (for the tools build) unconditionally defines these in + * sys/inttypes.h so don't redefine them if this file has already + * been included. [greg] + * + * This applies to Solaris as well. + */ + +/* + * Before trying to do the includes based on OS defines, see if we can use + * feature-based defines to get as much functionality as possible + */ + +#ifdef HAVE_INTTYPES_H +#include <inttypes.h> +#endif +#ifdef HAVE_SYS_TYPES_H +#include <sys/types.h> +#endif +#ifdef HAVE_SYS_INTTYPES_H +#include <sys/inttypes.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +#ifdef __FreeBSD__ +#include <sys/param.h> /* For __FreeBSD_version */ +#endif + +#if !defined(USING_AUTOCONF) +# if defined(__FreeBSD__) || defined(sun) +# ifdef KLD_MODULE +# include <sys/types.h> +# else +# if __FreeBSD_version >= 500043 +# if !defined(VMKERNEL) +# include <inttypes.h> +# endif +# include <sys/types.h> +# else +# include <sys/inttypes.h> +# endif +# endif +# elif defined __APPLE__ +# if KERNEL +# include <sys/unistd.h> +# include <sys/types.h> /* mostly for size_t */ +# include <stdint.h> +# else +# include <unistd.h> +# include <inttypes.h> +# include <stdlib.h> +# include <stdint.h> +# endif +# else +# if !defined(__intptr_t_defined) && !defined(intptr_t) +# ifdef VM_I386 +# define __intptr_t_defined +# ifdef VM_X86_64 +typedef int64 intptr_t; +# else +typedef int32 intptr_t; +# endif +# elif defined(__arm__) +typedef int32 intptr_t; +# endif +# endif + +# ifndef _STDINT_H +# ifdef VM_I386 +# ifdef VM_X86_64 +typedef uint64 uintptr_t; +# else +typedef uint32 uintptr_t; +# endif +# elif defined(__arm__) +typedef uint32 uintptr_t; +# endif +# endif +# endif +#endif + + +/* + * Time + * XXX These should be cleaned up. -- edward + */ + +typedef int64 VmTimeType; /* Time in microseconds */ +typedef int64 VmTimeRealClock; /* Real clock kept in microseconds */ +typedef int64 VmTimeVirtualClock; /* Virtual Clock kept in CPU cycles */ + +/* + * Printf format specifiers for size_t and 64-bit number. + * Use them like this: + * printf("%"FMT64"d\n", big); + * + * FMTH is for handles/fds. + */ + +#ifdef _MSC_VER + #define FMT64 "I64" + #ifdef VM_X86_64 + #define FMTSZ "I64" + #define FMTPD "I64" + #define FMTH "I64" + #else + #define FMTSZ "I" + #define FMTPD "I" + #define FMTH "I" + #endif +#elif defined __APPLE__ + /* Mac OS hosts use the same formatters for 32- and 64-bit. */ + #define FMT64 "ll" + #if KERNEL + #define FMTSZ "l" + #else + #define FMTSZ "z" + #endif + #define FMTPD "l" + #define FMTH "" +#elif defined(__GNUC__) || defined(__SUNPRO_C) + #define FMTH "" + #if defined(N_PLAT_NLM) || defined(sun) || \ + (defined(__FreeBSD__) && (__FreeBSD__ + 0) && ((__FreeBSD__ + 0) < 5)) + /* + * Why (__FreeBSD__ + 0)? See bug 141008. + * Yes, we really need to test both (__FreeBSD__ + 0) and + * ((__FreeBSD__ + 0) < 5). No, we can't remove "+ 0" from + * ((__FreeBSD__ + 0) < 5). + */ + #ifdef VM_X86_64 + #define FMTSZ "l" + #define FMTPD "l" + #else + #define FMTSZ "" + #define FMTPD "" + #endif + #elif defined(__linux__) \ + || (defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L) \ + || (defined(_POSIX_VERSION) && _POSIX_VERSION >= 200112L) \ + || (defined(_POSIX2_VERSION) && _POSIX2_VERSION >= 200112L) + /* BSD, Linux */ + #define FMTSZ "z" + + #if defined(VM_X86_64) + #define FMTPD "l" + #else + #define FMTPD "" + #endif + #else + /* Systems with a pre-C99 libc */ + #define FMTSZ "Z" + #ifdef VM_X86_64 + #define FMTPD "l" + #else + #define FMTPD "" + #endif + #endif + #ifdef VM_X86_64 + #define FMT64 "l" + #elif defined(sun) || defined(__FreeBSD__) + #define FMT64 "ll" + #else + #define FMT64 "L" + #endif +#else + #error - Need compiler define for FMT64 and FMTSZ +#endif + +/* + * Suffix for 64-bit constants. Use it like this: + * CONST64(0x7fffffffffffffff) for signed or + * CONST64U(0x7fffffffffffffff) for unsigned. + * + * 2004.08.30(thutt): + * The vmcore/asm64/gen* programs are compiled as 32-bit + * applications, but must handle 64 bit constants. If the + * 64-bit-constant defining macros are already defined, the + * definition will not be overwritten. + */ + +#if !defined(CONST64) || !defined(CONST64U) +#ifdef _MSC_VER +#define CONST64(c) c##I64 +#define CONST64U(c) c##uI64 +#elif defined __APPLE__ +#define CONST64(c) c##LL +#define CONST64U(c) c##uLL +#elif defined(__GNUC__) || defined(__SUNPRO_C) +#ifdef VM_X86_64 +#define CONST64(c) c##L +#define CONST64U(c) c##uL +#else +#define CONST64(c) c##LL +#define CONST64U(c) c##uLL +#endif +#else +#error - Need compiler define for CONST64 +#endif +#endif + +/* + * Use CONST3264/CONST3264U if you want a constant to be + * treated as a 32-bit number on 32-bit compiles and + * a 64-bit number on 64-bit compiles. Useful in the case + * of shifts, like (CONST3264U(1) << x), where x could be + * more than 31 on a 64-bit compile. + */ + +#ifdef VM_X86_64 + #define CONST3264(a) CONST64(a) + #define CONST3264U(a) CONST64U(a) +#else + #define CONST3264(a) (a) + #define CONST3264U(a) (a) +#endif + +#define MIN_INT8 ((int8)0x80) +#define MAX_INT8 ((int8)0x7f) + +#define MIN_UINT8 ((uint8)0) +#define MAX_UINT8 ((uint8)0xff) + +#define MIN_INT16 ((int16)0x8000) +#define MAX_INT16 ((int16)0x7fff) + +#define MIN_UINT16 ((uint16)0) +#define MAX_UINT16 ((uint16)0xffff) + +#define MIN_INT32 ((int32)0x80000000) +#define MAX_INT32 ((int32)0x7fffffff) + +#define MIN_UINT32 ((uint32)0) +#define MAX_UINT32 ((uint32)0xffffffff) + +#define MIN_INT64 (CONST64(0x8000000000000000)) +#define MAX_INT64 (CONST64(0x7fffffffffffffff)) + +#define MIN_UINT64 (CONST64U(0)) +#define MAX_UINT64 (CONST64U(0xffffffffffffffff)) + +typedef uint8 *TCA; /* Pointer into TC (usually). */ + +/* + * Type big enough to hold an integer between 0..100 + */ +typedef uint8 Percent; +#define AsPercent(v) ((Percent)(v)) +#define CHOOSE_PERCENT AsPercent(101) + + +typedef uintptr_t VA; +typedef uintptr_t VPN; + +typedef uint64 PA; +typedef uint32 PPN; + +typedef uint64 PhysMemOff; +typedef uint64 PhysMemSize; + +/* The Xserver source compiles with -ansi -pendantic */ +#ifndef __STRICT_ANSI__ +typedef uint64 BA; +#endif +typedef uint32 BPN; +typedef uint32 PageNum; +typedef unsigned MemHandle; +typedef int32 World_ID; + +/* !! do not alter the definition of INVALID_WORLD_ID without ensuring + * that the values defined in both bora/public/vm_basic_types.h and + * lib/vprobe/vm_basic_types.h are the same. Additionally, the definition + * of VMK_INVALID_WORLD_ID in vmkapi_world.h also must be defined with + * the same value + */ + +#define INVALID_WORLD_ID ((World_ID)0) + +typedef World_ID User_CartelID; +#define INVALID_CARTEL_ID INVALID_WORLD_ID + +typedef User_CartelID User_SessionID; +#define INVALID_SESSION_ID INVALID_CARTEL_ID + +typedef User_CartelID User_CartelGroupID; +#define INVALID_CARTELGROUP_ID INVALID_CARTEL_ID + +typedef uint32 Worldlet_ID; +#define INVALID_WORLDLET_ID ((Worldlet_ID)-1) + +/* The Xserver source compiles with -ansi -pendantic */ +#ifndef __STRICT_ANSI__ +typedef uint64 MA; +typedef uint32 MPN; +#endif + +/* + * This type should be used for variables that contain sector + * position/quantity. + */ +typedef uint64 SectorType; + +/* + * Linear address + */ + +typedef uintptr_t LA; +typedef uintptr_t LPN; +#define LA_2_LPN(_la) ((_la) >> PAGE_SHIFT) +#define LPN_2_LA(_lpn) ((_lpn) << PAGE_SHIFT) + +#define LAST_LPN ((((LA) 1) << (8 * sizeof(LA) - PAGE_SHIFT)) - 1) +#define LAST_LPN32 ((((LA32)1) << (8 * sizeof(LA32) - PAGE_SHIFT)) - 1) +#define LAST_LPN64 ((((LA64)1) << (8 * sizeof(LA64) - PAGE_SHIFT)) - 1) + +/* Valid bits in a LPN. */ +#define LPN_MASK LAST_LPN +#define LPN_MASK32 LAST_LPN32 +#define LPN_MASK64 LAST_LPN64 + +/* + * On 64 bit platform, address and page number types default + * to 64 bit. When we need to represent a 32 bit address, we use + * types defined below. + * + * On 32 bit platform, the following types are the same as the + * default types. + */ +typedef uint32 VA32; +typedef uint32 VPN32; +typedef uint32 LA32; +typedef uint32 LPN32; +typedef uint32 PA32; +typedef uint32 PPN32; +typedef uint32 MA32; +typedef uint32 MPN32; + +/* + * On 64 bit platform, the following types are the same as the + * default types. + */ +typedef uint64 VA64; +typedef uint64 VPN64; +typedef uint64 LA64; +typedef uint64 LPN64; +typedef uint64 PA64; +typedef uint64 PPN64; +typedef uint64 MA64; +typedef uint64 MPN64; + +/* + * VA typedefs for user world apps. + */ +typedef VA32 UserVA32; +typedef VA64 UserVA64; +typedef UserVA64 UserVAConst; /* Userspace ptr to data that we may only read. */ +typedef UserVA32 UserVA32Const; /* Userspace ptr to data that we may only read. */ +typedef UserVA64 UserVA64Const; /* Used by 64-bit syscalls until conversion is finished. */ +#ifdef VMKERNEL +typedef UserVA64 UserVA; +#else +typedef void * UserVA; +#endif + + +/* + * Maximal possible PPN value (errors too) that PhysMem can handle. + * Must be at least as large as MAX_PPN which is the maximum PPN + * for any region other than buserror. + */ +#define PHYSMEM_MAX_PPN ((PPN)0xffffffff) +#define MAX_PPN ((PPN)0x1fffffff) /* Maximal observable PPN value. */ +#define INVALID_PPN ((PPN)0xffffffff) + +#define INVALID_BPN ((BPN)0x1fffffff) + +#define RESERVED_MPN ((MPN) 0) +#define INVALID_MPN ((MPN)-1) +#define MEMREF_MPN ((MPN)-2) +#define RELEASED_MPN ((MPN)-3) +#define MAX_MPN ((MPN)0x7fffffff) /* 43 bits of address space. */ + +#define INVALID_LPN ((LPN)-1) +#define INVALID_VPN ((VPN)-1) +#define INVALID_LPN64 ((LPN64)-1) +#define INVALID_PAGENUM ((PageNum)-1) + + +/* + * Format modifier for printing VA, LA, and VPN. + * Use them like this: Log("%#"FMTLA"x\n", laddr) + */ + +#if defined(VMM) || defined(FROBOS64) || vm_x86_64 || defined __APPLE__ +# define FMTLA "l" +# define FMTVA "l" +# define FMTVPN "l" +#else +# define FMTLA "" +# define FMTVA "" +# define FMTVPN "" +#endif + +#ifndef EXTERN +#define EXTERN extern +#endif +#define CONST const + + +#ifndef INLINE +# ifdef _MSC_VER +# define INLINE __inline +# else +# define INLINE inline +# endif +#endif + + +/* + * Annotation for data that may be exported into a DLL and used by other + * apps that load that DLL and import the data. + */ +#if defined(_WIN32) && defined(VMX86_IMPORT_DLLDATA) +# define VMX86_EXTERN_DATA extern __declspec(dllimport) +#else // !_WIN32 +# define VMX86_EXTERN_DATA extern +#endif + +#if defined(_WIN32) && !defined(VMX86_NO_THREADS) +#define THREADSPECIFIC __declspec(thread) +#else +#define THREADSPECIFIC +#endif + +/* + * Due to the wonderful "registry redirection" feature introduced in + * 64-bit Windows, if you access any key under HKLM\Software in 64-bit + * code, you need to open/create/delete that key with + * VMKEY_WOW64_32KEY if you want a consistent view with 32-bit code. + */ + +#ifdef _WIN32 +#ifdef _WIN64 +#define VMW_KEY_WOW64_32KEY KEY_WOW64_32KEY +#else +#define VMW_KEY_WOW64_32KEY 0x0 +#endif +#endif + + +/* + * Consider the following reasons functions are inlined: + * + * 1) inlined for performance reasons + * 2) inlined because it's a single-use function + * + * Functions which meet only condition 2 should be marked with this + * inline macro; It is not critical to be inlined (but there is a + * code-space & runtime savings by doing so), so when other callers + * are added the inline-ness should be removed. + */ + +#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3) +/* + * Starting at version 3.3, gcc does not always inline functions marked + * 'inline' (it depends on their size). To force gcc to do so, one must use the + * extra __always_inline__ attribute. + */ +# define INLINE_SINGLE_CALLER INLINE __attribute__((__always_inline__)) +#else +# define INLINE_SINGLE_CALLER INLINE +#endif + +/* + * Used when a hard guaranteed of no inlining is needed. Very few + * instances need this since the absence of INLINE is a good hint + * that gcc will not do inlining. + */ + +#if defined(__GNUC__) && defined(VMM) +#define ABSOLUTELY_NOINLINE __attribute__((__noinline__)) +#endif + +/* + * Attributes placed on function declarations to tell the compiler + * that the function never returns. + */ + +#ifdef _MSC_VER +#define NORETURN __declspec(noreturn) +#elif __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 9) +#define NORETURN __attribute__((__noreturn__)) +#else +#define NORETURN +#endif + +/* + * GCC 3.2 inline asm needs the + constraint for input/ouput memory operands. + * Older GCCs don't know about it --hpreg + */ + +#if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 2) +# define VM_ASM_PLUS 1 +#else +# define VM_ASM_PLUS 0 +#endif + +/* + * Branch prediction hints: + * LIKELY(exp) - Expression exp is likely TRUE. + * UNLIKELY(exp) - Expression exp is likely FALSE. + * Usage example: + * if (LIKELY(excCode == EXC_NONE)) { + * or + * if (UNLIKELY(REAL_MODE(vc))) { + * + * We know how to predict branches on gcc3 and later (hopefully), + * all others we don't so we do nothing. + */ + +#if (__GNUC__ >= 3) +/* + * gcc3 uses __builtin_expect() to inform the compiler of an expected value. + * We use this to inform the static branch predictor. The '!!' in LIKELY + * will convert any !=0 to a 1. + */ +#define LIKELY(_exp) __builtin_expect(!!(_exp), 1) +#define UNLIKELY(_exp) __builtin_expect((_exp), 0) +#else +#define LIKELY(_exp) (_exp) +#define UNLIKELY(_exp) (_exp) +#endif + +/* + * GCC's argument checking for printf-like functions + * This is conditional until we have replaced all `"%x", void *' + * with `"0x%08x", (uint32) void *'. Note that %p prints different things + * on different platforms. Argument checking is enabled for the + * vmkernel, which has already been cleansed. + * + * fmtPos is the position of the format string argument, beginning at 1 + * varPos is the position of the variable argument, beginning at 1 + */ + +#if defined(__GNUC__) +# define PRINTF_DECL(fmtPos, varPos) __attribute__((__format__(__printf__, fmtPos, varPos))) +#else +# define PRINTF_DECL(fmtPos, varPos) +#endif + +#if defined(__GNUC__) +# define SCANF_DECL(fmtPos, varPos) __attribute__((__format__(__scanf__, fmtPos, varPos))) +#else +# define SCANF_DECL(fmtPos, varPos) +#endif + +/* + * UNUSED_PARAM should surround the parameter name and type declaration, + * e.g. "int MyFunction(int var1, UNUSED_PARAM(int var2))" + * + */ + +#ifndef UNUSED_PARAM +# if defined(__GNUC__) +# define UNUSED_PARAM(_parm) _parm __attribute__((__unused__)) +# else +# define UNUSED_PARAM(_parm) _parm +# endif +#endif + +/* + * REGPARM defaults to REGPARM3; i.e., a request that gcc + * put the first three arguments in registers. (It is fine + * if the function has fewer than three arguments.) Gcc only. + * Syntactically, put REGPARM where you'd put INLINE or NORETURN. + * + * Note that 64-bit code already puts the first six arguments in + * registers, so these attributes are only useful for 32-bit code. + */ + +#if defined(__GNUC__) +# define REGPARM0 __attribute__((regparm(0))) +# define REGPARM1 __attribute__((regparm(1))) +# define REGPARM2 __attribute__((regparm(2))) +# define REGPARM3 __attribute__((regparm(3))) +# define REGPARM REGPARM3 +#else +# define REGPARM0 +# define REGPARM1 +# define REGPARM2 +# define REGPARM3 +# define REGPARM +#endif + +/* + * ALIGNED specifies minimum alignment in "n" bytes. + */ + +#ifdef __GNUC__ +#define ALIGNED(n) __attribute__((__aligned__(n))) +#else +#define ALIGNED(n) +#endif + +/* + * __func__ is a stringified function name that is part of the C99 standard. The block + * below defines __func__ on older systems where the compiler does not support that + * macro. + */ +#if defined(__GNUC__) \ + && ((__GNUC__ == 2 && __GNUC_MINOR < 96) \ + || (__GNUC__ < 2)) +# define __func__ __FUNCTION__ +#endif + +/* + * Once upon a time, this was used to silence compiler warnings that + * get generated when the compiler thinks that a function returns + * when it is marked noreturn. Don't do it. Use NOT_REACHED(). + */ + +#define INFINITE_LOOP() do { } while (1) + +/* + * On FreeBSD (for the tools build), size_t is typedef'd if _BSD_SIZE_T_ + * is defined. Use the same logic here so we don't define it twice. [greg] + */ +#ifdef __FreeBSD__ +# ifdef _BSD_SIZE_T_ +# undef _BSD_SIZE_T_ +# ifdef VM_I386 +# ifdef VM_X86_64 + typedef uint64 size_t; +# else + typedef uint32 size_t; +# endif +# endif /* VM_I386 */ +# endif + +# ifdef _BSD_SSIZE_T_ +# undef _BSD_SSIZE_T_ +# ifdef VM_I386 +# ifdef VM_X86_64 + typedef int64 ssize_t; +# else + typedef int32 ssize_t; +# endif +# endif /* VM_I386 */ +# endif + +#else +# ifndef _SIZE_T +# ifdef VM_I386 +# define _SIZE_T +# ifdef VM_X86_64 + typedef uint64 size_t; +# else + typedef uint32 size_t; +# endif +# elif defined(__arm__) +# define _SIZE_T + typedef uint32 size_t; +# endif +# endif + +# if !defined(FROBOS) && !defined(_SSIZE_T) && !defined(_SSIZE_T_) && \ + !defined(ssize_t) && !defined(__ssize_t_defined) && \ + !defined(_SSIZE_T_DECLARED) +# ifdef VM_I386 +# define _SSIZE_T +# define __ssize_t_defined +# define _SSIZE_T_DECLARED +# ifdef VM_X86_64 + typedef int64 ssize_t; +# else + typedef int32 ssize_t; +# endif +# elif defined(__arm__) +# define _SSIZE_T +# define __ssize_t_defined +# define _SSIZE_T_DECLARED + typedef int32 ssize_t; +# endif +# endif + +#endif + +/* + * Format modifier for printing pid_t. On sun the pid_t is a ulong, but on + * Linux it's an int. + * Use this like this: printf("The pid is %"FMTPID".\n", pid); + */ +#ifdef sun +# ifdef VM_X86_64 +# define FMTPID "d" +# else +# define FMTPID "lu" +# endif +#else +# define FMTPID "d" +#endif + +/* + * Format modifier for printing uid_t. On Solaris 10 and earlier, uid_t + * is a ulong, but on other platforms it's an unsigned int. + * Use this like this: printf("The uid is %"FMTUID".\n", uid); + */ +#if defined(sun) && !defined(SOL11) +# ifdef VM_X86_64 +# define FMTUID "u" +# else +# define FMTUID "lu" +# endif +#else +# define FMTUID "u" +#endif + +/* + * Format modifier for printing mode_t. On sun the mode_t is a ulong, but on + * Linux it's an int. + * Use this like this: printf("The mode is %"FMTMODE".\n", mode); + */ +#ifdef sun +# ifdef VM_X86_64 +# define FMTMODE "o" +# else +# define FMTMODE "lo" +# endif +#else +# define FMTMODE "o" +#endif + +/* + * Format modifier for printing time_t. Most platforms define a time_t to be + * a long int, but on FreeBSD (as of 5.0, it seems), the time_t is a signed + * size quantity. Refer to the definition of FMTSZ to see why we need silly + * preprocessor arithmetic. + * Use this like this: printf("The mode is %"FMTTIME".\n", time); + */ +#if defined(__FreeBSD__) && (__FreeBSD__ + 0) && ((__FreeBSD__ + 0) >= 5) +# define FMTTIME FMTSZ"d" +#else +# if defined(_MSC_VER) +# ifndef _SAFETIME_H_ +# if (_MSC_VER < 1400) || defined(_USE_32BIT_TIME_T) +# define FMTTIME "ld" +# else +# define FMTTIME FMT64"d" +# endif +# else +# ifndef FMTTIME +# error "safetime.h did not define FMTTIME" +# endif +# endif +# else +# define FMTTIME "ld" +# endif +#endif + +#ifdef __APPLE__ +/* + * Format specifier for all these annoying types such as {S,U}Int32 + * which are 'long' in 32-bit builds + * and 'int' in 64-bit builds. + */ +# ifdef __LP64__ +# define FMTLI "" +# else +# define FMTLI "l" +# endif + +/* + * Format specifier for all these annoying types such as NS[U]Integer + * which are 'int' in 32-bit builds + * and 'long' in 64-bit builds. + */ +# ifdef __LP64__ +# define FMTIL "l" +# else +# define FMTIL "" +# endif +#endif + + +/* + * Define MXSemaHandle here so both vmmon and vmx see this definition. + */ + +#ifdef _WIN32 +typedef uintptr_t MXSemaHandle; +#else +typedef int MXSemaHandle; +#endif + +/* + * Define type for poll device handles. + */ + +typedef int64 PollDevHandle; + +/* + * Define the utf16_t type. + */ + +#if defined(_WIN32) && defined(_NATIVE_WCHAR_T_DEFINED) +typedef wchar_t utf16_t; +#else +typedef uint16 utf16_t; +#endif + +/* + * Define for point and rectangle types. Defined here so they + * can be used by other externally facing headers in bora/public. + */ + +typedef struct VMPoint { + int x, y; +} VMPoint; + +#if defined _WIN32 && defined USERLEVEL +struct tagRECT; +typedef struct tagRECT VMRect; +#else +typedef struct VMRect { + int left; + int top; + int right; + int bottom; +} VMRect; +#endif + +/* + * ranked locks "everywhere" + */ + +typedef uint32 MX_Rank; + +#endif /* _VM_BASIC_TYPES_H_ */ diff --git a/usr/src/uts/intel/io/vmxnet/vm_device_version.h b/usr/src/uts/intel/io/vmxnet/vm_device_version.h new file mode 100644 index 0000000000..7046594a6c --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vm_device_version.h @@ -0,0 +1,246 @@ +/********************************************************* + * Copyright (C) 1998 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation version 2.1 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the Lesser GNU General Public + * License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + *********************************************************/ + +#ifndef VM_DEVICE_VERSION_H +#define VM_DEVICE_VERSION_H + +#define INCLUDE_ALLOW_USERLEVEL + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_VMCORE +#include "includeCheck.h" + +#ifdef _WIN32 +#ifdef __MINGW32__ +#include "initguid.h" +#else +#include "guiddef.h" +#endif +#endif + +/* LSILogic 53C1030 Parallel SCSI controller + * LSILogic SAS1068 SAS controller + */ +#define PCI_VENDOR_ID_LSILOGIC 0x1000 +#define PCI_DEVICE_ID_LSI53C1030 0x0030 +#define PCI_DEVICE_ID_LSISAS1068 0x0054 + +/* Our own PCI IDs + * VMware SVGA II (Unified VGA) + * VMware SVGA (PCI Accelerator) + * VMware vmxnet (Idealized NIC) + * VMware vmxscsi (Abortive idealized SCSI controller) + * VMware chipset (Subsystem ID for our motherboards) + * VMware e1000 (Subsystem ID) + * VMware vmxnet3 (Uniform Pass Through NIC) + * VMware HD Audio codec + * VMware HD Audio controller + */ +#define PCI_VENDOR_ID_VMWARE 0x15AD +#define PCI_DEVICE_ID_VMWARE_SVGA2 0x0405 +#define PCI_DEVICE_ID_VMWARE_SVGA 0x0710 +#define PCI_DEVICE_ID_VMWARE_NET 0x0720 +#define PCI_DEVICE_ID_VMWARE_SCSI 0x0730 +#define PCI_DEVICE_ID_VMWARE_VMCI 0x0740 +#define PCI_DEVICE_ID_VMWARE_CHIPSET 0x1976 +#define PCI_DEVICE_ID_VMWARE_82545EM 0x0750 /* single port */ +#define PCI_DEVICE_ID_VMWARE_82546EB 0x0760 /* dual port */ +#define PCI_DEVICE_ID_VMWARE_EHCI 0x0770 +#define PCI_DEVICE_ID_VMWARE_UHCI 0x0774 +#define PCI_DEVICE_ID_VMWARE_XHCI 0x0778 +#define PCI_DEVICE_ID_VMWARE_1394 0x0780 +#define PCI_DEVICE_ID_VMWARE_BRIDGE 0x0790 +#define PCI_DEVICE_ID_VMWARE_ROOTPORT 0x07A0 +#define PCI_DEVICE_ID_VMWARE_VMXNET3 0x07B0 +#define PCI_DEVICE_ID_VMWARE_VMXWIFI 0x07B8 +#define PCI_DEVICE_ID_VMWARE_PVSCSI 0x07C0 +#define PCI_DEVICE_ID_VMWARE_82574 0x07D0 +#define PCI_DEVICE_ID_VMWARE_HDAUDIO_CODEC 0x1975 +#define PCI_DEVICE_ID_VMWARE_HDAUDIO_CONTROLLER 0x1977 + +/* The hypervisor device might grow. Please leave room + * for 7 more subfunctions. + */ +#define PCI_DEVICE_ID_VMWARE_HYPER 0x0800 +#define PCI_DEVICE_ID_VMWARE_VMI 0x0801 + +#define PCI_DEVICE_VMI_CLASS 0x05 +#define PCI_DEVICE_VMI_SUBCLASS 0x80 +#define PCI_DEVICE_VMI_INTERFACE 0x00 +#define PCI_DEVICE_VMI_REVISION 0x01 + +/* From linux/pci_ids.h: + * AMD Lance Ethernet controller + * BusLogic SCSI controller + * Ensoniq ES1371 sound controller + */ +#define PCI_VENDOR_ID_AMD 0x1022 +#define PCI_DEVICE_ID_AMD_VLANCE 0x2000 +#define PCI_VENDOR_ID_BUSLOGIC 0x104B +#define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER_NC 0x0140 +#define PCI_DEVICE_ID_BUSLOGIC_MULTIMASTER 0x1040 +#define PCI_VENDOR_ID_ENSONIQ 0x1274 +#define PCI_DEVICE_ID_ENSONIQ_ES1371 0x1371 + +/* From linux/pci_ids.h: + * Intel 82439TX (430 HX North Bridge) + * Intel 82371AB (PIIX4 South Bridge) + * Intel 82443BX (440 BX North Bridge and AGP Bridge) + * Intel 82545EM (e1000, server adapter, single port) + * Intel 82546EB (e1000, server adapter, dual port) + * Intel HECI (as embedded in ich9m) + */ +#define PCI_VENDOR_ID_INTEL 0x8086 +#define PCI_DEVICE_ID_INTEL_82439TX 0x7100 +#define PCI_DEVICE_ID_INTEL_82371AB_0 0x7110 +#define PCI_DEVICE_ID_INTEL_82371AB_2 0x7112 +#define PCI_DEVICE_ID_INTEL_82371AB_3 0x7113 +#define PCI_DEVICE_ID_INTEL_82371AB 0x7111 +#define PCI_DEVICE_ID_INTEL_82443BX 0x7190 +#define PCI_DEVICE_ID_INTEL_82443BX_1 0x7191 +#define PCI_DEVICE_ID_INTEL_82443BX_2 0x7192 /* Used when no AGP support */ +#define PCI_DEVICE_ID_INTEL_82545EM 0x100f +#define PCI_DEVICE_ID_INTEL_82546EB 0x1010 +#define PCI_DEVICE_ID_INTEL_82574 0x10d3 +#define PCI_DEVICE_ID_INTEL_82574_APPLE 0x10f6 +#define PCI_DEVICE_ID_INTEL_HECI 0x2a74 + +#define E1000E_PCI_DEVICE_ID_CONFIG_STR "e1000e.pci.deviceID" +#define E1000E_PCI_SUB_VENDOR_ID_CONFIG_STR "e1000e.pci.subVendorID" +#define E1000E_PCI_SUB_DEVICE_ID_CONFIG_STR "e1000e.pci.subDeviceID" + +/* + * Intel HD Audio controller and Realtek ALC885 codec. + */ +#define PCI_DEVICE_ID_INTEL_631XESB_632XESB 0x269a +#define PCI_VENDOR_ID_REALTEK 0x10ec +#define PCI_DEVICE_ID_REALTEK_ALC885 0x0885 + + +/* + * Fresco Logic xHCI (USB 3.0) Controller + */ +#define PCI_VENDOR_ID_FRESCO 0x1B73 +#define PCI_DEVICE_ID_FRESCO_FL1000 0x1000 // Original 1-port chip +#define PCI_DEVICE_ID_FRESCO_FL1009 0x1009 // New 2-port chip (Driver 3.0.98+) +#define PCI_DEVICE_ID_FRESCO_FL1400 0x1400 // Unknown (4-port? Dev hardware?) + +/* + * NEC/Renesas xHCI (USB 3.0) Controller + */ +#define PCI_VENDOR_ID_NEC 0x1033 +#define PCI_DEVICE_ID_NEC_UPD720200 0x0194 +#define PCI_REVISION_NEC_UPD720200 0x03 +#define PCI_FIRMWARE_NEC_UPD720200 0x3015 + + +/************* Strings for IDE Identity Fields **************************/ +#define VIDE_ID_SERIAL_STR "00000000000000000001" /* Must be 20 Bytes */ +#define VIDE_ID_FIRMWARE_STR "00000001" /* Must be 8 Bytes */ + +/* No longer than 40 Bytes */ +#define VIDE_ATA_MODEL_STR PRODUCT_GENERIC_NAME " Virtual IDE Hard Drive" +#define VIDE_ATAPI_MODEL_STR PRODUCT_GENERIC_NAME " Virtual IDE CDROM Drive" + +#define ATAPI_VENDOR_ID "NECVMWar" /* Must be 8 Bytes */ +#define ATAPI_PRODUCT_ID PRODUCT_GENERIC_NAME " IDE CDROM" /* Must be 16 Bytes */ +#define ATAPI_REV_LEVEL "1.00" /* Must be 4 Bytes */ + +#define IDE_NUM_INTERFACES 2 /* support for two interfaces */ +#define IDE_DRIVES_PER_IF 2 + +/************* Strings for SCSI Identity Fields **************************/ +#define SCSI_DISK_MODEL_STR PRODUCT_GENERIC_NAME " Virtual SCSI Hard Drive" +#define SCSI_DISK_VENDOR_NAME COMPANY_NAME +#define SCSI_DISK_REV_LEVEL "1.0" +#define SCSI_CDROM_MODEL_STR PRODUCT_GENERIC_NAME " Virtual SCSI CDROM Drive" +#define SCSI_CDROM_VENDOR_NAME COMPANY_NAME +#define SCSI_CDROM_REV_LEVEL "1.0" + +/************* SCSI implementation limits ********************************/ +#define SCSI_MAX_CONTROLLERS 4 // Need more than 1 for MSCS clustering +#define SCSI_MAX_DEVICES 16 // BT-958 emulates only 16 +#define PVSCSI_MAX_DEVICES 255 // 255 (including the controller) +/* + * VSCSI_BV_INTS is the number of uint32's needed for a bit vector + * to cover all scsi devices per target. + */ +#define VSCSI_BV_INTS CEILING(PVSCSI_MAX_DEVICES, 8 * sizeof (uint32)) +#define SCSI_IDE_CHANNEL SCSI_MAX_CONTROLLERS +#define SCSI_IDE_HOSTED_CHANNEL (SCSI_MAX_CONTROLLERS + 1) +#define SCSI_MAX_CHANNELS (SCSI_MAX_CONTROLLERS + 2) + +/************* Strings for the VESA BIOS Identity Fields *****************/ +#define VBE_OEM_STRING COMPANY_NAME " SVGA" +#define VBE_VENDOR_NAME COMPANY_NAME +#define VBE_PRODUCT_NAME PRODUCT_GENERIC_NAME + +/************* PCI implementation limits ********************************/ +#define PCI_MAX_BRIDGES 15 + +/************* Ethernet implementation limits ***************************/ +#define MAX_ETHERNET_CARDS 10 + +/********************** Floppy limits ***********************************/ +#define MAX_FLOPPY_DRIVES 2 + +/************* PCI Passthrough implementation limits ********************/ +#define MAX_PCI_PASSTHRU_DEVICES 6 + +/************* USB implementation limits ********************************/ +#define MAX_USB_DEVICES_PER_HOST_CONTROLLER 127 + +/************* Strings for Host USB Driver *******************************/ + +#ifdef _WIN32 + +/* + * Globally unique ID for the VMware device interface. Define INITGUID before including + * this header file to instantiate the variable. + */ +DEFINE_GUID(GUID_DEVICE_INTERFACE_VMWARE_USB_DEVICES, +0x2da1fe75, 0xaab3, 0x4d2c, 0xac, 0xdf, 0x39, 0x8, 0x8c, 0xad, 0xa6, 0x65); + +/* + * Globally unique ID for the VMware device setup class. + */ +DEFINE_GUID(GUID_CLASS_VMWARE_USB_DEVICES, +0x3b3e62a5, 0x3556, 0x4d7e, 0xad, 0xad, 0xf5, 0xfa, 0x3a, 0x71, 0x2b, 0x56); + +/* + * This string defines the device ID string of a VMware USB device. + * The format is USB\Vid_XXXX&Pid_YYYY, where XXXX and YYYY are the + * hexadecimal representations of the vendor and product ids, respectively. + * + * The official vendor ID for VMware, Inc. is 0x0E0F. + * The product id for USB generic devices is 0x0001. + */ +#define USB_VMWARE_DEVICE_ID_WIDE L"USB\\Vid_0E0F&Pid_0001" +#define USB_DEVICE_ID_LENGTH (sizeof(USB_VMWARE_DEVICE_ID_WIDE) / sizeof(WCHAR)) + +#ifdef UNICODE +#define USB_PNP_SETUP_CLASS_NAME L"VMwareUSBDevices" +#define USB_PNP_DRIVER_NAME L"vmusb" +#else +#define USB_PNP_SETUP_CLASS_NAME "VMwareUSBDevices" +#define USB_PNP_DRIVER_NAME "vmusb" +#endif +#endif + +#endif /* VM_DEVICE_VERSION_H */ diff --git a/usr/src/uts/intel/io/vmxnet/vmnet_def.h b/usr/src/uts/intel/io/vmxnet/vmnet_def.h new file mode 100644 index 0000000000..6e44aea2bb --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vmnet_def.h @@ -0,0 +1,91 @@ +/********************************************************* + * Copyright (C) 2004 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/* + * vmnet_def.h + * + * - definitions which are (mostly) not vmxnet or vlance specific + */ + +#ifndef _VMNET_DEF_H_ +#define _VMNET_DEF_H_ + +#define INCLUDE_ALLOW_USERLEVEL +#define INCLUDE_ALLOW_VMCORE + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMK_MODULE +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_DISTRIBUTE +#include "includeCheck.h" + +#define VMNET_NAME_BUFFER_LEN 128 /* Increased for i18n. */ +#define VMNET_COAL_SCHEME_NAME_LEN 16 + + +/* + * capabilities - not all of these are implemented in the virtual HW + * (eg VLAN support is in the virtual switch) so even vlance + * can use them + */ +#define VMNET_CAP_SG 0x0001 /* Can do scatter-gather transmits. */ +#define VMNET_CAP_IP4_CSUM 0x0002 /* Can checksum only TCP/UDP over IPv4. */ +#define VMNET_CAP_HW_CSUM 0x0004 /* Can checksum all packets. */ +#define VMNET_CAP_HIGH_DMA 0x0008 /* Can DMA to high memory. */ +#define VMNET_CAP_TOE 0x0010 /* Supports TCP/IP offload. */ +#define VMNET_CAP_TSO 0x0020 /* Supports TCP Segmentation offload */ +#define VMNET_CAP_SW_TSO 0x0040 /* Supports SW TCP Segmentation */ +#define VMNET_CAP_VMXNET_APROM 0x0080 /* Vmxnet APROM support */ +#define VMNET_CAP_HW_TX_VLAN 0x0100 /* Can we do VLAN tagging in HW */ +#define VMNET_CAP_HW_RX_VLAN 0x0200 /* Can we do VLAN untagging in HW */ +#define VMNET_CAP_SW_VLAN 0x0400 /* Can we do VLAN tagging/untagging in SW */ +#define VMNET_CAP_WAKE_PCKT_RCV 0x0800 /* Can wake on network packet recv? */ +#define VMNET_CAP_ENABLE_INT_INLINE 0x1000 /* Enable Interrupt Inline */ +#define VMNET_CAP_ENABLE_HEADER_COPY 0x2000 /* copy header for vmkernel */ +#define VMNET_CAP_TX_CHAIN 0x4000 /* Guest can use multiple tx entries for a pkt */ +#define VMNET_CAP_RX_CHAIN 0x8000 /* a pkt can span multiple rx entries */ +#define VMNET_CAP_LPD 0x10000 /* large pkt delivery */ +#define VMNET_CAP_BPF 0x20000 /* BPF Support in VMXNET Virtual Hardware */ +#define VMNET_CAP_SG_SPAN_PAGES 0x40000 /* Can do scatter-gather span multiple pages transmits. */ +#define VMNET_CAP_IP6_CSUM 0x80000 /* Can do IPv6 csum offload. */ +#define VMNET_CAP_TSO6 0x100000 /* Can do TSO segmentation offload for IPv6 pkts. */ +#define VMNET_CAP_TSO256k 0x200000 /* Can do TSO segmentation offload for pkts up to 256kB. */ +#define VMNET_CAP_UPT 0x400000 /* Support UPT */ +#define VMNET_CAP_RDONLY_INETHDRS 0x800000 /* Modifies inet headers for TSO/CSUm */ +#define VMNET_CAP_NPA 0x1000000 /* Support NPA */ +#define VMNET_CAP_DCB 0x2000000 /* Support DCB */ +#define VMNET_CAP_OFFLOAD_8OFFSET 0x4000000 /* supports 8bit parameterized offsets */ +#define VMNET_CAP_OFFLOAD_16OFFSET 0x8000000 /* supports 16bit parameterized offsets */ +#define VMNET_CAP_IP6_CSUM_EXT_HDRS 0x10000000 /* support csum of ip6 ext hdrs */ +#define VMNET_CAP_TSO6_EXT_HDRS 0x20000000 /* support TSO for ip6 ext hdrs */ +#define VMNET_CAP_SCHED 0x40000000 /* compliant with network scheduling */ +#endif // _VMNET_DEF_H_ diff --git a/usr/src/uts/intel/io/vmxnet/vmxnet.c b/usr/src/uts/intel/io/vmxnet/vmxnet.c new file mode 100644 index 0000000000..14a87fa22e --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vmxnet.c @@ -0,0 +1,2442 @@ +/********************************************************* + * Copyright (C) 2004 VMware, Inc. All rights reserved. + * + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/kstat.h> +#include <sys/vtrace.h> +#include <sys/dlpi.h> +#include <sys/strsun.h> +#include <sys/ethernet.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ddi_impldefs.h> +#include <sys/gld.h> +#include <sys/pci.h> +#include <sys/strsubr.h> + +/* + * This used to be defined in sys/gld.h, but was flagged as private, + * and we used it anyway. Now it no longer exists, and we're stuck + * with it for the time being. + */ +#ifndef GLD_MAX_MULTICAST +#define GLD_MAX_MULTICAST 64 +#endif + +#define __intptr_t_defined +#define _STDINT_H +#include "vm_basic_types.h" +#include "vmxnet2_def.h" +#include "vm_device_version.h" +#include "net.h" +#include "buildNumber.h" + +#define SOLVMXNET_SUCCESS 1 +#define SOLVMXNET_FAILURE 0 + +#ifdef SOLVMXNET_DEBUG_LEVEL +static int vxn_debug = SOLVMXNET_DEBUG_LEVEL; +#define DPRINTF(n, args) if (vxn_debug>(n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +static char ident[] = "VMware Ethernet Adapter b" BUILD_NUMBER_NUMERIC_STRING; +char _depends_on[] = {"misc/gld"}; + +#define MAX_NUM_RECV_BUFFERS 128 +#define DEFAULT_NUM_RECV_BUFFERS 100 +#define MAX_NUM_XMIT_BUFFERS 128 +#define DEFAULT_NUM_XMIT_BUFFERS 100 +#define CRC_POLYNOMIAL_LE 0xedb88320UL +#define SOLVMXNET_MAXNAME 20 +#define MAX_TX_WAIT_ON_STOP 2000 + +#define ETHERALIGN 2 +#define SLACKBYTES 4 +#define MAXPKTBUF (14 + ETHERALIGN + ETHERMTU + SLACKBYTES) + + +#define QHIWATER (MAX_NUM_RECV_BUFFERS*ETHERMTU) + +#define OUTB(dp, p, v) \ + ddi_put8((dp)->vxnIOHdl, \ + (uint8_t *)((caddr_t)((dp)->vxnIOp) + (p)), v) +#define OUTW(dp, p, v) \ + ddi_put16((dp)->vxnIOHdl, \ + (uint16_t *)((caddr_t)((dp)->vxnIOp) + (p)), v) +#define OUTL(dp, p, v) \ + ddi_put32((dp)->vxnIOHdl, \ + (uint32_t *)((caddr_t)((dp)->vxnIOp) + (p)), v) +#define INB(dp, p) \ + ddi_get8((dp)->vxnIOHdl, \ + (uint8_t *)(((caddr_t)(dp)->vxnIOp) + (p))) +#define INW(dp, p) \ + ddi_get16((dp)->vxnIOHdl, \ + (uint16_t *)(((caddr_t)(dp)->vxnIOp) + (p))) +#define INL(dp, p) \ + ddi_get32((dp)->vxnIOHdl, \ + (uint32_t *)(((caddr_t)(dp)->vxnIOp) + (p))) + +#define VMXNET_INC(val, max) \ + val++; \ + if (UNLIKELY(val == max)) { \ + val = 0; \ + } + +#define TX_RINGBUF_MBLK(dp, idx) (dp->txRingBuf[idx].mblk) +#define TX_RINGBUF_DMAMEM(dp, idx) (dp->txRingBuf[idx].dmaMem) + +typedef struct { + caddr_t buf; /* Virtual address */ + uint32_t phyBuf; /* Physical address */ + size_t bufLen; /* Buffer length */ + ddi_dma_cookie_t cookie; /* Dma cookie */ + uint_t cookieCount; /* Cookie count */ + ddi_dma_handle_t dmaHdl; /* Dma handle */ + ddi_acc_handle_t dataAccHdl; /* Dada access handle */ +} dma_buf_t; + +typedef struct rx_dma_buf { + dma_buf_t dmaDesc; /* Dma descriptor */ + mblk_t *mblk; /* Streams message block */ + frtn_t freeCB; /* Free callback */ + struct vxn_softc *softc; /* Back pointer to softc */ + struct rx_dma_buf *next; /* Next one in list */ +} rx_dma_buf_t; + +typedef struct vxn_stats { + uint32_t errxmt; /* Transmit errors */ + uint32_t errrcv; /* Receive errors */ + uint32_t runt; /* Runt packets */ + uint32_t norcvbuf; /* Buffer alloc errors */ + uint32_t interrupts; /* Interrupts */ + uint32_t defer; /* Deferred transmits */ +} vxn_stats_t; + +typedef struct tx_ring_buf { + mblk_t *mblk; + dma_buf_t dmaMem; +} tx_ring_buf_t; + +typedef struct vxn_softc { + char drvName[SOLVMXNET_MAXNAME]; /* Driver name string */ + int unit; /* Driver instance */ + vxn_stats_t stats; /* Stats */ + + dev_info_t *dip; /* Info pointer */ + ddi_iblock_cookie_t iblockCookie; /* Interrupt block cookie */ + gld_mac_info_t *macInfo; /* GLD mac info */ + ddi_acc_handle_t confHdl; /* Configuration space handle */ + ddi_acc_handle_t vxnIOHdl; /* I/O space handle */ + caddr_t vxnIOp; /* I/O space pointer */ + boolean_t morphed; /* Adapter morphed ? */ + + kmutex_t intrlock; /* Interrupt lock */ + kmutex_t xmitlock; /* Transmit lock */ + kmutex_t rxlistlock; /* Rx free pool lock */ + + boolean_t nicActive; /* NIC active flag */ + boolean_t inIntr; /* Interrupt processing flag */ + + struct ether_addr devAddr; /* MAC address */ + + uint32_t vxnNumRxBufs; /* Number of reveice buffers */ + uint32_t vxnNumTxBufs; /* Number of transmit buffers */ + + dma_buf_t driverDataDmaMem; /* Driver Data (dma handle) */ + Vmxnet2_DriverData *driverData; /* Driver Data */ + void *driverDataPhy; /* Driver Data busaddr pointer */ + Vmxnet2_RxRingEntry *rxRing; /* Receive ring */ + Vmxnet2_TxRingEntry *txRing; /* Transmit ring */ + ddi_dma_handle_t txDmaHdl; /* Tx buffers dma handle */ + rx_dma_buf_t *rxRingBuffPtr[MAX_NUM_RECV_BUFFERS]; + /* DMA buffers associated with rxRing */ + tx_ring_buf_t txRingBuf[MAX_NUM_XMIT_BUFFERS]; /* tx Ring buffers */ + + rx_dma_buf_t *rxFreeBufList; + uint32_t rxNumFreeBufs; /* current # of buffers in pool */ + uint32_t rxMaxFreeBufs; /* max # of buffers in pool */ + + uint32_t txPending; /* Pending transmits */ + uint32_t maxTxFrags; /* Max Tx fragments */ + + int multiCount; /* Multicast address count */ + struct ether_addr multicastList[GLD_MAX_MULTICAST]; /* Multicast list */ + + struct vxn_softc *next; /* Circular list of instances */ + struct vxn_softc *prev; +} vxn_softc_t; + +/* used for rx buffers or buffers allocated by ddi_dma_mem_alloc() */ +static ddi_dma_attr_t vxn_dma_attrs = { + DMA_ATTR_V0, /* dma_attr version */ + 0, /* dma_attr_addr_lo */ + (uint64_t)0xFFFFFFFF, /* dma_attr_addr_hi */ + 0x7FFFFFFF, /* dma_attr_count_max */ + 4, /* dma_attr_align */ + 0x3F, /* dma_attr_burstsizes */ + 1, /* dma_attr_minxfer */ + (uint64_t)0xFFFFFFFF, /* dma_attr_maxxfer */ + (uint64_t)0xFFFFFFFF, /* dma_attr_seg */ + 1, /* dma_attr_sgllen */ + 1, /* dma_attr_granular */ + 0, /* dma_attr_flags */ +}; + +/* used for tx buffers */ +static ddi_dma_attr_t vxn_dma_attrs_tx = { + DMA_ATTR_V0, /* dma_attr version */ + 0, /* dma_attr_addr_lo */ + (uint64_t)0xFFFFFFFF, /* dma_attr_addr_hi */ + 0x7FFFFFFF, /* dma_attr_count_max */ + 1, /* dma_attr_align */ + 0x3F, /* dma_attr_burstsizes */ + 1, /* dma_attr_minxfer */ + (uint64_t)0xFFFFFFFF, /* dma_attr_maxxfer */ + (uint64_t)0xFFFFFFFF, /* dma_attr_seg */ + 1, /* dma_attr_sgllen */ + 1, /* dma_attr_granular */ + 0, /* dma_attr_flags */ +}; + + +static struct ether_addr etherbroadcastaddr = { + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff} +}; + +static struct ddi_device_acc_attr vxn_buf_attrs = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + +static struct ddi_device_acc_attr dev_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + +static vxn_softc_t vxnList; /* for debugging */ +static kmutex_t vxnListLock; + +static void *Vxn_Memset(void *s, int c, size_t n); +static int Vxn_Reset(gld_mac_info_t *macInfo); +static int Vxn_SetPromiscuous(gld_mac_info_t *macInfo, int flag); +static int Vxn_GetStats(gld_mac_info_t *macInfo, struct gld_stats *gs); +static void Vxn_ApplyAddressFilter(vxn_softc_t *dp); +static int Vxn_SetMulticast(gld_mac_info_t *macinfo, uint8_t *ep, int flag); +static int Vxn_SetMacAddress(gld_mac_info_t *macInfo, uint8_t *mac); +static int Vxn_Start(gld_mac_info_t *macInfo); +static int Vxn_Stop(gld_mac_info_t *macInfo); +static void Vxn_FreeTxBuf(vxn_softc_t *dp, int idx); +static int Vxn_EncapTxBuf(vxn_softc_t *dp, mblk_t *mp, Vmxnet2_TxRingEntry *xre, + tx_ring_buf_t *txBuf); +static int Vxn_Send(gld_mac_info_t *macinfo, mblk_t *mp); +static boolean_t Vxn_TxComplete(vxn_softc_t *dp, boolean_t *reschedp); +static boolean_t Vxn_Receive(vxn_softc_t *dp); +static u_int Vxn_Interrupt(gld_mac_info_t *macInfo); +static void Vxn_ReclaimRxBuf(rx_dma_buf_t *rxDesc); +static void Vxn_FreeRxBuf(rx_dma_buf_t *rxDesc); +static rx_dma_buf_t *Vxn_AllocRxBuf(vxn_softc_t *dp, int cansleep); +static void Vxn_FreeInitBuffers(vxn_softc_t *dp); +static int Vxn_AllocInitBuffers(vxn_softc_t *dp); +static void Vxn_FreeDmaMem(dma_buf_t *dma); +static int Vxn_AllocDmaMem(vxn_softc_t *dp, int size, int cansleep, dma_buf_t *dma); +static void Vxn_FreeDriverData(vxn_softc_t *dp); +static int Vxn_AllocDriverData(vxn_softc_t *dp); +static int Vxn_Attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int Vxn_Detach(dev_info_t *dip, ddi_detach_cmd_t cmd); +static int Vxn_AllocRxBufPool(vxn_softc_t *dp); +static void Vxn_FreeRxBufPool(vxn_softc_t *dp); +static rx_dma_buf_t * Vxn_AllocRxBufFromPool(vxn_softc_t *dp); +static void Vxn_FreeRxBufToPool(rx_dma_buf_t *rxDesc); + +/* + *----------------------------------------------------------------------------- + * Vxn_Memset -- + * memset() (Because bzero does not get resolved by module loader) + * + * Results: + * pointer to the memory area s + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void * +Vxn_Memset(void *s, int c, size_t n) +{ + while (n--) { + ((uint8_t *)s)[n] = c; + } + + return s; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Reset -- + * Stub routine to reset hardware. Presently does nothing. Start/Stop should + * take care of resets. + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Reset(gld_mac_info_t *macInfo) +{ + return GLD_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_SetPromiscuous -- + * Set/Reset NIC to/from promiscuous mode + * + * Results: + * GLD_SUCCESS + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_SetPromiscuous(gld_mac_info_t *macInfo, int flag) +{ + vxn_softc_t *dp = (vxn_softc_t *)macInfo->gldm_private; + Vmxnet2_DriverData *dd = dp->driverData; + + mutex_enter(&dp->intrlock); + if (flag == GLD_MAC_PROMISC_PHYS) { + dd->ifflags |= VMXNET_IFF_PROMISC; + } else if (flag == GLD_MAC_PROMISC_MULTI) { + /* + * This should really set VMXNET_IFF_ALLMULTI, + * but unfortunately it doesn't exist. The next + * best thing would be to set the LADRFs to all + * 0xFFs and set VMXNET_IFF_MULTICAST, but that + * opens up a whole new set of potential pitfalls, + * so this is a reasonable temporary solution. + */ + dd->ifflags |= VMXNET_IFF_PROMISC; + } else if (flag == GLD_MAC_PROMISC_NONE) { + dd->ifflags &= ~VMXNET_IFF_PROMISC; + } else { + /* This could be GLD_MAC_PROMISC_NOOP? */ + mutex_exit(&dp->intrlock); + cmn_err(CE_WARN, "%s%d: Vxn_SetPromiscuous: Unexpected mode flag: 0x%x", + dp->drvName, dp->unit, flag); + + return GLD_FAILURE; + } + + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_UPDATE_IFF); + mutex_exit(&dp->intrlock); + + return GLD_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_GetStats -- + * Get driver specific stats + * + * Results: + * GLD_SUCCESS + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_GetStats(gld_mac_info_t *macInfo, struct gld_stats *gs) +{ + vxn_softc_t *dp = (vxn_softc_t *)macInfo->gldm_private; + + gs->glds_errxmt = dp->stats.errxmt; + gs->glds_errrcv = dp->stats.errrcv; + gs->glds_short = dp->stats.runt; + gs->glds_norcvbuf = dp->stats.norcvbuf; + gs->glds_intr = dp->stats.interrupts; + gs->glds_defer = dp->stats.defer; + + return GLD_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_ApplyAddressFilter -- + * Go over multicast list and compute/apply address filter + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_ApplyAddressFilter(vxn_softc_t *dp) +{ + uint8_t *ep; + int i, j, bit, byte; + uint32_t crc, poly = CRC_POLYNOMIAL_LE; + Vmxnet2_DriverData *dd = dp->driverData; + volatile uint16_t *mcastTable = (uint16_t *)dd->LADRF; + + ASSERT(MUTEX_HELD(&dp->intrlock)); + + /* clear the multicast filter */ + dd->LADRF[0] = 0; + dd->LADRF[1] = 0; + + for (i = 0; i < dp->multiCount; i++) { + crc = 0xffffffff; + ep = (uint8_t *)&dp->multicastList[i].ether_addr_octet; + + for (byte = 0; byte < 6; byte++) { + for (bit = *ep++, j = 0; j < 8; j++, bit >>= 1) { + int test; + + test = ((bit ^ crc) & 0x01); + crc >>= 1; + + if (test) { + crc = crc ^ poly; + } + } + } + + crc = crc >> 26; + mcastTable[crc >> 4] |= 1 << (crc & 0xf); + } +} + +/* + *----------------------------------------------------------------------------- + * Vxn_SetMulticast -- + * Add delete entry from multicast list + * + * Results: + * GLD_FAILURE on failure + * GLD_SUCCESS on success + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_SetMulticast(gld_mac_info_t *macinfo, uint8_t *ep, int flag) +{ + int i; + int copyLen; + vxn_softc_t *dp = (vxn_softc_t *)macinfo->gldm_private; + Vmxnet2_DriverData *dd = dp->driverData; + + if (flag == GLD_MULTI_ENABLE) { + /* + * Exceeded multicast address limit + */ + if (dp->multiCount >= GLD_MAX_MULTICAST) { + return GLD_FAILURE; + } + + /* + * Add mac address to multicast list + */ + bcopy(ep, dp->multicastList[dp->multiCount].ether_addr_octet, + ETHERADDRL); + dp->multiCount++; + } + else { + for (i=0; i<dp->multiCount; i++) { + if (bcmp(ep, dp->multicastList[i].ether_addr_octet, ETHERADDRL) == 0) { + goto found; + } + } + return GLD_FAILURE; + + found: + /* + * Delete mac address from multicast list + */ + copyLen = (dp->multiCount - (i+1)) * sizeof(struct ether_addr); + if (copyLen > 0) { + bcopy(&dp->multicastList[i+1], &dp->multicastList[i], copyLen); + } + dp->multiCount--; + } + + /* + * Compute address filter from list of addressed and apply it + */ + mutex_enter(&dp->intrlock); + Vxn_ApplyAddressFilter(dp); + + if (dp->multiCount) { + ASSERT(dd->LADRF[0] || dd->LADRF[1]); + dd->ifflags |= VMXNET_IFF_MULTICAST; + } else { + ASSERT(!(dd->LADRF[0] || dd->LADRF[1])); + dd->ifflags &= ~VMXNET_IFF_MULTICAST; + } + + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_UPDATE_IFF); + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_UPDATE_LADRF); + mutex_exit(&dp->intrlock); + + return GLD_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_SetMacAddress -- + * Change device MAC address + * + * Results: + * GLD_SUCCESS + * GLD_FAILURE + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_SetMacAddress(gld_mac_info_t *macInfo, uint8_t *mac) +{ + int i; + int err = GLD_SUCCESS; + vxn_softc_t * dp = (vxn_softc_t *)macInfo->gldm_private; + + mutex_enter(&dp->intrlock); + mutex_enter(&dp->xmitlock); + + /* + * Don't change MAC address on a running NIC + */ + if (dp->nicActive) { + err = GLD_FAILURE; + goto out; + } + + /* + * Save new MAC address + */ + for (i = 0; i < 6; i++) { + dp->devAddr.ether_addr_octet[i] = mac[i]; + } + + /* + * Push new MAC address down into hardware + */ + for (i = 0; i < 6; i++) { + OUTB(dp, VMXNET_MAC_ADDR + i, mac[i]); + } + +out: + mutex_exit(&dp->xmitlock); + mutex_exit(&dp->intrlock); + return err; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Start -- + * Device start routine. Called on "ifconfig plumb" + * + * Results: + * GLD_SUCCESS + * GLD_FAILURE + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Start(gld_mac_info_t *macInfo) +{ + int err = GLD_SUCCESS; + uint32_t r, capabilities, features; + vxn_softc_t * dp = (vxn_softc_t *)macInfo->gldm_private; + + mutex_enter(&dp->intrlock); + mutex_enter(&dp->xmitlock); + + if (!dp->nicActive) { + /* + * Register ring structure with hardware + * + * This downcast is OK because we requested a 32-bit physical address + */ + OUTL(dp, VMXNET_INIT_ADDR, (uint32_t)(uintptr_t)dp->driverDataPhy); + OUTL(dp, VMXNET_INIT_LENGTH, dp->driverData->length); + + /* + * Make sure registeration succeded + */ + r = INL(dp, VMXNET_INIT_LENGTH); + if (!r) { + cmn_err(CE_WARN, "%s%d: Vxn_Start: failed to register ring", + dp->drvName, dp->unit); + err = GLD_FAILURE; + goto out; + } + + /* + * Get maximum tx fragments supported + */ + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_GET_CAPABILITIES); + capabilities = INL(dp, VMXNET_COMMAND_ADDR); + + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_GET_FEATURES); + features = INL(dp, VMXNET_COMMAND_ADDR); + + DPRINTF(3, (CE_CONT, "%s%d: chip capabilities=0x%x features=0x%x\n", + dp->drvName, dp->unit, capabilities, features)); + + if ((capabilities & VMNET_CAP_SG) && + (features & VMXNET_FEATURE_ZERO_COPY_TX)) { + dp->maxTxFrags = VMXNET2_SG_DEFAULT_LENGTH; + } else { + dp->maxTxFrags = 1; + } + ASSERT(dp->maxTxFrags >= 1); + + /* + * Alloc Tx DMA handle + */ + vxn_dma_attrs_tx.dma_attr_sgllen = dp->maxTxFrags; + if (ddi_dma_alloc_handle(dp->dip, &vxn_dma_attrs_tx, DDI_DMA_SLEEP, + NULL, &dp->txDmaHdl) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_Start: failed to alloc tx dma handle", + dp->drvName, dp->unit); + err = GLD_FAILURE; + goto out; + } + + /* + * Enable interrupts on the card + */ + dp->driverData->ifflags |= VMXNET_IFF_BROADCAST | VMXNET_IFF_DIRECTED; + + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_INTR_ENABLE); + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_UPDATE_IFF); + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_UPDATE_LADRF); + + dp->nicActive = TRUE; + } + +out: + mutex_exit(&dp->xmitlock); + mutex_exit(&dp->intrlock); + return err; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Stop -- + * Device stop routine. Called on "ifconfig unplumb" + * + * Results: + * GLD_SUCCESS + * GLD_FAILURE + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Stop(gld_mac_info_t *macInfo) +{ + int i; + int err = GLD_SUCCESS; + vxn_softc_t * dp = (vxn_softc_t *)macInfo->gldm_private; + boolean_t resched; + + mutex_enter(&dp->intrlock); + mutex_enter(&dp->xmitlock); + + if (!dp->nicActive) { + goto out; + } + + /* + * Disable interrupts + */ + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_INTR_DISABLE); + + /* + * Wait for pending transmits + */ + if (dp->txPending) { + for (i=0; i < MAX_TX_WAIT_ON_STOP && dp->txPending; i++) { + delay(drv_usectohz(1000)); + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_CHECK_TX_DONE); + (void) Vxn_TxComplete(dp, &resched); + /* + * Don't worry about rescheduling transmits - GLD handles + * this automatically. + */ + } + } + if (dp->txPending) { + cmn_err(CE_WARN, "%s%d: Vxn_Stop: giving up on %d pending transmits", + dp->drvName, dp->unit, dp->txPending); + } + + OUTL(dp, VMXNET_INIT_ADDR, 0); + dp->nicActive = FALSE; + + /* + * Free Tx DMA handle + * + * The ddi_dma_free_handle() man page says that ddi_dma_unbind_handle() must be called + * prior to calling ddi_dma_free_handle(). + * However, call to ddi_dma_unbind_handle() is not required here, because + * ddi_dma_addr_bind_handle() and matching ddi_dma_unbind_handle() are called from + * Vxn_EncapTxBuf(). + * xmitlock is held in Vxn_EncapTxBuf() as well as acquired above in Vxn_Stop(). + */ + ddi_dma_free_handle(&dp->txDmaHdl); + dp->txDmaHdl = NULL; + +out: + mutex_exit(&dp->xmitlock); + mutex_exit(&dp->intrlock); + return err; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeTxBuf -- + * Free transmit buffer + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeTxBuf(vxn_softc_t *dp, int idx) +{ + mblk_t **txMblkp = &TX_RINGBUF_MBLK(dp, idx); + dma_buf_t *dmaMem = &TX_RINGBUF_DMAMEM(dp, idx); + + if (*txMblkp) { + freemsg(*txMblkp); + *txMblkp = NULL; + } + + if (dmaMem->buf) { + Vxn_FreeDmaMem(dmaMem); + ASSERT(dmaMem->buf == NULL); + } +} + +/* + *----------------------------------------------------------------------------- + * Vxn_EncapTxBuf -- + * Go over dma mappings of Tx buffers and drop buffer physical address + * into ring entry + * + * Results: + * SOLVMXNET_SUCCESS on success + * SOLVMXNET_FAILURE on failure + * + * Side effects: + * None + *---------------- ------------------------------------------------------------- + */ +static int +Vxn_EncapTxBuf(vxn_softc_t *dp, + mblk_t *mp, + Vmxnet2_TxRingEntry *xre, + tx_ring_buf_t *txBuf) +{ + int frag; + int fragcount; + int rval; + mblk_t *tp; + mblk_t *mblk; + boolean_t needPullup = FALSE; + boolean_t dmaMemAlloced = FALSE; + + ASSERT(txBuf); + ASSERT(txBuf->mblk == NULL); + ASSERT(MUTEX_HELD(&dp->xmitlock)); + + xre->sg.length = 0; + xre->flags = 0; + + fragcount = 0; + for (tp = mp; tp != NULL; tp = tp->b_cont) { + fragcount++; + } + if (fragcount > dp->maxTxFrags) { + needPullup = TRUE; + } + +pullup: + frag = 0; + if (needPullup) { + if (!(mblk = msgpullup(mp, -1))) { + cmn_err(CE_WARN, "%s%d: Vxn_EncapTxBuf: msgpullup failed", + dp->drvName, dp->unit); + goto err; + } + } else { + mblk = mp; + } + + /* + * Go through message chain and drop packet pointers into ring + * scatter/gather array + */ + for (tp = mblk; tp != NULL; tp = tp->b_cont) { + + uint_t nCookies; + ddi_dma_cookie_t dmaCookie; + int len = tp->b_wptr - tp->b_rptr; + + if (len) { + /* + * Associate tx buffer with dma handle + */ + ASSERT(dp->txDmaHdl); + if ((rval = ddi_dma_addr_bind_handle(dp->txDmaHdl, NULL, (caddr_t)tp->b_rptr, + len, DDI_DMA_RDWR | DDI_DMA_STREAMING, + DDI_DMA_DONTWAIT, NULL, + &dmaCookie, &nCookies)) + != DDI_DMA_MAPPED) { + + /* + * Try to handle bind failure caused by a page boundary spill + * by allocating a private dma buffer and copying data into it + */ + if ((rval == DDI_DMA_TOOBIG) && !dmaMemAlloced ) { + /* + * Force pullup + */ + if (!needPullup && (dp->maxTxFrags > 1)) { + needPullup = TRUE; + goto pullup; + } + + if (Vxn_AllocDmaMem(dp, len, FALSE, &txBuf->dmaMem) + != SOLVMXNET_SUCCESS) { + goto err; + } + + dmaMemAlloced = TRUE; + + /* + * Copy data into DMA capable buffer + */ + bcopy(tp->b_rptr, txBuf->dmaMem.buf, len); + + /* + * Stick buffer physical addr in the ring + */ + xre->sg.sg[frag].addrLow = txBuf->dmaMem.phyBuf; + xre->sg.sg[frag].length = len; + frag++; + + continue; + + } else { + cmn_err(CE_WARN, "%s%d: Vxn_EncapTxBuf: failed (%d) to bind dma " + "handle for len %d. [dmaMemAlloced=%d]", + dp->drvName, dp->unit, rval, len, dmaMemAlloced); + goto err; + } + } + + /* + * Extract tx buffer physical addresses from cookie + */ + while (nCookies) { + if (UNLIKELY(frag == dp->maxTxFrags)) { + (void)ddi_dma_unbind_handle(dp->txDmaHdl); + + if (!needPullup) { + ASSERT(!dmaMemAlloced); + needPullup = TRUE; + goto pullup; + } else { + cmn_err(CE_WARN, "%s%d: Vxn_EncapTxBuf: " + "exceeded max (%d) fragments in message", + dp->drvName, dp->unit, dp->maxTxFrags); + goto err; + } + } + + /* + * Stick it in the ring + */ + xre->sg.sg[frag].addrLow = dmaCookie.dmac_address; + xre->sg.sg[frag].length = dmaCookie.dmac_size; + frag++; + + if (--nCookies) { + ddi_dma_nextcookie(dp->txDmaHdl, &dmaCookie); + } + } + + (void)ddi_dma_unbind_handle(dp->txDmaHdl); + } + } + + if (frag > 0) { + xre->sg.length = frag; + + /* Give ownership to NIC */ + xre->sg.addrType = NET_SG_PHYS_ADDR; + xre->ownership = VMXNET2_OWNERSHIP_NIC; + xre->flags |= VMXNET2_TX_CAN_KEEP; + txBuf->mblk = mblk; + + /* + * If we called msgpullup to concatenate fragments, free + * original mblk now since we're going to return success. + */ + if (mblk != mp) { + freemsg(mp); + } + + return SOLVMXNET_SUCCESS; + } + +err: + if (mblk != NULL && mblk != mp) { + /* + * Free mblk allocated by msgpullup. + */ + freemsg(mblk); + } + + if (dmaMemAlloced) { + ASSERT(txBuf->dmaMem.buf); + Vxn_FreeDmaMem(&txBuf->dmaMem); + } + + return SOLVMXNET_FAILURE; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Send -- + * GLD Transmit routine. Starts packet hard tx. + * + * Results: + * GLD_SUCCESS on success + * GLD_FAILURE on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Send(gld_mac_info_t *macinfo, mblk_t *mp) +{ + Vmxnet2_TxRingEntry *xre; + int err = GLD_SUCCESS; + vxn_softc_t *dp = (vxn_softc_t *)macinfo->gldm_private; + Vmxnet2_DriverData *dd = dp->driverData; + boolean_t resched = FALSE; + + mutex_enter(&dp->xmitlock); + + /* + * Check if ring entry at drop pointer is available + */ + if (TX_RINGBUF_MBLK(dp, dd->txDriverNext) != NULL) { + DPRINTF(3, (CE_NOTE, "%s%d: Vxn_Send: tx ring full", + dp->drvName, dp->unit)); + err = GLD_NORESOURCES; + dd->txStopped = TRUE; + dp->stats.defer++; + goto out; + } + + xre = &dp->txRing[dd->txDriverNext]; + + /* + * Drop packet into ring entry + */ + if (Vxn_EncapTxBuf(dp, mp, xre, &dp->txRingBuf[dd->txDriverNext]) + != SOLVMXNET_SUCCESS) { + err = GLD_FAILURE; + dp->stats.errxmt++; + goto out; + } + + /* + * Increment drop pointer + */ + VMXNET_INC(dd->txDriverNext, dd->txRingLength); + dd->txNumDeferred++; + dp->txPending++; + + /* + * Transmit, if number of pending packets > tx cluster length + */ + if (dd->txNumDeferred >= dd->txClusterLength) { + dd->txNumDeferred = 0; + + /* + * Call hardware transmit + */ + INL(dp, VMXNET_TX_ADDR); + } + + /* + * Clean up transmit ring. TX completion interrupts are not guaranteed + */ + (void) Vxn_TxComplete(dp, &resched); + +out: + mutex_exit(&dp->xmitlock); + if (resched) { + /* Tell GLD to retry any deferred packets */ + gld_sched(dp->macInfo); + } + return err; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_TxComplete -- + * Scan Tx ring for completed transmits. Reclaim Tx buffers. + * + * Results: + * Returns TRUE if it found a completed transmit, FALSE otherwise. + * Also sets *reschedp to TRUE if the caller should call gld_sched + * to reschedule transmits (once all locks are dropped). + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static boolean_t +Vxn_TxComplete(vxn_softc_t *dp, boolean_t *reschedp) +{ + Vmxnet2_DriverData *dd = dp->driverData; + boolean_t found = FALSE; + boolean_t needresched = FALSE; + + ASSERT(MUTEX_HELD(&dp->xmitlock)); + + while (1) { + Vmxnet2_TxRingEntry *xre = &dp->txRing[dd->txDriverCur]; + + if (xre->ownership != VMXNET2_OWNERSHIP_DRIVER || + (TX_RINGBUF_MBLK(dp, dd->txDriverCur) == NULL)) { + break; + } + + found = TRUE; + Vxn_FreeTxBuf(dp, dd->txDriverCur); + + dp->txPending--; + VMXNET_INC(dd->txDriverCur, dd->txRingLength); + if (dd->txStopped) { + needresched = TRUE; + dd->txStopped = FALSE; + } + } + + *reschedp = needresched; + return found; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Receive -- + * Rx handler. First assembles the packets into a chain of mblks, + * then drops locks and passes them up the stack to GLD. + * + * Results: + * Returns TRUE if it find a packet ready for processing, FALSE + * otherwise. + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static boolean_t +Vxn_Receive(vxn_softc_t *dp) +{ + int ringnext; + short pktlen; + Vmxnet2_DriverData *dd = dp->driverData; + rx_dma_buf_t *rxDesc; + rx_dma_buf_t *newRxDesc; + mblk_t *mblk; + mblk_t *head = NULL; + mblk_t **tail = &head; + mblk_t *next; + boolean_t found = FALSE; /* Did we find at least one packet? */ + + ASSERT(MUTEX_HELD(&dp->intrlock)); + + /* + * Walk receive ring looking for entries with ownership + * reverted back to driver + */ + while (1) { + Vmxnet2_RxRingEntry *rre; + rx_dma_buf_t **rbuf; + + ringnext = dd->rxDriverNext; + rre = &dp->rxRing[ringnext]; + rbuf = &dp->rxRingBuffPtr[ringnext]; + + if (rre->ownership != VMXNET2_OWNERSHIP_DRIVER) { + break; + } + + found = TRUE; + + pktlen = rre->actualLength; + + if (pktlen < (60 - 4)) { + /* + * Ethernet header vlan tags are 4 bytes. Some vendors generate + * 60byte frames including vlan tags. When vlan tag + * is stripped, such frames become 60 - 4. (PR106153) + */ + dp->stats.errrcv++; + if (pktlen != 0) { + DPRINTF(3, (CE_CONT, "%s%d: runt packet\n", dp->drvName, dp->unit)); + dp->stats.runt++; + } + } else { + /* + * Alloc new Rx buffer to replace current one + */ + newRxDesc = Vxn_AllocRxBufFromPool(dp); + + if (newRxDesc) { + rxDesc = *rbuf; + mblk = rxDesc->mblk; + + *rbuf = newRxDesc; + rre->paddr = newRxDesc->dmaDesc.phyBuf + ETHERALIGN; + rre->bufferLength = MAXPKTBUF - ETHERALIGN; + rre->actualLength = 0; + + /* + * Advance write pointer past packet length + */ + mblk->b_wptr = mblk->b_rptr + pktlen; + + /* + * Add to end of chain. + */ + mblk->b_next = NULL; + *tail = mblk; + tail = &mblk->b_next; + } else { + dp->stats.errrcv++; + dp->stats.norcvbuf++; + } + } + + /* Give the descriptor back to NIC */ + rre->ownership = VMXNET2_OWNERSHIP_NIC; + VMXNET_INC(dd->rxDriverNext, dd->rxRingLength); + } + + /* + * Walk chain and pass mblks up to gld_recv one by one. + */ + mutex_exit(&dp->intrlock); + for (mblk = head; mblk != NULL; mblk = next) { + next = mblk->b_next; + mblk->b_next = NULL; + gld_recv(dp->macInfo, mblk); + } + mutex_enter(&dp->intrlock); + + return (found); +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Interrupt -- + * GLD interrupt handler. Scan: Rx ring for received packets, Tx ring for + * completed transmits + * + * Results: + * - DDI_INTR_CLAIMED (if we found something to do) + * - DDI_INTR_UNCLAIMED (if not) + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static u_int +Vxn_Interrupt(gld_mac_info_t *macInfo) +{ + u_int ret = DDI_INTR_UNCLAIMED; + vxn_softc_t *dp = (vxn_softc_t *)macInfo->gldm_private; + boolean_t foundRx, foundTx; + boolean_t resched = FALSE; + + mutex_enter(&dp->intrlock); + dp->inIntr = TRUE; + + if (!dp->nicActive) { + goto out; + } + + /* + * Ack interrupt + */ + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_INTR_ACK); + + foundRx = Vxn_Receive(dp); + + mutex_enter(&dp->xmitlock); + foundTx = Vxn_TxComplete(dp, &resched); + mutex_exit(&dp->xmitlock); + + if (foundRx || foundTx) { + ret = DDI_INTR_CLAIMED; + dp->stats.interrupts++; + } + +out: + dp->inIntr = FALSE; + mutex_exit(&dp->intrlock); + + if (resched) { + gld_sched(dp->macInfo); + } + + return ret; +} + + +/* + *----------------------------------------------------------------------------- + * Vxn_ReclaimRxBuf -- + * Callback handler invoked by freemsg(). Frees Rx buffer memory and mappings + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_ReclaimRxBuf(rx_dma_buf_t *rxDesc) +{ + Vxn_FreeRxBufToPool(rxDesc); +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeRxBuf -- + * Free allocated Rx buffer + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeRxBuf(rx_dma_buf_t *rxDesc) +{ + ASSERT(rxDesc); + + if (rxDesc->mblk) { + freemsg(rxDesc->mblk); + } else { + Vxn_FreeDmaMem(&rxDesc->dmaDesc); + kmem_free(rxDesc, sizeof(rx_dma_buf_t)); + } +} + + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocRxBuf -- + * Allocate Rx buffer + * + * Results: + * Pointer to Rx buffer descriptor - on success + * NULL - on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static rx_dma_buf_t * +Vxn_AllocRxBuf(vxn_softc_t *dp, int cansleep) +{ + rx_dma_buf_t *rxDesc; + + rxDesc = (rx_dma_buf_t *)kmem_zalloc(sizeof(rx_dma_buf_t), + cansleep ? KM_SLEEP : KM_NOSLEEP); + if (!rxDesc) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocRxBuf: kmem_zalloc failed", + dp->drvName, dp->unit); + return NULL; + } + + rxDesc->softc = dp; + + /* + * Alloc dma-able packet memory + */ + if (Vxn_AllocDmaMem(dp, MAXPKTBUF, cansleep, &rxDesc->dmaDesc) + != SOLVMXNET_SUCCESS) { + kmem_free(rxDesc, sizeof(rx_dma_buf_t)); + return NULL; + } + + /* + * Fill in free callback; fired by freemsg() + */ + rxDesc->freeCB.free_func = &Vxn_ReclaimRxBuf; + rxDesc->freeCB.free_arg = (caddr_t) rxDesc; + + rxDesc->mblk = NULL; + return rxDesc; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeInitBuffers -- + * Free allocated Tx and Rx buffers + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeInitBuffers(vxn_softc_t *dp) +{ + int i; + + for (i=0; i<dp->vxnNumRxBufs; i++) { + if (dp->rxRingBuffPtr[i]) { + Vxn_FreeRxBuf(dp->rxRingBuffPtr[i]); + dp->rxRingBuffPtr[i] = NULL; + } + } + + for (i=0; i<dp->vxnNumTxBufs; i++) { + if (TX_RINGBUF_MBLK(dp, i)) { + Vxn_FreeTxBuf(dp, i); + } + } + + /* + * Rx pool must get freed last. Rx buffers above will + * show up on the pool when freemsg callback fires. + */ + Vxn_FreeRxBufPool(dp); +} + + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocRxBufPool -- + * Allocate pool of rx buffers - 3 * configured Rx buffers + * + * Results: + * SOLVMXNET_SUCCESS/SOLVMXNET_FAILURE + * + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_AllocRxBufPool(vxn_softc_t *dp) +{ + int i; + + dp->rxFreeBufList = NULL; + + // Allow list to double in size if needed. Any additional buffers + // that are allocated on the fly will be freed back to main memory. + dp->rxMaxFreeBufs = dp->vxnNumRxBufs * 6; + + for (i = 0; i < dp->vxnNumRxBufs * 3; i++) { + rx_dma_buf_t *rxDesc; + + /* + * Alloc rx buffer + */ + if (!(rxDesc = Vxn_AllocRxBuf(dp, TRUE))) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocRxBufPool: failed to allocate memory", + dp->drvName, dp->unit); + dp->rxNumFreeBufs = i; + return SOLVMXNET_FAILURE; + } + /* + * Add to free list + */ + rxDesc->next = dp->rxFreeBufList; + dp->rxFreeBufList = rxDesc; + } + + dp->rxNumFreeBufs = i; + return SOLVMXNET_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeRxBufPool -- + * Free rx buffers pool + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeRxBufPool(vxn_softc_t *dp) +{ + while (dp->rxFreeBufList) { + rx_dma_buf_t *rxDesc = dp->rxFreeBufList; + + /* unlink */ + dp->rxFreeBufList = rxDesc->next; + + ASSERT(rxDesc->mblk == NULL); + Vxn_FreeDmaMem(&rxDesc->dmaDesc); + kmem_free(rxDesc, sizeof(rx_dma_buf_t)); + } + dp->rxNumFreeBufs = 0; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocRxBufFromPool -- + * Allocate Rx buffer from free pool + * + * Results: + * Pointer to Rx buffer descriptor - on success + * NULL - on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static rx_dma_buf_t * +Vxn_AllocRxBufFromPool(vxn_softc_t *dp) +{ + rx_dma_buf_t *rxDesc = NULL; + + mutex_enter(&dp->rxlistlock); + if (dp->rxFreeBufList) { + rxDesc = dp->rxFreeBufList; + dp->rxFreeBufList = rxDesc->next; + ASSERT(dp->rxNumFreeBufs >= 1); + dp->rxNumFreeBufs--; + } + mutex_exit(&dp->rxlistlock); + + if (!rxDesc) { + /* + * Try to allocate new descriptor from memory. Can't block here + * since we could be being called from interrupt context. + */ + DPRINTF(5, (CE_NOTE, "%s%d: allocating rx buf from memory", + dp->drvName, dp->unit)); + if (!(rxDesc = Vxn_AllocRxBuf(dp, FALSE))) { + cmn_err(CE_WARN, + "%s%d: Vxn_AllocRxBufFromPool : pool rx alloc failed", + dp->drvName, dp->unit); + return NULL; + } + } + + /* + * Allocate new message block for this buffer + */ + rxDesc->mblk = desballoc((uchar_t *)rxDesc->dmaDesc.buf + ETHERALIGN, + rxDesc->dmaDesc.bufLen - ETHERALIGN, + BPRI_MED, &rxDesc->freeCB); + if (!rxDesc->mblk) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocRxBufFromPool : desballoc failed", + dp->drvName, dp->unit); + + /* put back on free list */ + Vxn_FreeRxBufToPool(rxDesc); + return NULL; + } + + return rxDesc; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeRxBufToPool -- + * Return rx buffer to free pool + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeRxBufToPool(rx_dma_buf_t *rxDesc) +{ + vxn_softc_t *dp = rxDesc->softc; + + rxDesc->mblk = NULL; + + /* + * Insert on free list, or free if the list is full + */ + mutex_enter(&dp->rxlistlock); + if (dp->rxNumFreeBufs >= dp->rxMaxFreeBufs) { + DPRINTF(5, (CE_NOTE, "%s%d: freeing rx buf to memory", + dp->drvName, dp->unit)); + Vxn_FreeRxBuf(rxDesc); + } else { + rxDesc->next = dp->rxFreeBufList; + dp->rxFreeBufList = rxDesc; + dp->rxNumFreeBufs++; + } + mutex_exit(&dp->rxlistlock); +} + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocInitBuffers -- + * Allocated Rx buffers and init ring entries + * + * Results: + * SOLVMXNET_SUCCESS - on success + * SOLVMXNET_FAILURE - on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_AllocInitBuffers(vxn_softc_t *dp) +{ + Vmxnet2_DriverData *dd; + uint32_t i, offset; + + dd = dp->driverData; + offset = sizeof(*dd); + + /* + * Init shared structures + */ + dd->rxRingLength = dp->vxnNumRxBufs; + dd->rxRingOffset = offset; + dp->rxRing = (Vmxnet2_RxRingEntry *)((uintptr_t)dd + offset); + offset += dp->vxnNumRxBufs * sizeof(Vmxnet2_RxRingEntry); + + dd->rxRingLength2 = 1; + dd->rxRingOffset2 = offset; + offset += sizeof(Vmxnet2_RxRingEntry); + + dd->txRingLength = dp->vxnNumTxBufs; + dd->txRingOffset = offset; + dp->txRing = (Vmxnet2_TxRingEntry *)((uintptr_t)dd + offset); + offset += dp->vxnNumTxBufs * sizeof(Vmxnet2_TxRingEntry); + + /* + * Alloc Rx buffers pool + */ + if ( Vxn_AllocRxBufPool(dp) != SOLVMXNET_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocInitBuffers: failed to alloc buf pool", + dp->drvName, dp->unit); + return SOLVMXNET_FAILURE; + } + + /* + * Allocate receive buffers + */ + for (i = 0; i < dp->vxnNumRxBufs; i++) { + rx_dma_buf_t *rxDesc; + Vmxnet2_RxRingEntry *rre = &dp->rxRing[i]; + + if (!(rxDesc = Vxn_AllocRxBufFromPool(dp))) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocInitBuffers: " + "failed to alloc buf from pool", dp->drvName, dp->unit); + goto err; + } + + /* + * Init ring entries + */ + rre->paddr = rxDesc->dmaDesc.phyBuf + ETHERALIGN; + rre->bufferLength = MAXPKTBUF - ETHERALIGN; + rre->actualLength = 0; + dp->rxRingBuffPtr[i] = rxDesc; + rre->ownership = VMXNET2_OWNERSHIP_NIC; + } + + dp->txDmaHdl = NULL; + + /* + * Dummy recvRing2 tacked on to the end, with a single unusable entry + */ + dp->rxRing[i].paddr = 0; + dp->rxRing[i].bufferLength = 0; + dp->rxRing[i].actualLength = 0; + dp->rxRingBuffPtr[i] = NULL; + dp->rxRing[i].ownership = VMXNET2_OWNERSHIP_DRIVER; + + dd->rxDriverNext = 0; + + /* + * Give xmit ring ownership to DRIVER + */ + for (i = 0; i < dp->vxnNumTxBufs; i++) { + dp->txRing[i].ownership = VMXNET2_OWNERSHIP_DRIVER; + dp->txRingBuf[i].mblk = NULL; + dp->txRingBuf[i].dmaMem.buf = NULL; + dp->txRing[i].sg.sg[0].addrHi = 0; + } + + dd->txDriverCur = dd->txDriverNext = 0; + dd->txStopped = FALSE; + + return SOLVMXNET_SUCCESS; + +err: + for (i=0; i<dp->vxnNumRxBufs; i++) { + if (dp->rxRingBuffPtr[i]) { + Vxn_FreeRxBuf(dp->rxRingBuffPtr[i]); + dp->rxRingBuffPtr[i] = NULL; + } + } + + Vxn_FreeRxBufPool(dp); + return SOLVMXNET_FAILURE; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeDmaMem -- + * Free allocated dma memory + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeDmaMem(dma_buf_t *dma) +{ + ddi_dma_unbind_handle(dma->dmaHdl); + ddi_dma_mem_free(&dma->dataAccHdl); + ddi_dma_free_handle(&dma->dmaHdl); + + dma->buf = NULL; + dma->phyBuf = (uint32_t)(uintptr_t)NULL; + dma->bufLen = 0; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocDmaMem -- + * Allocate dma-able memory and fill passed in dma descriptor pointer + * if successful + * + * Results: + * SOLVMXNET_SUCCESS on success + * SOLVMXNET_FAILURE on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_AllocDmaMem(vxn_softc_t *dp, int size, int cansleep, dma_buf_t *dma) +{ + /* + * Allocate handle + */ + if (ddi_dma_alloc_handle(dp->dip, &vxn_dma_attrs, + cansleep ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT, + NULL, &dma->dmaHdl) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocDmaMem: failed to allocate handle", + dp->drvName, dp->unit); + return SOLVMXNET_FAILURE; + } + + /* + * Allocate memory + */ + if (ddi_dma_mem_alloc(dma->dmaHdl, size, &vxn_buf_attrs, DDI_DMA_CONSISTENT, + cansleep ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT, NULL, + &dma->buf, &dma->bufLen, &dma->dataAccHdl) + != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocDmaMem: " + "ddi_dma_mem_alloc %d bytes failed", + dp->drvName, dp->unit, size); + ddi_dma_free_handle(&dma->dmaHdl); + return SOLVMXNET_FAILURE; + } + + /* + * Mapin memory + */ + if (ddi_dma_addr_bind_handle(dma->dmaHdl, NULL, dma->buf, dma->bufLen, + DDI_DMA_RDWR | DDI_DMA_STREAMING, + cansleep ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT, + NULL, &dma->cookie, &dma->cookieCount) + != DDI_DMA_MAPPED) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocDmaMem: failed to bind handle", + dp->drvName, dp->unit); + ddi_dma_mem_free(&dma->dataAccHdl); + ddi_dma_free_handle(&dma->dmaHdl); + return SOLVMXNET_FAILURE; + } + + if (dma->cookieCount != 1) { + cmn_err(CE_WARN, "%s%d: Vxn_AllocDmaMem: too many DMA cookies", + dp->drvName, dp->unit); + Vxn_FreeDmaMem(dma); + return SOLVMXNET_FAILURE; + } + + /* + * Save physical address (for easy use) + */ + dma->phyBuf = dma->cookie.dmac_address; + + return SOLVMXNET_SUCCESS; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_FreeDriverData -- + * Free driver data structures and Tx Rx buffers + * + * Results: + * None + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static void +Vxn_FreeDriverData(vxn_softc_t *dp) +{ + Vxn_FreeInitBuffers(dp); + Vxn_FreeDmaMem(&dp->driverDataDmaMem); +} + +/* + *----------------------------------------------------------------------------- + * Vxn_AllocDriverData -- + * Allocate driver data structures and Tx Rx buffers on init + * + * Results: + * SOLVMXNET_SUCCESS on success + * SOLVMXNET_FAILURE on failure + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_AllocDriverData(vxn_softc_t *dp) +{ + uint32_t r, driverDataSize; + + /* + * Get configured receive buffers + */ + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_GET_NUM_RX_BUFFERS); + r = INL(dp, VMXNET_COMMAND_ADDR); + if (r == 0 || r > MAX_NUM_RECV_BUFFERS) { + r = DEFAULT_NUM_RECV_BUFFERS; + } + dp->vxnNumRxBufs = r; + + /* + * Get configured transmit buffers + */ + OUTL(dp, VMXNET_COMMAND_ADDR, VMXNET_CMD_GET_NUM_TX_BUFFERS); + r = INL(dp, VMXNET_COMMAND_ADDR); + if (r == 0 || r > MAX_NUM_XMIT_BUFFERS) { + r = DEFAULT_NUM_XMIT_BUFFERS; + } + dp->vxnNumTxBufs = r; + + /* + * Calculate shared data size and allocate memory for it + */ + driverDataSize = + sizeof(Vmxnet2_DriverData) + + /* numRecvBuffers + 1 for the dummy recvRing2 (used only by Windows) */ + (dp->vxnNumRxBufs + 1) * sizeof(Vmxnet2_RxRingEntry) + + dp->vxnNumTxBufs * sizeof(Vmxnet2_TxRingEntry); + + if (Vxn_AllocDmaMem(dp, driverDataSize, TRUE, &dp->driverDataDmaMem) + != SOLVMXNET_SUCCESS) { + return SOLVMXNET_FAILURE; + } + + /* + * Clear memory (bzero isn't resolved by module loader for some reason) + */ + ASSERT(dp->driverDataDmaMem.buf && dp->driverDataDmaMem.bufLen); + Vxn_Memset(dp->driverDataDmaMem.buf, 0, dp->driverDataDmaMem.bufLen); + + dp->driverData = (Vmxnet2_DriverData *)dp->driverDataDmaMem.buf; + dp->driverDataPhy = (void *)(uintptr_t)dp->driverDataDmaMem.phyBuf; + + /* So that the vmkernel can check it is compatible */ + dp->driverData->magic = VMXNET2_MAGIC; + dp->driverData->length = driverDataSize; + + /* + * Alloc rx/tx buffers, init ring, register with hardware etc. + */ + if (Vxn_AllocInitBuffers(dp) != SOLVMXNET_SUCCESS) { + Vxn_FreeDmaMem(&dp->driverDataDmaMem); + return SOLVMXNET_FAILURE; + } + + DPRINTF(3, (CE_CONT, "%s%d: numRxBufs=(%d*%"FMT64"d) numTxBufs=(%d*%"FMT64"d)" + " driverDataSize=%d driverDataPhy=0x%p\n", + dp->drvName, dp->unit, + dp->vxnNumRxBufs, (uint64_t)sizeof(Vmxnet2_RxRingEntry), + dp->vxnNumTxBufs, (uint64_t)sizeof(Vmxnet2_TxRingEntry), + driverDataSize, dp->driverDataPhy)); + + return SOLVMXNET_SUCCESS; +} + + +/* + *----------------------------------------------------------------------------- + * Vxn_Attach -- + * Probe and attach driver to stack + * + * Results: + * DDI_SUCCESS + * DDI_FAILURE + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i, ret, len, unit; + const char *drvName; + ddi_acc_handle_t confHdl; + uint16_t vid, did; + uint8_t revid __unused; + struct pci_phys_spec *regs; + caddr_t vxnIOp; + ddi_acc_handle_t vxnIOHdl; + uint32_t vLow, vHigh; + gld_mac_info_t *macInfo; + vxn_softc_t *dp; + boolean_t morphed = FALSE; + uint_t regSpaceSize; + uint_t chip; + uint_t vxnIOSize; + + if (cmd != DDI_ATTACH) { + return DDI_FAILURE; + } + + unit = ddi_get_instance(dip); + drvName = ddi_driver_name(dip); + + /* + * Check if chip is supported. + */ + if (pci_config_setup(dip, &confHdl) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: pci_config_setup() failed", drvName, unit); + return DDI_FAILURE; + } + + vid = pci_config_get16(confHdl, PCI_CONF_VENID); + did = pci_config_get16(confHdl, PCI_CONF_DEVID); + revid = pci_config_get8(confHdl, PCI_CONF_REVID); + + if (vid == PCI_VENDOR_ID_VMWARE && did == PCI_DEVICE_ID_VMWARE_NET) { + /* Found vmxnet */ + chip = VMXNET_CHIP; + } + else if (vid == PCI_VENDOR_ID_AMD && did == PCI_DEVICE_ID_AMD_VLANCE) { + /* Found vlance (maybe a vmxnet disguise) */ + chip = LANCE_CHIP; + } + else { + /* Not Found */ + DPRINTF(3, (CE_WARN, "%s: Vxn_Attach: wrong PCI venid/devid (0x%x, 0x%x)", + drvName, vid, did)); + goto err; + } + + DPRINTF(3, (CE_CONT, "%s%d: (vid: 0x%04x, did: 0x%04x, revid: 0x%02x)\n", + drvName, unit, vid, did, revid)); + + /* + * Get device properties + */ + regs = NULL; + len = 0; + if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "reg", (caddr_t)®s, &len) != DDI_PROP_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: failed to get reg property", + drvName, unit); + goto err; + } + + ASSERT(regs != NULL && len > 0); + + /* + * Search device properties for IO-space + */ + for (i = 0; i <len / sizeof(struct pci_phys_spec); i++) { + if ((regs[i].pci_phys_hi & PCI_REG_ADDR_M) == PCI_ADDR_IO) { + regSpaceSize = regs[i].pci_size_low; + DPRINTF(5, (CE_CONT, "%s%d: Vxn_Attach: regSpaceSize=%d\n", + drvName, unit, regSpaceSize)); + kmem_free(regs, len); + goto map_space_found; + } + } + + cmn_err(CE_WARN, "%s%d: Vxn_Attach: failed to find IO space", drvName, unit); + kmem_free(regs, len); + goto err; + +map_space_found: + + /* + * Ensure we can access registers through IO space. + */ + ret = pci_config_get16(confHdl, PCI_CONF_COMM); + ret |= PCI_COMM_IO | PCI_COMM_ME; + pci_config_put16(confHdl, PCI_CONF_COMM, ret); + + if (ddi_regs_map_setup(dip, i, (caddr_t *)&vxnIOp, 0, 0, &dev_attr, + &vxnIOHdl) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: ddi_regs_map_setup failed", + drvName, unit); + goto err; + } + + if (chip == VMXNET_CHIP) { + vxnIOSize = VMXNET_CHIP_IO_RESV_SIZE; + } + else { + /* + * Since this is a vlance adapter we can only use it if + * its I/0 space is big enough for the adapter to be + * capable of morphing. This is the first requirement + * for this adapter to potentially be morphable. The + * layout of a morphable LANCE adapter is + * + * I/O space: + * + * |------------------| + * | LANCE IO PORTS | + * |------------------| + * | MORPH PORT | + * |------------------| + * | VMXNET IO PORTS | + * |------------------| + * + * VLance has 8 ports of size 4 bytes, the morph port is 4 bytes, and + * Vmxnet has 10 ports of size 4 bytes. + * + * We shift up the ioaddr with the size of the LANCE I/O space since + * we want to access the vmxnet ports. We also shift the ioaddr up by + * the MORPH_PORT_SIZE so other port access can be independent of + * whether we are Vmxnet or a morphed VLance. This means that when + * we want to access the MORPH port we need to subtract the size + * from ioaddr to get to it. + */ + vxnIOp += LANCE_CHIP_IO_RESV_SIZE + MORPH_PORT_SIZE; + vxnIOSize = LANCE_CHIP_IO_RESV_SIZE + MORPH_PORT_SIZE + + VMXNET_CHIP_IO_RESV_SIZE; + } + + /* + * Do not attempt to morph non-morphable AMD PCnet + */ + if (vxnIOSize > regSpaceSize) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: " + "vlance device is not supported by this driver", drvName, unit); + goto err_free_regs_map; + } + + /* + * Morph, if we found a vlance adapter + */ + if (chip == LANCE_CHIP) { + uint16_t magic; + + /* Read morph port to verify that we can morph the adapter */ + magic = ddi_get16(vxnIOHdl, (uint16_t *)(vxnIOp - MORPH_PORT_SIZE)); + if (magic != LANCE_CHIP && magic != VMXNET_CHIP) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: Invalid magic, read: 0x%08X", + drvName, unit, magic); + goto err_free_regs_map; + } + + /* Morph */ + ddi_put16(vxnIOHdl, (uint16_t *)(vxnIOp - MORPH_PORT_SIZE), VMXNET_CHIP); + morphed = TRUE; + + /* Verify that we morphed correctly */ + magic = ddi_get16(vxnIOHdl, (uint16_t *)(vxnIOp - MORPH_PORT_SIZE)); + if (magic != VMXNET_CHIP) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: Couldn't morph adapter." + " Invalid magic, read:: 0x%08X", drvName, unit, magic); + goto err_morph_back; + } + } + + /* + * Check the version number of the device implementation + */ + vLow = (uint32_t)ddi_get32(vxnIOHdl, + (uint32_t *)(vxnIOp+VMXNET_LOW_VERSION)); + vHigh = (uint32_t)ddi_get32(vxnIOHdl, + (uint32_t *)(vxnIOp+VMXNET_HIGH_VERSION)); + + if ((vLow & 0xffff0000) != (VMXNET2_MAGIC & 0xffff0000) || + ((VMXNET2_MAGIC < vLow) || (VMXNET2_MAGIC > vHigh))) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: driver version 0x%08X doesn't " + "match device 0x%08X:0x%08X", + drvName, unit, VMXNET2_MAGIC, vLow, vHigh); + goto err_version_mismatch; + } + + /* + * Alloc soft state + */ + macInfo = gld_mac_alloc(dip); + if (!macInfo) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: gld_mac_alloc failed", + drvName, unit); + goto err_gld_mac_alloc; + } + + dp = (vxn_softc_t *) kmem_zalloc(sizeof(vxn_softc_t), KM_SLEEP); + ASSERT(dp); + + /* + * Get interrupt cookie + */ + if (ddi_get_iblock_cookie(dip, 0, &dp->iblockCookie) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: Vxn_Attach: ddi_get_iblock_cookie failed", + drvName, unit); + goto err_get_iblock_cookie; + } + + strncpy(dp->drvName, drvName, SOLVMXNET_MAXNAME); + dp->unit = unit; + dp->dip = dip; + dp->macInfo = macInfo; + dp->confHdl = confHdl; + dp->vxnIOHdl = vxnIOHdl; + dp->vxnIOp = vxnIOp; + dp->morphed = morphed; + dp->nicActive = FALSE; + dp->txPending = 0; + dp->maxTxFrags = 1; + + /* + * Initialize mutexes + */ + mutex_init(&dp->intrlock, NULL, MUTEX_DRIVER, (void *)dp->iblockCookie); + mutex_init(&dp->xmitlock, NULL, MUTEX_DRIVER, (void *)dp->iblockCookie); + mutex_init(&dp->rxlistlock, NULL, MUTEX_DRIVER, (void *)dp->iblockCookie); + + /* + * Allocate and initialize our private and shared data structures + */ + if (Vxn_AllocDriverData(dp) != SOLVMXNET_SUCCESS) { + goto err_alloc_driverdata; + } + + /* + * Read the MAC address from the device + */ + for (i = 0; i < 6; i++) { + dp->devAddr.ether_addr_octet[i] = + (uint8_t)ddi_get8(vxnIOHdl, (uint8_t *)(vxnIOp + VMXNET_MAC_ADDR + i)); + } + macInfo->gldm_vendor_addr = dp->devAddr.ether_addr_octet; + macInfo->gldm_broadcast_addr = etherbroadcastaddr.ether_addr_octet; + + DPRINTF(3, (CE_CONT, + "MAC address: %02x:%02x:%02x:%02x:%02x:%02x\n", + dp->devAddr.ether_addr_octet[0], + dp->devAddr.ether_addr_octet[1], + dp->devAddr.ether_addr_octet[2], + dp->devAddr.ether_addr_octet[3], + dp->devAddr.ether_addr_octet[4], + dp->devAddr.ether_addr_octet[5])); + + /* + * Configure GLD entry points + */ + macInfo->gldm_devinfo = dip; + macInfo->gldm_private = (caddr_t)dp; + macInfo->gldm_cookie = dp->iblockCookie; + macInfo->gldm_reset = Vxn_Reset; + macInfo->gldm_start = Vxn_Start; + macInfo->gldm_stop = Vxn_Stop; + macInfo->gldm_set_mac_addr = Vxn_SetMacAddress; + macInfo->gldm_send = Vxn_Send; + macInfo->gldm_set_promiscuous = Vxn_SetPromiscuous; + macInfo->gldm_get_stats = Vxn_GetStats; + macInfo->gldm_ioctl = NULL; + macInfo->gldm_set_multicast= Vxn_SetMulticast; + macInfo->gldm_intr = Vxn_Interrupt; + macInfo->gldm_mctl = NULL; + + macInfo->gldm_ident = (char *)ddi_driver_name(dip); + macInfo->gldm_type = DL_ETHER; + macInfo->gldm_minpkt = 0; + macInfo->gldm_maxpkt = ETHERMTU; + macInfo->gldm_addrlen = ETHERADDRL; + macInfo->gldm_saplen = -2; + macInfo->gldm_ppa = unit; + + /* + * Register with GLD (Generic Lan Driver) framework + */ + if (gld_register(dip, + (char *)ddi_driver_name(dip), macInfo) != DDI_SUCCESS) { + goto err_gld_register; + } + + /* + * Add interrupt to system. + */ + if (ddi_add_intr(dip, 0, NULL, NULL, gld_intr, + (caddr_t)macInfo) != DDI_SUCCESS) { + cmn_err(CE_WARN, "%s%d: ddi_add_intr failed", drvName, unit); + goto err_ddi_add_intr; + } + + /* + * Add to list of interfaces. + */ + mutex_enter(&vxnListLock); + dp->next = &vxnList; + dp->prev = vxnList.prev; + vxnList.prev->next = dp; + vxnList.prev = dp; + mutex_exit(&vxnListLock); + + /* + * Success + */ + return DDI_SUCCESS; + +err_ddi_add_intr: + gld_unregister(macInfo); + +err_gld_register: + Vxn_FreeDriverData(dp); + +err_alloc_driverdata: + mutex_destroy(&dp->intrlock); + mutex_destroy(&dp->xmitlock); + +err_get_iblock_cookie: + kmem_free(dp, sizeof(*dp)); + gld_mac_free(macInfo); + +err_gld_mac_alloc: +err_version_mismatch: +err_morph_back: + if (morphed) { + ddi_put16(vxnIOHdl, (uint16_t *)(vxnIOp - MORPH_PORT_SIZE), LANCE_CHIP); + } + +err_free_regs_map: + ddi_regs_map_free(&vxnIOHdl); + +err: + pci_config_teardown(&confHdl); + return DDI_FAILURE; +} + +/* + *----------------------------------------------------------------------------- + * Vxn_Detach -- + * Called on module unload + * + * Results: + * DDI_SUCCESS + * DDI_FAILURE + * + * Side effects: + * None + *----------------------------------------------------------------------------- + */ +static int +Vxn_Detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + gld_mac_info_t *macInfo; + vxn_softc_t *dp; + + macInfo = (gld_mac_info_t *)ddi_get_driver_private(dip); + dp = (vxn_softc_t *)macInfo->gldm_private; + + if (cmd == DDI_DETACH) { + /* + * Tear down interrupt + */ + ddi_remove_intr(dip, 0, macInfo->gldm_cookie); + gld_unregister(macInfo); + + /* + * Quiesce hardware + */ + Vxn_Stop(macInfo); + + /* + * Free driver-data, tx/rx buffers etc + */ + Vxn_FreeDriverData(dp); + + /* + * Destroy locks + */ + mutex_destroy(&dp->intrlock); + mutex_destroy(&dp->xmitlock); + + /* + * Unmorph + */ + if (dp->morphed) { + uint16_t magic; + + /* Verify that we had morphed earlier */ + magic = ddi_get16(dp->vxnIOHdl, + (uint16_t *)(dp->vxnIOp - MORPH_PORT_SIZE)); + if (magic != VMXNET_CHIP) { + cmn_err(CE_WARN, "%s%d: Vxn_Detach: Adapter not morphed" + " magic=0x%08X", dp->drvName, dp->unit, magic); + } + else { + /* Unmorph */ + ddi_put16(dp->vxnIOHdl, + (uint16_t *)(dp->vxnIOp - MORPH_PORT_SIZE), LANCE_CHIP); + + /* Verify */ + magic = ddi_get16(dp->vxnIOHdl, + (uint16_t *)(dp->vxnIOp - MORPH_PORT_SIZE)); + if (magic != LANCE_CHIP) { + cmn_err(CE_WARN, "%s%d: Vxn_Detach: Unable to unmorph adapter" + " magic=0x%08X", dp->drvName, dp->unit, magic); + } + } + } + + /* + * Release resister mappings + */ + ddi_regs_map_free(&dp->vxnIOHdl); + pci_config_teardown(&dp->confHdl); + + /* + * Remove from list of interfaces. + */ + mutex_enter(&vxnListLock); + ASSERT(dp != &vxnList); + dp->prev->next = dp->next; + dp->next->prev = dp->prev; + mutex_exit(&vxnListLock); + + /* + * Release memory + */ + kmem_free(dp, sizeof(*dp)); + gld_mac_free(macInfo); + + return DDI_SUCCESS; + } + else { + return DDI_FAILURE; + } +} + +static struct module_info vxnminfo = { + 0, /* mi_idnum */ + "vmxnet", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + QHIWATER, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit vxnrinit = { + NULL, /* qi_putp */ + gld_rsrv, /* qi_srvp */ + gld_open, /* qi_qopen */ + gld_close, /* qi_qclose */ + NULL, /* qi_qadmin */ + &vxnminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit vxnwinit = { + gld_wput, /* qi_putp */ + gld_wsrv, /* qi_srvp */ + NULL, /* qi_qopen */ + NULL, /* qi_qclose */ + NULL, /* qi_qadmin */ + &vxnminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab vxn_info = { + &vxnrinit, /* st_rdinit */ + &vxnwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_vxn_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &vxn_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops vxn_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + gld_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + Vxn_Attach, /* devo_attach */ + Vxn_Detach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_vxn_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + ddi_power /* devo_power */ +}; + +static struct modldrv modldrv = { + &mod_driverops, + ident, + &vxn_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modldrv, NULL,} +}; + + +/* + * Module load entry point + */ +int +_init(void) +{ + int err; + + DPRINTF(5, (CE_CONT, "vxn: _init:\n")); + /* Initialize interface list */ + vxnList.next = vxnList.prev = &vxnList; + mutex_init(&vxnListLock, NULL, MUTEX_DRIVER, NULL); + if ((err = mod_install(&modlinkage)) != 0) { + mutex_destroy(&vxnListLock); + } + return err; +} + +/* + * Module unload entry point + */ +int +_fini(void) +{ + int err; + + DPRINTF(5, (CE_CONT, "vxn: _fini:\n")); + if ((err = mod_remove(&modlinkage)) == 0) { + mutex_destroy(&vxnListLock); + } + return err; +} + +/* + * Module info entry point + */ +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + diff --git a/usr/src/uts/intel/io/vmxnet/vmxnet.conf b/usr/src/uts/intel/io/vmxnet/vmxnet.conf new file mode 100644 index 0000000000..eb3b160412 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vmxnet.conf @@ -0,0 +1,24 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright (c) 2012, Joyent, Inc. All rights reserved. +# Use is subject to license terms. +# diff --git a/usr/src/uts/intel/io/vmxnet/vmxnet2_def.h b/usr/src/uts/intel/io/vmxnet/vmxnet2_def.h new file mode 100644 index 0000000000..5ea437df72 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vmxnet2_def.h @@ -0,0 +1,436 @@ +/********************************************************* + * Copyright (C) 2004 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +#ifndef _VMXNET2_DEF_H_ +#define _VMXNET2_DEF_H_ + +#define INCLUDE_ALLOW_USERLEVEL + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMK_MODULE +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_DISTRIBUTE +#include "includeCheck.h" + +#include "net_sg.h" +#include "vmxnet_def.h" + + +/* + * Magic number that identifies this version of the vmxnet protocol. + */ +#define VMXNET2_MAGIC 0xbabe864f + +/* size of the rx ring */ +#define VMXNET2_MAX_NUM_RX_BUFFERS 128 +#define VMXNET2_DEFAULT_NUM_RX_BUFFERS 100 + + +/* size of the rx ring when enhanced vmxnet is used */ +#define ENHANCED_VMXNET2_MAX_NUM_RX_BUFFERS 512 +#define ENHANCED_VMXNET2_DEFAULT_NUM_RX_BUFFERS 150 + +/* size of the 2nd rx ring */ +#define VMXNET2_MAX_NUM_RX_BUFFERS2 2048 +#define VMXNET2_DEFAULT_NUM_RX_BUFFERS2 512 + +/* size of the tx ring */ +#define VMXNET2_MAX_NUM_TX_BUFFERS 128 +#define VMXNET2_DEFAULT_NUM_TX_BUFFERS 100 + +/* size of the tx ring when tso/jf is used */ +#define VMXNET2_MAX_NUM_TX_BUFFERS_TSO 512 +#define VMXNET2_DEFAULT_NUM_TX_BUFFERS_TSO 256 + +enum { + VMXNET2_OWNERSHIP_DRIVER, + VMXNET2_OWNERSHIP_DRIVER_PENDING, + VMXNET2_OWNERSHIP_NIC, + VMXNET2_OWNERSHIP_NIC_PENDING, + VMXNET2_OWNERSHIP_NIC_FRAG, + VMXNET2_OWNERSHIP_DRIVER_FRAG, +}; + +#define VMXNET2_SG_DEFAULT_LENGTH 6 + +typedef struct Vmxnet2_SG_Array { + uint16 addrType; + uint16 length; + NetSG_Elem sg[VMXNET2_SG_DEFAULT_LENGTH]; +} Vmxnet2_SG_Array; + +typedef struct Vmxnet2_RxRingEntry { + uint64 paddr; /* Physical address of the packet data. */ + uint32 bufferLength; /* The length of the data at paddr. */ + uint32 actualLength; /* The actual length of the received data. */ + uint16 ownership; /* Who owns the packet. */ + uint16 flags; /* Flags as defined below. */ + uint32 index; /* + * Currently: + * + * This is being used as an packet index to + * rx buffers. + * + * Originally: + * + * was void* driverData ("Driver specific data.") + * which was used for sk_buf**s in Linux and + * VmxnetRxBuff*s in Windows. It could not be + * here because the structure needs to be the + * same size between architectures, and it was + * not used on the device side, anyway. Look + * for its replacement in + * Vmxnet_Private.rxRingBuffPtr on Linux and + * VmxnetAdapter.rxRingBuffPtr on Windows. + */ +} Vmxnet2_RxRingEntry; + +/* + * Vmxnet2_RxRingEntry flags: + * + * VMXNET2_RX_HW_XSUM_OK The hardware verified the TCP/UDP checksum. + * VMXNET2_RX_WITH_FRAG More data is in the 2nd ring + * VMXNET2_RX_FRAG_EOP This is the last frag, the only valid flag for + * 2nd ring entry + * + */ +#define VMXNET2_RX_HW_XSUM_OK 0x01 +#define VMXNET2_RX_WITH_FRAG 0x02 +#define VMXNET2_RX_FRAG_EOP 0x04 + +typedef struct Vmxnet2_TxRingEntry { + uint16 flags; /* Flags as defined below. */ + uint16 ownership; /* Who owns this packet. */ + uint32 extra; /* + * was void* driverData ("Driver specific data.") + * which was used for sk_buf*s in Linux and + * VmxnetTxInfo*s in Windows. It could not be + * here because the structure needs to be the + * same size between architectures, and it was + * not used on the device side, anyway. Look + * for its replacement in + * Vmxnet_Private.txRingBuffPtr on Linux and + * VmxnetAdapter.txRingBuffPtr on Windows. + */ + uint32 tsoMss; /* TSO pkt MSS */ + Vmxnet2_SG_Array sg; /* Packet data. */ +} Vmxnet2_TxRingEntry; + +/* + * Vmxnet2_TxRingEntry flags: + * + * VMXNET2_TX_CAN_KEEP The implementation can return the tx ring entry + * to the driver when it is ready as opposed to + * before the transmit call from the driver completes. + * VMXNET2_TX_RING_LOW The driver's transmit ring buffer is low on free + * slots. + * VMXNET2_TX_HW_XSUM The hardware should perform the TCP/UDP checksum + * VMXNET2_TX_TSO The hardware should do TCP segmentation. + * VMXNET2_TX_PINNED_BUFFER The driver used one of the preallocated vmkernel + * buffers *and* it has been pinned with Net_PinTxBuffers. + * VMXNET2_TX_MORE This is *not* the last tx entry for the pkt. + * All flags except VMXNET2_TX_MORE are ignored + * for the subsequent tx entries. + */ +#define VMXNET2_TX_CAN_KEEP 0x0001 +#define VMXNET2_TX_RING_LOW 0x0002 +#define VMXNET2_TX_HW_XSUM 0x0004 +#define VMXNET2_TX_TSO 0x0008 +#define VMXNET2_TX_PINNED_BUFFER 0x0010 +#define VMXNET2_TX_MORE 0x0020 + +/* + * Structure used by implementations. This structure allows the inline + * functions below to be used. + */ +typedef struct Vmxnet2_RxRingInfo { + Vmxnet2_RxRingEntry *base; /* starting addr of the ring */ + uint32 nicNext; /* next entry to use in the ring */ + uint32 ringLength; /* # of entries in the ring */ + PA startPA; /* PA of the starting addr of the ring */ +#ifdef VMX86_DEBUG + const char *name; +#endif +} Vmxnet2_RxRingInfo; + +typedef struct Vmxnet2_TxRingInfo { + Vmxnet2_TxRingEntry *base; /* starting addr of the ring */ + uint32 nicNext; /* next entry to use in the ring */ + uint32 ringLength; /* # of entries in the ring */ + PA startPA; /* PA of the starting addr of the ring */ +#ifdef VMX86_DEBUG + const char *name; +#endif +} Vmxnet2_TxRingInfo; + +typedef struct Vmxnet2_ImplData { + Vmxnet2_RxRingInfo rxRing; + Vmxnet2_RxRingInfo rxRing2; + Vmxnet2_TxRingInfo txRing; + + struct PhysMem_Token *ddPhysMemToken; +} Vmxnet2_ImplData; + +/* + * Used internally for performance studies. By default this will be off so there + * should be no compatibilty or other interferences. + */ + +/* #define ENABLE_VMXNET2_PROFILING */ + + +#ifdef ENABLE_VMXNET2_PROFILING +typedef struct Vmxnet2_VmmStats { + uint64 vIntTSC; /* the time that virtual int was posted */ + uint64 actionsCount; /* Number of actions received */ + uint64 numWasteActions; /* Number of non-productive actions */ +} Vmxnet2_VmmStats; +#endif + +typedef struct Vmxnet2_DriverStats { + uint32 transmits; /* # of times that the drivers transmit function */ + /* is called. The driver could transmit more */ + /* than one packet per call. */ + uint32 pktsTransmitted; /* # of packets transmitted. */ + uint32 noCopyTransmits; /* # of packets that are transmitted without */ + /* copying any data. */ + uint32 copyTransmits; /* # of packets that are transmittted by copying */ + /* the data into a buffer. */ + uint32 maxTxsPending; /* Max # of transmits outstanding. */ + uint32 txStopped; /* # of times that transmits got stopped because */ + /* the tx ring was full. */ + uint32 txRingOverflow; /* # of times that transmits got deferred bc */ + /* the tx ring was full. This must be >= */ + /* txStopped since there will be one */ + /* txStopped when the ring fills up and then */ + /* one txsRingOverflow for each packet that */ + /* that gets deferred until there is space. */ + uint32 interrupts; /* # of times interrupted. */ + uint32 pktsReceived; /* # of packets received. */ + uint32 rxBuffersLow; /* # of times that the driver was low on */ + /* receive buffers. */ +#ifdef ENABLE_VMXNET2_PROFILING + Vmxnet2_VmmStats vmmStats; /* vmm related stats for perf study */ +#endif +} Vmxnet2_DriverStats; + +/* + * Shared data structure between the vm, the vmm, and the vmkernel. + * This structure was originally arranged to try to group common data + * on 32-byte cache lines, but bit rot and the fact that we no longer + * run on many CPUs with that cacheline size killed that optimization. + * vmxnet3 should target 128 byte sizes and alignments to optimize for + * the 64 byte cacheline pairs on P4. + */ +typedef struct Vmxnet2_DriverData { + /* + * Magic must be first. + */ + Vmxnet_DDMagic magic; + + /* + * Receive fields. + */ + uint32 rxRingLength; /* Length of the receive ring. */ + uint32 rxDriverNext; /* Index of the next packet that will */ + /* be filled in by the impl */ + + uint32 rxRingLength2; /* Length of the 2nd receive ring. */ + uint32 rxDriverNext2; /* Index of the next packet that will */ + /* be filled in by the impl */ + + uint32 notUsed1; /* was "irq" */ + + /* + * Interface flags and multicast filter. + */ + uint32 ifflags; + uint32 LADRF[VMXNET_MAX_LADRF]; + + /* + * Transmit fields + */ + uint32 txDontClusterSize; /* All packets <= this will be transmitted */ + /* immediately, regardless of clustering */ + /* settings [was fill[1]] */ + uint32 txRingLength; /* Length of the transmit ring. */ + uint32 txDriverCur; /* Index of the next packet to be */ + /* returned by the implementation.*/ + uint32 txDriverNext; /* Index of the entry in the ring */ + /* buffer to use for the next packet.*/ + uint32 txStopped; /* The driver has stopped transmitting */ + /* because its ring buffer is full.*/ + uint32 txClusterLength; /* Maximum number of packets to */ + /* put in the ring buffer before */ + /* asking the implementation to */ + /* transmit the packets in the buffer.*/ + uint32 txNumDeferred; /* Number of packets that have been */ + /* queued in the ring buffer since */ + /* the last time the implementation */ + /* was asked to transmit. */ + uint32 notUsed3; /* This field is deprecated but still used */ + /* as minXmitPhysLength on the escher branch. */ + /* It cannot be used for other purposes */ + /* until escher vms no longer are allowed */ + /* to install this driver. */ + + uint32 totalRxBuffers; /* used by esx for max rx buffers */ + uint64 rxBufferPhysStart; /* used by esx for pinng rx buffers */ + /* + * Extra fields for future expansion. + */ + uint32 extra[2]; + + uint16 maxFrags; /* # of frags the driver can handle */ + uint16 featureCtl; /* for driver to enable some feature */ + + /* + * The following fields are used to save the nicNext indexes part + * of implData in the vmkernel when disconnecting the adapter, we + * need them when we reconnect. This mechanism is used for + * checkpointing as well. + */ + uint32 savedRxNICNext; + uint32 savedRxNICNext2; + uint32 savedTxNICNext; + + /* + * Fields used during initialization or debugging. + */ + uint32 length; + uint32 rxRingOffset; + uint32 rxRingOffset2; + uint32 txRingOffset; + uint32 debugLevel; + uint32 txBufferPhysStart; + uint32 txBufferPhysLength; + uint32 txPktMaxSize; + + /* + * Driver statistics. + */ + Vmxnet2_DriverStats stats; +} Vmxnet2_DriverData; + +/* + * Shared between VMM and Vmkernel part of vmxnet2 to optimize action posting + * VMM writes 1 (don't post) or 0 (okay to post) and vmk reads this. + */ +typedef struct VmxnetVMKShared { + uint32 dontPostActions; +} VmxnetVMKShared; + +#if defined VMX86_VMX || defined VMKERNEL + +/* + * Inline functions used to assist the implementation of the vmxnet interface. + */ + +/* + * Get the next empty packet out of the receive ring and move to + * the next packet. + */ +static INLINE Vmxnet2_RxRingEntry * +Vmxnet2_GetNextRx(Vmxnet2_RxRingInfo *ri, uint16 ownership) +{ + Vmxnet2_RxRingEntry *rre = ri->base + ri->nicNext; + if (rre->ownership == ownership) { + VMXNET_INC(ri->nicNext, ri->ringLength); + } else { + rre = NULL; + } + + return rre; +} + +/* + * Return ownership of a packet in the receive ring to the driver. + */ +static INLINE void +Vmxnet2_PutRx(Vmxnet2_RxRingEntry *rre, uint32 pktLength, uint16 ownership) +{ + rre->actualLength = pktLength; + COMPILER_MEM_BARRIER(); + rre->ownership = ownership; +} + +/* + * Get the next pending packet out of the transmit ring. + */ +static INLINE Vmxnet2_TxRingEntry * +Vmxnet2_GetNextTx(Vmxnet2_TxRingInfo *ri) +{ + Vmxnet2_TxRingEntry *txre = ri->base + ri->nicNext; + if (txre->ownership == VMXNET2_OWNERSHIP_NIC) { + return txre; + } else { + return NULL; + } +} + +/* + * Move to the next entry in the transmit ring. + */ +static INLINE unsigned int +Vmxnet2_IncNextTx(Vmxnet2_TxRingInfo *ri) +{ + unsigned int prev = ri->nicNext; + Vmxnet2_TxRingEntry *txre = ri->base + ri->nicNext; + + txre->ownership = VMXNET2_OWNERSHIP_NIC_PENDING; + + VMXNET_INC(ri->nicNext, ri->ringLength); + return prev; +} + +/* + * Get the indicated entry from transmit ring. + */ +static INLINE Vmxnet2_TxRingEntry * +Vmxnet2_GetTxEntry(Vmxnet2_TxRingInfo *ri, unsigned int idx) +{ + return ri->base + idx; +} + +/* + * Get the indicated entry from the given rx ring + */ +static INLINE Vmxnet2_RxRingEntry * +Vmxnet2_GetRxEntry(Vmxnet2_RxRingInfo *ri, unsigned int idx) +{ + return ri->base + idx; +} + +#endif /* defined VMX86_VMX || defined VMKERNEL */ + +#endif + diff --git a/usr/src/uts/intel/io/vmxnet/vmxnet_def.h b/usr/src/uts/intel/io/vmxnet/vmxnet_def.h new file mode 100644 index 0000000000..703466c995 --- /dev/null +++ b/usr/src/uts/intel/io/vmxnet/vmxnet_def.h @@ -0,0 +1,184 @@ +/********************************************************* + * Copyright (C) 1999 VMware, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation version 2 and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + *********************************************************/ + +/********************************************************* + * The contents of this file are subject to the terms of the Common + * Development and Distribution License (the "License") version 1.0 + * and no later version. You may not use this file except in + * compliance with the License. + * + * You can obtain a copy of the License at + * http://www.opensource.org/licenses/cddl1.php + * + * See the License for the specific language governing permissions + * and limitations under the License. + * + *********************************************************/ + +#ifndef _VMXNET_DEF_H_ +#define _VMXNET_DEF_H_ + +#define INCLUDE_ALLOW_USERLEVEL + +#define INCLUDE_ALLOW_MODULE +#define INCLUDE_ALLOW_VMK_MODULE +#define INCLUDE_ALLOW_VMKERNEL +#define INCLUDE_ALLOW_DISTRIBUTE +#include "includeCheck.h" + +#include "net_sg.h" +#include "vmnet_def.h" + + +/* + * Vmxnet I/O ports, used by both the vmxnet driver and + * the device emulation code. + */ + +#define VMXNET_INIT_ADDR 0x00 +#define VMXNET_INIT_LENGTH 0x04 +#define VMXNET_TX_ADDR 0x08 +#define VMXNET_COMMAND_ADDR 0x0c +#define VMXNET_MAC_ADDR 0x10 +#define VMXNET_LOW_VERSION 0x18 +#define VMXNET_HIGH_VERSION 0x1c +#define VMXNET_STATUS_ADDR 0x20 +#define VMXNET_TOE_INIT_ADDR 0x24 +#define VMXNET_APROM_ADDR 0x28 +#define VMXNET_INT_ENABLE_ADDR 0x30 +#define VMXNET_WAKE_PKT_PATTERNS 0x34 + +/* + * Vmxnet command register values. + */ +#define VMXNET_CMD_INTR_ACK 0x0001 +#define VMXNET_CMD_UPDATE_LADRF 0x0002 +#define VMXNET_CMD_UPDATE_IFF 0x0004 +#define VMXNET_CMD_UNUSED 1 0x0008 +#define VMXNET_CMD_UNUSED_2 0x0010 +#define VMXNET_CMD_INTR_DISABLE 0x0020 +#define VMXNET_CMD_INTR_ENABLE 0x0040 +#define VMXNET_CMD_UNUSED_3 0x0080 +#define VMXNET_CMD_CHECK_TX_DONE 0x0100 +#define VMXNET_CMD_GET_NUM_RX_BUFFERS 0x0200 +#define VMXNET_CMD_GET_NUM_TX_BUFFERS 0x0400 +#define VMXNET_CMD_PIN_TX_BUFFERS 0x0800 +#define VMXNET_CMD_GET_CAPABILITIES 0x1000 +#define VMXNET_CMD_GET_FEATURES 0x2000 +#define VMXNET_CMD_SET_POWER_FULL 0x4000 +#define VMXNET_CMD_SET_POWER_LOW 0x8000 + +/* + * Vmxnet status register values. + */ +#define VMXNET_STATUS_CONNECTED 0x0001 +#define VMXNET_STATUS_ENABLED 0x0002 +#define VMXNET_STATUS_TX_PINNED 0x0004 + +/* + * Values for the interface flags. + */ +#define VMXNET_IFF_PROMISC 0x01 +#define VMXNET_IFF_BROADCAST 0x02 +#define VMXNET_IFF_MULTICAST 0x04 +#define VMXNET_IFF_DIRECTED 0x08 + +/* + * Length of the multicast address filter. + */ +#define VMXNET_MAX_LADRF 2 + +/* + * Size of Vmxnet APROM. + */ +#define VMXNET_APROM_SIZE 6 + +/* + * An invalid ring index. + */ +#define VMXNET_INVALID_RING_INDEX (-1) + +/* + * Features that are implemented by the driver. These are driver + * specific so not all features will be listed here. In addition not all + * drivers have to pay attention to these feature flags. + * + * VMXNET_FEATURE_ZERO_COPY_TX The driver won't do any copies as long as + * the packet length is > + * Vmxnet_DriverData.minTxPhysLength. + * + * VMXNET_FEATURE_TSO The driver will use the TSO capabilities + * of the underlying hardware if available + * and enabled. + * + * VMXNET_FEATURE_JUMBO_FRAME The driver can send/rcv jumbo frame + * + * VMXNET_FEATURE_LPD The backend can deliver large pkts + */ +#define VMXNET_FEATURE_ZERO_COPY_TX 0x01 +#define VMXNET_FEATURE_TSO 0x02 +#define VMXNET_FEATURE_JUMBO_FRAME 0x04 +#define VMXNET_FEATURE_LPD 0x08 + +/* + * Define the set of capabilities required by each feature above + */ +#define VMXNET_FEATURE_ZERO_COPY_TX_CAPS VMXNET_CAP_SG +#define VMXNET_FEATURE_TSO_CAPS VMXNET_CAP_TSO +#define VMXNET_HIGHEST_FEATURE_BIT VMXNET_FEATURE_TSO + +#define VMXNET_INC(val, max) \ + val++; \ + if (UNLIKELY(val == max)) { \ + val = 0; \ + } + +/* + * code that just wants to switch on the different versions of the + * guest<->implementation protocol can cast driver data to this. + */ +typedef uint32 Vmxnet_DDMagic; + +/* + * Wake packet pattern commands sent through VMXNET_WAKE_PKT_PATTERNS port + */ + +#define VMXNET_PM_OPCODE_START 3 /* args: cnt of wake packet patterns */ +#define VMXNET_PM_OPCODE_LEN 2 /* args: index of wake packet pattern */ + /* number of pattern byte values */ +#define VMXNET_PM_OPCODE_DATA 1 /* args: index of wake packet pattern */ + /* offset in pattern byte values list */ + /* packet byte offset */ + /* packet byte value */ +#define VMXNET_PM_OPCODE_END 0 /* args: <none> */ + +typedef union Vmxnet_WakePktCmd { + uint32 pktData : 32; + struct { + unsigned cmd : 2; /* wake packet pattern cmd [from list above] */ + unsigned cnt : 3; /* cnt wk pkt pttrns 1..MAX_NUM_FILTER_PTTRNS */ + unsigned ind : 3; /* ind wk pkt pttrn 0..MAX_NUM_FILTER_PTTRNS-1 */ + unsigned lenOff : 8; /* num pttrn byte vals 1..MAX_PKT_FILTER_SIZE */ + /* OR offset in pattern byte values list */ + /* 0..MAX_PKT_FILTER_SIZE-1 */ + unsigned byteOff : 8; /* pkt byte offset 0..MAX_PKT_FILTER_SIZE-1 */ + unsigned byteVal : 8; /* packet byte value 0..255 */ + } pktPttrn; +} Vmxnet_WakePktCmd; + +#endif /* _VMXNET_DEF_H_ */ diff --git a/usr/src/uts/intel/ipf/Makefile b/usr/src/uts/intel/ipf/Makefile index 4dde1e0034..d5dd00155b 100644 --- a/usr/src/uts/intel/ipf/Makefile +++ b/usr/src/uts/intel/ipf/Makefile @@ -21,6 +21,7 @@ # # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2018 Joyent, Inc. # # Copyright (c) 2018, Joyent, Inc. @@ -50,7 +51,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) CPPFLAGS += -DIPFILTER_LKM -DIPFILTER_LOG -DIPFILTER_LOOKUP -DUSE_INET6 CPPFLAGS += -DSUNDDI -DSOLARIS2=$(RELEASE_MINOR) -DIRE_ILL_CN -LDFLAGS += -Ndrv/ip -Nmisc/md5 -Nmisc/neti -Nmisc/hook -Nmisc/kcf +LDFLAGS += -Ndrv/ip -Nmisc/md5 -Nmisc/neti -Nmisc/hook -Nmisc/kcf -Ndrv/vnd LDFLAGS += -Nmisc/mac INC_PATH += -I$(UTSBASE)/common/inet/ipf diff --git a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 b/usr/src/uts/intel/ipf/ipf.global-objs.debug64 index 846011b4c5..ea5510a78d 100644 --- a/usr/src/uts/intel/ipf/ipf.global-objs.debug64 +++ b/usr/src/uts/intel/ipf/ipf.global-objs.debug64 @@ -22,9 +22,21 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright 2018 Joyent, Inc. All rights reserved +# Copyright 2019 Joyent, Inc. # +cfw_evdrops +cfw_evreports +cfw_ring +cfw_ringcv +cfw_ringend +cfw_ringfull +cfw_ringlock +cfw_ringmask +cfw_ringsize +cfw_ringstart +cfw_timeout_tries +cfw_timeout_wait fr_availfuncs fr_features fr_objbytes @@ -43,6 +55,10 @@ hook4_nicevents hook4_nicevents_gz hook4_out hook4_out_gz +hook4_vnd_in +hook4_vnd_in_gz +hook4_vnd_out +hook4_vnd_out_gz hook6_in hook6_in_gz hook6_loop_in @@ -53,6 +69,10 @@ hook6_nicevents hook6_nicevents_gz hook6_out hook6_out_gz +hook6_vnd_in +hook6_vnd_in_gz +hook6_vnd_out +hook6_vnd_out_gz icmpreplytype4 icmpreplytype6 icmptoicmp6types @@ -60,6 +80,7 @@ icmptoicmp6unreach idletime_tab ip6exthdr ipf_cb_ops +ipf_cfwlog_enabled ipf_dev_info ipf_devfiles ipf_eth_bcast_addr diff --git a/usr/src/uts/intel/iptun/Makefile b/usr/src/uts/intel/iptun/Makefile index 5ef3d91df1..3fde3a343e 100644 --- a/usr/src/uts/intel/iptun/Makefile +++ b/usr/src/uts/intel/iptun/Makefile @@ -52,7 +52,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -Ndrv/dld -Nmisc/dls -Nmisc/mac -Ndrv/ip -INC_PATH += -I$(UTSBASE)/common/io/bpf CERRWARN += -_gcc=-Wno-unused-label diff --git a/usr/src/uts/intel/lx_brand/Makefile b/usr/src/uts/intel/lx_brand/Makefile new file mode 100644 index 0000000000..a1c4027afa --- /dev/null +++ b/usr/src/uts/intel/lx_brand/Makefile @@ -0,0 +1,92 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# Copyright 2019 Joyent, Inc. + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Path to where brand common sources live +# +LX_CMN = $(SRC)/common/brand/lx + +# +# Define the module and object file sets. +# +MODULE = lx_brand +OBJECTS = $(LX_BRAND_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_BRAND_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) -I$(SRC)/common +INC_PATH += -I$(UTSBASE)/common/inet/sockmods -I$(UTSBASE)/common/io/bpf +INC_PATH += -I$(UTSBASE)/common/fs/sockfs +INC_PATH += -I$(UTSBASE)/common/fs/zfs +AS_INC_PATH += -I$(UTSBASE)/i86pc/genassym/$(OBJS_DIR) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nexec/elfexec -Nfs/fifofs -Nfs/sockfs -Ndrv/ip \ + -Nfs/zfs -Nmisc/klmmod -Nsys/sysacct + +# needs work +SMATCH=off + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + +# +# Include brand-specific rules +# + +include $(UTSBASE)/intel/lx_brand/Makefile.rules diff --git a/usr/src/uts/intel/lx_brand/Makefile.rules b/usr/src/uts/intel/lx_brand/Makefile.rules new file mode 100644 index 0000000000..f1244569b0 --- /dev/null +++ b/usr/src/uts/intel/lx_brand/Makefile.rules @@ -0,0 +1,82 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# Copyright 2016 Joyent, Inc. +# +# + +# +# Section 1a: C object build rules +# +$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/common/brand/lx/os/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/common/brand/lx/os/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_OBJ64)/%.o: $(UTSBASE)/intel/brand/lx/%.s + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< + +$(OBJS_DIR_OBJ64)/%.o: $(LX_CMN)/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR_DBG64)/%.o: $(UTSBASE)/intel/brand/lx/%.s + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/os/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/syscall/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(LX_CMN)/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/intel/brand/lx/%.s + $(COMPILE.s) -I$(UTSBASE)/i86pc -o $@ $< diff --git a/usr/src/uts/intel/lx_cgroup/Makefile b/usr/src/uts/intel/lx_cgroup/Makefile new file mode 100644 index 0000000000..e21a83cace --- /dev/null +++ b/usr/src/uts/intel/lx_cgroup/Makefile @@ -0,0 +1,52 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +LX_CMN = $(SRC)/common/brand/lx + +MODULE = lx_cgroup +OBJECTS = $(LX_CGROUP_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_FS_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nbrand/lx_brand + +# needs work +$(OBJS_DIR)/cgrps_vnops.o := SMOFF += signed + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ + +include $(UTSBASE)/intel/lx_cgroup/Makefile.rules diff --git a/usr/src/uts/intel/lx_cgroup/Makefile.rules b/usr/src/uts/intel/lx_cgroup/Makefile.rules new file mode 100644 index 0000000000..f08cb0d6f2 --- /dev/null +++ b/usr/src/uts/intel/lx_cgroup/Makefile.rules @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. All rights reserved. +# + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/cgroups/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_devfs/Makefile b/usr/src/uts/intel/lx_devfs/Makefile new file mode 100644 index 0000000000..1f5f13b747 --- /dev/null +++ b/usr/src/uts/intel/lx_devfs/Makefile @@ -0,0 +1,52 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +LX_CMN = $(SRC)/common/brand/lx + +MODULE = lx_devfs +OBJECTS = $(LX_DEVFS_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_FS_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nbrand/lx_brand + +# needs work +$(OBJS_DIR)/lxd_vnops.o := SMOFF += signed + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ + +include $(UTSBASE)/intel/lx_devfs/Makefile.rules diff --git a/usr/src/uts/intel/lx_devfs/Makefile.rules b/usr/src/uts/intel/lx_devfs/Makefile.rules new file mode 100644 index 0000000000..b2bcb2fc89 --- /dev/null +++ b/usr/src/uts/intel/lx_devfs/Makefile.rules @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. All rights reserved. +# + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/devfs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_netlink/Makefile b/usr/src/uts/intel/lx_netlink/Makefile new file mode 100644 index 0000000000..ed94db631d --- /dev/null +++ b/usr/src/uts/intel/lx_netlink/Makefile @@ -0,0 +1,67 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = lx_netlink +OBJECTS = $(LX_NETLINK_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CPPFLAGS += -I$(UTSBASE)/common/brand/lx +LDFLAGS += -Ndrv/ip -Nfs/sockfs -Nbrand/lx_brand + +# needs work +SMOFF += all_func_returns + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_proc/Makefile b/usr/src/uts/intel/lx_proc/Makefile new file mode 100644 index 0000000000..4997a34d7b --- /dev/null +++ b/usr/src/uts/intel/lx_proc/Makefile @@ -0,0 +1,101 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/lx_proc/Makefile +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright 2019 Joyent, Inc. +# +# This makefile drives the production of the lxproc file system +# kernel module. +# +# i86 architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Path to where brand common sources live +# +LX_CMN = $(SRC)/common/brand/lx + +# +# Define the module and object file sets. +# +MODULE = lx_proc +OBJECTS = $(LX_PROC_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_FS_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) +INC_PATH += -I$(UTSBASE)/common/fs/zfs + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) + +# +# Depends on procfs and lx_brand +# +LDFLAGS += -Nfs/procfs -Nbrand/lx_brand -Ndrv/inotify -Ndrv/ip +LDFLAGS += -Nfs/sockfs -Ncrypto/swrand -Nmisc/cc + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + +# +# Include brand-specific rules +# + +include $(UTSBASE)/intel/lx_proc/Makefile.rules diff --git a/usr/src/uts/intel/lx_proc/Makefile.rules b/usr/src/uts/intel/lx_proc/Makefile.rules new file mode 100644 index 0000000000..9d3c3b668b --- /dev/null +++ b/usr/src/uts/intel/lx_proc/Makefile.rules @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +# +# Section 1a: C object build rules +# +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/procfs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_ptm/Makefile b/usr/src/uts/intel/lx_ptm/Makefile new file mode 100644 index 0000000000..a0e63664f2 --- /dev/null +++ b/usr/src/uts/intel/lx_ptm/Makefile @@ -0,0 +1,80 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/lx_ptm/Makefile +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# This makefile drives the production of the lx_ptm driver +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = lx_ptm +OBJECTS = $(LX_PTM_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/brand/lx/io + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +CPPFLAGS += -I$(UTSBASE)/common/brand/lx + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_sysfs/Makefile b/usr/src/uts/intel/lx_sysfs/Makefile new file mode 100644 index 0000000000..93599d6723 --- /dev/null +++ b/usr/src/uts/intel/lx_sysfs/Makefile @@ -0,0 +1,49 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2016 Joyent, Inc. +# + +UTSBASE = ../.. + +LX_CMN = $(SRC)/common/brand/lx + +MODULE = lx_sysfs +OBJECTS = $(LX_SYS_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_FS_DIR)/$(MODULE) + +INC_PATH += -I$(UTSBASE)/common/brand/lx -I$(LX_CMN) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nbrand/lx_brand -Ndrv/ip + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ + +include $(UTSBASE)/intel/lx_sysfs/Makefile.rules diff --git a/usr/src/uts/intel/lx_sysfs/Makefile.rules b/usr/src/uts/intel/lx_sysfs/Makefile.rules new file mode 100644 index 0000000000..fab15d52b1 --- /dev/null +++ b/usr/src/uts/intel/lx_sysfs/Makefile.rules @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. All rights reserved. +# + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/sysfs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lx_systrace/Makefile b/usr/src/uts/intel/lx_systrace/Makefile new file mode 100644 index 0000000000..a2f9e6be35 --- /dev/null +++ b/usr/src/uts/intel/lx_systrace/Makefile @@ -0,0 +1,62 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +UTSBASE = ../.. + +MODULE = lx_systrace +OBJECTS = $(LX_SYSTRACE_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +ROOTLINK = $(USR_DTRACE_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/brand/lx/dtrace + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) + +CPPFLAGS += -I$(UTSBASE)/common/brand/lx + +LDFLAGS += -Ndrv/dtrace -Nbrand/lx_brand + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +$(ROOTLINK): $(USR_DTRACE_DIR) $(ROOTMODULE) + -$(RM) $@; ln $(ROOTMODULE) $@ + +include $(UTSBASE)/intel/Makefile.targ + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/dtrace/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lxautofs/Makefile b/usr/src/uts/intel/lxautofs/Makefile new file mode 100644 index 0000000000..4b87e11966 --- /dev/null +++ b/usr/src/uts/intel/lxautofs/Makefile @@ -0,0 +1,102 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# Copyright 2019 Joyent, Inc. +# + +# +# This makefile drives the production of the lxautofs file system +# kernel module. +# +# i86 architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +# Note that the name of the actual filesystem is lxautofs and +# not lx_autofs. This is becase filesystem names are stupidly +# limited to 8 characters. +# +MODULE = lxautofs +OBJECTS = $(LX_AUTOFS_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +ROOTLINK = $(USR_FS_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/brand/lx/autofs + +INC_PATH += -I$(UTSBASE)/common/brand/lx + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -Nfs/nfs + +# needs work +SMOFF += all_func_returns + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +$(ROOTLINK): $(ROOT_FS_DIR) $(ROOTMODULE) + -$(RM) $@; ln $(ROOTMODULE) $@ + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ + +# +# Include brand-specific rules +# + +include $(UTSBASE)/intel/lxautofs/Makefile.rules diff --git a/usr/src/uts/intel/lxautofs/Makefile.rules b/usr/src/uts/intel/lxautofs/Makefile.rules new file mode 100644 index 0000000000..ab09a48bc9 --- /dev/null +++ b/usr/src/uts/intel/lxautofs/Makefile.rules @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Section 1a: C object build rules +# +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/lx/autofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) diff --git a/usr/src/uts/intel/lxprocfs/Makefile b/usr/src/uts/intel/lxprocfs/Makefile new file mode 100644 index 0000000000..f57d46add5 --- /dev/null +++ b/usr/src/uts/intel/lxprocfs/Makefile @@ -0,0 +1,80 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# uts/intel/lxprocfs/Makefile +# +# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright 2019 Joyent, Inc. + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = lxprocfs +OBJECTS = $(LXPROC_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_FS_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Depends on procfs +# +LDFLAGS += -Nfs/procfs + +# false positive +# needs work +$(OBJS_DIR)/lxpr_vnops.o := SMOFF += strcpy_overflow + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/mac/Makefile b/usr/src/uts/intel/mac/Makefile index 96fbf52585..bfad21b20a 100644 --- a/usr/src/uts/intel/mac/Makefile +++ b/usr/src/uts/intel/mac/Makefile @@ -51,8 +51,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # Overrides. # CFLAGS += $(CCVERBOSE) -INC_PATH += -I$(UTSBASE)/common/io/bpf - CERRWARN += -_gcc=-Wno-unused-label CERRWARN += $(CNOWARN_UNINIT) @@ -62,6 +60,7 @@ CERRWARN += -_gcc=-Wno-unused-variable # needs work SMOFF += all_func_returns +$(OBJS_DIR)/mac_sched.o := SMOFF += assign_vs_compare $(OBJS_DIR)/mac_util.o := SMOFF += signed # diff --git a/usr/src/uts/intel/mac_ether/Makefile b/usr/src/uts/intel/mac_ether/Makefile index c56f6026bc..fadd3402c3 100644 --- a/usr/src/uts/intel/mac_ether/Makefile +++ b/usr/src/uts/intel/mac_ether/Makefile @@ -54,7 +54,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/intel/mac_ib/Makefile b/usr/src/uts/intel/mac_ib/Makefile index 0527fd1dce..e527f88904 100644 --- a/usr/src/uts/intel/mac_ib/Makefile +++ b/usr/src/uts/intel/mac_ib/Makefile @@ -54,7 +54,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/intel/mac_wifi/Makefile b/usr/src/uts/intel/mac_wifi/Makefile index 73efe6ffd7..ebb33564a3 100644 --- a/usr/src/uts/intel/mac_wifi/Makefile +++ b/usr/src/uts/intel/mac_wifi/Makefile @@ -56,7 +56,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -Nmisc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/intel/ml/modstubs.s b/usr/src/uts/intel/ml/modstubs.s index bac97ef672..0994573bd7 100644 --- a/usr/src/uts/intel/ml/modstubs.s +++ b/usr/src/uts/intel/ml/modstubs.s @@ -46,7 +46,7 @@ * NOTE: Use NO_UNLOAD_STUBs if the module is NOT unloadable once it is * loaded. */ -#define MAXNARG 10 +#define MAXNARG 12 /* * WARNING: there is no check for forgetting to write END_MODULE, @@ -181,7 +181,7 @@ fcnname/**/_info: \ pushq %rcx pushq %r8 pushq %r9 - /* (next 4 args, if any, are already on the stack above %rbp) */ + /* (next 6 args, if any, are already on the stack above %rbp) */ movq %r15, %rdi call mod_hold_stub /* mod_hold_stub(mod_stub_info *) */ cmpl $-1, %eax /* error? */ @@ -192,7 +192,7 @@ fcnname/**/_info: \ jmp .L2 .L1: /* - * copy MAXNARG == 10 incoming arguments + * copy MAXNARG == 12 incoming arguments */ popq %r9 popq %r8 @@ -216,9 +216,11 @@ fcnname/**/_info: \ pushq (%rsp, %r11, 8) pushq (%rsp, %r11, 8) pushq (%rsp, %r11, 8) + pushq (%rsp, %r11, 8) + pushq (%rsp, %r11, 8) movq (%r15), %rax INDIRECT_CALL_REG(rax) /* call the stub fn(arg, ..) */ - addq $0x20, %rsp /* pop off last 4 args */ + addq $0x30, %rsp /* pop off last 6 args */ pushq %rax /* save any return values */ pushq %rdx movq %r15, %rdi diff --git a/usr/src/uts/intel/ml/swtch.s b/usr/src/uts/intel/ml/swtch.s index c6c606b11e..55aaf4e122 100644 --- a/usr/src/uts/intel/ml/swtch.s +++ b/usr/src/uts/intel/ml/swtch.s @@ -507,3 +507,41 @@ resume_from_intr_return: call thread_exit /* destroy thread if it returns. */ /*NOTREACHED*/ SET_SIZE(thread_start) + + ENTRY(thread_splitstack_run) + pushq %rbp /* push base pointer */ + movq %rsp, %rbp /* construct frame */ + movq %rdi, %rsp /* set stack pinter */ + movq %rdx, %rdi /* load arg */ + INDIRECT_CALL_REG(rsi) /* call specified function */ + leave /* pop base pointer */ + ret + SET_SIZE(thread_splitstack_run) + + /* + * Once we're back on our own stack, we need to be sure to set the + * value of rsp0 in the TSS back to our original stack: if we gave + * up the CPU at all while on our split stack, the rsp0 will point + * to that stack from resume (above); if were to try to return to + * userland in that state, we will die absolutely horribly (namely, + * trying to iretq back to registers in a bunch of freed segkp). We + * are expecting this to be called after T_STACK has been restored, + * but before we return. It's okay if we are preempted in this code: + * when the new CPU picks us up, they will automatically set rsp0 + * correctly, which is all we're trying to do here. + */ + ENTRY(thread_splitstack_cleanup) + LOADCPU(%r8) + movq CPU_TSS(%r8), %r9 + cmpq $1, kpti_enable + jne 1f + leaq CPU_KPTI_TR_RSP(%r8), %rax + jmp 2f +1: + movq CPU_THREAD(%r8), %r10 + movq T_STACK(%r10), %rax + addq $REGSIZE+MINFRAME, %rax +2: + movq %rax, TSS_RSP0(%r9) + ret + SET_SIZE(thread_splitstack_cleanup) diff --git a/usr/src/uts/intel/nfp/Makefile b/usr/src/uts/intel/nfp/Makefile new file mode 100644 index 0000000000..d302cc16c3 --- /dev/null +++ b/usr/src/uts/intel/nfp/Makefile @@ -0,0 +1,76 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# uts/intel/nfp/Makefile +# +# This makefile drives the production of the nfp +# driver kernel module. +# +# intel architecture dependent +# + +# +# Paths to the base of the uts directory trees +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = nfp +OBJECTS = $(NFP_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Driver-specific flags +# +CPPFLAGS += -DCH_KERNELVER=270 +CERRWARN += -_gcc=-Wno-unused-variable +CERRWARN += -_gcc=-Wno-unused-function + +# 3rd party code +SMOFF += indenting + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/opteron_pcbe/Makefile b/usr/src/uts/intel/opteron_pcbe/Makefile index 8b04073ac3..4d2180329e 100644 --- a/usr/src/uts/intel/opteron_pcbe/Makefile +++ b/usr/src/uts/intel/opteron_pcbe/Makefile @@ -49,7 +49,7 @@ CPCGEN_SRCS = $(CPCGEN_OBJS:%.o=%.c) opteron_pcbe_cpcgen.h MODULE = pcbe.AuthenticAMD OBJECTS = $(OPTERON_PCBE_OBJS:%=$(OBJS_DIR)/%) OBJECTS += $(CPCGEN_OBJS:%=$(OBJS_DIR)/%) -ROOTMODULE = $(USR_PCBE_DIR)/$(MODULE) +ROOTMODULE = $(ROOT_PSM_PCBE_DIR)/$(MODULE) # # Include common rules. diff --git a/usr/src/uts/intel/os/archdep.c b/usr/src/uts/intel/os/archdep.c index 08a593bffd..1f3f438951 100644 --- a/usr/src/uts/intel/os/archdep.c +++ b/usr/src/uts/intel/os/archdep.c @@ -521,6 +521,13 @@ ucontext_32ton(const ucontext32_t *src, ucontext_t *dst) if (src->uc_flags & UC_FPU) fpregset_32ton(&src->uc_mcontext.fpregs, &dst->uc_mcontext.fpregs); + + /* + * Copy the brand-private data: + */ + dst->uc_brand_data[0] = (void *)(uintptr_t)src->uc_brand_data[0]; + dst->uc_brand_data[1] = (void *)(uintptr_t)src->uc_brand_data[1]; + dst->uc_brand_data[2] = (void *)(uintptr_t)src->uc_brand_data[2]; } #endif /* _SYSCALL32_IMPL */ @@ -575,9 +582,11 @@ getuserpc() #define IS_NOT_CS 0 /*ARGSUSED*/ -static greg_t +greg_t fix_segreg(greg_t sr, int iscs, model_t datamodel) { + kthread_t *t = curthread; + switch (sr &= 0xffff) { case 0: @@ -610,6 +619,19 @@ fix_segreg(greg_t sr, int iscs, model_t datamodel) } /* + * Allow this process's brand to do any necessary segment register + * manipulation. + */ + if (PROC_IS_BRANDED(t->t_procp) && BRMOP(t->t_procp)->b_fixsegreg) { + greg_t bsr = BRMOP(t->t_procp)->b_fixsegreg(sr, datamodel); + + if (bsr == 0 && iscs == IS_CS) + return (0 | SEL_UPL); + else + return (bsr); + } + + /* * Force it into the LDT in ring 3 for 32-bit processes, which by * default do not have an LDT, so that any attempt to use an invalid * selector will reference the (non-existant) LDT, and cause a #gp diff --git a/usr/src/uts/intel/os/comm_page_util.c b/usr/src/uts/intel/os/comm_page_util.c index f286bee7f6..0674acbc2e 100644 --- a/usr/src/uts/intel/os/comm_page_util.c +++ b/usr/src/uts/intel/os/comm_page_util.c @@ -39,12 +39,12 @@ comm_page_mapin() { #if !defined(__xpv) proc_t *p = curproc; - caddr_t addr = NULL; + caddr_t addr = (caddr_t)COMM_PAGE_ALIGN; size_t len = COMM_PAGE_SIZE; uint_t prot = PROT_USER | PROT_READ; segumap_crargs_t suarg; - map_addr(&addr, len, (offset_t)0, 1, 0); + map_addr(&addr, len, (offset_t)0, 1, MAP_ALIGN); if (addr == NULL || valid_usr_range(addr, len, prot, p->p_as, p->p_as->a_userlimit) != RANGE_OKAY) { return (NULL); diff --git a/usr/src/uts/intel/os/cpuid.c b/usr/src/uts/intel/os/cpuid.c index 5e564b7acf..1459c034b9 100644 --- a/usr/src/uts/intel/os/cpuid.c +++ b/usr/src/uts/intel/os/cpuid.c @@ -5585,6 +5585,12 @@ cpuid_pass_resolve(cpu_t *cpu, void *arg) hwcap_flags |= AV_386_TSC; } + /* Detect systems with a potential CPUID limit */ + if (cpi->cpi_vendor == X86_VENDOR_Intel && cpi->cpi_maxeax < 4) { + cmn_err(CE_NOTE, "CPUID limit detected, " + "see the CPUID(7D) man page for details\n"); + } + /* * Check a few miscellaneous features. */ diff --git a/usr/src/uts/intel/os/desctbls.c b/usr/src/uts/intel/os/desctbls.c index c54efdb75f..0307aa56c6 100644 --- a/usr/src/uts/intel/os/desctbls.c +++ b/usr/src/uts/intel/os/desctbls.c @@ -158,7 +158,7 @@ struct interposing_handler { * The brand infrastructure interposes on two handlers, and we use one as a * NULL signpost. */ -static struct interposing_handler brand_tbl[2]; +static struct interposing_handler brand_tbl[3]; /* * software prototypes for default local descriptor table @@ -774,6 +774,13 @@ init_idt_common(gate_desc_t *idt) KCS_SEL, SDT_SYSIGT, TRP_KPL, idt_vector_to_ist(T_SIMDFPE)); /* + * install "int80" handler at, well, 0x80. + */ + set_gatesegd(&idt0[T_INT80], + (kpti_enable == 1) ? &tr_sys_int80 : &sys_int80, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80)); + + /* * install fast trap handler at 210. */ set_gatesegd(&idt[T_FASTTRAP], @@ -795,18 +802,25 @@ init_idt_common(gate_desc_t *idt) KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_DTRACE_RET)); /* - * Prepare interposing descriptor for the syscall handler - * and cache copy of the default descriptor. + * Prepare interposing descriptors for the branded "int80" + * and syscall handlers and cache copies of the default + * descriptors. */ - brand_tbl[0].ih_inum = T_SYSCALLINT; - brand_tbl[0].ih_default_desc = idt0[T_SYSCALLINT]; - + brand_tbl[0].ih_inum = T_INT80; + brand_tbl[0].ih_default_desc = idt0[T_INT80]; set_gatesegd(&(brand_tbl[0].ih_interp_desc), + (kpti_enable == 1) ? &tr_brand_sys_int80 : &brand_sys_int80, + KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_INT80)); + + brand_tbl[1].ih_inum = T_SYSCALLINT; + brand_tbl[1].ih_default_desc = idt0[T_SYSCALLINT]; + + set_gatesegd(&(brand_tbl[1].ih_interp_desc), (kpti_enable == 1) ? &tr_brand_sys_syscall_int : &brand_sys_syscall_int, KCS_SEL, SDT_SYSIGT, TRP_UPL, idt_vector_to_ist(T_SYSCALLINT)); - brand_tbl[1].ih_inum = 0; + brand_tbl[2].ih_inum = 0; } #if defined(__xpv) diff --git a/usr/src/uts/intel/os/device_policy b/usr/src/uts/intel/os/device_policy index 41adb787ce..88d6afa054 100644 --- a/usr/src/uts/intel/os/device_policy +++ b/usr/src/uts/intel/os/device_policy @@ -8,4 +8,21 @@ # * read_priv_set=none write_priv_set=none -md:admin write_priv_set=sys_config +bridge:* read_priv_set=net_rawaccess write_priv_set=net_rawaccess +fssnap:ctl read_priv_set=sys_config write_priv_set=sys_config +icmp:* read_priv_set=net_icmpaccess write_priv_set=net_icmpaccess +icmp6:* read_priv_set=net_icmpaccess write_priv_set=net_icmpaccess +ipf:* read_priv_set=sys_ip_config write_priv_set=sys_ip_config +ip:* read_priv_set=net_rawaccess write_priv_set=net_rawaccess +ip6:* read_priv_set=net_rawaccess write_priv_set=net_rawaccess +ipnet:* read_priv_set=net_observability write_priv_set=net_observability +ipsecah:* read_priv_set=sys_ip_config write_priv_set=sys_ip_config +ipsecesp:* read_priv_set=sys_ip_config write_priv_set=sys_ip_config +keysock:* read_priv_set=sys_ip_config write_priv_set=sys_ip_config +mm:allkmem read_priv_set=all write_priv_set=all +mm:kmem read_priv_set=none write_priv_set=all +mm:mem read_priv_set=none write_priv_set=all +openeepr:* write_priv_set=all +random:* write_priv_set=sys_devices +scsi_vhci:devctl write_priv_set=sys_devices +spdsock:* read_priv_set=sys_ip_config write_priv_set=sys_ip_config diff --git a/usr/src/uts/intel/os/driver_aliases b/usr/src/uts/intel/os/driver_aliases index 3116819932..1a9c082f57 100644 --- a/usr/src/uts/intel/os/driver_aliases +++ b/usr/src/uts/intel/os/driver_aliases @@ -1 +1,1851 @@ +aac "pci1028,3" +aac "pci1028,a" +aac "pci9005,285" +aac "pci9005,286" +aac "pciex9005,285" +aac "pciex9005,286" +acpinex "acpivirtnex" +adpu320 "pci9005,8000" +adpu320 "pci9005,800f.9005.5f" +adpu320 "pci9005,8010" +adpu320 "pci9005,8011" +adpu320 "pci9005,8012" +adpu320 "pci9005,8014" +adpu320 "pci9005,8015" +adpu320 "pci9005,8016" +adpu320 "pci9005,8017" +adpu320 "pci9005,801d" +adpu320 "pci9005,801e" +adpu320 "pci9005,801f" +adpu320 "pci9005,808f" +afe "pci10b7,9300" +afe "pci1113,1216" +afe "pci1317,1985" +afe "pci1317,9511" +afe "pci1317,9513" +afe "pci1317,981" +afe "pci1317,985" +afe "pci13d1,ab02" +afe "pci13d1,ab03" +afe "pci13d1,ab08" +afe "pci1737,ab08" +agptarget "pci1022,7454" +agptarget "pci8086,1130" +agptarget "pci8086,2560" +agptarget "pci8086,2570" +agptarget "pci8086,2580" +agptarget "pci8086,2590" +agptarget "pci8086,2770" +agptarget "pci8086,27a0" +agptarget "pci8086,27ac" +agptarget "pci8086,2970" +agptarget "pci8086,2980" +agptarget "pci8086,2990" +agptarget "pci8086,29a0" +agptarget "pci8086,29b0" +agptarget "pci8086,29c0" +agptarget "pci8086,29d0" +agptarget "pci8086,2a00" +agptarget "pci8086,2a10" +agptarget "pci8086,2a40" +agptarget "pci8086,2e00" +agptarget "pci8086,2e10" +agptarget "pci8086,2e20" +agptarget "pci8086,2e30" +agptarget "pci8086,2e40" +agptarget "pci8086,3575" +agptarget "pci8086,3580" +agptarget "pci8086,40" +agptarget "pci8086,44" +agptarget "pci8086,62" +agptarget "pci8086,6a" +agptarget "pci8086,7120" +agptarget "pci8086,7122" +agptarget "pci8086,7124" +ahci "pciclass,010601" +ahci "pci8086,2822,p" +ahci "pci8086,282a,p" +amd64_gart "pci1022,1103" +amd8111s "pci1022,7462" +amd_iommu "pci1002,5a23" +amd_iommu "pci1022,11ff" +amdzen_stub "pci1022,1440,p" +amdzen_stub "pci1022,1441,p" +amdzen_stub "pci1022,1442,p" +amdzen_stub "pci1022,1443,p" +amdzen_stub "pci1022,1444,p" +amdzen_stub "pci1022,1445,p" +amdzen_stub "pci1022,1446,p" +amdzen_stub "pci1022,1447,p" +amdzen_stub "pci1022,1448,p" +amdzen_stub "pci1022,1449,p" +amdzen_stub "pci1022,144a,p" +amdzen_stub "pci1022,144b,p" +amdzen_stub "pci1022,144c,p" +amdzen_stub "pci1022,144d,p" +amdzen_stub "pci1022,144e,p" +amdzen_stub "pci1022,144f,p" +amdzen_stub "pci1022,1450,p" +amdzen_stub "pci1022,1460,p" +amdzen_stub "pci1022,1461,p" +amdzen_stub "pci1022,1462,p" +amdzen_stub "pci1022,1463,p" +amdzen_stub "pci1022,1464,p" +amdzen_stub "pci1022,1465,p" +amdzen_stub "pci1022,1466,p" +amdzen_stub "pci1022,1467,p" +amdzen_stub "pci1022,1480,p" +amdzen_stub "pci1022,1490,p" +amdzen_stub "pci1022,1491,p" +amdzen_stub "pci1022,1492,p" +amdzen_stub "pci1022,1493,p" +amdzen_stub "pci1022,1494,p" +amdzen_stub "pci1022,1495,p" +amdzen_stub "pci1022,1496,p" +amdzen_stub "pci1022,1497,p" +amdzen_stub "pci1022,14a4,p" +amdzen_stub "pci1022,14ad,p" +amdzen_stub "pci1022,14ae,p" +amdzen_stub "pci1022,14af,p" +amdzen_stub "pci1022,14b0,p" +amdzen_stub "pci1022,14b1,p" +amdzen_stub "pci1022,14b2,p" +amdzen_stub "pci1022,14b3,p" +amdzen_stub "pci1022,14b4,p" +amdzen_stub "pci1022,14b5,p" +amdzen_stub "pci1022,14d8,p" +amdzen_stub "pci1022,14e0,p" +amdzen_stub "pci1022,14e1,p" +amdzen_stub "pci1022,14e2,p" +amdzen_stub "pci1022,14e3,p" +amdzen_stub "pci1022,14e4,p" +amdzen_stub "pci1022,14e5,p" +amdzen_stub "pci1022,14e6,p" +amdzen_stub "pci1022,14e7,p" +amdzen_stub "pci1022,15d0,p" +amdzen_stub "pci1022,15e8,p" +amdzen_stub "pci1022,15e9,p" +amdzen_stub "pci1022,15ea,p" +amdzen_stub "pci1022,15eb,p" +amdzen_stub "pci1022,15ec,p" +amdzen_stub "pci1022,15ed,p" +amdzen_stub "pci1022,15ee,p" +amdzen_stub "pci1022,15ef,p" +amdzen_stub "pci1022,1630,p" +amdzen_stub "pci1022,1650,p" +amdzen_stub "pci1022,1651,p" +amdzen_stub "pci1022,1652,p" +amdzen_stub "pci1022,1653,p" +amdzen_stub "pci1022,1654,p" +amdzen_stub "pci1022,1655,p" +amdzen_stub "pci1022,1656,p" +amdzen_stub "pci1022,1657,p" +amdzen_stub "pci1022,166a,p" +amdzen_stub "pci1022,166b,p" +amdzen_stub "pci1022,166c,p" +amdzen_stub "pci1022,166d,p" +amdzen_stub "pci1022,166e,p" +amdzen_stub "pci1022,166f,p" +amdzen_stub "pci1022,1670,p" +amdzen_stub "pci1022,1671,p" +amdzen_stub "pci1022,1679,p" +amdzen_stub "pci1022,167a,p" +amdzen_stub "pci1022,167b,p" +amdzen_stub "pci1022,167c,p" +amdzen_stub "pci1022,167d,p" +amdzen_stub "pci1022,167e,p" +amdzen_stub "pci1022,167f,p" +amdzen_stub "pci1022,1680,p" +amdzen_stub "pci1022,1724,p" +amdzen_stub "pci1022,1725,p" +amdzen_stub "pci1022,1726,p" +amdzen_stub "pci1022,1727,p" +amdzen_stub "pci1022,1728,p" +amdzen_stub "pci1022,1729,p" +amdzen_stub "pci1022,172a,p" +amdzen_stub "pci1022,172b,p" +amdnbtemp "pci1022,1203,p" +amdnbtemp "pci1022,1303,p" +amdnbtemp "pci1022,1403,p" +amdnbtemp "pci1022,141d,p" +amdnbtemp "pci1022,1533,p" +amdnbtemp "pci1022,1583,p" +amdnbtemp "pci1022,1603,p" +amdnbtemp "pci1022,1703,p" +amr "pci1000,1960.1000,532" +amr "pci101e,1960.1028,493" +amr "pci1000,1960.1028,518" +amr "pci1000,1960.1028,520" +arcmsr "pci17d3,1110" +arcmsr "pci17d3,1120" +arcmsr "pci17d3,1130" +arcmsr "pci17d3,1160" +arcmsr "pci17d3,1170" +arcmsr "pci17d3,1201" +arcmsr "pci17d3,1210" +arcmsr "pci17d3,1220" +arcmsr "pci17d3,1230" +arcmsr "pci17d3,1260" +arcmsr "pci17d3,1270" +arcmsr "pci17d3,1280" +arcmsr "pci17d3,1380" +arcmsr "pci17d3,1381" +arcmsr "pci17d3,1680" +arcmsr "pci17d3,1681" +arcmsr "pci17d3,1880" +arcmsr "pci17d3,1882" asy "pci11c1,480" +ata "ide" +atge "pciex1969,1026" +atge "pciex1969,1048" +atge "pciex1969,1062" +atge "pciex1969,1063" +atge "pciex1969,1073" +atge "pciex1969,1083" +atge "pciex1969,2060" +atge "pciex1969,2062" +axf "usb7b8,420a" +axf "usbb95,7720" +axf "usbb95,772a" +axf "usb2001,1a00" +axf "usb77b,2226" +axf "usb846,1040" +axf "usbb95,1720" +axf "usb8dd,90ff" +axf "usb557,2009" +axf "usb411,3d" +axf "usb6189,182d" +axf "usb7aa,17" +axf "usb1189,893" +axf "usb1631,6200" +axf "usb13b1,18" +axf "usb1557,7720" +axf "usb7d1,3c05" +axf "usb2001,3c05" +axf "usb5ac,1402" +bcm_sata "pci1166,24a" +bfe "pci14e4,170c" +bfe "pci14e4,4401" +bfe "pci14e4,4402" +bge "SUNW,bge" +bge "pci108e,1647" +bge "pci108e,1648" +bge "pci14e4,1682" +bge "pci14e4,1686" +bge "pci108e,16a7" +bge "pci108e,16a8" +bge "pci14e4,16b0" +bge "pci14e4,16b1" +bge "pci14e4,16b2" +bge "pci14e4,16b3" +bge "pci14e4,16b4" +bge "pci14e4,16b5" +bge "pci14e4,16b6" +bge "pci14e4,16b7" +bge "pci14e4,1600" +bge "pci14e4,1601" +bge "pci14e4,1643" +bge "pci14e4,1644" +bge "pci14e4,1645" +bge "pci14e4,1647" +bge "pci14e4,1648" +bge "pci14e4,1649" +bge "pci14e4,1653" +bge "pci14e4,1654" +bge "pci14e4,1657" +bge "pci14e4,1659" +bge "pci14e4,165d" +bge "pci14e4,165e" +bge "pci14e4,165f" +bge "pci14e4,1665" +bge "pci14e4,1668" +bge "pci14e4,1669" +bge "pci14e4,166a" +bge "pci14e4,166e" +bge "pci14e4,1677" +bge "pci14e4,1678" +bge "pci14e4,1679" +bge "pci14e4,167d" +bge "pci14e4,1693" +bge "pci14e4,1696" +bge "pci14e4,1699" +bge "pci14e4,169b" +bge "pci14e4,169c" +bge "pci14e4,16a6" +bge "pci14e4,16a7" +bge "pci14e4,16a8" +bge "pci14e4,16c7" +bge "pci14e4,16f3" +bge "pciex14e4,1643" +bge "pciex14e4,1655" +bge "pciex14e4,1656" +bge "pciex14e4,1657" +bge "pciex14e4,165a" +bge "pciex14e4,165b" +bge "pciex14e4,165c" +bge "pciex14e4,165f" +bge "pciex14e4,1665" +bge "pciex14e4,1673" +bge "pciex14e4,1674" +bge "pciex14e4,1677" +bge "pciex14e4,167a" +bge "pciex14e4,167b" +bge "pciex14e4,1680" +bge "pciex14e4,1681" +bge "pciex14e4,1682" +bge "pciex14e4,1684" +bge "pciex14e4,1686" +bge "pciex14e4,1688" +bge "pciex14e4,1689" +bge "pciex14e4,1690" +bge "pciex14e4,1691" +bge "pciex14e4,1692" +bge "pciex14e4,1694" +bge "pciex14e4,1698" +bge "pciex14e4,169d" +bge "pciex14e4,16b0" +bge "pciex14e4,16b1" +bge "pciex14e4,16b2" +bge "pciex14e4,16b3" +bge "pciex14e4,16b4" +bge "pciex14e4,16b5" +bge "pciex14e4,16b6" +bge "pciex14e4,16b7" +bge "pciex14e4,16fd" +bge "pciex14e4,16f3" +bge "pciex14e4,1713" +bnx "pci14e4,1639" +bnx "pci14e4,163a" +bnx "pci14e4,163b" +bnx "pci14e4,163c" +bnx "pci14e4,164a" +bnx "pci14e4,164c" +bnx "pci14e4,16aa" +bnx "pci14e4,16ac" +bnxe "pci14e4,164e" +bnxe "pci14e4,164f" +bnxe "pci14e4,1650" +bnxe "pciex14e4,164e" +bnxe "pciex14e4,164f" +bnxe "pciex14e4,1650" +bnxe "pciex14e4,1662" +bnxe "pciex14e4,1663" +bnxe "pciex14e4,168a" +bnxe "pciex14e4,168d" +bnxe "pciex14e4,168e" +bnxe "pciex14e4,16a1" +bnxe "pciex14e4,16a4" +bnxe "pciex14e4,16a5" +bnxe "pciex14e4,16ab" +bnxe "pciex14e4,16ae" +bscbus "SVI0101" +ccid "usbif,classb" +ce "pci100b,35" +ce "pci108e,abba" +chxge "pci1425,7" +chxge "pci1425,a" +cpqary3 "pci103c,3211" +cpqary3 "pci103c,3212" +cpqary3 "pci103c,3223" +cpqary3 "pci103c,3225" +cpqary3 "pci103c,3234" +cpqary3 "pci103c,3235" +cpqary3 "pci103c,3237" +cpqary3 "pci103c,323d" +cpqary3 "pcie11,4070" +cpqary3 "pcie11,4080" +cpqary3 "pcie11,4082" +cpqary3 "pcie11,4083" +cpqary3 "pcie11,4091" +cpqary3 "pcie11,409a" +cpqary3 "pcie11,409b" +cpqary3 "pcie11,409c" +cpqary3 "pcie11,409d" +cpqary3 "pcie11,409e" +cpudrv "cpu" +cpunex "cpus" +dcam1394 "firewire000104,000100" +dcam1394 "firewire00a02d,000100" +dmfe "pci108e,9102" +dmfe "pci1282,9102" +dnet "pci1011,14" +dnet "pci1011,19" +dnet "pci1011,2" +dnet "pci1011,9" +dnet "pci10b8,2001" +dnet "pci1109,1400" +dnet "pci1109,2400" +dnet "pci2646,1" +dr "acpidr_sbd" +e1000g "pci8086,1000" +e1000g "pci8086,1001" +e1000g "pci8086,1004.0e11.49" +e1000g "pci8086,1004.0e11.b1a4" +e1000g "pci8086,1004.1014.10f2" +e1000g "pci8086,1004.8086.1004" +e1000g "pci8086,1004.8086.2004" +e1000g "pci8086,1008" +e1000g "pci8086,1009" +e1000g "pci8086,100c" +e1000g "pci8086,100d" +e1000g "pci8086,100e" +e1000g "pci8086,100f" +e1000g "pci8086,1010" +e1000g "pci8086,1011" +e1000g "pci8086,1012" +e1000g "pci8086,1013" +e1000g "pci8086,1014" +e1000g "pci8086,1015" +e1000g "pci8086,1016" +e1000g "pci8086,1017" +e1000g "pci8086,1018" +e1000g "pci8086,1019" +e1000g "pci8086,101a" +e1000g "pci8086,101d" +e1000g "pci8086,101e" +e1000g "pci8086,1026" +e1000g "pci8086,1027" +e1000g "pci8086,1028" +e1000g "pci8086,1049" +e1000g "pci8086,104a" +e1000g "pci8086,104b" +e1000g "pci8086,104c" +e1000g "pci8086,104d" +e1000g "pci8086,105a" +e1000g "pci8086,105b" +e1000g "pci8086,105c" +e1000g "pci8086,105e" +e1000g "pci8086,105f" +e1000g "pci8086,1060" +e1000g "pci8086,1061" +e1000g "pci8086,1062" +e1000g "pci8086,1063" +e1000g "pci8086,1075" +e1000g "pci8086,1076" +e1000g "pci8086,1077" +e1000g "pci8086,1078" +e1000g "pci8086,1079" +e1000g "pci8086,107a" +e1000g "pci8086,107b" +e1000g "pci8086,107c" +e1000g "pci8086,107d" +e1000g "pci8086,107e" +e1000g "pci8086,107f" +e1000g "pci8086,108a" +e1000g "pci8086,108b" +e1000g "pci8086,108c" +e1000g "pci8086,1096" +e1000g "pci8086,1098" +e1000g "pci8086,1099" +e1000g "pci8086,109a" +e1000g "pci8086,10a4" +e1000g "pci8086,10a5" +e1000g "pci8086,10b5" +e1000g "pci8086,10b9" +e1000g "pci8086,10ba" +e1000g "pci8086,10bb" +e1000g "pci8086,10bc" +e1000g "pci8086,10bd" +e1000g "pci8086,10bf" +e1000g "pci8086,10c0" +e1000g "pci8086,10c2" +e1000g "pci8086,10c3" +e1000g "pci8086,10c4" +e1000g "pci8086,10c5" +e1000g "pci8086,10cb" +e1000g "pci8086,10cc" +e1000g "pci8086,10cd" +e1000g "pci8086,10ce" +e1000g "pci8086,10d3" +e1000g "pci8086,10d5" +e1000g "pci8086,10d9" +e1000g "pci8086,10da" +e1000g "pci8086,10de" +e1000g "pci8086,10df" +e1000g "pci8086,10e5" +e1000g "pci8086,10ea" +e1000g "pci8086,10eb" +e1000g "pci8086,10ef" +e1000g "pci8086,10f0" +e1000g "pci8086,10f5" +e1000g "pci8086,10f6" +e1000g "pci8086,1502" +e1000g "pci8086,1503" +e1000g "pci8086,150c" +e1000g "pci8086,153a" +e1000g "pci8086,153b" +e1000g "pci8086,1559" +e1000g "pci8086,155a" +e1000g "pci8086,156f" +e1000g "pci8086,1570" +e1000g "pci8086,15a0" +e1000g "pci8086,15a1" +e1000g "pci8086,15a2" +e1000g "pci8086,15a3" +e1000g "pci8086,15b7" +e1000g "pci8086,15b8" +e1000g "pci8086,15b9" +e1000g "pci8086,15bb" +e1000g "pci8086,15bc" +e1000g "pci8086,15bd" +e1000g "pci8086,15be" +e1000g "pci8086,15d6" +e1000g "pci8086,15d7" +e1000g "pci8086,15d8" +e1000g "pci8086,15df" +e1000g "pci8086,15e0" +e1000g "pci8086,15e1" +e1000g "pci8086,15e2" +e1000g "pci8086,15e3" +e1000g "pci8086,15f4,p" +e1000g "pci8086,15f5,p" +e1000g "pci8086,15f9,p" +e1000g "pci8086,15fa,p" +e1000g "pci8086,15fb,p" +e1000g "pci8086,15fc,p" +e1000g "pci8086,1a1c,p" +e1000g "pci8086,1a1d,p" +e1000g "pci8086,1a1e,p" +e1000g "pci8086,1a1f,p" +e1000g "pci8086,294c" +e1000g "pci8086,550a,p" +e1000g "pci8086,550b,p" +e1000g "pci8086,550c,p" +e1000g "pci8086,550d,p" +e1000g "pci8086,550e,p" +e1000g "pci8086,550f,p" +e1000g "pci8086,5510,p" +e1000g "pci8086,5511,p" +e1000g "pci8086,d4c,p" +e1000g "pci8086,d4d,p" +e1000g "pci8086,d4e,p" +e1000g "pci8086,d4f,p" +e1000g "pci8086,d53,p" +e1000g "pci8086,d55,p" +e1000g "pci8086,dc5,p" +e1000g "pci8086,dc6,p" +e1000g "pci8086,dc7,p" +e1000g "pci8086,dc8,p" +e1000g "pci8086,f0fe" +e1000g "pciex8086,1049" +e1000g "pciex8086,104a" +e1000g "pciex8086,104b" +e1000g "pciex8086,104c" +e1000g "pciex8086,104d" +e1000g "pciex8086,105e" +e1000g "pciex8086,105f" +e1000g "pciex8086,1060" +e1000g "pciex8086,107d" +e1000g "pciex8086,107e" +e1000g "pciex8086,107f" +e1000g "pciex8086,108b" +e1000g "pciex8086,108c" +e1000g "pciex8086,1096" +e1000g "pciex8086,1098" +e1000g "pciex8086,109a" +e1000g "pciex8086,10a4" +e1000g "pciex8086,10a5" +e1000g "pciex8086,10b9" +e1000g "pciex8086,10ba" +e1000g "pciex8086,10bb" +e1000g "pciex8086,10bc" +e1000g "pciex8086,10bd" +e1000g "pciex8086,10bf" +e1000g "pciex8086,10c0" +e1000g "pciex8086,10c2" +e1000g "pciex8086,10c3" +e1000g "pciex8086,10c4" +e1000g "pciex8086,10c5" +e1000g "pciex8086,10cb" +e1000g "pciex8086,10cc" +e1000g "pciex8086,10cd" +e1000g "pciex8086,10ce" +e1000g "pciex8086,10d3" +e1000g "pciex8086,10d5" +e1000g "pciex8086,10d9" +e1000g "pciex8086,10da" +e1000g "pciex8086,10de" +e1000g "pciex8086,10df" +e1000g "pciex8086,10e5" +e1000g "pciex8086,10ea" +e1000g "pciex8086,10eb" +e1000g "pciex8086,10ef" +e1000g "pciex8086,10f0" +e1000g "pciex8086,10f5" +e1000g "pciex8086,10f6" +e1000g "pciex8086,1502" +e1000g "pciex8086,1503" +e1000g "pciex8086,150c" +e1000g "pciex8086,153a" +e1000g "pciex8086,153b" +e1000g "pciex8086,1559" +e1000g "pciex8086,156f" +e1000g "pciex8086,1570" +e1000g "pciex8086,155a" +e1000g "pciex8086,15a0" +e1000g "pciex8086,15a1" +e1000g "pciex8086,15a2" +e1000g "pciex8086,15a3" +e1000g "pciex8086,15b7" +e1000g "pciex8086,15b8" +e1000g "pciex8086,15b9" +e1000g "pciex8086,15bb" +e1000g "pciex8086,15bc" +e1000g "pciex8086,15bd" +e1000g "pciex8086,15be" +e1000g "pciex8086,15d6" +e1000g "pciex8086,15d7" +e1000g "pciex8086,15d8" +e1000g "pciex8086,15df" +e1000g "pciex8086,15e0" +e1000g "pciex8086,15e1" +e1000g "pciex8086,15e2" +e1000g "pciex8086,15e3" +e1000g "pciex8086,15f4" +e1000g "pciex8086,15f5" +e1000g "pciex8086,15f9" +e1000g "pciex8086,15fa" +e1000g "pciex8086,15fb" +e1000g "pciex8086,15fc" +e1000g "pciex8086,1a1c" +e1000g "pciex8086,1a1d" +e1000g "pciex8086,1a1e" +e1000g "pciex8086,1a1f" +e1000g "pciex8086,294c" +e1000g "pciex8086,550a" +e1000g "pciex8086,550b" +e1000g "pciex8086,550c" +e1000g "pciex8086,550d" +e1000g "pciex8086,550e" +e1000g "pciex8086,550f" +e1000g "pciex8086,5510" +e1000g "pciex8086,5511" +e1000g "pciex8086,d4c" +e1000g "pciex8086,d4d" +e1000g "pciex8086,d4e" +e1000g "pciex8086,d4f" +e1000g "pciex8086,d53" +e1000g "pciex8086,d55" +e1000g "pciex8086,dc5" +e1000g "pciex8086,dc6" +e1000g "pciex8086,dc7" +e1000g "pciex8086,dc8" +e1000g "pciex8086,f0fe" +ecpp "lp" +ehci "pciclass,0c0320" +elxl "pci10b7,9000" +elxl "pci10b7,9001" +elxl "pci10b7,9004" +elxl "pci10b7,9005" +elxl "pci10b7,9006" +elxl "pci10b7,9050" +elxl "pci10b7,9051" +elxl "pci10b7,9055" +elxl "pci10b7,9056" +elxl "pci10b7,905a" +elxl "pci10b7,9200" +elxl "pci10b7,9201" +elxl "pci10b7,9202" +elxl "pci10b7,9800" +elxl "pci10b7,9805" +emlxs "lpfs" +emlxs "pci10df,f0a5" +emlxs "pci10df,f800" +emlxs "pci10df,f900" +emlxs "pci10df,f980" +emlxs "pci10df,fa00" +emlxs "pci10df,fc00" +emlxs "pci10df,fc10" +emlxs "pci10df,fc20" +emlxs "pci10df,fd00" +emlxs "pci10df,fe00" +emlxs "pciex10df,e200" +emlxs "pciex10df,e300" +emlxs "pciex10df,f100" +emlxs "pciex10df,f111" +emlxs "pciex10df,f112" +emlxs "pciex10df,fc20" +emlxs "pciex10df,fc40" +emlxs "pciex10df,fe00" +emlxs "pciex10df,fe05" +emlxs "pciex117c,63" +emlxs "pciex117c,64" +emlxs "pciex117c,65" +emlxs "pciex117c,94" +emlxs "pciex19a2,704" +emlxs "pciex19a2,714" +ena "pciex1d0f,ec2" +ena "pciex1d0f,1ec2" +ena "pciex1d0f,ec20" +ena "pciex1d0f,ec21" +fipe "pci8086,25f0" +fipe "pci8086,360c" +glm "pci1000,b" +hci1394 "pciclass,0c0010" +heci "pci8086,2974" +heci "pci8086,2984" +heci "pci8086,2994" +heci "pci8086,29a4" +heci "pci8086,29b4" +heci "pci8086,29c4" +heci "pci8086,2e04" +heci "pci8086,2e14" +hid "usbif,class3" +hme "pci108e,1001" +hubd "usbif,class9" +hxge "pci108e,aaaa" +i40e "pciex8086,101f" +i40e "pciex8086,104e" +i40e "pciex8086,104f" +i40e "pciex8086,1572" +i40e "pciex8086,1580" +i40e "pciex8086,1581" +i40e "pciex8086,1583" +i40e "pciex8086,1584" +i40e "pciex8086,1585" +i40e "pciex8086,1586" +i40e "pciex8086,1589" +i40e "pciex8086,158a" +i40e "pciex8086,158b" +i40e "pciex8086,15ff" +i40e "pciex8086,37ce" +i40e "pciex8086,37cf" +i40e "pciex8086,37d0" +i40e "pciex8086,37d1" +i40e "pciex8086,37d2" +i40e "pciex8086,37d3" +i40e "pciex8086,cf8" +i40e "pciex8086,d58" +i915 "pci8086,2562" +i915 "pci8086,2572" +i915 "pci8086,2582" +i915 "pci8086,2592" +i915 "pci8086,2772" +i915 "pci8086,27a2" +i915 "pci8086,27ae" +i915 "pci8086,2972" +i915 "pci8086,2982" +i915 "pci8086,2992" +i915 "pci8086,29a2" +i915 "pci8086,29b2" +i915 "pci8086,29c2" +i915 "pci8086,29d2" +i915 "pci8086,2a02" +i915 "pci8086,2a12" +i915 "pci8086,2a42" +i915 "pci8086,2e02.8086.2e02" +i915 "pci8086,2e12" +i915 "pci8086,2e22" +i915 "pci8086,2e32" +i915 "pci8086,2e42" +i915 "pci8086,42" +i915 "pci8086,46" +ibp "ib.ipib" +igb "pciex8086,10a7" +igb "pciex8086,10a9" +igb "pciex8086,10c9" +igb "pciex8086,10d6" +igb "pciex8086,10e6" +igb "pciex8086,10e7" +igb "pciex8086,10e8" +igb "pciex8086,150a" +igb "pciex8086,150d" +igb "pciex8086,150e" +igb "pciex8086,150f" +igb "pciex8086,1510" +igb "pciex8086,1511" +igb "pciex8086,1516" +igb "pciex8086,1518" +igb "pciex8086,1521" +igb "pciex8086,1522" +igb "pciex8086,1523" +igb "pciex8086,1524" +igb "pciex8086,1526" +igb "pciex8086,1533" +igb "pciex8086,1534" +igb "pciex8086,1535" +igb "pciex8086,1536" +igb "pciex8086,1537" +igb "pciex8086,1538" +igb "pciex8086,1539" +igb "pciex8086,1546" +igb "pciex8086,157b" +igb "pciex8086,157c" +igb "pciex8086,1f40" +igb "pciex8086,1f41" +igb "pciex8086,1f45" +igb "pciex8086,438" +imcstub "pci8086,e1e,p" +imcstub "pci8086,e1f,p" +imcstub "pci8086,e60,p" +imcstub "pci8086,e68,p" +imcstub "pci8086,e6a,p" +imcstub "pci8086,e6b,p" +imcstub "pci8086,e6c,p" +imcstub "pci8086,e6d,p" +imcstub "pci8086,e71,p" +imcstub "pci8086,e79,p" +imcstub "pci8086,ea0,p" +imcstub "pci8086,ea8,p" +imcstub "pci8086,eaa,p" +imcstub "pci8086,eab,p" +imcstub "pci8086,eac,p" +imcstub "pci8086,ead,p" +imcstub "pci8086,ec8,p" +imcstub "pci8086,ec9,p" +imcstub "pci8086,eca,p" +imcstub "pci8086,2014,p" +imcstub "pci8086,2016,p" +imcstub "pci8086,2024,p" +imcstub "pci8086,2040,p" +imcstub "pci8086,2044,p" +imcstub "pci8086,2048,p" +imcstub "pci8086,2054,p" +imcstub "pci8086,2055,p" +imcstub "pci8086,2066,p" +imcstub "pci8086,208e,p" +imcstub "pci8086,2f1e,p" +imcstub "pci8086,2f1f,p" +imcstub "pci8086,2f28,p" +imcstub "pci8086,2f60,p" +imcstub "pci8086,2f68,p" +imcstub "pci8086,2f6a,p" +imcstub "pci8086,2f6b,p" +imcstub "pci8086,2f6c,p" +imcstub "pci8086,2f6d,p" +imcstub "pci8086,2f71,p" +imcstub "pci8086,2f79,p" +imcstub "pci8086,2fa0,p" +imcstub "pci8086,2fa8,p" +imcstub "pci8086,2faa,p" +imcstub "pci8086,2fab,p" +imcstub "pci8086,2fac,p" +imcstub "pci8086,2fad,p" +imcstub "pci8086,2ffc,p" +imcstub "pci8086,2ffd,p" +imcstub "pci8086,3c71,p" +imcstub "pci8086,3ca0,p" +imcstub "pci8086,3ca8,p" +imcstub "pci8086,3caa,p" +imcstub "pci8086,3cab,p" +imcstub "pci8086,3cac,p" +imcstub "pci8086,3cad,p" +imcstub "pci8086,3ce0,p" +imcstub "pci8086,3ce3,p" +imcstub "pci8086,3cf4,p" +imcstub "pci8086,3cf5,p" +imcstub "pci8086,3cf6,p" +imcstub "pci8086,6f1e,p" +imcstub "pci8086,6f1f,p" +imcstub "pci8086,6f28,p" +imcstub "pci8086,6f60,p" +imcstub "pci8086,6f68,p" +imcstub "pci8086,6f6a,p" +imcstub "pci8086,6f6b,p" +imcstub "pci8086,6f6c,p" +imcstub "pci8086,6f6d,p" +imcstub "pci8086,6f71,p" +imcstub "pci8086,6f79,p" +imcstub "pci8086,6fa0,p" +imcstub "pci8086,6fa8,p" +imcstub "pci8086,6faa,p" +imcstub "pci8086,6fab,p" +imcstub "pci8086,6fac,p" +imcstub "pci8086,6fad,p" +imcstub "pci8086,6ffc,p" +imcstub "pci8086,6ffd,p" +imcstub "pciex8086,e1e" +imcstub "pciex8086,e1f" +imcstub "pciex8086,e60" +imcstub "pciex8086,e68" +imcstub "pciex8086,e6a" +imcstub "pciex8086,e6b" +imcstub "pciex8086,e6c" +imcstub "pciex8086,e6d" +imcstub "pciex8086,e71" +imcstub "pciex8086,e79" +imcstub "pciex8086,ea0" +imcstub "pciex8086,ea8" +imcstub "pciex8086,eaa" +imcstub "pciex8086,eab" +imcstub "pciex8086,eac" +imcstub "pciex8086,ead" +imcstub "pciex8086,ec8" +imcstub "pciex8086,ec9" +imcstub "pciex8086,eca" +imcstub "pciex8086,2014" +imcstub "pciex8086,2016" +imcstub "pciex8086,2024" +imcstub "pciex8086,2040" +imcstub "pciex8086,2044" +imcstub "pciex8086,2048" +imcstub "pciex8086,2054" +imcstub "pciex8086,2055" +imcstub "pciex8086,2066" +imcstub "pciex8086,208e" +imcstub "pciex8086,2f1e" +imcstub "pciex8086,2f1f" +imcstub "pciex8086,2f28" +imcstub "pciex8086,2f60" +imcstub "pciex8086,2f68" +imcstub "pciex8086,2f6a" +imcstub "pciex8086,2f6b" +imcstub "pciex8086,2f6c" +imcstub "pciex8086,2f6d" +imcstub "pciex8086,2f71" +imcstub "pciex8086,2f79" +imcstub "pciex8086,2fa0" +imcstub "pciex8086,2fa8" +imcstub "pciex8086,2faa" +imcstub "pciex8086,2fab" +imcstub "pciex8086,2fac" +imcstub "pciex8086,2fad" +imcstub "pciex8086,2ffc" +imcstub "pciex8086,2ffd" +imcstub "pciex8086,3c71" +imcstub "pciex8086,3ca0" +imcstub "pciex8086,3ca8" +imcstub "pciex8086,3caa" +imcstub "pciex8086,3cab" +imcstub "pciex8086,3cac" +imcstub "pciex8086,3cad" +imcstub "pciex8086,3ce0" +imcstub "pciex8086,3ce3" +imcstub "pciex8086,3cf4" +imcstub "pciex8086,3cf5" +imcstub "pciex8086,3cf6" +imcstub "pciex8086,6f1e" +imcstub "pciex8086,6f1f" +imcstub "pciex8086,6f28" +imcstub "pciex8086,6f60" +imcstub "pciex8086,6f68" +imcstub "pciex8086,6f6a" +imcstub "pciex8086,6f6b" +imcstub "pciex8086,6f6c" +imcstub "pciex8086,6f6d" +imcstub "pciex8086,6f71" +imcstub "pciex8086,6f79" +imcstub "pciex8086,6fa0" +imcstub "pciex8086,6fa8" +imcstub "pciex8086,6faa" +imcstub "pciex8086,6fab" +imcstub "pciex8086,6fac" +imcstub "pciex8086,6fad" +imcstub "pciex8086,6ffc" +imcstub "pciex8086,6ffd" +intel_nb5000 "pci8086,25c0" +intel_nb5000 "pci8086,25d0" +intel_nb5000 "pci8086,25d4" +intel_nb5000 "pci8086,25d8" +intel_nb5000 "pci8086,3600" +intel_nb5000 "pci8086,4000" +intel_nb5000 "pci8086,4001" +intel_nb5000 "pci8086,4003" +intel_nb5000 "pci8086,65c0" +intel_nhm "pci8086,3423" +intel_nhm "pci8086,372a" +intel_nhmex "pci8086,3422" +intel_nhmex "pci8086,3438" +ioat "pciex8086,1a38" +ioat "pciex8086,360b" +ioat "pciex8086,402f" +iprb "pci8086,1029" +iprb "pci8086,1030" +iprb "pci8086,1031" +iprb "pci8086,1032" +iprb "pci8086,1038" +iprb "pci8086,1039" +iprb "pci8086,103d" +iprb "pci8086,103d.8086.103d" +iprb "pci8086,1050" +iprb "pci8086,1050.8086.3020" +iprb "pci8086,1050.8086.302f" +iprb "pci8086,1050.8086.3427" +iprb "pci8086,1059" +iprb "pci8086,1064" +iprb "pci8086,1068" +iprb "pci8086,1069" +iprb "pci8086,1092" +iprb "pci8086,1209" +iprb "pci8086,1229" +iprb "pci8086,1229.8086.1" +iprb "pci8086,1229.8086.10" +iprb "pci8086,1229.8086.1009" +iprb "pci8086,1229.8086.100c" +iprb "pci8086,1229.8086.1012" +iprb "pci8086,1229.8086.1013" +iprb "pci8086,1229.8086.1015" +iprb "pci8086,1229.8086.1016" +iprb "pci8086,1229.8086.1017" +iprb "pci8086,1229.8086.1030" +iprb "pci8086,1229.8086.1040" +iprb "pci8086,1229.8086.1041" +iprb "pci8086,1229.8086.1042" +iprb "pci8086,1229.8086.1050" +iprb "pci8086,1229.8086.1051" +iprb "pci8086,1229.8086.1052" +iprb "pci8086,1229.8086.10f0" +iprb "pci8086,1229.8086.11" +iprb "pci8086,1229.8086.12" +iprb "pci8086,1229.8086.1229" +iprb "pci8086,1229.8086.13" +iprb "pci8086,1229.8086.2" +iprb "pci8086,1229.8086.2009" +iprb "pci8086,1229.8086.200d" +iprb "pci8086,1229.8086.200e" +iprb "pci8086,1229.8086.200f" +iprb "pci8086,1229.8086.2010" +iprb "pci8086,1229.8086.2013" +iprb "pci8086,1229.8086.2016" +iprb "pci8086,1229.8086.2017" +iprb "pci8086,1229.8086.2018" +iprb "pci8086,1229.8086.2019" +iprb "pci8086,1229.8086.2101" +iprb "pci8086,1229.8086.2102" +iprb "pci8086,1229.8086.2103" +iprb "pci8086,1229.8086.2104" +iprb "pci8086,1229.8086.2105" +iprb "pci8086,1229.8086.2106" +iprb "pci8086,1229.8086.2107" +iprb "pci8086,1229.8086.2108" +iprb "pci8086,1229.8086.2200" +iprb "pci8086,1229.8086.2201" +iprb "pci8086,1229.8086.2202" +iprb "pci8086,1229.8086.2203" +iprb "pci8086,1229.8086.2204" +iprb "pci8086,1229.8086.2205" +iprb "pci8086,1229.8086.2206" +iprb "pci8086,1229.8086.2207" +iprb "pci8086,1229.8086.2208" +iprb "pci8086,1229.8086.2402" +iprb "pci8086,1229.8086.2407" +iprb "pci8086,1229.8086.2408" +iprb "pci8086,1229.8086.2409" +iprb "pci8086,1229.8086.240f" +iprb "pci8086,1229.8086.2410" +iprb "pci8086,1229.8086.2411" +iprb "pci8086,1229.8086.2412" +iprb "pci8086,1229.8086.2413" +iprb "pci8086,1229.8086.3" +iprb "pci8086,1229.8086.30" +iprb "pci8086,1229.8086.3000" +iprb "pci8086,1229.8086.3001" +iprb "pci8086,1229.8086.3002" +iprb "pci8086,1229.8086.3006" +iprb "pci8086,1229.8086.3007" +iprb "pci8086,1229.8086.3008" +iprb "pci8086,1229.8086.3010" +iprb "pci8086,1229.8086.3011" +iprb "pci8086,1229.8086.3012" +iprb "pci8086,1229.8086.301a" +iprb "pci8086,1229.8086.31" +iprb "pci8086,1229.8086.3411" +iprb "pci8086,1229.8086.4" +iprb "pci8086,1229.8086.40" +iprb "pci8086,1229.8086.41" +iprb "pci8086,1229.8086.42" +iprb "pci8086,1229.8086.5" +iprb "pci8086,1229.8086.50" +iprb "pci8086,1229.8086.6" +iprb "pci8086,1229.8086.7" +iprb "pci8086,1229.8086.8" +iprb "pci8086,1229.8086.9" +iprb "pci8086,1229.8086.a" +iprb "pci8086,1229.8086.b" +iprb "pci8086,1229.8086.c" +iprb "pci8086,1229.8086.d" +iprb "pci8086,1229.8086.e" +iprb "pci8086,1229.8086.f" +iprb "pci8086,2449" +iprb "pci8086,2449.8086.3010" +iprb "pci8086,2449.8086.3011" +iprb "pci8086,2449.8086.3012" +iprb "pci8086,2449.8086.3013" +iprb "pci8086,2449.8086.3014" +iprb "pci8086,2449.8086.3015" +iprb "pci8086,2449.8086.3016" +iprb "pci8086,2449.8086.3017" +iprb "pci8086,2449.8086.3018" +iprb "pci8086,27dc" +iprb "pci8086,27dc.8086.308d" +isa "pciclass,060100" +iwn "pci8086,82" +iwn "pci8086,83" +iwn "pci8086,84" +iwn "pci8086,85" +iwn "pci8086,87" +iwn "pci8086,89" +iwn "pci8086,8a" +iwn "pci8086,8b" +iwn "pci8086,8d" +iwn "pci8086,8e" +iwn "pci8086,90" +iwn "pci8086,91" +iwn "pci8086,887" +iwn "pci8086,888" +iwn "pci8086,88e" +iwn "pci8086,88f" +iwn "pci8086,890" +iwn "pci8086,891" +iwn "pci8086,892" +iwn "pci8086,893" +iwn "pci8086,894" +iwn "pci8086,895" +iwn "pci8086,896" +iwn "pci8086,897" +iwn "pci8086,8ae" +iwn "pci8086,422b" +iwn "pci8086,422c" +iwn "pci8086,4236" +iwn "pci8086,4238" +iwn "pci8086,4239" +ixgb "pci8086,1048" +ixgb "pci8086,109e" +ixgb "pci8086,1a48" +ixgb "pci8086,a11f" +ixgbe "pciex8086,10b6" +ixgbe "pciex8086,10c6" +ixgbe "pciex8086,10c7" +ixgbe "pciex8086,10c8" +ixgbe "pciex8086,10db" +ixgbe "pciex8086,10dd" +ixgbe "pciex8086,10e1" +ixgbe "pciex8086,10ec" +ixgbe "pciex8086,10f1" +ixgbe "pciex8086,10f4" +ixgbe "pciex8086,10f7" +ixgbe "pciex8086,10f8" +ixgbe "pciex8086,10f9" +ixgbe "pciex8086,10fb" +ixgbe "pciex8086,10fc" +ixgbe "pciex8086,1507" +ixgbe "pciex8086,1508" +ixgbe "pciex8086,150b" +ixgbe "pciex8086,1514" +ixgbe "pciex8086,1517" +ixgbe "pciex8086,151c" +ixgbe "pciex8086,1528" +ixgbe "pciex8086,154d" +ixgbe "pciex8086,154a" +ixgbe "pciex8086,1557" +ixgbe "pciex8086,1558" +ixgbe "pciex8086,1560" +ixgbe "pciex8086,1563" +ixgbe "pciex8086,15aa" +ixgbe "pciex8086,15ab" +ixgbe "pciex8086,15ac" +ixgbe "pciex8086,15ad" +ixgbe "pciex8086,15ae" +ixgbe "pciex8086,15c2" +ixgbe "pciex8086,15c3" +ixgbe "pciex8086,15c4" +ixgbe "pciex8086,15c6" +ixgbe "pciex8086,15c7" +ixgbe "pciex8086,15c8" +ixgbe "pciex8086,15ca" +ixgbe "pciex8086,15cc" +ixgbe "pciex8086,15ce" +ixgbe "pciex8086,15d1" +ixgbe "pciex8086,15e4" +ixgbe "pciex8086,15e5" +kb8042 "pnpPNP,303" +lsimega "pci1000,1960" +lsimega "pci1000,407" +lsimega "pci1000,407.1000.532" +lsimega "pci1000,408" +lsimega "pci1000,408.1000.2" +lsimega "pci1000,409" +lsimega "pci1028,13" +marvell88sx "pci11ab,5040" +marvell88sx "pci11ab,5041" +marvell88sx "pci11ab,5080" +marvell88sx "pci11ab,5081" +marvell88sx "pci11ab,6041.9" +marvell88sx "pci11ab,6081.9" +mc-amd "pci1022,1100" +mc-amd "pci1022,1101" +mc-amd "pci1022,1102" +mega_sas "pci1000,411.1000.1001" +mega_sas "pci1000,411.1000.1002" +mega_sas "pci1000,411.1000.1003" +mega_sas "pci1000,411.1000.1004" +mega_sas "pci1000,411.1000.1008" +mega_sas "pci1000,411.1000.100c" +mega_sas "pci1000,411.1000.100d" +mega_sas "pci1000,411.1000.2004" +mega_sas "pci1000,411.1000.2005" +mega_sas "pci1000,411.1033.8287" +mega_sas "pci1000,411.1054.3016" +mega_sas "pci1000,411.1734.1081" +mega_sas "pci1000,411.1734.10a3" +mega_sas "pci1000,411.8086.1001" +mega_sas "pci1000,411.8086.1003" +mega_sas "pci1000,411.8086.1008" +mega_sas "pci1000,411.8086.3490" +mega_sas "pci1000,411.8086.3500" +mega_sas "pci1000,411.8086.3501" +mega_sas "pci1000,411.8086.3504" +mega_sas "pci1000,411.8086.3507" +mega_sas "pci1000,413.1000.1005" +mega_sas "pci1000,57.8086.3002" +mega_sas "pci1000,60.1000.1006" +mega_sas "pci1000,60.1000.100a" +mega_sas "pci1000,60.1000.100e" +mega_sas "pci1000,60.1000.100f" +mega_sas "pci1000,60.1000.1010" +mega_sas "pci1000,60.1000.1011" +mega_sas "pci1000,60.1000.1012" +mega_sas "pci1000,60.1000.1013" +mega_sas "pci1000,60.1000.1014" +mega_sas "pci1000,60.1000.1015" +mega_sas "pci1000,60.1000.1016" +mega_sas "pci1000,60.1000.1017" +mega_sas "pci1000,60.1000.1018" +mega_sas "pci1000,60.1000.1019" +mega_sas "pci1000,60.1000.101a" +mega_sas "pci1000,60.1000.101b" +mega_sas "pci1000,60.1000.1021" +mega_sas "pci1000,60.1000.1022" +mega_sas "pci1000,60.1014.363" +mega_sas "pci1000,60.1014.364" +mega_sas "pci1000,60.1014.365" +mega_sas "pci1000,60.1014.379" +mega_sas "pci1000,60.1014.3a2" +mega_sas "pci1000,60.1014.3ac" +mega_sas "pci1000,60.1028.1f0a" +mega_sas "pci1000,60.1028.1f0b" +mega_sas "pci1000,60.1028.1f0c" +mega_sas "pci1000,60.1028.1f0d" +mega_sas "pci1000,60.1028.1f11" +mega_sas "pci1000,60.1033.835a" +mega_sas "pci1000,60.1033.836e" +mega_sas "pci1000,60.1043.824d" +mega_sas "pci1000,60.1054.3019" +mega_sas "pci1000,60.1170.2f" +mega_sas "pci1000,60.1170.34" +mega_sas "pci1000,60.1170.36" +mega_sas "pci1000,60.1458.1000" +mega_sas "pci1000,60.15d9.c080" +mega_sas "pci1000,60.1734.10f9" +mega_sas "pci1000,60.1734.1102" +mega_sas "pci1000,60.1734.114b" +mega_sas "pci1000,60.17aa.6b7c" +mega_sas "pci1000,60.18a1.3" +mega_sas "pci1000,60.19e5.2001" +mega_sas "pci1000,60.19e5.2002" +mega_sas "pci1000,60.19e5.2003" +mega_sas "pci1000,60.19e5.2004" +mega_sas "pci1000,60.19e5.2005" +mega_sas "pci1000,60.19e5.2006" +mega_sas "pci1000,60.19e5.2010" +mega_sas "pci1000,60.19e5.2011" +mega_sas "pci1000,60.19e5.2012" +mega_sas "pci1000,60.19e5.2013" +mega_sas "pci1000,60.19e5.2014" +mega_sas "pci1000,60.19e5.2015" +mega_sas "pci1000,60.19e5.2016" +mega_sas "pci1000,60.19e5.2017" +mega_sas "pci1000,60.19e5.2018" +mega_sas "pci1000,60.19e5.2019" +mega_sas "pci1000,60.19e5.201a" +mega_sas "pci1000,60.19e5.d203" +mega_sas "pci1000,60.1b0a.14" +mega_sas "pci1000,60.1fca.2163" +mega_sas "pci1000,60.1fca.2164" +mega_sas "pci1000,60.8086.1006" +mega_sas "pci1000,60.8086.100a" +mega_sas "pci1000,60.8086.1010" +mega_sas "pci1000,60.8086.1013" +mega_sas "pci1000,60.8086.1021" +mega_sas "pci1000,60.8086.34cc" +mega_sas "pci1000,60.8086.34cd" +mega_sas "pci1000,60.8086.34e4" +mega_sas "pci1000,60.8086.3505" +mega_sas "pci1000,60.8086.3508" +mega_sas "pci1000,7c.1000.101c" +mega_sas "pci1000,7c.1000.101d" +mega_sas "pci1000,7c.1014.395" +mega_sas "pci1028,15.1028.1f01" +mega_sas "pci1028,15.1028.1f02" +mega_sas "pci1028,15.1028.1f03" +mouse8042 "pnpPNP,f03" +mlxcx "pciex15b3,1013" +mlxcx "pciex15b3,1014" +mlxcx "pciex15b3,1015" +mlxcx "pciex15b3,1016" +mlxcx "pciex15b3,1017" +mlxcx "pciex15b3,1018" +mlxcx "pciex15b3,1019" +mlxcx "pciex15b3,101a" +mlxcx "pciex15b3,101b" +mlxcx "pciex15b3,101c" +mlxcx "pciex15b3,101d" +mlxcx "pciex15b3,101e" +mlxcx "pciex15b3,101f" +mpt "pci1000,30" +mpt "pci1000,50" +mpt "pci1000,54" +mpt "pci1000,56" +mpt "pci1000,58" +mpt "pci1000,62" +mpt "pciex1000,56" +mpt "pciex1000,58" +mpt "pciex1000,62" +mpt_sas "pci1000,64" +mpt_sas "pci1000,70" +mpt_sas "pci1000,72" +mpt_sas "pci1000,76" +mpt_sas "pciex1000,64" +mpt_sas "pciex1000,65" +mpt_sas "pciex1000,6e" +mpt_sas "pciex1000,70" +mpt_sas "pciex1000,72" +mpt_sas "pciex1000,74" +mpt_sas "pciex1000,76" +mpt_sas "pciex1000,77" +mpt_sas "pciex1000,7e" +mpt_sas "pciex1000,80" +mpt_sas "pciex1000,81" +mpt_sas "pciex1000,82" +mpt_sas "pciex1000,83" +mpt_sas "pciex1000,84" +mpt_sas "pciex1000,85" +mpt_sas "pciex1000,86" +mpt_sas "pciex1000,87" +mpt_sas "pciex1000,90" +mpt_sas "pciex1000,91" +mpt_sas "pciex1000,92" +mpt_sas "pciex1000,93" +mpt_sas "pciex1000,94" +mpt_sas "pciex1000,95" +mpt_sas "pciex1000,96" +mpt_sas "pciex1000,97" +mpt_sas "pciex1000,aa" +mpt_sas "pciex1000,ab" +mpt_sas "pciex1000,ac" +mpt_sas "pciex1000,ad" +mpt_sas "pciex1000,ae" +mpt_sas "pciex1000,af" +mpt_sas "pciex1000,c0" +mpt_sas "pciex1000,c1" +mpt_sas "pciex1000,c2" +mpt_sas "pciex1000,c3" +mpt_sas "pciex1000,c4" +mpt_sas "pciex1000,c5" +mpt_sas "pciex1000,c6" +mpt_sas "pciex1000,c7" +mpt_sas "pciex1000,c8" +mpt_sas "pciex1000,c9" +mpt_sas "pciex1000,d0" +mpt_sas "pciex1000,d1" +mpt_sas "pciex1000,d2" +mr_sas "pciex1000,52" +mr_sas "pciex1000,53" +mr_sas "pciex1000,5b" +mr_sas "pciex1000,5d" +mr_sas "pciex1000,5f" +mr_sas "pciex1000,71" +mr_sas "pciex1000,73" +mr_sas "pciex1000,78" +mr_sas "pciex1000,79" +mr_sas "pciex1000,ce" +mr_sas "pciex1000,cf" +mwl "pci11ab,2a0a" +mwl "pci11ab,2a24" +mxfe "pci10d9,512" +mxfe "pci10d9,531" +mxfe "pci11ad,c115" +mxfe "pci11fc,9881" +myri10ge "pci14c1,8" +myri10ge "pci14c1,9" +myri10ge "pciex14c1,8" +myri10ge "pciex14c1,9" +nfp "pci1011,1065.100.100" +nfp "pciex1011,1065.100.100" +nfp "pci8086,b555.100.100" +nfp "pciex8086,b555.100.100" +nge "pci10de,268" +nge "pci10de,269" +nge "pci10de,37" +nge "pci10de,372" +nge "pci10de,373" +nge "pci10de,38" +nge "pci10de,3ee" +nge "pci10de,3ef" +nge "pci10de,56" +nge "pci10de,57" +nge "pci10de,760" +nge "pci10de,ab0" +nge "pci10de,df" +nge "pci10de,e6" +npe "pciex_root_complex" +ntxn "pci4040,1" +ntxn "pci4040,100" +ntxn "pci4040,2" +ntxn "pci4040,24" +ntxn "pci4040,25" +ntxn "pci4040,3" +ntxn "pci4040,4" +ntxn "pci4040,5" +nulldriver "scsa,nodev" +nulldriver "scsa,probe" +nv_sata "pci10de,266" +nv_sata "pci10de,267" +nv_sata "pci10de,36" +nv_sata "pci10de,37e" +nv_sata "pci10de,37f" +nv_sata "pci10de,3e" +nv_sata "pci10de,3f6" +nv_sata "pci10de,3f7" +nv_sata "pci10de,54" +nv_sata "pci10de,55" +nvme "pciclass,010802" +nvme "pciexclass,010802" +nxge "SUNW,niusl" +nxge "pciex108e,abcd" +oce "pciex19a2,700" +oce "pciex19a2,710" +ohci "pciclass,0c0310" +pcata "pccard,disk" +pchtemp "pci8086,2f9,p" +pchtemp "pci8086,6f9,p" +pchtemp "pci8086,8ca4,p" +pchtemp "pci8086,8c24,p" +pchtemp "pci8086,8d24,p" +pchtemp "pci8086,9ca4,p" +pchtemp "pci8086,9d31,p" +pchtemp "pci8086,9df9,p" +pchtemp "pci8086,a1b1,p" +pchtemp "pci8086,a231,p" +pchtemp "pci8086,a2b1,p" +pchtemp "pci8086,a131,p" +pchtemp "pci8086,a379,p" +pci_pci "pci1011,1" +pci_pci "pci1011,21" +pci_pci "pci1014,22" +pci_pci "pciclass,060400" +pci_pci "pciclass,060401" +pcic "pciclass,060500" +pcic "pciclass,060700" +pcieb "pciexclass,060400" +pcieb "pciexclass,060401" +pcn "pci1022,2000" +pcn "pci103c,104c" +pcser "pccard,Intel_MODEM_2400+_iNC110US_A-" +pcser "pccard,serial" +pcser "pccard102,2" +pcser "pccard102,5" +pit_beep "SUNW,pit_beep" +pmcs "pciex11f8,8001" +pseudo "zconsnex" +pseudo "zfdnex" +pvscsi "pci15ad,7c0" +qede "pciex1077,1634" +qede "pciex1077,1629" +qede "pciex1077,1630" +qede "pciex1077,1656" +qede "pciex1077,1654" +qede "pciex1077,1666" +qede "pciex1077,8070" +qede "pciex1077,8071" +qede "pciex1077,8072" +qede "pciex1077,8073" +qlc "pci1077,132" +qlc "pci1077,2200" +qlc "pci1077,2300" +qlc "pci1077,2312" +qlc "pci1077,2422" +qlc "pciex1077,2432" +qlc "pciex1077,2532" +qlc "pciex1077,5432" +qlc "pciex1077,8001" +qlc "pciex1077,8021" +qlge "pciex1077,8000" +rge "pci10ec,8136" +rge "pci10ec,8167" +rge "pci10ec,8168" +rge "pci10ec,8169" +rge "pci16ec,116" +rge "pciex10ec,8136" +rge "pciex10ec,8168" +rge "pciex10ec,8169" +rtls "pci10ec,8139" +rtls "pci1113,1211" +rtls "pci1186,1300" +rtls "pci1186,1301" +rum "usb1044,800a" +rum "usb13b1,20" +rum "usb148f,2573" +rum "usb15a9,4" +rum "usb7d1,3c03" +rum "usb7d1,3c04" +rum "usbb05,1723" +scsa1394 "firewire00609e,010483" +scsa2usb "usb584,222" +scsa2usb "usbif,class8" +sd "scsiclass,00" +sd "scsiclass,05" +sdhost "pciclass,080500" +sdhost "pciclass,080501" +ses "scsiclass,03S" +ses "scsiclass,0d" +sfe "pci100b,20" +sfe "pci1039,7016" +sfe "pci1039,900" +sfxge "pci1924,803" +sfxge "pci1924,810" +sfxge "pci1924,813" +sfxge "pci1924,901" +sfxge "pci1924,903" +sfxge "pci1924,923" +sgen "scsa,08.bfcp" +sgen "scsa,08.bvhci" +smrt "pci103c,1920" +smrt "pci103c,1921" +smrt "pci103c,1922" +smrt "pci103c,1923" +smrt "pci103c,1924" +smrt "pci103c,1926" +smrt "pci103c,1928" +smrt "pci103c,21bd" +smrt "pci103c,21be" +smrt "pci103c,21bf" +smrt "pci103c,21c0" +smrt "pci103c,21c1" +smrt "pci103c,21c2" +smrt "pci103c,21c3" +smrt "pci103c,21c5" +smrt "pci103c,21c6" +smrt "pci103c,21c7" +smrt "pci103c,21c8" +smrt "pci103c,21ca" +smrt "pci103c,21cb" +smrt "pci103c,21cc" +smrt "pci103c,21cd" +smrt "pci103c,21ce" +smrt "pci103c,3241" +smrt "pci103c,3243" +smrt "pci103c,3245" +smrt "pci103c,3247" +smrt "pci103c,3249" +smrt "pci103c,324a" +smrt "pci103c,324b" +smrt "pci103c,3350" +smrt "pci103c,3351" +smrt "pci103c,3352" +smrt "pci103c,3353" +smrt "pci103c,3354" +smrt "pci103c,3355" +smrt "pci103c,3356" +si3124 "pci1095,3124" +si3124 "pci1095,3132" +skd "pciex1b39,1" +st "scsiclass,01" +t4nex "pciex1425,4400" +t4nex "pciex1425,4401" +t4nex "pciex1425,4402" +t4nex "pciex1425,4403" +t4nex "pciex1425,4404" +t4nex "pciex1425,4405" +t4nex "pciex1425,4406" +t4nex "pciex1425,4407" +t4nex "pciex1425,4408" +t4nex "pciex1425,4409" +t4nex "pciex1425,440a" +t4nex "pciex1425,440d" +t4nex "pciex1425,440e" +t4nex "pciex1425,5400" +t4nex "pciex1425,5401" +t4nex "pciex1425,5402" +t4nex "pciex1425,5403" +t4nex "pciex1425,5404" +t4nex "pciex1425,5405" +t4nex "pciex1425,5406" +t4nex "pciex1425,5407" +t4nex "pciex1425,5408" +t4nex "pciex1425,5409" +t4nex "pciex1425,540a" +t4nex "pciex1425,540b" +t4nex "pciex1425,540c" +t4nex "pciex1425,540d" +t4nex "pciex1425,540e" +t4nex "pciex1425,540f" +t4nex "pciex1425,5410" +t4nex "pciex1425,5411" +t4nex "pciex1425,5412" +t4nex "pciex1425,5413" +t4nex "pciex1425,5414" +t4nex "pciex1425,5415" +t4nex "pciex1425,5416" +t4nex "pciex1425,5417" +t4nex "pciex1425,5418" +t4nex "pciex1425,5480" +t4nex "pciex1425,5481" +t4nex "pciex1425,5482" +t4nex "pciex1425,5486" +t4nex "pciex1425,5487" +t4nex "pciex1425,5488" +t4nex "pciex1425,5489" +t4nex "pciex1425,5490" +t4nex "pciex1425,5491" +t4nex "pciex1425,5492" +t4nex "pciex1425,5493" +t4nex "pciex1425,5494" +t4nex "pciex1425,5495" +t4nex "pciex1425,5496" +t4nex "pciex1425,5497" +t4nex "pciex1425,5498" +t4nex "pciex1425,5499" +t4nex "pciex1425,549a" +t4nex "pciex1425,549b" +t4nex "pciex1425,549c" +t4nex "pciex1425,549d" +t4nex "pciex1425,549e" +t4nex "pciex1425,549f" +t4nex "pciex1425,54a0" +t4nex "pciex1425,54a1" +t4nex "pciex1425,54a2" +t4nex "pciex1425,54a3" +t4nex "pciex1425,54a4" +t4nex "pciex1425,54a5" +t4nex "pciex1425,54a6" +t4nex "pciex1425,54a7" +t4nex "pciex1425,54a8" +t4nex "pciex1425,54a9" +t4nex "pciex1425,54aa" +t4nex "pciex1425,54ab" +t4nex "pciex1425,54ac" +t4nex "pciex1425,54ad" +t4nex "pciex1425,54ae" +t4nex "pciex1425,54af" +t4nex "pciex1425,54b0" +t4nex "pciex1425,6400" +t4nex "pciex1425,6401" +t4nex "pciex1425,6402" +t4nex "pciex1425,6403" +t4nex "pciex1425,6404" +t4nex "pciex1425,6405" +t4nex "pciex1425,6406" +t4nex "pciex1425,6407" +t4nex "pciex1425,6408" +t4nex "pciex1425,6409" +t4nex "pciex1425,640d" +t4nex "pciex1425,6410" +t4nex "pciex1425,6411" +t4nex "pciex1425,6414" +t4nex "pciex1425,6415" +t4nex "pciex1425,6480" +t4nex "pciex1425,6481" +t4nex "pciex1425,6482" +t4nex "pciex1425,6483" +t4nex "pciex1425,6484" +t4nex "pciex1425,6485" +t4nex "pciex1425,6486" +t4nex "pciex1425,6487" +t4nex "pciex1425,6488" +t4nex "pciex1425,6489" +t4nex "pciex1425,648a" +t4nex "pciex1425,648b" +t4nex "pciex1425,6492" +tavor "pci15b3,5a44" +tavor "pci15b3,5a45" +tavor "pci15b3,6278" +tavor "pci15b3,6279" +tavor "pciex15b3,6278" +tavor "pciex15b3,6279" +udmf "usb7aa,9601" +udmf "usba46,6688" +udmf "usba46,268" +udmf "usba46,9601" +udmf "usbfe6,8101" +ugen "usbif51d,class3" +upf "usb7a6,8511" +upf "usb411,5" +upf "usb4bb,904" +upf "usb4bb,93a" +upf "usb506,4601" +upf "usb557,2007" +upf "usb7b8,110c" +upf "usb7b8,4104" +upf "usb7b8,4004" +upf "usb7b8,4007" +upf "usb7b8,4102" +upf "usb7b8,4002" +upf "usb7b8,400b" +upf "usb7b8,400c" +upf "usb7b8,abc1" +upf "usb7b8,200c" +upf "usb83a,1046" +upf "usb83a,5046" +upf "usb83a,b004" +upf "usb7a6,8513" +upf "usb7a6,8515" +upf "usb7a6,986" +upf "usb7a6,1986" +upf "usb334,1701" +upf "usb7c9,b100" +upf "usb50d,121" +upf "usb8dd,986" +upf "usb8dd,987" +upf "usb8dd,988" +upf "usb8dd,8511" +upf "usb49f,8511" +upf "usb7aa,4" +upf "usb7aa,d" +upf "usb2001,4001" +upf "usb2001,4002" +upf "usb2001,4102" +upf "usb2001,400b" +upf "usb2001,200c" +upf "usb2001,4003" +upf "usb2001,abc1" +upf "usbdb7,2" +upf "usb1342,304" +upf "usb56e,4010" +upf "usb5cc,3000" +upf "usb1044,8002" +upf "usbe66,400c" +upf "usb3f0,811c" +upf "usb4bb,913" +upf "usb951,a" +upf "usb56e,4002" +upf "usb56e,4005" +upf "usb56e,400b" +upf "usb56e,abc1" +upf "usb56e,200c" +upf "usb66b,2202" +upf "usb66b,2203" +upf "usb66b,2204" +upf "usb66b,2206" +upf "usb66b,8b4" +upf "usb66b,400b" +upf "usb66b,200c" +upf "usb411,1" +upf "usb411,9" +upf "usb45e,7a" +upf "usb846,1020" +upf "usbb39,109" +upf "usbb39,901" +upf "usb8d1,3" +upf "usb707,200" +upf "usb707,201" +upf "usb15e8,9100" +upf "usb15e8,9110" +upf "usb67c,1001" +urf "usbbda,8150" +urf "usb411,12" +urf "usb3980,3" +urf "usb7b8,401a" +urf "usb1557,8150" +urf "usb586,401a" +uhci "pciclass,0c0300" +usb_ac "usbif,class1.1" +usb_as "usbif,class1.2" +usb_ia "usb,ia" +usb_mid "usb,device" +usbecm "usb,class2.6.0" +usbecm "usb430,a4a2" +usbecm "usbia,class2.6" +usbecm "usbif,class2.6" +usbftdi "usb403,6001" +usbftdi "usb403,6014" +usbftdi "usb403,6015" +usbftdi "usb403,cc48" +usbftdi "usb403,cc49" +usbftdi "usb403,cc4a" +usbftdi "usb403,e888" +usbftdi "usb403,e889" +usbftdi "usb403,e88b" +usbftdi "usb403,e88c" +usbftdi "usb403,fa00" +usbftdi "usb403,fa01" +usbftdi "usb403,fa02" +usbftdi "usb403,fa03" +usbftdi "usb403,fa04" +usbftdi "usb403,fc08" +usbftdi "usb403,fc09" +usbftdi "usb403,fc0b" +usbftdi "usb403,fc0c" +usbftdi "usb403,fc0d" +usbftdi "usb403,fc82" +usbftdi "usb411,00b3" +usbftdi "usb7cc,0421" +usbftdi "usb856,ac01" +usbftdi "usb93c,0601" +usbftdi "usb93c,0701" +usbprn "usbif,class7.1" +usbsacm "usb,class2.2.0" +usbsacm "usb1410,1110" +usbsacm "usbc88,17da" +usbsacm "usbif,class2.2" +usbser_edge "usbif1608,1.100.config1.0" +usbser_edge "usbif1608,1.config1.0" +usbser_edge "usbif1608,10.config1.0" +usbser_edge "usbif1608,11.config1.0" +usbser_edge "usbif1608,12.config1.0" +usbser_edge "usbif1608,13.config1.0" +usbser_edge "usbif1608,14.config1.0" +usbser_edge "usbif1608,201.config1.0" +usbser_edge "usbif1608,205.config1.0" +usbser_edge "usbif1608,206.config1.0" +usbser_edge "usbif1608,207.config1.0" +usbser_edge "usbif1608,20c.config1.0" +usbser_edge "usbif1608,20d.config1.0" +usbser_edge "usbif1608,215.config1.0" +usbser_edge "usbif1608,217.config1.0" +usbser_edge "usbif1608,21a.config1.0" +usbser_edge "usbif1608,240.config1.0" +usbser_edge "usbif1608,241.config1.0" +usbser_edge "usbif1608,242.config1.0" +usbser_edge "usbif1608,243.config1.0" +usbser_edge "usbif1608,244.config1.0" +usbser_edge "usbif1608,247.config1.0" +usbser_edge "usbif1608,3.config1.0" +usbser_edge "usbif1608,4.config1.0" +usbser_edge "usbif1608,5.config1.0" +usbser_edge "usbif1608,6.config1.0" +usbser_edge "usbif1608,7.config1.0" +usbser_edge "usbif1608,c.config1.0" +usbser_edge "usbif1608,d.config1.0" +usbser_edge "usbif1608,e.config1.0" +usbser_edge "usbif1608,f.config1.0" +usbsksp "usb6cd,11a" +usbsksp "usb6cd,121" +usbsksp "usb6cd,12a" +usbsksp "usb6cd,131" +usbsprl "usb557,2008" +usbsprl "usb56e,5004" +usbsprl "usb5ad,fba" +usbsprl "usb6189,2068" +usbsprl "usb67b,2303" +usbsprl "usb67b,aaa2" +vgatext "pciclass,000100" +vgatext "pciclass,030000" +vgatext "pciclass,030001" +vio9f "pci1af4,9" +vioblk "pci1af4,1001" +vioif "pci1af4,1" +vioif "pci1af4,1000,p" +vioscsi "pci1af4,1004" +vmxnet "pci15ad,720" +vmxnet3s "pci15ad,7b0" +vr "pci1106,3043" +vr "pci1106,3053" +vr "pci1106,3065" +vr "pci1106,3106" +xge "pci17d5,5731" +xge "pci17d5,5831" +xge "pci17d5,5832" +xhci "pciclass,0c0330" +xnbe "xnb,ioemu" +xnbo "xnb" +xnbo "xnb,SUNW_mac" +xnbu "xnb,netfront" +xpv "pci5853,1.1" +yge "pciex1186,4b00" +yge "pciex11ab,4354" +yge "pciex11ab,4355" +yge "pciex11ab,4362" +yge "pciex11ab,4363" +yge "pciex11ab,4364" +yge "pciex11ab,436a" diff --git a/usr/src/uts/intel/os/driver_classes b/usr/src/uts/intel/os/driver_classes index e69de29bb2..b1bd2d0ae4 100644 --- a/usr/src/uts/intel/os/driver_classes +++ b/usr/src/uts/intel/os/driver_classes @@ -0,0 +1,29 @@ +scsi_vhci scsi-self-identifying +mpt scsi +pci_pci pci +cpqary3 scsi +aac scsi +adpu320 scsi +ahci scsi-self-identifying +amr scsi +arcmsr scsi +bcm_sata scsi-self-identifying +isa sysbus +pci pci +emlxs fibre-channel +iscsi scsi-self-identify +lsimega scsi +mpt_sas scsi-self-identifying +mr_sas scsi-self-identifying +mega_sas scsi +marvell88sx scsi-self-identifying +nv_sata scsi-self-identifying +pcic pcmcia +glm scsi +pmcs scsi-self-identifying +ata dada +ata scsi +qlc fibre-channel +si3124 scsi-self-identifying +skd disk +smrt scsi-self-identifying diff --git a/usr/src/uts/intel/os/minor_perm b/usr/src/uts/intel/os/minor_perm index dbc6a35081..3b4971e69b 100644 --- a/usr/src/uts/intel/os/minor_perm +++ b/usr/src/uts/intel/os/minor_perm @@ -1,6 +1,272 @@ +spwr:* 0666 root sys +dump:dump 0660 root sys +id:* 0640 root sys +kstat:* 0666 root sys +ksyms:* 0666 root sys +logi:l 0666 root sys +msm:l 0666 root sys +pts:* 0644 root sys +pts:0 0620 root tty +pts:1 0620 root tty +pts:2 0620 root tty +pts:3 0620 root tty +svvslo:* 0666 root sys +vol:volctl 0666 root sys +tnf:tnfctl 0600 root sys +tnf:tnfmap 0600 root sys +fssnap:* 0640 root sys +fssnap:ctl 0666 root sys +rsm:* 0666 root sys +vni:* 0666 root sys +bmc:bmc 0666 root sys +glm:* 0755 root sys +usbsacm:* 0666 root sys +sdpib:* 0644 root sys +i915:* 0644 root sys +rdc:* 0666 root sys +md:* 0640 root sys +md:admin 0644 root sys +iscsi:* 0600 root sys +dcam1394:* 0666 root sys +nv_sata:* 0644 root sys +smp:* 0644 root sys +sv:* 0666 root sys +dmfe:* 0666 root sys +clone:dmfe 0666 root sys +clone:ptmx 0666 root sys +ehci:* 0644 root sys +hid:* 0600 root sys +hubd:* 0644 root sys +hwahc:* 0644 root sys +hwarc:* 0644 root sys +ohci:* 0644 root sys +uhci:* 0644 root sys +usb_ac:* 0600 root sys +usb_as:* 0600 root sys +usbprn:* 0666 root sys +wusb_ca:* 0666 root sys +wusb_df:* 0666 root sys +ipw:* 0666 root sys +clone:ipw 0666 root sys +fcip:* 0600 root sys +ral:* 0666 root sys +clone:ral 0666 root sys +daplt:* 0644 root sys +atiatom:* 0644 root sys +rtw:* 0666 root sys +clone:rtw 0666 root sys +usbsksp:* 0666 root sys +wpi:* 0666 root sys +clone:wpi 0666 root sys +pcser:* 0666 uucp uucp +dtrace:* 0666 root sys +fasttrap:fasttrap 0666 root sys +fbt:fbt 0644 root sys +lockstat:* 0644 root sys +profile:profile 0644 root sys +sdt:sdt 0644 root sys +systrace:systrace 0644 root sys +rum:* 0666 root sys +clone:rum 0666 root sys +ce:* 0600 root sys +rge:* 0666 root sys +clone:rge 0666 root sys +fcsm:* 0600 root sys +bge:* 0666 root sys +clone:bge 0666 root sys +winlock:* 0666 root sys +fp:* 0600 root sys +fm:* 0644 root sys +ipf:* 0666 root sys +smbsrv:* 0640 root sys +bnx:* 0644 root sys +pcwl:* 0666 root sys +clone:pcwl 0666 root sys +pcan:* 0666 root sys +clone:pcan 0666 root sys +ural:* 0666 root sys +clone:ural 0666 root sys +mxfe:* 0666 root sys +clone:mxfe 0666 root sys +ugen:* 0644 root sys +zyd:* 0666 root sys +clone:zyd 0666 root sys +qlc:* 0666 root sys +qlge:* 0666 root sys +igb:* 0666 root sys +clone:igb 0666 root sys +afe:* 0666 root sys +clone:afe 0666 root sys +scsa1394:* 0666 root sys +agpgart:* 0644 root sys +agptarget:* 0644 root sys +amd64_gart:* 0644 root sys +nsmb:* 0666 root sys +fcoet:* 0600 root sys +nvidia:* 0666 root root +fcp:* 0600 root sys +ahci:* 0644 root sys +adpu320:adpu320ctl 0666 root root +ses:* 0666 bin bin +iwk:* 0666 root sys +clone:iwk 0666 root sys +iwh:* 0666 root sys +clone:iwh 0666 root sys +amd_iommu:* 0644 root sys +balloon:* 0444 root sys +domcaps:* 0444 root sys +evtchn:* 0666 root sys +privcmd:* 0666 root sys +xenbus:* 0666 root sys +xpvtap:* 0666 root sys +ixgb:* 0666 root root +pm:* 0666 root sys +usbvc:* 0666 root sys +hci1394:* 0600 root sys +rpcib:* 0644 root sys +nxge:* 0600 root sys +audio:* 0666 root sys +fcoe:* 0600 root sys +usbser_edge:* 0666 root sys +srpt:* 0644 root sys +pool:pool 0666 root sys +pool:poolctl 0666 root sys +e1000g:* 0666 root sys +clone:e1000g 0666 root sys +sdp:* 0644 root sys +sdp:sdp 0666 root sys +ixgbe:* 0666 root sys +clone:ixgbe 0666 root sys +xge:* 0666 root sys +clone:xge 0666 root sys +aggr:* 0666 root sys +arp:arp 0666 root sys +bl:* 0666 root sys +conskbd:kbd 0666 root sys +clone:bridge 0666 root sys +cn:* 0620 root tty +consms:mouse 0666 root sys +cpuid:self 0644 root sys +crypto:crypto 0666 root sys +cryptoadm:cryptoadm 0644 root sys +dld:* 0666 root sys devinfo:devinfo 0640 root sys devinfo:devinfo,ro 0444 root sys +dlpistub:* 0666 root sys +icmp:icmp 0666 root sys +icmp6:icmp6 0666 root sys +ip:ip 0666 root sys +ip6:ip6 0666 root sys +ipnet:lo0 0666 root sys +ipsecah:ipsecah 0666 root sys +ipsecesp:ipsecesp 0666 root sys +keysock:keysock 0666 root sys +clone:llc1 0666 root sys +lofi:* 0600 root sys +lofi:ctl 0644 root sys +log:conslog 0666 root sys +log:log 0640 root sys +mm:allkmem 0600 root sys +mm:full 0666 root sys +mm:kmem 0640 root sys +mm:mem 0640 root sys +mm:null 0666 root sys +mm:zero 0666 root sys +openeepr:openprom 0640 root sys +ptc:* 0666 root sys +poll:* 0666 root sys +physmem:* 0600 root sys +ptsl:* 0666 root sys +rts:rts 0666 root sys +random:* 0644 root sys +sad:admin 0666 root sys +sad:user 0666 root sys +scsi_vhci:* 0666 root sys +simnet:* 0666 root sys +clone:simnet 0666 root sys +sgen:* 0600 root sys +spdsock:spdsock 0666 root sys +st:* 0666 root sys +sy:tty 0666 root tty +sysevent:* 0600 root sys +sysmsg:msglog 0600 root sys +sysmsg:sysmsg 0600 root sys +tcp:tcp 0666 root sys +tcp6:tcp6 0666 root sys +tl:* 0666 root sys +clone:ticlts 0666 root sys +clone:ticots 0666 root sys +clone:ticotsord 0666 root sys +udp:udp 0666 root sys +vnic:* 0666 root sys +clone:vnic 0666 root sys +udp6:udp6 0666 root sys +wc:* 0600 root sys +acpi_drv:* 0666 root sys +ramdisk:* 0600 root sys +ramdisk:ctl 0644 root sys +sd:* 0640 root sys +smbios:smbios 0444 root sys +ucode:* 0644 root sys +iser:* 0600 root sys +ath:* 0666 root sys +clone:ath 0666 root sys +usbsprl:* 0666 root sys +marvell88sx:* 0644 root sys +rtls:* 0666 root sys +clone:rtls 0666 root sys +iwi:* 0666 root sys +clone:iwi 0666 root sys +arcmsr:* 0600 root sys +hxge:* 0600 root sys +cpc:shared 0666 root sys +emlxs:* 0666 root sys +oce:* 0666 root sys +tavor:* 0666 root sys +zfs:* 0600 root sys +zfs:zfs 0666 root sys +iscsit:* 0600 root sys +dnet:* 0666 root sys +clone:dnet 0666 root sys +elxl:* 0666 root sys +clone:elxl 0666 root sys +iprb:* 0666 root sys +clone:iprb 0666 root sys +pcn:* 0666 root sys +clone:pcn 0666 root sys +sfe:* 0666 root root +chxge:* 0666 root sys +clone:chxge 0666 root sys +av1394:* 0600 root sys +vscan:* 0640 root sys +si3124:* 0644 root sys +nge:* 0666 root sys +clone:nge 0666 root sys asy:* 0666 root sys asy:*,cu 0600 uucp uucp -md:* 0640 root sys -md:admin 0644 root sys +cmdk:* 0640 root sys +ecpp:* 0666 root sys +fd:* 0666 root sys +fct:* 0666 root sys +pppt:* 0666 root sys +qlt:* 0666 root sys +stmf:* 0666 root sys +stmf_sbd:* 0666 root sys +bpf:bpf 0666 root sys +ib:* 0644 root sys +sdhost:* 0644 root root +vboxdrv:'* 0666 root sys' +vboxusbmon:'* 0666 root sys' +blkdev:* 0640 root root +ibp:* 0666 root sys +clone:ibp 0666 root sys +ipmi:ipmi 0600 root sys +lx_ptm:lx_ptmajor 0666 root sys +lx_systrace:* 0644 root sys +inotify:* 0666 root sys +skd:* 0600 root sys +eventfd:* 0666 root sys +timerfd:* 0666 root sys +signalfd:* 0666 root sys +iwn:* 0666 root sys +clone:iwn 0666 root sys diff --git a/usr/src/uts/intel/os/name_to_major b/usr/src/uts/intel/os/name_to_major index c5ad4c9bf0..65cd9053b7 100644 --- a/usr/src/uts/intel/os/name_to_major +++ b/usr/src/uts/intel/os/name_to_major @@ -2,3 +2,316 @@ md 85 devinfo 88 asy 106 did 239 +dump 0 +fssnap 1 +kstat 2 +ksyms 3 +logindmux 4 +ptm 5 +pts 6 +aggr 7 +arp 8 +bl 9 +clone 10 +conskbd 11 +bridge 12 +cn 13 +consms 14 +cpuid 15 +crypto 16 +cryptoadm 17 +dld 18 +dlpistub 19 +icmp 20 +icmp6 21 +ip 22 +ip6 23 +ipnet 24 +ippctl 25 +ipsecah 26 +ipsecesp 27 +iptun 28 +bnx 29 +iwscn 30 +kb8042 31 +keysock 32 +kmdb 33 +llc1 35 +lofi 36 +log 37 +mm 38 +mouse8042 39 +nulldriver 40 +openeepr 41 +options 42 +ptc 43 +poll 44 +physmem 45 +pseudo 46 +ptsl 47 +rts 48 +random 49 +sad 50 +scsi_vhci 51 +kvm 52 +simnet 54 +sgen 55 +softmac 56 +spdsock 57 +st 58 +sy 59 +sysevent 60 +sysmsg 61 +tcp 62 +tcp6 63 +tl 64 +udp 65 +vnic 66 +udp6 67 +wc 68 +acpi_drv 69 +bscbus 70 +bscv 71 +cpunex 72 +i8042 73 +intel_nb5000 74 +intel_nhm 75 +intel_nhmex 76 +mc-amd 77 +mpt 78 +pci_pci 79 +pcieb 80 +power 81 +ramdisk 82 +sd 83 +smbios 84 +tzmon 86 +ucode 87 +vgatext 89 +zfs 90 +pool 91 +zcons 92 +bnxe 93 +cpqary3 94 +hci1394 95 +audio 96 +audio1575 97 +audioens 98 +audiopci 99 +audiots 100 +audio810 101 +adpu320 102 +afe 103 +amd64_gart 104 +agpgart 105 +agptarget 107 +ahci 108 +amd8111s 109 +amr 110 +heci 111 +arcmsr 112 +atge 113 +bcm_sata 114 +bfe 115 +bge 116 +acpippm 117 +amd_iommu 118 +balloon 119 +cpudrv 120 +evtchn 121 +domcaps 122 +isa 123 +pit_beep 124 +npe 125 +ppm 126 +rootnex 127 +pci 128 +privcmd 129 +acpinex 130 +ehci 131 +hid 132 +hubd 133 +hwahc 134 +hwarc 135 +ohci 136 +scsa2usb 137 +uhci 138 +usb_ac 139 +usb_as 140 +usb_ia 141 +usb_mid 142 +usbprn 143 +wusb_ca 144 +wusb_df 145 +xpvtap 146 +xdb 147 +xdf 148 +xencons 149 +xnbe 150 +xnbu 151 +xpvd 152 +xnf 153 +xnbo 154 +xenbus 155 +ce 156 +chxge 157 +cpc 158 +srn 159 +ioat 160 +dmfe 161 +i915 162 +dtrace 163 +dcpc 164 +fasttrap 165 +fbt 166 +profile 167 +lockstat 168 +sdt 169 +systrace 170 +emlxs 171 +oce 172 +fcp 173 +fp 174 +fcip 175 +fcsm 176 +fipe 177 +fm 178 +dcam1394 179 +hme 180 +hxge 181 +ib 182 +sdpib 183 +sdp 184 +igb 185 +e1000g 186 +ipf 187 +iscsi 189 +iser 190 +ixgb 191 +ixgbe 192 +bpf 193 +lsimega 194 +vioif 195 +overlay 196 +vio9f 197 +mpt_sas 198 +mr_sas 199 +mega_sas 200 +marvell88sx 201 +mwl 202 +mxfe 203 +myri10ge 204 +nge 205 +ntxn 206 +nv_sata 207 +nxge 208 +pcn 209 +dnet 210 +elxl 211 +iprb 212 +pcs 213 +pcic 214 +pcser 215 +glm 216 +pm 217 +pmcs 218 +sppp 219 +sppptun 220 +fd 221 +fdc 222 +ecpp 223 +cmdk 225 +pci-ide 226 +ata 227 +pcata 228 +qlc 229 +qlge 230 +rge 231 +rpcib 232 +rtls 233 +rum 234 +scsa1394 235 +sdhost 237 +ses 238 +sfe 240 +si3124 241 +nsmb 242 +smp 243 +tavor 244 +tpm 245 +usbsacm 246 +daplt 247 +usbser_edge 248 +usbftdi 249 +ugen 250 +usbsksp 251 +usbsprl 252 +audiovia823x 253 +vr 254 +xge 255 +xpv 256 +yge 257 +zyd 258 +fct 259 +stmf 260 +stmf_sbd 261 +pppt 262 +qlt 263 +ibp 264 +blkdev 265 +smbsrv 266 +aac 267 +vioblk 53 +iscsit 269 +vmxnet 270 +ipd 188 +tap 271 +tun 272 +ipmi 273 +t4nex 275 +cxgbe 276 +virtio 277 +usbgem 278 +axf 279 +udmf 280 +upf 281 +urf 282 +vnd 283 +lx_ptm 284 +lx_systrace 286 +nfp 287 +inotify 288 +skd 289 +eventfd 290 +zfd 291 +timerfd 292 +signalfd 293 +nvme 294 +lxautofs 295 +i40e 296 +sfxge 297 +pvscsi 298 +vmxnet3s 299 +smrt 300 +iwn 301 +xhci 302 +qede 303 +vioscsi 304 +vmm 305 +viona 306 +ppt 307 +coretemp 308 +amdf17nbdf 309 +pchtemp 310 +ufm 311 +ufmtest 312 +imcstub 313 +imc 314 +ccid 315 +ksensor 316 +mlxcx 317 +amdzen_stub 318 +amdzen 319 +smntemp 320 +ena 321 +zen_umc 322 diff --git a/usr/src/uts/intel/os/path_to_inst b/usr/src/uts/intel/os/path_to_inst index 180715f4cd..9853ae8e23 100644 --- a/usr/src/uts/intel/os/path_to_inst +++ b/usr/src/uts/intel/os/path_to_inst @@ -1 +1,3 @@ -#path_to_inst_bootstrap_1 +# +# Caution! This file contains critical kernel state +# diff --git a/usr/src/uts/intel/os/sendsig.c b/usr/src/uts/intel/os/sendsig.c index e3d60eb62b..becea9eeec 100644 --- a/usr/src/uts/intel/os/sendsig.c +++ b/usr/src/uts/intel/os/sendsig.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,6 +90,8 @@ #include <sys/kdi.h> #include <sys/contract_impl.h> #include <sys/x86_archext.h> +#include <sys/brand.h> +#include <sys/sdt.h> /* * Construct the execution environment for the user's signal @@ -185,7 +190,18 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); - if (newstack) { + /* + * If this is a branded process, the brand may provide an alternate + * stack pointer for signal delivery: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) { + /* + * Use the stack pointer value provided by the brand, + * accounting for the 128-byte reserved region. + */ + newstack = 0; + fp = BROP(p)->b_sendsig_stack(sig) - STACK_RESERVE; + } else if (newstack) { fp = (caddr_t)(SA((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN); } else { @@ -295,6 +311,8 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) kmem_free(tuc, sizeof (*tuc)); tuc = NULL; + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc); lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { @@ -344,6 +362,14 @@ sendsig(int sig, k_siginfo_t *sip, void (*hdlr)()) } /* + * Allow the brand to perform additional book-keeping once the signal + * handling frame has been fully assembled: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) { + BROP(p)->b_sendsig(sig); + } + + /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. @@ -419,7 +445,17 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) newstack = sigismember(&PTOU(curproc)->u_sigonstack, sig) && !(lwp->lwp_sigaltstack.ss_flags & (SS_ONSTACK|SS_DISABLE)); - if (newstack) { + /* + * If this is a branded process, the brand may provide an alternate + * stack pointer for signal delivery: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig_stack != NULL) { + /* + * Use the stack pointer value provided by the brand: + */ + newstack = 0; + fp = BROP(p)->b_sendsig_stack(sig); + } else if (newstack) { fp = (caddr_t)(SA32((uintptr_t)lwp->lwp_sigaltstack.ss_sp) + SA32(lwp->lwp_sigaltstack.ss_size) - STACK_ALIGN32); } else if ((rp->r_ss & 0xffff) != UDS_SEL) { @@ -434,8 +470,9 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) USEGD_GETBASE(&ldt[SELTOIDX(rp->r_ss)]); else fp = (caddr_t)rp->r_sp; - } else + } else { fp = (caddr_t)rp->r_sp; + } /* * Force proper stack pointer alignment, even in the face of a @@ -516,6 +553,8 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) kmem_free(tuc, sizeof (*tuc)); tuc = NULL; + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, (uintptr_t)uc); lwp->lwp_oldcontext = (uintptr_t)uc; if (newstack) { @@ -565,6 +604,14 @@ sendsig32(int sig, k_siginfo_t *sip, void (*hdlr)()) } /* + * Allow the brand to perform additional book-keeping once the signal + * handling frame has been fully assembled: + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_sendsig != NULL) { + BROP(p)->b_sendsig(sig); + } + + /* * Don't set lwp_eosys here. sendsig() is called via psig() after * lwp_eosys is handled, so setting it here would affect the next * system call. diff --git a/usr/src/uts/intel/p4_pcbe/Makefile b/usr/src/uts/intel/p4_pcbe/Makefile index d7f594ed3f..730c3c4b07 100644 --- a/usr/src/uts/intel/p4_pcbe/Makefile +++ b/usr/src/uts/intel/p4_pcbe/Makefile @@ -34,7 +34,7 @@ UTSBASE = ../.. # MODULE = pcbe.GenuineIntel.15 OBJECTS = $(P4_PCBE_OBJS:%=$(OBJS_DIR)/%) -ROOTMODULE = $(USR_PCBE_DIR)/$(MODULE) +ROOTMODULE = $(ROOT_PSM_PCBE_DIR)/$(MODULE) # # Include common rules. diff --git a/usr/src/uts/intel/pcbe/core_pcbe.c b/usr/src/uts/intel/pcbe/core_pcbe.c index f2b6d07861..ad92c2f62f 100644 --- a/usr/src/uts/intel/pcbe/core_pcbe.c +++ b/usr/src/uts/intel/pcbe/core_pcbe.c @@ -819,7 +819,7 @@ core_pcbe_init(void) for (i = 0; i < num_gpc; i++) { /* - * Determine length of all supported event names + * determine length of all supported event names * (architectural + non-architectural) */ size = arch_events_string_length; diff --git a/usr/src/uts/intel/sys/machbrand.h b/usr/src/uts/intel/sys/machbrand.h index 3f9ebdb6b7..ad7f631649 100644 --- a/usr/src/uts/intel/sys/machbrand.h +++ b/usr/src/uts/intel/sys/machbrand.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_MACHBRAND_H @@ -32,20 +33,25 @@ extern "C" { #ifndef _ASM #include <sys/model.h> +#include <sys/thread.h> struct brand_mach_ops { void (*b_sysenter)(void); + void (*b_int80)(void); void (*b_int91)(void); void (*b_syscall)(void); void (*b_syscall32)(void); + greg_t (*b_fixsegreg)(greg_t, model_t); + uintptr_t (*b_fsbase)(klwp_t *, uintptr_t); }; #endif /* _ASM */ #define BRAND_CB_SYSENTER 0 -#define BRAND_CB_INT91 1 -#define BRAND_CB_SYSCALL 2 -#define BRAND_CB_SYSCALL32 3 +#define BRAND_CB_INT80 1 +#define BRAND_CB_INT91 2 +#define BRAND_CB_SYSCALL 3 +#define BRAND_CB_SYSCALL32 4 #ifdef __cplusplus } diff --git a/usr/src/uts/intel/sys/segments.h b/usr/src/uts/intel/sys/segments.h index 774f2fd3b8..4bf3d87378 100644 --- a/usr/src/uts/intel/sys/segments.h +++ b/usr/src/uts/intel/sys/segments.h @@ -709,6 +709,8 @@ extern void _start(), cmnint(); extern void achktrap(), mcetrap(); extern void xmtrap(); extern void fasttrap(); +extern void sys_int80(); +extern void brand_sys_int80(); extern void dtrace_ret(); /* KPTI trampolines */ @@ -724,6 +726,8 @@ extern void tr_overrun(), tr_resvtrap(); extern void tr_achktrap(), tr_mcetrap(); extern void tr_xmtrap(); extern void tr_fasttrap(); +extern void tr_sys_int80(); +extern void tr_brand_sys_int80(); extern void tr_dtrace_ret(); #if !defined(__amd64) diff --git a/usr/src/uts/intel/sys/ucontext.h b/usr/src/uts/intel/sys/ucontext.h index 66300e71a1..2d4e39b3e8 100644 --- a/usr/src/uts/intel/sys/ucontext.h +++ b/usr/src/uts/intel/sys/ucontext.h @@ -20,6 +20,7 @@ */ /* + * Copyright 2015 Joyent, Inc. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * * Copyright 2010 Sun Microsystems, Inc. All rights reserved. @@ -84,9 +85,16 @@ struct __ucontext { sigset_t uc_sigmask; stack_t uc_stack; mcontext_t uc_mcontext; - long uc_filler[5]; /* see ABI spec for Intel386 */ + /* + * The Intel386 ABI specification includes a 5-element array of longs + * called "uc_filler", padding the size of the struct to 512 bytes. To + * allow zone brands to communicate extra data right the way through + * the signal handling process, from sigacthandler to setcontext, we + * steal the first three of these longs as a brand-private member. + */ + void *uc_brand_data[3]; + long uc_filler[2]; }; - #if defined(_SYSCALL32) /* Kernel view of user ILP32 ucontext structure */ @@ -97,7 +105,8 @@ typedef struct ucontext32 { sigset_t uc_sigmask; stack32_t uc_stack; mcontext32_t uc_mcontext; - int32_t uc_filler[5]; + caddr32_t uc_brand_data[3]; + int32_t uc_filler[2]; } ucontext32_t; #if defined(_KERNEL) diff --git a/usr/src/uts/intel/syscall/getcontext.c b/usr/src/uts/intel/syscall/getcontext.c index f7c404ba72..a210448dc3 100644 --- a/usr/src/uts/intel/syscall/getcontext.c +++ b/usr/src/uts/intel/syscall/getcontext.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,6 +49,7 @@ #include <sys/schedctl.h> #include <sys/debug.h> #include <sys/sysmacros.h> +#include <sys/sdt.h> /* * Save user context. @@ -125,7 +129,23 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask) else ucp->uc_flags &= ~UC_FPU; - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } + + if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext != NULL) { + /* + * Allow the brand the chance to modify the context we + * saved: + */ + BROP(p)->b_savecontext(ucp); + } } /* @@ -136,7 +156,19 @@ restorecontext(ucontext_t *ucp) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + if (PROC_IS_BRANDED(p) && BROP(p)->b_restorecontext != NULL) { + /* + * Allow the brand the chance to modify the context before + * we restore it: + */ + BROP(p)->b_restorecontext(ucp); + } + + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, + uintptr_t, (uintptr_t)ucp->uc_link); lwp->lwp_oldcontext = (uintptr_t)ucp->uc_link; if (ucp->uc_flags & UC_STACK) { @@ -184,6 +216,7 @@ getsetcontext(int flag, void *arg) ucontext_t *ucp; klwp_t *lwp = ttolwp(curthread); stack_t dummy_stk; + proc_t *p = lwptoproc(lwp); /* * In future releases, when the ucontext structure grows, @@ -228,6 +261,15 @@ getsetcontext(int flag, void *arg) return (set_errno(EFAULT)); } + /* + * If this is a branded process, copy in the brand-private + * data: + */ + if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data, + &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) { + return (set_errno(EFAULT)); + } + restorecontext(&uc); if ((uc.uc_flags & UC_STACK) && (lwp->lwp_ustack != 0)) @@ -311,7 +353,23 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask) else ucp->uc_flags &= ~UC_FPU; - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } + + if (PROC_IS_BRANDED(p) && BROP(p)->b_savecontext32 != NULL) { + /* + * Allow the brand the chance to modify the context we + * saved: + */ + BROP(p)->b_savecontext32(ucp); + } } int @@ -323,6 +381,7 @@ getsetcontext32(int flag, void *arg) klwp_t *lwp = ttolwp(curthread); caddr32_t ustack32; stack32_t dummy_stk32; + proc_t *p = lwptoproc(lwp); switch (flag) { default: @@ -354,6 +413,15 @@ getsetcontext32(int flag, void *arg) return (set_errno(EFAULT)); } + /* + * If this is a branded process, copy in the brand-private + * data: + */ + if (PROC_IS_BRANDED(p) && copyin(&ucp->uc_brand_data, + &uc.uc_brand_data, sizeof (uc.uc_brand_data)) != 0) { + return (set_errno(EFAULT)); + } + ucontext_32ton(&uc, &ucnat); restorecontext(&ucnat); diff --git a/usr/src/uts/intel/vmxnet/Makefile b/usr/src/uts/intel/vmxnet/Makefile new file mode 100644 index 0000000000..581aafbaa1 --- /dev/null +++ b/usr/src/uts/intel/vmxnet/Makefile @@ -0,0 +1,85 @@ +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2019 Joyent, Inc. +# + +# +# VMware Ethernet Adapter b module +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. +UTSCLOSED = ../../../../closed/uts + +# +# Define the module and object file sets. +# +MODULE = vmxnet +# +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/intel/io/vmxnet + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# The list of object files is defined here, rather than in Makefile.files, +# because the "$(CLOSED_BUILD)" macro has not been defined at the time +# Makefile.files is processed. +# +VMXNET_OBJS += vmxnet.o + +OBJECTS = $(VMXNET_OBJS:%=$(OBJS_DIR)/%) + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +CPPFLAGS += -I$(UTSBASE)/i86pc +LDFLAGS += -N misc/gld + +# needs work +SMOFF += all_func_returns + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include ../Makefile.targ diff --git a/usr/src/uts/intel/vnd/Makefile b/usr/src/uts/intel/vnd/Makefile new file mode 100644 index 0000000000..5a412ea94d --- /dev/null +++ b/usr/src/uts/intel/vnd/Makefile @@ -0,0 +1,52 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +UTSBASE = ../.. + +MODULE = vnd +OBJECTS = $(VND_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) +CONF_SRCDIR = $(UTSBASE)/common/io/vnd + +CPPFLAGS += -I$(UTSBASE)/i86pc +LDFLAGS += -Nmisc/neti -Nmisch/hook -Nfs/dev -Nmisc/gsqueue + +# +# We use <sys/ctype.h> which causes gcc to think that all of its inline +# functions are defined and unused. +# +CERRWARN += -_gcc=-Wno-unused-function + +# needs work +SMOFF += or_vs_and + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/zfd/Makefile b/usr/src/uts/intel/zfd/Makefile new file mode 100644 index 0000000000..4f9146510a --- /dev/null +++ b/usr/src/uts/intel/zfd/Makefile @@ -0,0 +1,40 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2014 Joyent, Inc. All rights reserved. +# +# uts/intel/zfd/Makefile + +UTSBASE = ../.. + +MODULE = zfd +OBJECTS = $(ZFD_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/intel/Makefile.intel + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/sfmmu/vm/hat_sfmmu.c b/usr/src/uts/sfmmu/vm/hat_sfmmu.c index 76617cb130..1f0ef11096 100644 --- a/usr/src/uts/sfmmu/vm/hat_sfmmu.c +++ b/usr/src/uts/sfmmu/vm/hat_sfmmu.c @@ -86,6 +86,7 @@ #include <sys/fpu/fpusystm.h> #include <vm/mach_kpm.h> #include <sys/callb.h> +#include <sys/zone.h> #ifdef DEBUG #define SFMMU_VALIDATE_HMERID(hat, rid, saddr, len) \ @@ -934,6 +935,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { } \ pp->p_mapping = hme; \ pp->p_share++; \ + zone_add_page(pp); \ } /* @@ -954,6 +956,7 @@ static kphysm_setup_vector_t sfmmu_update_vec = { \ ASSERT(pp->p_share > 0); \ pp->p_share--; \ + zone_rm_page(pp); \ \ if (hme->hme_prev) { \ ASSERT(pp->p_mapping != hme); \ @@ -7346,6 +7349,8 @@ retry: tpp->p_mapping = NULL; dpp->p_share = tpp->p_share; tpp->p_share = 0; + dpp->p_zoneid = tpp->p_zoneid; + tpp->p_zoneid = ALL_ZONES; while (index != 0) { index = index >> 1; diff --git a/usr/src/uts/sparc/Makefile.sparc b/usr/src/uts/sparc/Makefile.sparc index 306f84df75..210d2b512a 100644 --- a/usr/src/uts/sparc/Makefile.sparc +++ b/usr/src/uts/sparc/Makefile.sparc @@ -231,6 +231,7 @@ DRV_KMODS += nulldriver DRV_KMODS += bridge trill DRV_KMODS += bpf DRV_KMODS += dca +DRV_KMODS += inotify DRV_KMODS += eventfd DRV_KMODS += signalfd DRV_KMODS += ufm @@ -496,6 +497,7 @@ SOCKET_KMODS += sockpfp SOCKET_KMODS += socksctp SOCKET_KMODS += socksdp SOCKET_KMODS += sockrds +SOCKET_KMODS += datafilt # # kiconv modules (/kernel/kiconv): diff --git a/usr/src/uts/sparc/bpf/Makefile b/usr/src/uts/sparc/bpf/Makefile index cfc7bdfd1d..213c28a29d 100644 --- a/usr/src/uts/sparc/bpf/Makefile +++ b/usr/src/uts/sparc/bpf/Makefile @@ -58,8 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # CFLAGS += $(CCVERBOSE) -LDFLAGS += -Nmisc/mac -Nmisc/dls -Ndrv/ipnet -Nmisc/neti -INC_PATH += -I$(UTSBASE)/common/io/bpf +LDFLAGS += -Nmisc/mac -Nmisc/dls -Ndrv/ipnet -Nmisc/neti -Ndrv/ip # # For now, disable these warnings; maintainers should endeavor diff --git a/usr/src/uts/sparc/datafilt/Makefile b/usr/src/uts/sparc/datafilt/Makefile new file mode 100644 index 0000000000..4560585d42 --- /dev/null +++ b/usr/src/uts/sparc/datafilt/Makefile @@ -0,0 +1,62 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2011, OmniTI Computer Consulting, Inc. All rights reserved. +# Copyright 2012, Nexenta Systems, Inc. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = datafilt +OBJECTS = $(DATAFILT_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -Nfs/sockfs -Ndrv/ip + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/dld/Makefile b/usr/src/uts/sparc/dld/Makefile index ed46cee2f8..7d4c31d3de 100644 --- a/usr/src/uts/sparc/dld/Makefile +++ b/usr/src/uts/sparc/dld/Makefile @@ -55,7 +55,6 @@ CFLAGS += $(CCVERBOSE) $(RELEASE_BUILD)CFLAGS += -xinline=auto -xcrossfile $(RELEASE_BUILD)COPTIMIZE = -xO5 LDFLAGS += -N misc/dls -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # For now, disable these warnings; maintainers should endeavor diff --git a/usr/src/uts/sparc/dls/Makefile b/usr/src/uts/sparc/dls/Makefile index 5fb9c5e27f..055906e0c8 100644 --- a/usr/src/uts/sparc/dls/Makefile +++ b/usr/src/uts/sparc/dls/Makefile @@ -53,7 +53,6 @@ CFLAGS += $(CCVERBOSE) $(RELEASE_BUILD)CFLAGS += -xinline=auto -xcrossfile $(RELEASE_BUILD)COPTIMIZE = -xO5 LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # For now, disable these warnings; maintainers should endeavor diff --git a/usr/src/uts/sparc/icmp/Makefile b/usr/src/uts/sparc/icmp/Makefile index 64640cafe7..b36d51cda7 100644 --- a/usr/src/uts/sparc/icmp/Makefile +++ b/usr/src/uts/sparc/icmp/Makefile @@ -22,6 +22,7 @@ # uts/sparc/icmp/Makefile # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2016 Joyent, Inc. # # This makefile drives the production of the icmp IP driver # diff --git a/usr/src/uts/sparc/inotify/Makefile b/usr/src/uts/sparc/inotify/Makefile new file mode 100644 index 0000000000..89617b71dc --- /dev/null +++ b/usr/src/uts/sparc/inotify/Makefile @@ -0,0 +1,61 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = inotify +OBJECTS = $(INOTIFY_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) +CONF_SRCDIR = $(UTSBASE)/common/io + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +CERRWARN += -_gcc=-Wno-parentheses +LDFLAGS += -Nfs/specfs + +# +# Define targets +# +ALL_TARGET = $(BINARY) $(SRC_CONFILE) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/ipf/ipf.global-objs.debug64 b/usr/src/uts/sparc/ipf/ipf.global-objs.debug64 index 663613cee3..b42dca618a 100644 --- a/usr/src/uts/sparc/ipf/ipf.global-objs.debug64 +++ b/usr/src/uts/sparc/ipf/ipf.global-objs.debug64 @@ -22,9 +22,25 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# Copyright 2013 Joyent, Inc. All rights reserved +# Copyright 2019 Joyent, Inc. # +cfw_evdrops +cfw_evreports +cfw_ring +cfw_ringcv +cfw_ringend +cfw_ringfull +cfw_ringlock +cfw_ringmask +cfw_ringsize +cfw_ringstart +cfw_timeout_tries +cfw_timeout_wait +hook4_vnd_in +hook4_vnd_out +hook6_vnd_in +hook6_vnd_out fr_availfuncs fr_features fr_objbytes @@ -56,6 +72,7 @@ icmptoicmp6unreach idletime_tab ip6exthdr ipf_cb_ops +ipf_cfwlog_enabled ipf_dev_info ipf_devfiles ipf_kstat_tmp diff --git a/usr/src/uts/sparc/mac_ether/Makefile b/usr/src/uts/sparc/mac_ether/Makefile index 7d2413781e..889da099ea 100644 --- a/usr/src/uts/sparc/mac_ether/Makefile +++ b/usr/src/uts/sparc/mac_ether/Makefile @@ -54,7 +54,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/sparc/mac_ib/Makefile b/usr/src/uts/sparc/mac_ib/Makefile index c44e192745..84855bf749 100644 --- a/usr/src/uts/sparc/mac_ib/Makefile +++ b/usr/src/uts/sparc/mac_ib/Makefile @@ -54,7 +54,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -N misc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/sparc/mac_wifi/Makefile b/usr/src/uts/sparc/mac_wifi/Makefile index 7c3b74a79b..02d03549ac 100644 --- a/usr/src/uts/sparc/mac_wifi/Makefile +++ b/usr/src/uts/sparc/mac_wifi/Makefile @@ -56,7 +56,6 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # CFLAGS += $(CCVERBOSE) LDFLAGS += -Nmisc/mac -INC_PATH += -I$(UTSBASE)/common/io/bpf # # Default build targets. diff --git a/usr/src/uts/sparc/syscall/getcontext.c b/usr/src/uts/sparc/syscall/getcontext.c index aa933e5d23..3cf07718ad 100644 --- a/usr/src/uts/sparc/syscall/getcontext.c +++ b/usr/src/uts/sparc/syscall/getcontext.c @@ -20,6 +20,9 @@ */ /* + * Copyright 2015 Joyent, Inc. + */ +/* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -110,10 +113,15 @@ savecontext(ucontext_t *ucp, const k_sigset_t *mask) ucp->uc_flags &= ~UC_FPU; ucp->uc_mcontext.gwins = (gwindows_t *)NULL; - /* - * Save signal mask. - */ - sigktou(mask, &ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask. + */ + sigktou(mask, &ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } } @@ -412,11 +420,16 @@ savecontext32(ucontext32_t *ucp, const k_sigset_t *mask, struct fq32 *dfq) ucp->uc_flags &= ~UC_FPU; ucp->uc_mcontext.gwins = (caddr32_t)(uintptr_t)NULL; - /* - * Save signal mask (the 32- and 64-bit sigset_t structures are - * identical). - */ - sigktou(mask, (sigset_t *)&ucp->uc_sigmask); + if (mask != NULL) { + /* + * Save signal mask (the 32- and 64-bit sigset_t structures are + * identical). + */ + sigktou(mask, (sigset_t *)&ucp->uc_sigmask); + } else { + ucp->uc_flags &= ~UC_SIGMASK; + bzero(&ucp->uc_sigmask, sizeof (ucp->uc_sigmask)); + } } int diff --git a/usr/src/uts/sparc/zfd/Makefile b/usr/src/uts/sparc/zfd/Makefile new file mode 100644 index 0000000000..6371399988 --- /dev/null +++ b/usr/src/uts/sparc/zfd/Makefile @@ -0,0 +1,42 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2014 Joyent, Inc. All rights reserved. +# +# uts/intel/zfd/Makefile + +UTSBASE = ../.. + +MODULE = zfd +OBJECTS = $(ZFD_OBJS:%=$(OBJS_DIR)/%) +ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) + +include $(UTSBASE)/sparc/Makefile.sparc + +ALL_TARGET = $(BINARY) +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +CFLAGS += $(CCVERBOSE) + +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +install: $(INSTALL_DEPS) + +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sun4u/io/pci/pcisch.c b/usr/src/uts/sun4u/io/pci/pcisch.c index 5d2981b6c8..a8ffef185c 100644 --- a/usr/src/uts/sun4u/io/pci/pcisch.c +++ b/usr/src/uts/sun4u/io/pci/pcisch.c @@ -2973,11 +2973,12 @@ iommu_tlb_scrub(iommu_t *iommu_p, int scrub) "\tContext=%lx %sWritable %sStreamable\n" "\tPCI Page Size=%sk Address in page %lx\n", ddi_driver_name(dip), ddi_get_instance(dip), errstat, i, - (tag & TLBTAG_CONTEXT_BITS) >> TLBTAG_CONTEXT_SHIFT, + (uint64_t)(tag & TLBTAG_CONTEXT_BITS) >> + TLBTAG_CONTEXT_SHIFT, (tag & TLBTAG_WRITABLE_BIT) ? "" : neg, (tag & TLBTAG_STREAM_BIT) ? "" : neg, (tag & TLBTAG_PGSIZE_BIT) ? "64" : "8", - (tag & TLBTAG_PCIVPN_BITS) << 13); + (uint64_t)(tag & TLBTAG_PCIVPN_BITS) << 13); cmn_err(CE_CONT, "Memory: %sValid %sCacheable Page Frame=%lx\n", (data & TLBDATA_VALID_BIT) ? "" : neg, (data & TLBDATA_CACHE_BIT) ? "" : neg, pfn); |