diff options
Diffstat (limited to 'usr/src/uts/common')
412 files changed, 92941 insertions, 2158 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 8477310f0a..055a038c24 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -24,6 +24,7 @@ # Copyright (c) 2013 by Delphix. All rights reserved. # Copyright (c) 2013 by Saso Kiselkov. All rights reserved. # Copyright 2014 Nexenta Systems, Inc. All rights reserved. +# Copyright 2015, Joyent, Inc. # # @@ -280,6 +281,7 @@ GENUNIX_OBJS += \ rctl.o \ rctlsys.o \ readlink.o \ + refhash.o \ refstr.o \ rename.o \ resolvepath.o \ @@ -427,6 +429,8 @@ PROFILE_OBJS += profile.o SYSTRACE_OBJS += systrace.o +LX_SYSTRACE_OBJS += lx_systrace.o + LOCKSTAT_OBJS += lockstat.o FASTTRAP_OBJS += fasttrap.o fasttrap_isa.o @@ -491,6 +495,12 @@ PTSL_OBJS += tty_pts.o PTM_OBJS += ptm.o +LX_PTM_OBJS += lx_ptm.o + +LX_AUDIO_OBJS += lx_audio.o + +LX_NETLINK_OBJS += lx_netlink.o + MII_OBJS += mii.o mii_cicada.o mii_natsemi.o mii_intel.o mii_qualsemi.o \ mii_marvell.o mii_realtek.o mii_other.o @@ -544,6 +554,7 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o \ sctp_misc.o IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o +IP_COMM_OBJS = inet_hash.o IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ @@ -559,7 +570,8 @@ IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ $(IP_TCP_OBJS) \ $(IP_UDP_OBJS) \ $(IP_SCTP_OBJS) \ - $(IP_ILB_OBJS) + $(IP_ILB_OBJS) \ + $(IP_COMM_OBJS) IP6_OBJS += ip6ddi.o @@ -577,6 +589,8 @@ IPSECESP_OBJS += ipsecespddi.o ipsecesp.o IPSECAH_OBJS += ipsecahddi.o ipsecah.o sadb.o +DATAFILT_OBJS += datafilt.o + SPPP_OBJS += sppp.o sppp_dlpi.o sppp_mod.o s_common.o SPPPTUN_OBJS += sppptun.o sppptun_mod.o @@ -674,6 +688,15 @@ NET80211_OBJS += net80211.o net80211_proto.o net80211_input.o \ VNIC_OBJS += vnic_ctl.o vnic_dev.o +OVERLAY_OBJS += overlay.o overlay_fm.o overlay_mux.o overlay_plugin.o \ + overlay_prop.o overlay_target.o + +OVERLAY_VXLAN_OBJS += overlay_vxlan.o + +VND_OBJS += vnd.o frameio.o + +GSQUEUE_OBJS += gsqueue.o + SIMNET_OBJS += simnet.o IB_OBJS += ibnex.o ibnex_ioctl.o ibnex_hca.o @@ -986,8 +1009,12 @@ DEVPOLL_OBJS += devpoll.o DEVPOOL_OBJS += devpool.o +EVENTFD_OBJS += eventfd.o + I8042_OBJS += i8042.o +INOTIFY_OBJS += inotify.o + KB8042_OBJS += \ at_keyprocess.o \ kb8042.o \ @@ -1025,6 +1052,8 @@ GEN_DRV_OBJS += gen_drv.o TCLIENT_OBJS += tclient.o +TIMERFD_OBJS += timerfd.o + TPHCI_OBJS += tphci.o TVHCI_OBJS += tvhci.o @@ -1060,6 +1089,8 @@ QLGE_OBJS += qlge.o qlge_dbg.o qlge_flash.o qlge_fm.o qlge_gld.o qlge_mpi.o ZCONS_OBJS += zcons.o +ZFD_OBJS += zfd.o + NV_SATA_OBJS += nv_sata.o SI3124_OBJS += si3124.o @@ -1122,8 +1153,7 @@ DEVFS_OBJS += devfs_subr.o devfs_vfsops.o devfs_vnops.o DEV_OBJS += sdev_subr.o sdev_vfsops.o sdev_vnops.o \ sdev_ptsops.o sdev_zvolops.o sdev_comm.o \ sdev_profile.o sdev_ncache.o sdev_netops.o \ - sdev_ipnetops.o \ - sdev_vtops.o + sdev_ipnetops.o sdev_vtops.o sdev_plugin.o CTFS_OBJS += ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \ ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o @@ -1140,8 +1170,13 @@ PIPE_OBJS += pipe.o HSFS_OBJS += hsfs_node.o hsfs_subr.o hsfs_vfsops.o hsfs_vnops.o \ hsfs_susp.o hsfs_rrip.o hsfs_susp_subr.o +HYPRLOFS_OBJS += hyprlofs_dir.o hyprlofs_subr.o \ + hyprlofs_vnops.o hyprlofs_vfsops.o + LOFS_OBJS += lofs_subr.o lofs_vfsops.o lofs_vnops.o +LXPROC_OBJS += lxpr_subr.o lxpr_vfsops.o lxpr_vnops.o + NAMEFS_OBJS += namevfs.o namevno.o NFS_OBJS += nfs_client.o nfs_common.o nfs_dump.o \ @@ -1249,8 +1284,8 @@ SMBSRV_OBJS += $(SMBSRV_SHARED_OBJS) \ PCFS_OBJS += pc_alloc.o pc_dir.o pc_node.o pc_subr.o \ pc_vfsops.o pc_vnops.o -PROC_OBJS += prcontrol.o prioctl.o prsubr.o prusrio.o \ - prvfsops.o prvnops.o +PROC_OBJS += prargv.o prcontrol.o prioctl.o prsubr.o \ + prusrio.o prvfsops.o prvnops.o MNTFS_OBJS += mntvfsops.o mntvnops.o @@ -1294,6 +1329,7 @@ SMBFS_OBJS += smbfs_vfsops.o smbfs_vnops.o smbfs_node.o \ smbfs_rwlock.o smbfs_xattr.o \ $(SMBFS_COMMON_OBJS) +BOOTFS_OBJS += bootfs_construct.o bootfs_vfsops.o bootfs_vnops.o # # LVM modules @@ -1388,6 +1424,7 @@ ZFS_COMMON_OBJS += \ zfs_fuid.o \ zfs_sa.o \ zfs_znode.o \ + zfs_zone.o \ zil.o \ zio.o \ zio_checksum.o \ @@ -1834,7 +1871,7 @@ ZYD_OBJS += zyd.o zyd_usb.o zyd_hw.o zyd_fw.o MXFE_OBJS += mxfe.o -MPTSAS_OBJS += mptsas.o mptsas_hash.o mptsas_impl.o mptsas_init.o \ +MPTSAS_OBJS += mptsas.o mptsas_impl.o mptsas_init.o \ mptsas_raid.o mptsas_smhba.o SFE_OBJS += sfe.o sfe_util.o @@ -2027,6 +2064,11 @@ MEGA_SAS_OBJS = megaraid_sas.o MR_SAS_OBJS = ld_pd_map.o mr_sas.o mr_sas_tbolt.o mr_sas_list.o # +# DR_SAS module +# +DR_SAS_OBJS = dr_sas.o + +# # CPQARY3 module # CPQARY3_OBJS = cpqary3.o cpqary3_noe.o cpqary3_talk2ctlr.o \ @@ -2065,6 +2107,20 @@ NULLDRIVER_OBJS = nulldriver.o TPM_OBJS = tpm.o tpm_hcall.o # +# USB Fast ethernet drivers +# +USBGEM_OBJS = usbgem.o +AXF_OBJS = axf_usbgem.o +UDMF_OBJS = udmf_usbgem.o +URF_OBJS = urf_usbgem.o +UPF_OBJS = upf_usbgem.o + +# +# NFP objects +# +NFP_OBJS = hostif.o osif.o drvlist.o i21555.o i21285.o i21555d.o + +# # BNXE objects # BNXE_OBJS += bnxe_cfg.o \ diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index c5e728fdff..e452aeb830 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -23,6 +23,7 @@ # Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> # Copyright 2014 Nexenta Systems, Inc. All rights reserved. +# Copyright 2015 Joyent, Inc. # # @@ -95,6 +96,10 @@ $(OBJS_DIR)/%.o: $(COMMONBASE)/avl/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/inet/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(COMMONBASE)/ucode/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -103,6 +108,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/sn1/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/sngl/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/brand/solaris10/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -207,6 +216,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/autofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/bootfs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/cachefs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -243,10 +256,18 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hsfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/hyprlofs/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lofs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/lxproc/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/fs/mntfs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -758,6 +779,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/drm/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/dr_sas/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/efe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -986,6 +1011,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/net80211/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nfp/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nge/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1001,6 +1030,14 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/npi/%.c $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/nxge/%.s $(COMPILE.s) -o $@ $< +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/overlay/plugins/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/pci-ide/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1137,6 +1174,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sdcard/targets/sdcard/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/gsqueue/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/sfe/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1149,6 +1190,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/softmac/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vnd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/uath/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1273,6 +1318,30 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/usb/usba10/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/usbgem/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/axf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/udf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/udmf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/upf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/urf/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vuidmice/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1445,6 +1514,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/vioblk/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(COMMONBASE)/idspace/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # krtld must refer to its own bzero/bcopy until the kernel is fully linked # @@ -1513,6 +1586,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/pcmcia/pcs/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/refhash/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/rpc/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1628,12 +1705,18 @@ $(LINTS_DIR)/%.ln: $(COMMONBASE)/acl/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/avl/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(COMMONBASE)/inet/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(COMMONBASE)/ucode/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) $(LINTS_DIR)/%.ln: $(UTSBASE)/common/brand/sn1/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/brand/sngl/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/brand/solaris10/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1736,6 +1819,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/autofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/bootfs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/cachefs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1763,9 +1849,15 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/fifofs/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hsfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/hyprlofs/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lofs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/lxproc/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/fs/mntfs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2111,6 +2203,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dmfe/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/drm/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/dr_sas/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/efe/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2282,6 +2377,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/mwl/mwl_fw/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/net80211/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nfp/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nge/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2294,6 +2392,12 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/%.s $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/nxge/npi/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/overlay/plugins/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/pci-ide/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2501,6 +2605,21 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usb/usba/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usb/usba10/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/usbgem/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/axf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/udmf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/upf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/urf/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/vuidmice/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2648,6 +2767,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/nexus/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/pcmcia/pcs/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/refhash/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/rpc/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -2726,3 +2848,6 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/skd/%.c $(LINTS_DIR)/%.ln: $(COMMONBASE)/fsreparse/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) + +$(LINTS_DIR)/%.ln: $(COMMONBASE)/idspace/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/brand/lx/autofs/lx_autofs.c b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c new file mode 100644 index 0000000000..d2bb03c118 --- /dev/null +++ b/usr/src/uts/common/brand/lx/autofs/lx_autofs.c @@ -0,0 +1,1569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <fs/fs_subr.h> +#include <sys/atomic.h> +#include <sys/cmn_err.h> +#include <sys/dirent.h> +#include <sys/fs/fifonode.h> +#include <sys/modctl.h> +#include <sys/mount.h> +#include <sys/policy.h> +#include <sys/sunddi.h> + +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> + +#include <sys/lx_autofs_impl.h> + +/* + * External functions + */ +extern uintptr_t space_fetch(char *key); +extern int space_store(char *key, uintptr_t ptr); + +/* + * Globals + */ +static vfsops_t *lx_autofs_vfsops; +static vnodeops_t *lx_autofs_vn_ops = NULL; +static int lx_autofs_fstype; +static major_t lx_autofs_major; +static minor_t lx_autofs_minor = 0; + +/* + * Support functions + */ +static void +i_strfree(char *str) +{ + kmem_free(str, strlen(str) + 1); +} + +static char * +i_strdup(char *str) +{ + int n = strlen(str); + char *ptr = kmem_alloc(n + 1, KM_SLEEP); + bcopy(str, ptr, n + 1); + return (ptr); +} + +static int +i_str_to_int(char *str, int *val) +{ + long res; + + if (str == NULL) + return (-1); + + if ((ddi_strtol(str, NULL, 10, &res) != 0) || + (res < INT_MIN) || (res > INT_MAX)) + return (-1); + + *val = res; + return (0); +} + +static void +i_stack_init(list_t *lp) +{ + list_create(lp, + sizeof (stack_elem_t), offsetof(stack_elem_t, se_list)); +} + +static void +i_stack_fini(list_t *lp) +{ + ASSERT(list_head(lp) == NULL); + list_destroy(lp); +} + +static void +i_stack_push(list_t *lp, caddr_t ptr1, caddr_t ptr2, caddr_t ptr3) +{ + stack_elem_t *se; + + se = kmem_alloc(sizeof (*se), KM_SLEEP); + se->se_ptr1 = ptr1; + se->se_ptr2 = ptr2; + se->se_ptr3 = ptr3; + list_insert_head(lp, se); +} + +static int +i_stack_pop(list_t *lp, caddr_t *ptr1, caddr_t *ptr2, caddr_t *ptr3) +{ + stack_elem_t *se; + + if ((se = list_head(lp)) == NULL) + return (-1); + list_remove(lp, se); + if (ptr1 != NULL) + *ptr1 = se->se_ptr1; + if (ptr2 != NULL) + *ptr2 = se->se_ptr2; + if (ptr3 != NULL) + *ptr3 = se->se_ptr3; + kmem_free(se, sizeof (*se)); + return (0); +} + +static vnode_t * +fifo_peer_vp(vnode_t *vp) +{ + fifonode_t *fnp = VTOF(vp); + fifonode_t *fn_dest = fnp->fn_dest; + return (FTOV(fn_dest)); +} + +static vnode_t * +i_vn_alloc(vfs_t *vfsp, vnode_t *uvp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *vp, *vp_old; + + /* Allocate a new vnode structure in case we need it. */ + vp = vn_alloc(KM_SLEEP); + vn_setops(vp, lx_autofs_vn_ops); + VN_SET_VFS_TYPE_DEV(vp, vfsp, uvp->v_type, uvp->v_rdev); + vp->v_data = uvp; + ASSERT(vp->v_count == 1); + + /* + * Take a hold on the vfs structure. This is how unmount will + * determine if there are any active vnodes in the file system. + */ + VFS_HOLD(vfsp); + + /* + * Check if we already have a vnode allocated for this underlying + * vnode_t. + */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t *)&vp_old) != 0) { + + /* + * Didn't find an existing node. + * Add this node to the hash and return. + */ + VERIFY(mod_hash_insert(data->lav_vn_hash, + (mod_hash_key_t)uvp, + (mod_hash_val_t)vp) == 0); + mutex_exit(&data->lav_lock); + return (vp); + } + + /* Get a hold on the existing vnode and free up the one we allocated. */ + VN_HOLD(vp_old); + mutex_exit(&data->lav_lock); + + /* Free up the new vnode we allocated. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); + + return (vp_old); +} + +static void +i_vn_free(vnode_t *vp) +{ + vfs_t *vfsp = vp->v_vfsp; + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *uvp = vp->v_data; + vnode_t *vp_tmp; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + ASSERT(MUTEX_HELD((&vp->v_lock))); + + ASSERT(vp->v_count == 0); + + /* We're about to free this vnode so take it out of the hash. */ + (void) mod_hash_remove(data->lav_vn_hash, + (mod_hash_key_t)uvp, (mod_hash_val_t)&vp_tmp); + + /* + * No one else can lookup this vnode any more so there's no need + * to hold locks. + */ + mutex_exit(&data->lav_lock); + mutex_exit(&vp->v_lock); + + /* Release the underlying vnode. */ + VN_RELE(uvp); + VFS_RELE(vfsp); + vn_invalid(vp); + vn_free(vp); +} + +static lx_autofs_lookup_req_t * +i_lalr_alloc(lx_autofs_vfs_t *data, int *dup_request, char *nm) +{ + lx_autofs_lookup_req_t *lalr, *lalr_dup; + + /* Pre-allocate a new automounter request before grabbing locks. */ + lalr = kmem_zalloc(sizeof (*lalr), KM_SLEEP); + mutex_init(&lalr->lalr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lalr->lalr_cv, NULL, CV_DEFAULT, NULL); + lalr->lalr_ref = 1; + lalr->lalr_pkt.lap_protover = LX_AUTOFS_PROTO_VERSION; + + /* Assign a unique id for this request. */ + lalr->lalr_pkt.lap_id = id_alloc(data->lav_ids); + + /* + * The token expected by the linux automount is the name of + * the directory entry to look up. (And not the entire + * path that is being accessed.) + */ + lalr->lalr_pkt.lap_name_len = strlen(nm); + if (lalr->lalr_pkt.lap_name_len > + (sizeof (lalr->lalr_pkt.lap_name) - 1)) { + zcmn_err(getzoneid(), CE_NOTE, + "invalid autofs lookup: \"%s\"", nm); + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); + return (NULL); + } + (void) strlcpy(lalr->lalr_pkt.lap_name, nm, + sizeof (lalr->lalr_pkt.lap_name)); + + /* Check for an outstanding request for this path. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_path_hash, + (mod_hash_key_t)nm, (mod_hash_val_t *)&lalr_dup) == 0) { + /* + * There's already an outstanding request for this + * path so we don't need a new one. + */ + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); + lalr = lalr_dup; + + /* Bump the ref count on the old request. */ + atomic_add_int(&lalr->lalr_ref, 1); + + *dup_request = 1; + } else { + /* Add it to the hashes. */ + VERIFY(mod_hash_insert(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)lalr) == 0); + VERIFY(mod_hash_insert(data->lav_path_hash, + (mod_hash_key_t)i_strdup(nm), + (mod_hash_val_t)lalr) == 0); + + *dup_request = 0; + } + mutex_exit(&data->lav_lock); + + return (lalr); +} + +static lx_autofs_lookup_req_t * +i_lalr_find(lx_autofs_vfs_t *data, int id) +{ + lx_autofs_lookup_req_t *lalr; + + /* Check for an outstanding request for this id. */ + mutex_enter(&data->lav_lock); + if (mod_hash_find(data->lav_id_hash, (mod_hash_key_t)(uintptr_t)id, + (mod_hash_val_t *)&lalr) != 0) { + mutex_exit(&data->lav_lock); + return (NULL); + } + atomic_add_int(&lalr->lalr_ref, 1); + mutex_exit(&data->lav_lock); + return (lalr); +} + +static void +i_lalr_complete(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + lx_autofs_lookup_req_t *lalr_tmp; + + /* Remove this request from the hashes so no one can look it up. */ + mutex_enter(&data->lav_lock); + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)&lalr_tmp); + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)lalr->lalr_pkt.lap_name, + (mod_hash_val_t)&lalr_tmp); + mutex_exit(&data->lav_lock); + + /* Mark this requst as complete and wakeup anyone waiting on it. */ + mutex_enter(&lalr->lalr_lock); + lalr->lalr_complete = 1; + cv_broadcast(&lalr->lalr_cv); + mutex_exit(&lalr->lalr_lock); +} + +static void +i_lalr_release(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + ASSERT(!MUTEX_HELD(&lalr->lalr_lock)); + if (atomic_add_int_nv(&lalr->lalr_ref, -1) > 0) + return; + ASSERT(lalr->lalr_ref == 0); + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); +} + +static void +i_lalr_abort(lx_autofs_vfs_t *data, lx_autofs_lookup_req_t *lalr) +{ + lx_autofs_lookup_req_t *lalr_tmp; + + /* + * This is a little tricky. We're aborting the wait for this + * request. So if anyone else is waiting for this request we + * can't free it, but if no one else is waiting for the request + * we should free it. + */ + mutex_enter(&data->lav_lock); + if (atomic_add_int_nv(&lalr->lalr_ref, -1) > 0) { + mutex_exit(&data->lav_lock); + return; + } + ASSERT(lalr->lalr_ref == 0); + + /* Remove this request from the hashes so no one can look it up. */ + (void) mod_hash_remove(data->lav_id_hash, + (mod_hash_key_t)(uintptr_t)lalr->lalr_pkt.lap_id, + (mod_hash_val_t)&lalr_tmp); + (void) mod_hash_remove(data->lav_path_hash, + (mod_hash_key_t)lalr->lalr_pkt.lap_name, + (mod_hash_val_t)&lalr_tmp); + mutex_exit(&data->lav_lock); + + /* It's ok to free this now because the ref count was zero. */ + id_free(data->lav_ids, lalr->lalr_pkt.lap_id); + kmem_free(lalr, sizeof (*lalr)); +} + +static int +i_fifo_lookup(pid_t pgrp, int fd, file_t **fpp_wr, file_t **fpp_rd) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_wr, *ufp_rd = NULL; + file_t *fp_wr, *fp_rd = NULL; + vnode_t *vp_wr, *vp_rd; + int i; + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* Sanity check fifo write fd. */ + if (fd >= fip->fi_nfiles) { + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* Get a pointer to the write fifo. */ + UF_ENTER(ufp_wr, fip, fd); + if (((fp_wr = ufp_wr->uf_file) == NULL) || + ((vp_wr = fp_wr->f_vnode) == NULL) || (vp_wr->v_type != VFIFO)) { + /* Invalid fifo fd. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + vp_rd = fifo_peer_vp(fp_wr->f_vnode); + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + UF_EXIT(ufp_wr); + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * We need to drop fi_lock before we can try to acquire f_tlock + * the good news is that the file pointers are protected because + * we're still holding uf_lock. + */ + mutex_exit(&fip->fi_lock); + + /* + * Here we bump the open counts on the fifos. The reason + * that we do this is because when we go to write to the + * fifo we want to ensure that they are actually open (and + * not in the process of being closed) without having to + * stop the automounter. (If the write end of the fifo + * were closed and we tried to write to it we would panic. + * If the read end of the fifo was closed and we tried to + * write to the other end, the process that invoked the + * lookup operation would get an unexpected SIGPIPE.) + */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + ASSERT(fp_wr->f_count >= 2); + mutex_exit(&fp_wr->f_tlock); + + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + ASSERT(fp_rd->f_count >= 2); + mutex_exit(&fp_rd->f_tlock); + + /* Release all our locks. */ + UF_EXIT(ufp_wr); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + /* Return the file pointers. */ + *fpp_rd = fp_rd; + *fpp_wr = fp_wr; + return (0); +} + +static uint_t +/*ARGSUSED*/ +i_fifo_close_cb(mod_hash_key_t key, mod_hash_val_t *val, void *arg) +{ + int *id = (int *)arg; + /* Return the key and terminate the walk. */ + *id = (uintptr_t)key; + return (MH_WALK_TERMINATE); +} + +static void +i_fifo_close(lx_autofs_vfs_t *data) +{ + /* + * Close the fifo to prevent any future requests from + * getting sent to the automounter. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr != NULL) { + (void) closef(data->lav_fifo_wr); + data->lav_fifo_wr = NULL; + } + if (data->lav_fifo_rd != NULL) { + (void) closef(data->lav_fifo_rd); + data->lav_fifo_rd = NULL; + } + mutex_exit(&data->lav_lock); + + /* + * Wakeup any threads currently waiting for the automounter + * note that it's possible for multiple threads to have entered + * this function and to be doing the work below simultaneously. + */ + for (;;) { + lx_autofs_lookup_req_t *lalr; + int id; + + /* Lookup the first entry in the hash. */ + id = -1; + mod_hash_walk(data->lav_id_hash, + i_fifo_close_cb, &id); + if (id == -1) { + /* No more id's in the hash. */ + break; + } + if ((lalr = i_lalr_find(data, id)) == NULL) { + /* Someone else beat us to it. */ + continue; + } + + /* Mark the request as compleate and release it. */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + } +} + +static int +i_fifo_verify_rd(lx_autofs_vfs_t *data) +{ + proc_t *prp; + uf_info_t *fip; + uf_entry_t *ufp_rd = NULL; + file_t *fp_rd = NULL; + vnode_t *vp_rd; + int i; + + ASSERT(MUTEX_HELD((&data->lav_lock))); + + /* Check if we've already been shut down. */ + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + return (-1); + } + vp_rd = fifo_peer_vp(data->lav_fifo_wr->f_vnode); + + /* + * sprlock() is zone aware, so assuming this mount call was + * initiated by a process in a zone, if it tries to specify + * a pgrp outside of it's zone this call will fail. + * + * Also, we want to grab hold of the main automounter process + * and its going to be the group leader for pgrp, so its + * pid will be equal to pgrp. + */ + prp = sprlock(data->lav_pgrp); + if (prp == NULL) + return (-1); + mutex_exit(&prp->p_lock); + + /* Now we want to access the processes open file descriptors. */ + fip = P_FINFO(prp); + mutex_enter(&fip->fi_lock); + + /* + * Now we need to find the read end of the fifo (for reasons + * explained below.) We assume that the read end of the fifo + * is in the same process as the write end. + */ + for (i = 0; i < fip->fi_nfiles; i++) { + UF_ENTER(ufp_rd, fip, i); + if (((fp_rd = ufp_rd->uf_file) != NULL) && + (fp_rd->f_vnode == vp_rd)) + break; + UF_EXIT(ufp_rd); + } + if (i == fip->fi_nfiles) { + /* Didn't find it. */ + mutex_exit(&fip->fi_lock); + mutex_enter(&prp->p_lock); + sprunlock(prp); + return (-1); + } + + /* + * Seems the automounter still has the read end of the fifo + * open, we're done here. Release all our locks and exit. + */ + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp_rd); + mutex_enter(&prp->p_lock); + sprunlock(prp); + + return (0); +} + +static int +i_fifo_write(lx_autofs_vfs_t *data, lx_autofs_pkt_t *lap) +{ + struct uio uio; + struct iovec iov; + file_t *fp_wr, *fp_rd; + int error; + + /* + * The catch here is we need to make sure _we_ don't close + * the the fifo while writing to it. (Another thread could come + * along and realize the automounter process is gone and close + * the fifo. To do this we bump the open count before we + * write to the fifo. + */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + fp_wr = data->lav_fifo_wr; + fp_rd = data->lav_fifo_rd; + + /* Bump the open count on the write fifo. */ + mutex_enter(&fp_wr->f_tlock); + fp_wr->f_count++; + mutex_exit(&fp_wr->f_tlock); + + /* Bump the open count on the read fifo. */ + mutex_enter(&fp_rd->f_tlock); + fp_rd->f_count++; + mutex_exit(&fp_rd->f_tlock); + + mutex_exit(&data->lav_lock); + + iov.iov_base = (caddr_t)lap; + iov.iov_len = sizeof (*lap); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_loffset = 0; + uio.uio_segflg = (short)UIO_SYSSPACE; + uio.uio_resid = sizeof (*lap); + uio.uio_llimit = 0; + uio.uio_fmode = FWRITE | FNDELAY | FNONBLOCK; + + error = VOP_WRITE(fp_wr->f_vnode, &uio, 0, kcred, NULL); + (void) closef(fp_wr); + (void) closef(fp_rd); + + /* + * After every write we verify that the automounter still has + * these files open. + */ + mutex_enter(&data->lav_lock); + if (i_fifo_verify_rd(data) != 0) { + /* + * Something happened to the automounter. + * Close down the communication pipe we setup. + */ + mutex_exit(&data->lav_lock); + i_fifo_close(data); + if (error != 0) + return (error); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + return (error); +} + +static int +i_bs_readdir(vnode_t *dvp, list_t *dir_stack, list_t *file_stack) +{ + struct iovec iov; + struct uio uio; + dirent64_t *dp, *dbuf; + vnode_t *vp; + size_t dlen, dbuflen; + int eof, error, ndirents = 64; + char *nm; + + dlen = ndirents * (sizeof (*dbuf)); + dbuf = kmem_alloc(dlen, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = UIO_COPY_CACHED; + uio.uio_loffset = 0; + uio.uio_llimit = MAXOFFSET_T; + + eof = 0; + error = 0; + while (!error && !eof) { + uio.uio_resid = dlen; + iov.iov_base = (char *)dbuf; + iov.iov_len = dlen; + + (void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL); + if (VOP_READDIR(dvp, &uio, kcred, &eof, NULL, 0) != 0) { + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + kmem_free(dbuf, dlen); + return (-1); + } + VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL); + + if ((dbuflen = dlen - uio.uio_resid) == 0) { + /* We're done. */ + break; + } + + for (dp = dbuf; ((intptr_t)dp < (intptr_t)dbuf + dbuflen); + dp = (dirent64_t *)((intptr_t)dp + dp->d_reclen)) { + + nm = dp->d_name; + + if (strcmp(nm, ".") == 0 || strcmp(nm, "..") == 0) + continue; + + if (VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + kmem_free(dbuf, dlen); + return (-1); + } + if (vp->v_type == VDIR) { + if (dir_stack != NULL) { + i_stack_push(dir_stack, (caddr_t)dvp, + (caddr_t)vp, i_strdup(nm)); + } else { + VN_RELE(vp); + } + } else { + if (file_stack != NULL) { + i_stack_push(file_stack, (caddr_t)dvp, + (caddr_t)vp, i_strdup(nm)); + } else { + VN_RELE(vp); + } + } + } + } + kmem_free(dbuf, dlen); + return (0); +} + +static void +i_bs_destroy(vnode_t *dvp, char *path) +{ + list_t search_stack; + list_t dir_stack; + list_t file_stack; + vnode_t *pdvp, *vp; + char *dpath, *fpath; + int ret; + + if (VOP_LOOKUP(dvp, path, &vp, NULL, 0, NULL, kcred, + NULL, NULL, NULL) != 0) { + /* A directory entry with this name doesn't actually exist. */ + return; + } + + if ((vp->v_type & VDIR) == 0) { + /* Easy, the directory entry is a file so delete it. */ + VN_RELE(vp); + (void) VOP_REMOVE(dvp, path, kcred, NULL, 0); + return; + } + + /* + * The directory entry is a subdirectory, now we have a bit more + * work to do. (We'll have to recurse into the sub directory.) + * It would have been much easier to do this recursively but kernel + * stacks are notoriously small. + */ + i_stack_init(&search_stack); + i_stack_init(&dir_stack); + i_stack_init(&file_stack); + + /* Save our newfound subdirectory into a list. */ + i_stack_push(&search_stack, (caddr_t)dvp, (caddr_t)vp, i_strdup(path)); + + /* Do a recursive depth first search into the subdirectories. */ + while (i_stack_pop(&search_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the subdirectories in this directory. */ + if (i_bs_readdir(dvp, &search_stack, NULL) != 0) + goto exit; + + /* Save the current directory a separate stack. */ + i_stack_push(&dir_stack, (caddr_t)pdvp, (caddr_t)dvp, dpath); + } + + /* + * Now dir_stack contains a list of directories, the deepest paths + * are at the top of the list. So let's go through and process them. + */ + while (i_stack_pop(&dir_stack, + (caddr_t *)&pdvp, (caddr_t *)&dvp, &dpath) == 0) { + + /* Get a list of the files in this directory. */ + if (i_bs_readdir(dvp, NULL, &file_stack) != 0) { + VN_RELE(dvp); + i_strfree(dpath); + goto exit; + } + + /* Delete all the files in this directory. */ + while (i_stack_pop(&file_stack, + NULL, (caddr_t *)&vp, &fpath) == 0) { + VN_RELE(vp) + ret = VOP_REMOVE(dvp, fpath, kcred, NULL, 0); + i_strfree(fpath); + if (ret != 0) { + i_strfree(dpath); + goto exit; + } + } + + /* Delete this directory. */ + VN_RELE(dvp); + ret = VOP_RMDIR(pdvp, dpath, pdvp, kcred, NULL, 0); + i_strfree(dpath); + if (ret != 0) + goto exit; + } + +exit: + while ( + (i_stack_pop(&search_stack, NULL, (caddr_t *)&vp, &path) == 0) || + (i_stack_pop(&dir_stack, NULL, (caddr_t *)&vp, &path) == 0) || + (i_stack_pop(&file_stack, NULL, (caddr_t *)&vp, &path) == 0)) { + VN_RELE(vp); + i_strfree(path); + } + i_stack_fini(&search_stack); + i_stack_fini(&dir_stack); + i_stack_fini(&file_stack); +} + +static vnode_t * +i_bs_create(vnode_t *dvp, char *bs_name) +{ + vnode_t *vp; + vattr_t vattr; + + /* + * After looking at the mkdir syscall path it seems we don't need + * to initialize all of the vattr_t structure. + */ + bzero(&vattr, sizeof (vattr)); + vattr.va_type = VDIR; + vattr.va_mode = 0755; /* u+rwx,og=rx */ + vattr.va_mask = AT_TYPE|AT_MODE; + + if (VOP_MKDIR(dvp, bs_name, &vattr, &vp, kcred, NULL, 0, NULL) != 0) + return (NULL); + return (vp); +} + +static int +i_automounter_call(vnode_t *dvp, char *nm) +{ + lx_autofs_lookup_req_t *lalr; + lx_autofs_vfs_t *data; + int error, dup_request; + + /* Get a pointer to the vfs mount data. */ + data = dvp->v_vfsp->vfs_data; + + /* The automounter only support queries in the root directory. */ + if (dvp != data->lav_root) + return (ENOENT); + + /* + * Check if the current process is in the automounters process + * group. (If it is, the current process is either the autmounter + * itself or one of it's forked child processes.) If so, don't + * redirect this lookup back into the automounter because we'll + * hang. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp == curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + /* Verify that the automount process pipe still exists. */ + mutex_enter(&data->lav_lock); + if (data->lav_fifo_wr == NULL) { + ASSERT(data->lav_fifo_rd == NULL); + mutex_exit(&data->lav_lock); + return (ENOENT); + } + mutex_exit(&data->lav_lock); + + /* Allocate an automounter request structure. */ + if ((lalr = i_lalr_alloc(data, &dup_request, nm)) == NULL) + return (ENOENT); + + /* + * If we were the first one to allocate this request then we + * need to send it to the automounter. + */ + if ((!dup_request) && + ((error = i_fifo_write(data, &lalr->lalr_pkt)) != 0)) { + /* + * Unable to send the request to the automounter. + * Unblock any other threads waiting on the request + * and release the request. + */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + return (error); + } + + /* Wait for someone to signal us that this request has compleated. */ + mutex_enter(&lalr->lalr_lock); + while (!lalr->lalr_complete) { + if (cv_wait_sig(&lalr->lalr_cv, &lalr->lalr_lock) == 0) { + /* We got a signal, abort this lookup. */ + mutex_exit(&lalr->lalr_lock); + i_lalr_abort(data, lalr); + return (EINTR); + } + } + mutex_exit(&lalr->lalr_lock); + i_lalr_release(data, lalr); + + return (0); +} + +static int +i_automounter_ioctl(vnode_t *vp, int cmd, intptr_t arg) +{ + lx_autofs_vfs_t *data = (lx_autofs_vfs_t *)vp->v_vfsp->vfs_data; + + /* + * Be strict. + * We only accept ioctls from the automounter process group. + */ + mutex_enter(&pidlock); + if (data->lav_pgrp != curproc->p_pgrp) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + if ((cmd == LX_AUTOFS_IOC_READY) || (cmd == LX_AUTOFS_IOC_FAIL)) { + lx_autofs_lookup_req_t *lalr; + int id = arg; + + /* + * We don't actually care if the request failed or succeeded. + * We do the same thing either way. + */ + if ((lalr = i_lalr_find(data, id)) == NULL) + return (ENXIO); + + /* Mark the request as compleate and release it. */ + i_lalr_complete(data, lalr); + i_lalr_release(data, lalr); + return (0); + } + if (cmd == LX_AUTOFS_IOC_CATATONIC) { + /* The automounter is shutting down. */ + i_fifo_close(data); + return (0); + } + return (ENOTSUP); +} + +static int +i_parse_mntopt(vfs_t *vfsp, lx_autofs_vfs_t *data) +{ + char *fd_str, *pgrp_str, *minproto_str, *maxproto_str; + int fd, pgrp, minproto, maxproto; + file_t *fp_wr, *fp_rd; + + /* Require all options to be present. */ + if ((vfs_optionisset(vfsp, LX_MNTOPT_FD, &fd_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_PGRP, &pgrp_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MINPROTO, &minproto_str) != 1) || + (vfs_optionisset(vfsp, LX_MNTOPT_MAXPROTO, &maxproto_str) != 1)) + return (EINVAL); + + /* Get the values for each parameter. */ + if ((i_str_to_int(fd_str, &fd) != 0) || + (i_str_to_int(pgrp_str, &pgrp) != 0) || + (i_str_to_int(minproto_str, &minproto) != 0) || + (i_str_to_int(maxproto_str, &maxproto) != 0)) + return (EINVAL); + + /* + * We support v2 of the linux kernel automounter protocol. + * Make sure the mount request we got indicates support + * for this version of the protocol. + */ + if ((minproto > 2) || (maxproto < 2)) + return (EINVAL); + + /* + * Now we need to lookup the fifos we'll be using + * to talk to the userland automounter process. + */ + if (i_fifo_lookup(pgrp, fd, &fp_wr, &fp_rd) != 0) + return (EINVAL); + + /* Save the mount options and fifo pointers. */ + data->lav_fd = fd; + data->lav_pgrp = pgrp; + data->lav_fifo_rd = fp_rd; + data->lav_fifo_wr = fp_wr; + return (0); +} + +/* + * VFS entry points + */ +static int +lx_autofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + lx_autofs_vfs_t *data; + dev_t dev; + char name[40]; + int error; + + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) + return (EBUSY); + + /* We don't support mountes in the global zone. */ + if (getzoneid() == GLOBAL_ZONEID) + return (EPERM); + + /* We don't support mounting on top of ourselves. */ + if (vn_matchops(mvp, lx_autofs_vn_ops)) + return (EPERM); + + /* Allocate a vfs struct. */ + data = kmem_zalloc(sizeof (lx_autofs_vfs_t), KM_SLEEP); + + /* Parse mount options. */ + if ((error = i_parse_mntopt(vfsp, data)) != 0) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (error); + } + + /* Initialize the backing store. */ + i_bs_destroy(mvp, LX_AUTOFS_BS_DIR); + if ((data->lav_bs_vp = i_bs_create(mvp, LX_AUTOFS_BS_DIR)) == NULL) { + kmem_free(data, sizeof (lx_autofs_vfs_t)); + return (EBUSY); + } + data->lav_bs_name = LX_AUTOFS_BS_DIR; + + /* We have to hold the underlying vnode we're mounted on. */ + data->lav_mvp = mvp; + VN_HOLD(mvp); + + /* Initialize vfs fields */ + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lx_autofs_fstype; + vfsp->vfs_data = data; + + /* Invent a dev_t (sigh) */ + do { + dev = makedevice(lx_autofs_major, + atomic_add_32_nv(&lx_autofs_minor, 1) & L_MAXMIN32); + } while (vfs_devismounted(dev)); + vfsp->vfs_dev = dev; + vfs_make_fsid(&vfsp->vfs_fsid, dev, lx_autofs_fstype); + + /* Create an id space arena for automounter requests. */ + (void) snprintf(name, sizeof (name), "lx_autofs_id_%d", + getminor(vfsp->vfs_dev)); + data->lav_ids = id_space_create(name, 1, INT_MAX); + + /* Create hashes to keep track of automounter requests. */ + mutex_init(&data->lav_lock, NULL, MUTEX_DEFAULT, NULL); + (void) snprintf(name, sizeof (name), "lx_autofs_path_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_path_hash = mod_hash_create_strhash(name, + LX_AUTOFS_VFS_PATH_HASH_SIZE, mod_hash_null_valdtor); + (void) snprintf(name, sizeof (name), "lx_autofs_id_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_id_hash = mod_hash_create_idhash(name, + LX_AUTOFS_VFS_ID_HASH_SIZE, mod_hash_null_valdtor); + + /* Create a hash to keep track of vnodes. */ + (void) snprintf(name, sizeof (name), "lx_autofs_vn_hash_%d", + getminor(vfsp->vfs_dev)); + data->lav_vn_hash = mod_hash_create_ptrhash(name, + LX_AUTOFS_VFS_VN_HASH_SIZE, mod_hash_null_valdtor, + sizeof (vnode_t)); + + /* Create root vnode */ + data->lav_root = i_vn_alloc(vfsp, data->lav_bs_vp); + data->lav_root->v_flag |= + VROOT | VNOCACHE | VNOMAP | VNOSWAP | VNOMOUNT; + + return (0); +} + +static int +lx_autofs_unmount(vfs_t *vfsp, int flag, struct cred *cr) +{ + lx_autofs_vfs_t *data; + + if (secpolicy_fs_unmount(cr, vfsp) != 0) + return (EPERM); + + /* We do not currently support forced unmounts. */ + if (flag & MS_FORCE) + return (ENOTSUP); + + /* + * We should never have a reference count of less than 2: one for the + * caller, one for the root vnode. + */ + ASSERT(vfsp->vfs_count >= 2); + + /* If there are any outstanding vnodes, we can't unmount. */ + if (vfsp->vfs_count > 2) + return (EBUSY); + + /* Check for any remaining holds on the root vnode. */ + data = vfsp->vfs_data; + ASSERT(data->lav_root->v_vfsp == vfsp); + if (data->lav_root->v_count > 1) + return (EBUSY); + + /* Close the fifo to the automount process. */ + if (data->lav_fifo_wr != NULL) + (void) closef(data->lav_fifo_wr); + if (data->lav_fifo_rd != NULL) + (void) closef(data->lav_fifo_rd); + + /* + * We have to release our hold on our root vnode before we can + * delete the backing store. (Since the root vnode is linked + * to the backing store.) + */ + VN_RELE(data->lav_root); + + /* Cleanup the backing store. */ + i_bs_destroy(data->lav_mvp, data->lav_bs_name); + VN_RELE(data->lav_mvp); + + /* Cleanup out remaining data structures. */ + mod_hash_destroy_strhash(data->lav_path_hash); + mod_hash_destroy_idhash(data->lav_id_hash); + mod_hash_destroy_ptrhash(data->lav_vn_hash); + id_space_destroy(data->lav_ids); + kmem_free(data, sizeof (lx_autofs_vfs_t)); + + return (0); +} + +static int +lx_autofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + + *vpp = data->lav_root; + VN_HOLD(*vpp); + + return (0); +} + +static int +lx_autofs_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + lx_autofs_vfs_t *data = vfsp->vfs_data; + vnode_t *urvp = data->lav_root->v_data; + dev32_t d32; + int error; + + if ((error = VFS_STATVFS(urvp->v_vfsp, sp)) != 0) + return (error); + + /* Update some of values before returning. */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + (void) strlcpy(sp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name, + sizeof (sp->f_basetype)); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + return (0); +} + +static const fs_operation_def_t lx_autofs_vfstops[] = { + { VFSNAME_MOUNT, { .vfs_mount = lx_autofs_mount } }, + { VFSNAME_UNMOUNT, { .vfs_unmount = lx_autofs_unmount } }, + { VFSNAME_ROOT, { .vfs_root = lx_autofs_root } }, + { VFSNAME_STATVFS, { .vfs_statvfs = lx_autofs_statvfs } }, + { NULL, NULL } +}; + +/* + * VOP entry points - simple passthrough + * + * For most VOP entry points we can simply pass the request on to + * the underlying filesystem we're mounted on. + */ +static int +lx_autofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_CLOSE(uvp, flag, count, offset, cr, ctp)); +} + +static int +lx_autofs_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ctp, int flags) +{ + vnode_t *uvp = vp->v_data; + return (VOP_READDIR(uvp, uiop, cr, eofp, ctp, flags)); +} + +static int +lx_autofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_ACCESS(uvp, mode, flags, cr, ctp)); +} + +static int +lx_autofs_rwlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + return (VOP_RWLOCK(uvp, write_lock, ctp)); +} + +static void +lx_autofs_rwunlock(struct vnode *vp, int write_lock, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + VOP_RWUNLOCK(uvp, write_lock, ctp); +} + +/*ARGSUSED*/ +static int +lx_autofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ctp, int flags) +{ + vnode_t *udvp = dvp->v_data; + + /* + * cdir is the calling processes current directory. + * If cdir is lx_autofs vnode then get its real underlying + * vnode ptr. (It seems like the only thing cdir is + * ever used for is to make sure the user doesn't delete + * their current directory.) + */ + if (vn_matchops(cdir, lx_autofs_vn_ops)) { + vnode_t *ucdir = cdir->v_data; + return (VOP_RMDIR(udvp, nm, ucdir, cr, ctp, flags)); + } + + return (VOP_RMDIR(udvp, nm, cdir, cr, ctp, flags)); +} + +/* + * VOP entry points - special passthrough + * + * For some VOP entry points we will first pass the request on to + * the underlying filesystem we're mounted on. If there's an error + * then we immediately return the error, but if the request succeeds + * we have to do some extra work before returning. + */ +static int +lx_autofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ctp) +{ + vnode_t *ovp = *vpp; + vnode_t *uvp = ovp->v_data; + int error; + + if ((error = VOP_OPEN(&uvp, flag, cr, ctp)) != 0) + return (error); + + /* Check for clone opens. */ + if (uvp == ovp->v_data) + return (0); + + /* Deal with clone opens by returning a new vnode. */ + *vpp = i_vn_alloc(ovp->v_vfsp, uvp); + VN_RELE(ovp); + return (0); +} + +static int +lx_autofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + int error; + + if ((error = VOP_GETATTR(uvp, vap, flags, cr, ctp)) != 0) + return (error); + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = vp->v_vfsp->vfs_dev; + return (0); +} + +static int +lx_autofs_mkdir(vnode_t *dvp, char *nm, struct vattr *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ctp, int flags, vsecattr_t *vsecp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + int error; + + if ((error = VOP_MKDIR(udvp, nm, vap, &uvp, cr, + ctp, flags, vsecp)) != 0) + return (error); + + /* Update the attributes with our filesystem id. */ + vap->va_fsid = dvp->v_vfsp->vfs_dev; + + /* Allocate a new vnode. */ + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); +} + +/* + * VOP entry points - custom + */ +/*ARGSUSED*/ +static void +lx_autofs_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ctp) +{ + lx_autofs_vfs_t *data = vp->v_vfsp->vfs_data; + + /* + * We need to hold the vfs lock because if we're going to free + * this vnode we have to prevent anyone from looking it up + * in the vnode hash. + */ + mutex_enter(&data->lav_lock); + mutex_enter(&vp->v_lock); + + if (vp->v_count < 1) { + panic("lx_autofs_inactive: bad v_count"); + /*NOTREACHED*/ + } + + /* Drop the temporary hold by vn_rele now. */ + if (--vp->v_count > 0) { + mutex_exit(&vp->v_lock); + mutex_exit(&data->lav_lock); + return; + } + + /* + * No one should have been blocked on this lock because we're + * about to free this vnode. + */ + i_vn_free(vp); +} + +static int +lx_autofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ctp, + int *direntflags, pathname_t *realpnp) +{ + vnode_t *udvp = dvp->v_data; + vnode_t *uvp = NULL; + int error; + + /* First try to lookup if this path component already exitst. */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, ctp, + direntflags, realpnp)) == 0) { + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + + /* Only query the automounter if the path does not exist. */ + if (error != ENOENT) + return (error); + + /* Refer the lookup to the automounter. */ + if ((error = i_automounter_call(dvp, nm)) != 0) + return (error); + + /* Retry the lookup operation. */ + if ((error = VOP_LOOKUP(udvp, nm, &uvp, pnp, flags, rdir, cr, ctp, + direntflags, realpnp)) == 0) { + *vpp = i_vn_alloc(dvp->v_vfsp, uvp); + return (0); + } + return (error); +} + +/*ARGSUSED*/ +static int +lx_autofs_ioctl(vnode_t *vp, int cmd, intptr_t arg, int mode, cred_t *cr, + int *rvalp, caller_context_t *ctp) +{ + vnode_t *uvp = vp->v_data; + + /* Intercept certain ioctls. */ + switch ((uint_t)cmd) { + case LX_AUTOFS_IOC_READY: + case LX_AUTOFS_IOC_FAIL: + case LX_AUTOFS_IOC_CATATONIC: + case LX_AUTOFS_IOC_EXPIRE: + case LX_AUTOFS_IOC_PROTOVER: + case LX_AUTOFS_IOC_SETTIMEOUT: + return (i_automounter_ioctl(vp, cmd, arg)); + } + + /* Pass any remaining ioctl on. */ + return (VOP_IOCTL(uvp, cmd, arg, mode, cr, rvalp, ctp)); +} + +/* + * VOP entry points definitions + */ +static const fs_operation_def_t lx_autofs_tops_root[] = { + { VOPNAME_OPEN, { .vop_open = lx_autofs_open } }, + { VOPNAME_CLOSE, { .vop_close = lx_autofs_close } }, + { VOPNAME_IOCTL, { .vop_ioctl = lx_autofs_ioctl } }, + { VOPNAME_RWLOCK, { .vop_rwlock = lx_autofs_rwlock } }, + { VOPNAME_RWUNLOCK, { .vop_rwunlock = lx_autofs_rwunlock } }, + { VOPNAME_GETATTR, { .vop_getattr = lx_autofs_getattr } }, + { VOPNAME_ACCESS, { .vop_access = lx_autofs_access } }, + { VOPNAME_READDIR, { .vop_readdir = lx_autofs_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = lx_autofs_lookup } }, + { VOPNAME_INACTIVE, { .vop_inactive = lx_autofs_inactive } }, + { VOPNAME_MKDIR, { .vop_mkdir = lx_autofs_mkdir } }, + { VOPNAME_RMDIR, { .vop_rmdir = lx_autofs_rmdir } }, + { NULL } +}; + +/* + * lx_autofs_init() gets invoked via the mod_install() call in + * this modules _init() routine. Therefor, the code that cleans + * up the structures we allocate below is actually found in + * our _fini() routine. + */ +/* ARGSUSED */ +static int +lx_autofs_init(int fstype, char *name) +{ + int error; + + if ((lx_autofs_major = + (major_t)space_fetch(LX_AUTOFS_SPACE_KEY_UDEV)) == 0) { + + if ((lx_autofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lx_autofs_init: " + "can't get unique device number"); + return (EAGAIN); + } + + if (space_store(LX_AUTOFS_SPACE_KEY_UDEV, + (uintptr_t)lx_autofs_major) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: " + "can't save unique device number"); + return (EAGAIN); + } + } + + lx_autofs_fstype = fstype; + if ((error = vfs_setfsops( + fstype, lx_autofs_vfstops, &lx_autofs_vfsops)) != 0) { + cmn_err(CE_WARN, "lx_autofs_init: bad vfs ops template"); + return (error); + } + + if ((error = vn_make_ops("lx_autofs vnode ops", + lx_autofs_tops_root, &lx_autofs_vn_ops)) != 0) { + VERIFY(vfs_freevfsops_by_type(fstype) == 0); + lx_autofs_vn_ops = NULL; + return (error); + } + + return (0); +} + + +/* + * Module linkage + */ +static mntopt_t lx_autofs_mntopt[] = { + { LX_MNTOPT_FD, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_PGRP, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MINPROTO, NULL, 0, MO_HASVALUE }, + { LX_MNTOPT_MAXPROTO, NULL, 0, MO_HASVALUE } +}; + +static mntopts_t lx_autofs_mntopts = { + sizeof (lx_autofs_mntopt) / sizeof (mntopt_t), + lx_autofs_mntopt +}; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + LX_AUTOFS_NAME, + lx_autofs_init, + VSW_HASPROTO | VSW_VOLATILEDEV, + &lx_autofs_mntopts +}; + +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "linux autofs filesystem", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + if ((error = mod_remove(&modlinkage)) != 0) + return (error); + + if (lx_autofs_vn_ops != NULL) { + vn_freevnodeops(lx_autofs_vn_ops); + lx_autofs_vn_ops = NULL; + } + + /* + * In our init routine, if we get an error after calling + * vfs_setfsops() we cleanup by calling vfs_freevfsops_by_type(). + * But we don't need to call vfs_freevfsops_by_type() here + * because the fs framework did this for us as part of the + * mod_remove() call above. + */ + return (0); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c new file mode 100644 index 0000000000..510626d220 --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.c @@ -0,0 +1,497 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + + +#include <sys/modctl.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stat.h> +#include <sys/conf.h> +#include <sys/frame.h> +#include <sys/dtrace.h> +#include <sys/dtrace_impl.h> + +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> + +/* + * We store the syscall number in the low 16 bits (which limits us to 64k + * syscalls). The next bit indicates entry/return probe and the next bit + * indicates 64bit/32bit syscall. + */ +#define SCALL_MASK 0xffff +#define ENTRY_FLAG 0x10000 +#define SYSC_64_BIT 0x100000 + +#define LX_SYSTRACE_IS64BIT(x) ((int)(x) & SYSC_64_BIT) +#define LX_SYSTRACE_ISENTRY(x) ((int)(x) & ENTRY_FLAG) +#define LX_SYSTRACE_SYSNUM(x) ((int)(x) & SCALL_MASK) + +#define LX_SYSTRACE32_ENTRY(id) (ENTRY_FLAG | (id)) +#define LX_SYSTRACE32_RETURN(id) (id) + +#define LX_SYSTRACE64_ENTRY(id) (SYSC_64_BIT | ENTRY_FLAG | (id)) +#define LX_SYSTRACE64_RETURN(id) (SYSC_64_BIT | id) + +#define LX_SYSTRACE_ENTRY_AFRAMES 2 +#define LX_SYSTRACE_RETURN_AFRAMES 4 + +typedef struct lx_systrace_sysent { + const char *lss_name; + dtrace_id_t lss_entry; + dtrace_id_t lss_return; +} lx_systrace_sysent_t; + +static dev_info_t *lx_systrace_devi; +static dtrace_provider_id_t lx_systrace_id; +static kmutex_t lx_systrace_lock; +static uint_t lx_systrace_nenabled; + +static int lx_systrace_nsysent32; +static lx_systrace_sysent_t *lx_systrace_sysent32; + +#if defined(_LP64) +static int lx_systrace_nsysent64; +static lx_systrace_sysent_t *lx_systrace_sysent64; +#endif + +/*ARGSUSED*/ +static void +lx_systrace_entry(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_entry; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_entry; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_return(ulong_t sysnum, ulong_t arg0, ulong_t arg1, ulong_t arg2, + ulong_t arg3, ulong_t arg4, ulong_t arg5) +{ + dtrace_id_t id; + +#if defined(_LP64) + if ((ttoproc(curthread))->p_model == DATAMODEL_NATIVE) { + if (sysnum >= lx_systrace_nsysent64) + return; + id = lx_systrace_sysent64[sysnum].lss_return; + } else +#endif + { + if (sysnum >= lx_systrace_nsysent32) + return; + id = lx_systrace_sysent32[sysnum].lss_return; + } + + if (id == DTRACE_IDNONE) + return; + dtrace_probe(id, arg0, arg1, arg2, arg3, arg4); +} + +/*ARGSUSED*/ +static void +lx_systrace_provide(void *arg, const dtrace_probedesc_t *desc) +{ + int i; + + if (desc != NULL) + return; + + for (i = 0; i < lx_systrace_nsysent32; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys32", + lx_systrace_sysent32[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE32_RETURN(i))); + + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + for (i = 0; i < lx_systrace_nsysent64; i++) { + if (dtrace_probe_lookup(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry") != 0) + continue; + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "entry", + LX_SYSTRACE_ENTRY_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_ENTRY(i))); + + (void) dtrace_probe_create(lx_systrace_id, "sys64", + lx_systrace_sysent64[i].lss_name, "return", + LX_SYSTRACE_RETURN_AFRAMES, + (void *)((uintptr_t)LX_SYSTRACE64_RETURN(i))); + + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif +} + +/*ARGSUSED*/ +static int +lx_systrace_enable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + mutex_enter(&lx_systrace_lock); + if (lx_systrace_nenabled++ == 0) + lx_brand_systrace_enable(); + mutex_exit(&lx_systrace_lock); + + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { +#if defined(_LP64) + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = id; + } else { + lx_systrace_sysent64[sysnum].lss_return = id; + } +#endif + } else { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = id; + } else { + lx_systrace_sysent32[sysnum].lss_return = id; + } + } + return (0); +} + +/*ARGSUSED*/ +static void +lx_systrace_disable(void *arg, dtrace_id_t id, void *parg) +{ + int sysnum = LX_SYSTRACE_SYSNUM((uintptr_t)parg); + + if (LX_SYSTRACE_IS64BIT((uintptr_t)parg)) { +#if defined(_LP64) + ASSERT(sysnum < lx_systrace_nsysent64); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent64[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent64[sysnum].lss_return = DTRACE_IDNONE; + } +#endif + } else { + ASSERT(sysnum < lx_systrace_nsysent32); + + if (LX_SYSTRACE_ISENTRY((uintptr_t)parg)) { + lx_systrace_sysent32[sysnum].lss_entry = DTRACE_IDNONE; + } else { + lx_systrace_sysent32[sysnum].lss_return = DTRACE_IDNONE; + } + } + + mutex_enter(&lx_systrace_lock); + if (--lx_systrace_nenabled == 0) + lx_brand_systrace_disable(); + mutex_exit(&lx_systrace_lock); +} + +/*ARGSUSED*/ +static void +lx_systrace_destroy(void *arg, dtrace_id_t id, void *parg) +{ +} + +/*ARGSUSED*/ +static uint64_t +lx_systrace_getarg(void *arg, dtrace_id_t id, void *parg, int argno, + int aframes) +{ + struct frame *fp = (struct frame *)dtrace_getfp(); + uintptr_t *stack; + uint64_t val = 0; + int i; + + if (argno >= 6) + return (0); + + /* + * Walk the four frames down the stack to the entry or return callback. + * Our callback calls dtrace_probe() which calls dtrace_dif_variable() + * which invokes this function to get the extended arguments. We get + * the frame pointer in via call to dtrace_getfp() above which makes for + * four frames. + */ + for (i = 0; i < 4; i++) { + fp = (struct frame *)fp->fr_savfp; + } + + stack = (uintptr_t *)&fp[1]; + + /* + * Skip the first argument to the callback -- the system call number. + */ + argno++; + +#ifdef __amd64 + /* + * On amd64, the first 6 arguments are passed in registers while + * subsequent arguments are on the stack. + */ + argno -= 6; +#endif + + DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT); + val = stack[argno]; + DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT); + + return (val); +} + + +static const dtrace_pattr_t lx_systrace_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +}; + +static dtrace_pops_t lx_systrace_pops = { + lx_systrace_provide, + NULL, + lx_systrace_enable, + lx_systrace_disable, + NULL, + NULL, + NULL, + lx_systrace_getarg, + NULL, + lx_systrace_destroy +}; + +static int +lx_systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + int i; + + switch (cmd) { + case DDI_ATTACH: + break; + case DDI_RESUME: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "lx_systrace", S_IFCHR, + 0, DDI_PSEUDO, NULL) == DDI_FAILURE || + dtrace_register("lx-syscall", &lx_systrace_attr, + DTRACE_PRIV_USER, 0, &lx_systrace_pops, NULL, + &lx_systrace_id) != 0) { + ddi_remove_minor_node(devi, NULL); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + lx_systrace_devi = devi; + + /* + * Initialize the 32-bit table. + */ + VERIFY(lx_nsysent32 > 0); + lx_systrace_nsysent32 = lx_nsysent32; + lx_systrace_sysent32 = kmem_zalloc(lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent32; i++) { + lx_systrace_sysent32[i].lss_name = lx_sysent32[i].sy_name; + lx_systrace_sysent32[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent32[i].lss_return = DTRACE_IDNONE; + } + +#if defined(_LP64) + /* + * Initialize the 64-bit table. + */ + VERIFY(lx_nsysent64 > 0); + lx_systrace_nsysent64 = lx_nsysent64; + lx_systrace_sysent64 = kmem_zalloc(lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t), KM_SLEEP); + + for (i = 0; i < lx_systrace_nsysent64; i++) { + lx_systrace_sysent64[i].lss_name = lx_sysent64[i].sy_name; + lx_systrace_sysent64[i].lss_entry = DTRACE_IDNONE; + lx_systrace_sysent64[i].lss_return = DTRACE_IDNONE; + } +#endif + + /* + * Install probe triggers. + */ + lx_systrace_entry_ptr = lx_systrace_entry; + lx_systrace_return_ptr = lx_systrace_return; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + case DDI_SUSPEND: + return (DDI_SUCCESS); + default: + return (DDI_FAILURE); + } + + if (dtrace_unregister(lx_systrace_id) != 0) + return (DDI_FAILURE); + + /* + * Free tables. + */ + kmem_free(lx_systrace_sysent32, lx_systrace_nsysent32 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent32 = NULL; + lx_systrace_nsysent32 = 0; + +#if defined(_LP64) + kmem_free(lx_systrace_sysent64, lx_systrace_nsysent64 * + sizeof (lx_systrace_sysent_t)); + lx_systrace_sysent64 = NULL; + lx_systrace_nsysent64 = 0; +#endif + + /* + * Reset probe triggers. + */ + lx_systrace_entry_ptr = NULL; + lx_systrace_return_ptr = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_systrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + return (0); +} + +static struct cb_ops lx_systrace_cb_ops = { + lx_systrace_open, /* open */ + nodev, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + nodev, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops lx_systrace_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + ddi_getinfo_1to1, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + lx_systrace_attach, /* attach */ + lx_systrace_detach, /* detach */ + nodev, /* reset */ + &lx_systrace_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "Linux Brand System Call Tracing", /* name of module */ + &lx_systrace_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf new file mode 100644 index 0000000000..e4499c8a5b --- /dev/null +++ b/usr/src/uts/common/brand/lx/dtrace/lx_systrace.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_systrace" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_audio.c b/usr/src/uts/common/brand/lx/io/lx_audio.c new file mode 100644 index 0000000000..e8c6234d92 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_audio.c @@ -0,0 +1,1996 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#include <sys/audio.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/disp.h> +#include <sys/ddi.h> +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/kmem.h> +#include <sys/lx_audio.h> +#include <sys/mixer.h> +#include <sys/modhash.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/stropts.h> +#include <sys/types.h> +#include <sys/zone.h> + +/* Properties used by the lx_audio driver */ +#define LXA_PROP_INPUTDEV "inputdev" +#define LXA_PROP_OUTPUTDEV "outputdev" + +/* default device paths used by this driver */ +#define LXA_DEV_DEFAULT "/dev/audio" +#define LXA_DEV_CUSTOM_DIR "/dev/sound/" + +/* maximum possible number of concurrent opens of this driver */ +#define LX_AUDIO_MAX_OPENS 1024 + +/* + * these are default fragment size and fragment count values. + * these values were chosen to make quake work well on my + * laptop: 2Ghz Pentium M + NVIDIA GeForce Go 6400. + * + * for reference: + * - 1 sec of stereo output at 44Khz is about 171 Kb of data + * - 1 sec of mono output at 8Khz is about 8Kb of data + */ +#define LXA_OSS_FRAG_SIZE (1024) /* 1/8 sec at 8Khz mono */ +#define LXA_OSS_FRAG_CNT (1024 * 2) + +/* maximum ammount of fragment memory we'll allow a process to mmap */ +#define LXA_OSS_FRAG_MEM (1024 * 1024 * 2) /* 2Mb */ + +/* forward declarations */ +typedef struct lxa_state lxa_state_t; +typedef struct lxa_zstate lxa_zstate_t; + +/* + * Structure and enum declarations + */ +typedef enum { + LXA_TYPE_INVALID = 0, + LXA_TYPE_AUDIO = 1, /* audio device */ + LXA_TYPE_AUDIOCTL = 2 /* audio control/mixer device */ +} lxa_dev_type_t; + +struct lxa_zstate { + char *lxa_zs_zonename; + + /* + * we could store the input/output audio device setting here, + * but instead we're keeing them as device node properties + * so that a user can easily see the audio configuration for + * a zone via prtconf. + */ + + /* + * OSS doesn't support multiple opens of the audio device. + * (multiple opens of the mixer device are supported.) + * so here we'll keep a pointer to any open input/output + * streams. (OSS does support two opens if one is for input + * and the other is for output.) + */ + lxa_state_t *lxa_zs_istate; + lxa_state_t *lxa_zs_ostate; + + /* + * we need to cache channel gain and balance. channel gain and + * balance map to PCM volume in OSS, which are supposedly a property + * of the underlying hardware. but in solaris, channels are + * implemented in software and only exist when an audio device + * is actually open. (each open returns a unique channel.) OSS + * apps will expect consistent PCM volume set/get operations to + * work even if no audio device is open. hence, if no underlying + * device is open we need to cache the gain and balance setting. + */ + lxa_mixer_levels_t lxa_zs_pcm_levels; +}; + +struct lxa_state { + lxa_zstate_t *lxas_zs; /* zone state pointer */ + + dev_t lxas_dev_old; /* dev_t used to open the device */ + dev_t lxas_dev_new; /* new dev_t assigned to an open */ + int lxas_flags; /* original flags passed to open */ + lxa_dev_type_t lxas_type; /* type of device that was opened */ + + int lxas_devs_same; /* input and output device the same? */ + + /* input device variables */ + ldi_handle_t lxas_idev_lh; /* ldi handle for access */ + int lxas_idev_flags; /* flags used for open */ + + /* output device variables */ + ldi_handle_t lxas_odev_lh; /* ldi handle for access */ + int lxas_odev_flags; /* flags used for open */ + + /* + * since we support multiplexing of devices we need to remember + * certain parameters about the devices + */ + uint_t lxas_hw_features; + uint_t lxas_sw_features; + + uint_t lxas_frag_size; + uint_t lxas_frag_cnt; + + /* + * members needed to support mmap device access. note that to + * simplifly things we only support one mmap access per open. + */ + ddi_umem_cookie_t lxas_umem_cookie; + char *lxas_umem_ptr; + size_t lxas_umem_len; + kthread_t *lxas_mmap_thread; + int lxas_mmap_thread_running; + int lxas_mmap_thread_exit; + int lxas_mmap_thread_frag; +}; + +/* + * Global variables + */ +dev_info_t *lxa_dip = NULL; +kmutex_t lxa_lock; +id_space_t *lxa_minor_id = NULL; +mod_hash_t *lxa_state_hash = NULL; +mod_hash_t *lxa_zstate_hash = NULL; +size_t lxa_state_hash_size = 15; +size_t lxa_zstate_hash_size = 15; +size_t lxa_registered_zones = 0; + +/* + * function declarations + */ +static void lxa_mmap_output_disable(lxa_state_t *); + +/* + * functions + */ +static void +lxa_state_close(lxa_state_t *lxa_state) +{ + lxa_zstate_t *lxa_zs = lxa_state->lxas_zs; + minor_t minor = getminor(lxa_state->lxas_dev_new); + + /* disable any mmap output that might still be going on */ + lxa_mmap_output_disable(lxa_state); + + /* + * if this was the active input/output device, unlink it from + * the global zone state so that other opens of the audio device + * can now succeed. + */ + mutex_enter(&lxa_lock); + if (lxa_zs->lxa_zs_istate == lxa_state) + lxa_zs->lxa_zs_istate = NULL; + if (lxa_zs->lxa_zs_ostate == lxa_state) { + lxa_zs->lxa_zs_ostate = NULL; + } + mutex_exit(&lxa_lock); + + /* remove this state structure from the hash (if it's there) */ + (void) mod_hash_remove(lxa_state_hash, + (mod_hash_key_t)(uintptr_t)minor, (mod_hash_val_t *)&lxa_state); + + /* close any audio device that we have open */ + if (lxa_state->lxas_idev_lh != NULL) + (void) ldi_close(lxa_state->lxas_idev_lh, + lxa_state->lxas_idev_flags, kcred); + if (lxa_state->lxas_odev_lh != NULL) + (void) ldi_close(lxa_state->lxas_odev_lh, + lxa_state->lxas_odev_flags, kcred); + + /* free up any memory allocated by mmaps */ + if (lxa_state->lxas_umem_cookie != NULL) + ddi_umem_free(lxa_state->lxas_umem_cookie); + + /* release the id associated with this state structure */ + id_free(lxa_minor_id, minor); + + kmem_free(lxa_state, sizeof (*lxa_state)); +} + +static char * +getzonename(void) +{ + return (curproc->p_zone->zone_name); +} + +static char * +lxa_devprop_name(char *zname, char *pname) +{ + char *zpname; + int n; + + ASSERT((pname != NULL) && (zname != NULL)); + + /* prepend the zone name to the property name */ + n = snprintf(NULL, 0, "%s_%s", zname, pname) + 1; + zpname = kmem_alloc(n, KM_SLEEP); + (void) snprintf(zpname, n, "%s_%s", zname, pname); + + return (zpname); +} + +static int +lxa_devprop_verify(char *pval) +{ + int n; + + ASSERT(pval != NULL); + + if (strcmp(pval, "default") == 0) + return (0); + + /* make sure the value is an integer */ + for (n = 0; pval[n] != '\0'; n++) { + if ((pval[n] < '0') && (pval[n] > '9')) { + return (-1); + } + } + + return (0); +} + +static char * +lxa_devprop_lookup(char *zname, char *pname, lxa_dev_type_t lxa_type) +{ + char *zprop_name, *pval; + char *dev_path; + int n, rv; + + ASSERT((pname != NULL) && (zname != NULL)); + ASSERT((lxa_type == LXA_TYPE_AUDIO) || (lxa_type == LXA_TYPE_AUDIOCTL)); + + zprop_name = lxa_devprop_name(zname, pname); + + /* attempt to lookup the property */ + rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, zprop_name, &pval); + strfree(zprop_name); + + if (rv != DDI_PROP_SUCCESS) + return (NULL); + + if (lxa_devprop_verify(pval) != 0) { + ddi_prop_free(pval); + return (NULL); + } + + if (strcmp(pval, "none") == 0) { + /* there is no audio device specified */ + return (NULL); + } else if (strcmp(pval, "default") == 0) { + /* use the default audio device on the system */ + dev_path = strdup(LXA_DEV_DEFAULT); + } else { + /* a custom audio device was specified, generate a path */ + n = snprintf(NULL, 0, "%s%s", LXA_DEV_CUSTOM_DIR, pval) + 1; + dev_path = kmem_alloc(n, KM_SLEEP); + (void) snprintf(dev_path, n, "%s%s", LXA_DEV_CUSTOM_DIR, pval); + } + ddi_prop_free(pval); + + /* + * if this is an audio control device so we need to append + * "ctl" to the path + */ + if (lxa_type == LXA_TYPE_AUDIOCTL) { + char *tmp; + n = snprintf(NULL, 0, "%s%s", dev_path, "ctl") + 1; + tmp = kmem_alloc(n, KM_SLEEP); + (void) snprintf(tmp, n, "%s%s", dev_path, "ctl"); + strfree(dev_path); + dev_path = tmp; + } + + return (dev_path); +} + +static int +lxa_dev_getfeatures(lxa_state_t *lxa_state) +{ + audio_info_t ai_idev, ai_odev; + int n, rv; + + /* set a default fragment size */ + lxa_state->lxas_frag_size = LXA_OSS_FRAG_SIZE; + lxa_state->lxas_frag_cnt = LXA_OSS_FRAG_CNT; + + /* get info for the currently open audio devices */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, + AUDIO_GETINFO, (intptr_t)&ai_idev, FKIOCTL, kcred, &n)) != 0)) + return (rv); + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_GETINFO, (intptr_t)&ai_odev, FKIOCTL, kcred, &n)) != 0)) + return (rv); + + /* if we're only open for reading or writing then it's easy */ + if (lxa_state->lxas_idev_lh == NULL) { + lxa_state->lxas_sw_features = ai_odev.sw_features; + lxa_state->lxas_hw_features = ai_odev.hw_features; + return (0); + } else if (lxa_state->lxas_odev_lh == NULL) { + lxa_state->lxas_sw_features = ai_idev.sw_features; + lxa_state->lxas_hw_features = ai_idev.hw_features; + return (0); + } + + /* + * well if we're open for reading and writing but the underlying + * device is the same then it's also pretty easy + */ + if (lxa_state->lxas_devs_same) { + if ((ai_odev.sw_features != ai_idev.sw_features) || + (ai_odev.hw_features != ai_idev.hw_features)) { + zcmn_err(getzoneid(), CE_WARN, "lx_audio error: " + "audio device reported inconsistent features"); + return (EIO); + } + lxa_state->lxas_sw_features = ai_odev.sw_features; + lxa_state->lxas_hw_features = ai_odev.hw_features; + return (0); + } + + /* + * figure out which software features we're going to support. + * we will report a feature as supported if both the input + * and output device support it. + */ + lxa_state->lxas_sw_features = 0; + n = ai_idev.sw_features & ai_odev.sw_features; + if (n & AUDIO_SWFEATURE_MIXER) + lxa_state->lxas_sw_features |= AUDIO_SWFEATURE_MIXER; + + /* + * figure out which hardware features we're going to support. + * for a first pass we will report a feature as supported if + * both the input and output device support it. + */ + lxa_state->lxas_hw_features = 0; + n = ai_idev.hw_features & ai_odev.hw_features; + if (n & AUDIO_HWFEATURE_MSCODEC) + lxa_state->lxas_hw_features |= AUDIO_HWFEATURE_MSCODEC; + + /* + * if we made it here then we have different audio input and output + * devices. this will allow us to report support for additional + * hardware features that may not supported by just the input or + * output device alone. + */ + + /* always report tha we support both playback and recording */ + lxa_state->lxas_hw_features = + AUDIO_HWFEATURE_PLAY | AUDIO_HWFEATURE_RECORD; + + /* always report full duplex support */ + lxa_state->lxas_hw_features = AUDIO_HWFEATURE_DUPLEX; + + /* never report that we have input to output loopback support */ + ASSERT((lxa_state->lxas_hw_features & AUDIO_HWFEATURE_IN2OUT) == 0); + return (0); +} + +static int +lxa_dev_open(lxa_state_t *lxa_state) +{ + char *idev, *odev; + int flags, rv; + ldi_handle_t lh; + ldi_ident_t li = NULL; + + ASSERT((lxa_state->lxas_type == LXA_TYPE_AUDIO) || + (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL)); + + /* + * check if we have configuration properties for this zone. + * if we don't then audio isn't supported in this zone. + */ + idev = lxa_devprop_lookup(getzonename(), LXA_PROP_INPUTDEV, + lxa_state->lxas_type); + odev = lxa_devprop_lookup(getzonename(), LXA_PROP_OUTPUTDEV, + lxa_state->lxas_type); + + /* make sure there is at least one device to read from or write to */ + if ((idev == NULL) && (odev == NULL)) + return (ENODEV); + + /* see if the input and output devices are actually the same device */ + if (((idev != NULL) && (odev != NULL)) && + (strcmp(idev, odev) == 0)) + lxa_state->lxas_devs_same = 1; + + /* we don't respect FEXCL */ + flags = lxa_state->lxas_flags & ~FEXCL; + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + /* + * if we're opening audio devices then we need to muck + * with the FREAD/FWRITE flags. + * + * certain audio device may only support input or output + * (but not both.) so if we're multiplexing input/output + * to different devices we need to make sure we don't try + * and open the output device for reading and the input + * device for writing. + * + * if we're using the same device for input/output we still + * need to do this because some audio devices won't let + * themselves be opened multiple times for read access. + */ + lxa_state->lxas_idev_flags = flags & ~FWRITE; + lxa_state->lxas_odev_flags = flags & ~FREAD; + + /* make sure we have devices to read from and write to */ + if (((flags & FREAD) && (idev == NULL)) || + ((flags & FWRITE) && (odev == NULL))) { + rv = ENODEV; + goto out; + } + } else { + lxa_state->lxas_idev_flags = lxa_state->lxas_odev_flags = flags; + } + + /* get an ident to open the devices */ + if (ldi_ident_from_dev(lxa_state->lxas_dev_new, &li) != 0) { + rv = ENODEV; + goto out; + } + + /* open the input device */ + lxa_state->lxas_idev_lh = NULL; + if (((lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) || + (lxa_state->lxas_idev_flags & FREAD)) && + (idev != NULL)) { + rv = ldi_open_by_name(idev, lxa_state->lxas_idev_flags, + kcred, &lh, li); + if (rv != 0) { + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "unable to open audio device: %s", idev); + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "possible zone audio configuration error"); + goto out; + } + lxa_state->lxas_idev_lh = lh; + } + + /* open the output device */ + lxa_state->lxas_odev_lh = NULL; + if (((lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) || + (lxa_state->lxas_odev_flags & FWRITE)) && + (odev != NULL)) { + rv = ldi_open_by_name(odev, lxa_state->lxas_odev_flags, + kcred, &lh, li); + if (rv != 0) { + /* + * If this open failed and we previously opened an + * input device, it is the responsibility of the + * caller to close that device after we return + * failure here. + */ + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "unable to open audio device: %s", odev); + zcmn_err(getzoneid(), CE_WARN, "lxa_open_dev: " + "possible zone audio configuration error"); + goto out; + } + lxa_state->lxas_odev_lh = lh; + } + + /* free up stuff */ +out: + if (li != NULL) + ldi_ident_release(li); + if (idev != NULL) + strfree(idev); + if (odev != NULL) + strfree(odev); + + return (rv); +} + +void +lxa_mmap_thread_exit(lxa_state_t *lxa_state) +{ + mutex_enter(&lxa_lock); + lxa_state->lxas_mmap_thread = NULL; + lxa_state->lxas_mmap_thread_frag = 0; + lxa_state->lxas_mmap_thread_running = 0; + lxa_state->lxas_mmap_thread_exit = 0; + mutex_exit(&lxa_lock); + thread_exit(); + /*NOTREACHED*/ +} + +void +lxa_mmap_thread(lxa_state_t *lxa_state) +{ + struct uio uio, uio_null; + iovec_t iovec, iovec_null; + uint_t bytes_per_sec, usec_per_frag, ticks_per_frag; + int rv, junk, eof, retry; + audio_info_t ai; + + /* we better be setup for writing to the output device */ + ASSERT((lxa_state->lxas_flags & FWRITE) != 0); + ASSERT(lxa_state->lxas_odev_lh != NULL); + + /* setup a uio to output one fragment */ + uio.uio_iov = &iovec; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + + /* setup a uio to output a eof (a fragment with a length of 0) */ + uio_null.uio_iov = &iovec_null; + uio_null.uio_iov->iov_len = 0; + uio_null.uio_iov->iov_base = NULL; + uio_null.uio_iovcnt = 1; + uio_null.uio_offset = 0; + uio_null.uio_segflg = UIO_SYSSPACE; + uio_null.uio_fmode = 0; + uio_null.uio_extflg = 0; + uio_null.uio_llimit = MAXOFFSET_T; + uio_null.uio_resid = 0; + +lxa_mmap_thread_top: + ASSERT(!MUTEX_HELD(&lxa_lock)); + + /* first drain any pending audio output */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_DRAIN, NULL, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_DRAIN failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* + * we depend on the ai.play.eof value to keep track of + * audio output progress so reset it here. + */ + AUDIO_INITINFO(&ai); + ai.play.eof = 0; + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_SETINFO failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* + * we're going to need to know the sampling rate and number + * of output channels to estimate how long we can sleep between + * requests. + */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_GETINFO, + (intptr_t)&ai, FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_GETINFO failed, aborting audio output"); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ + } + + /* estimate how many ticks it takes to output a fragment of data */ + bytes_per_sec = (ai.play.sample_rate * ai.play.channels * + ai.play.precision) / 8; + usec_per_frag = MICROSEC * lxa_state->lxas_frag_size / bytes_per_sec; + ticks_per_frag = drv_usectohz(usec_per_frag); + + /* queue up three fragments of of data into the output stream */ + eof = 3; + + /* sanity check the eof value */ + ASSERT(ai.play.eof == 0); + ai.play.eof = 0; + + /* we always start audio output at fragment 0 */ + mutex_enter(&lxa_lock); + lxa_state->lxas_mmap_thread_frag = 0; + + /* + * we shouldn't have allowed the mapping if it isn't a multiple + * of the fragment size + */ + ASSERT((lxa_state->lxas_umem_len % lxa_state->lxas_frag_size) == 0); + + while (!lxa_state->lxas_mmap_thread_exit) { + size_t start, end; + + /* + * calculate the start and ending offsets of the next + * fragment to output + */ + start = lxa_state->lxas_mmap_thread_frag * + lxa_state->lxas_frag_size; + end = start + lxa_state->lxas_frag_size; + + ASSERT(start < lxa_state->lxas_umem_len); + ASSERT(end <= lxa_state->lxas_umem_len); + + /* setup the uio to output one fragment of audio */ + uio.uio_resid = end - start; + uio.uio_iov->iov_len = end - start; + uio.uio_iov->iov_base = &lxa_state->lxas_umem_ptr[start]; + + /* increment the current fragment index */ + lxa_state->lxas_mmap_thread_frag = + (lxa_state->lxas_mmap_thread_frag + 1) % + (lxa_state->lxas_umem_len / lxa_state->lxas_frag_size); + + /* drop the audio lock before actually outputting data */ + mutex_exit(&lxa_lock); + + /* + * write the fragment of audio data to the device stream + * then write a eof to the stream to tell the device to + * increment ai.play.eof when it's done processing the + * fragment we just wrote + */ + if ((rv = ldi_write(lxa_state->lxas_odev_lh, + &uio, kcred)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "ldi_write() failed (%d), " + "resetting audio output", rv); + goto lxa_mmap_thread_top; + } + if ((rv = ldi_write(lxa_state->lxas_odev_lh, + &uio_null, kcred)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "ldi_write(eof) failed (%d), " + "resetting audio output", rv); + goto lxa_mmap_thread_top; + } + + /* + * we want to avoid buffer underrun so ensure that + * there is always at least one fragment of data in the + * output stream. + */ + mutex_enter(&lxa_lock); + if (--eof > 0) { + continue; + } + + /* + * now we wait until the audio device has finished outputting + * at least one fragment of data. + */ + retry = 0; + while (!lxa_state->lxas_mmap_thread_exit && (eof == 0)) { + uint_t ai_eof_old = ai.play.eof; + + mutex_exit(&lxa_lock); + + /* + * delay for the number of ticks it takes + * to output one fragment of data + */ + if (ticks_per_frag > 0) + delay(ticks_per_frag); + + /* check if we've managed to output any fragments */ + if ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, + AUDIO_GETINFO, (intptr_t)&ai, + FKIOCTL, kcred, &junk)) != 0) { + cmn_err(CE_WARN, "lxa_mmap_thread: " + "AUDIO_GETINFO failed (%d), " + "resetting audio output", rv); + /* re-start mmap audio output */ + goto lxa_mmap_thread_top; + } + + if (ai_eof_old == ai.play.eof) { + /* institute a random retry limit */ + if (retry++ < 100) { + mutex_enter(&lxa_lock); + continue; + } + cmn_err(CE_WARN, "lxa_mmap_thread: " + "output stalled, " + "resetting audio output"); + /* re-start mmap audio output */ + goto lxa_mmap_thread_top; + } + + if (ai.play.eof > ai_eof_old) { + eof = ai.play.eof - ai_eof_old; + } else { + /* eof counter wrapped around */ + ASSERT(ai_eof_old < ai.play.eof); + eof = ai.play.eof + (ai_eof_old - UINTMAX_MAX); + } + /* we're done with this loop so re-aquire the lock */ + ASSERT(eof != 0); + mutex_enter(&lxa_lock); + } + } + mutex_exit(&lxa_lock); + lxa_mmap_thread_exit(lxa_state); + /*NOTREACHED*/ +} + +static void +lxa_mmap_output_disable(lxa_state_t *lxa_state) +{ + kt_did_t tid; + + mutex_enter(&lxa_lock); + + /* if the output thread isn't running there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running == 0) { + mutex_exit(&lxa_lock); + return; + } + + /* tell the pcm mmap output thread to exit */ + lxa_state->lxas_mmap_thread_exit = 1; + + /* wait for the mmap output thread to exit */ + tid = lxa_state->lxas_mmap_thread->t_did; + mutex_exit(&lxa_lock); + thread_join(tid); +} + +static void +lxa_mmap_output_enable(lxa_state_t *lxa_state) +{ + mutex_enter(&lxa_lock); + + /* if the output thread is already running there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running != 0) { + mutex_exit(&lxa_lock); + return; + } + + /* setup output state */ + lxa_state->lxas_mmap_thread_running = 1; + lxa_state->lxas_mmap_thread_exit = 0; + lxa_state->lxas_mmap_thread_frag = 0; + + /* kick off a thread to do the mmap pcm output */ + lxa_state->lxas_mmap_thread = thread_create(NULL, 0, + (void (*)())lxa_mmap_thread, lxa_state, + 0, &p0, TS_RUN, minclsyspri); + ASSERT(lxa_state->lxas_mmap_thread != NULL); + + mutex_exit(&lxa_lock); +} + +static int +lxa_ioc_mmap_output(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + uint_t trigger; + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* if the user hasn't mmap the device then there's nothing to do */ + if (lxa_state->lxas_umem_cookie == NULL) + return (EINVAL); + + /* copy in the request */ + if (ddi_copyin((void *)arg, &trigger, sizeof (trigger), mode) != 0) + return (EFAULT); + + /* a zero value disables output */ + if (trigger == 0) { + lxa_mmap_output_disable(lxa_state); + return (0); + } + + /* a non-zero value enables output */ + lxa_mmap_output_enable(lxa_state); + return (0); +} + +static int +lxa_ioc_mmap_ptr(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + int ptr; + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* if the user hasn't mmap the device then there's nothing to do */ + if (lxa_state->lxas_umem_cookie == NULL) + return (EINVAL); + + /* if the output thread isn't running then there's nothing to do */ + if (lxa_state->lxas_mmap_thread_running == 0) + return (EINVAL); + + mutex_enter(&lxa_lock); + ptr = lxa_state->lxas_mmap_thread_frag * lxa_state->lxas_frag_size; + mutex_exit(&lxa_lock); + + if (ddi_copyout(&ptr, (void *)arg, sizeof (ptr), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_ioc_get_frag_info(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_frag_info_t fi; + + fi.lxa_fi_size = lxa_state->lxas_frag_size; + fi.lxa_fi_cnt = lxa_state->lxas_frag_cnt; + + if (ddi_copyout(&fi, (void *)arg, sizeof (fi), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_ioc_set_frag_info(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_frag_info_t fi; + + /* if the device is mmaped we can't change the fragment settings */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EINVAL); + + /* copy in the request */ + if (ddi_copyin((void *)arg, &fi, sizeof (fi), mode) != 0) + return (EFAULT); + + /* do basic bounds checking */ + if ((fi.lxa_fi_cnt == 0) || (fi.lxa_fi_size < 16)) + return (EINVAL); + + /* don't accept size values less than 16 */ + + lxa_state->lxas_frag_size = fi.lxa_fi_size; + lxa_state->lxas_frag_cnt = fi.lxa_fi_cnt; + + return (0); +} + +static int +lxa_audio_drain(lxa_state_t *lxa_state) +{ + int junk; + + /* only applies to output buffers */ + if (lxa_state->lxas_odev_lh == NULL) + return (EINVAL); + + /* can't fail so ignore the return value */ + (void) ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_DRAIN, NULL, + FKIOCTL, kcred, &junk); + return (0); +} + +/* + * lxa_audio_info_merge() usage notes: + * + * - it's important to make sure NOT to get the ai_idev and ai_odev + * parameters mixed up when calling lxa_audio_info_merge(). + * + * - it's important for the caller to make sure that AUDIO_GETINFO + * was called for the input device BEFORE the output device. (see + * the comments for merging the monitor_gain setting to see why.) + */ +static void +lxa_audio_info_merge(lxa_state_t *lxa_state, + audio_info_t *ai_idev, audio_info_t *ai_odev, audio_info_t *ai_merged) +{ + /* if we're not setup for output return the intput device info */ + if (lxa_state->lxas_odev_lh == NULL) { + *ai_merged = *ai_idev; + return; + } + + /* if we're not setup for input return the output device info */ + if (lxa_state->lxas_idev_lh == NULL) { + *ai_merged = *ai_odev; + return; + } + + /* get record values from the input device */ + ai_merged->record = ai_idev->record; + + /* get play values from the output device */ + ai_merged->play = ai_odev->play; + + /* muting status only matters for the output device */ + ai_merged->output_muted = ai_odev->output_muted; + + /* we don't support device reference counts, always return 1 */ + ai_merged->ref_cnt = 1; + + /* + * for supported hw/sw features report the combined feature + * set we calcuated out earlier. + */ + ai_merged->hw_features = lxa_state->lxas_hw_features; + ai_merged->sw_features = lxa_state->lxas_sw_features; + + if (!lxa_state->lxas_devs_same) { + /* + * if the input and output devices are different + * physical devices then we don't support input to + * output loopback so we always report the input + * to output loopback gain to be zero. + */ + ai_merged->monitor_gain = 0; + } else { + /* + * the intput and output devices are actually the + * same physical device. hence it probably supports + * intput to output loopback. regardless we should + * pass back the intput to output gain reported by + * the device. when we pick a value to passback we + * use the output device value since that was + * the most recently queried. (we base this + * decision on the assumption that io gain is + * actually hardware setting in the device and + * hence if it is changed on one open instance of + * the device the change will be visable to all + * other instances of the device.) + */ + ai_merged->monitor_gain = ai_odev->monitor_gain; + } + + /* + * for currently enabled software features always return the + * merger of the two. (of course the enabled software features + * for the input and output devices should alway be the same, + * so if it isn't complain.) + */ + if (ai_idev->sw_features_enabled != ai_odev->sw_features_enabled) + zcmn_err(getzoneid(), CE_WARN, "lx_audio: " + "unexpected sofware feature state"); + ai_merged->sw_features_enabled = + ai_idev->sw_features_enabled & ai_odev->sw_features_enabled; +} + +static int +lxa_audio_setinfo(lxa_state_t *lxa_state, int cmd, intptr_t arg, + int mode) +{ + audio_info_t ai, ai_null, ai_idev, ai_odev; + int rv, junk; + + /* copy in the request */ + if (ddi_copyin((void *)arg, &ai, sizeof (ai), mode) != 0) + return (EFAULT); + + /* + * if the caller is attempting to enable a software feature that + * we didn't report as supported the return an error + */ + if ((ai.sw_features_enabled != -1) && + (ai.sw_features_enabled & ~lxa_state->lxas_sw_features)) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * changes to the play.eof field (since mmap output depends + * on this field. + */ + if ((lxa_state->lxas_umem_cookie != NULL) && + (ai.play.eof != -1)) + return (EIO); + + /* initialize the new requests */ + AUDIO_INITINFO(&ai_null); + ai_idev = ai_odev = ai; + + /* remove audio input settings from the output device request */ + ai_odev.record = ai_null.record; + + /* remove audio output settings from the input device request */ + ai_idev.play = ai_null.play; + ai_idev.output_muted = ai_null.output_muted; + + /* apply settings to the intput device */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, cmd, + (intptr_t)&ai_idev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* apply settings to the output device */ + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, cmd, + (intptr_t)&ai_odev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* + * a AUDIO_SETINFO call performs an implicit AUDIO_GETINFO to + * return values (see the coments in audioio.h.) so we need + * to combine the values returned from the input and output + * device back into the users buffer. + */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, &ai); + + /* copyout the results */ + if (ddi_copyout(&ai, (void *)arg, sizeof (ai), mode) != 0) { + return (EFAULT); + } + + return (0); +} + +static int +lxa_audio_getinfo(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + audio_info_t ai, ai_idev, ai_odev; + int rv, junk; + + /* get the settings from the input device */ + if ((lxa_state->lxas_idev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_idev_lh, AUDIO_GETINFO, + (intptr_t)&ai_idev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* get the settings from the output device */ + if ((lxa_state->lxas_odev_lh != NULL) && + ((rv = ldi_ioctl(lxa_state->lxas_odev_lh, AUDIO_GETINFO, + (intptr_t)&ai_odev, FKIOCTL, kcred, &junk)) != 0)) + return (rv); + + /* + * we need to combine the values returned from the input + * and output device back into a single user buffer. + */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, &ai); + + /* copyout the results */ + if (ddi_copyout(&ai, (void *)arg, sizeof (ai), mode) != 0) + return (EFAULT); + + return (0); +} + +static int +lxa_mixer_ai_from_lh(ldi_handle_t lh, audio_info_t *ai) +{ + int rv, junk; + + ASSERT((lh != NULL) && (ai != NULL)); + + /* get the device state and channel state */ + rv = ldi_ioctl(lh, AUDIO_GETINFO, (intptr_t)ai, FKIOCTL, kcred, &junk); + + return (rv); +} + +static int +lxa_mixer_get_ai(lxa_state_t *lxa_state, audio_info_t *ai) +{ + audio_info_t ai_idev, ai_odev; + int rv; + + /* if there is no input device, query the output device */ + if (lxa_state->lxas_idev_lh == NULL) + return (lxa_mixer_ai_from_lh(lxa_state->lxas_odev_lh, ai)); + + /* if there is no ouput device, query the intput device */ + if (lxa_state->lxas_odev_lh == NULL) + return (lxa_mixer_ai_from_lh(lxa_state->lxas_idev_lh, ai)); + + /* + * now get the audio_info and channel information for the + * underlying output device. + */ + if ((rv = lxa_mixer_ai_from_lh(lxa_state->lxas_idev_lh, + &ai_idev)) != 0) + return (rv); + if ((rv = lxa_mixer_ai_from_lh(lxa_state->lxas_odev_lh, + &ai_odev)) != 0) + return (rv); + + /* now merge the audio_info structures */ + lxa_audio_info_merge(lxa_state, &ai_idev, &ai_odev, ai); + return (0); +} + +static int +lxa_mixer_get_common(lxa_state_t *lxa_state, int cmd, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + audio_info_t ai; + int rv; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + if ((rv = lxa_mixer_get_ai(lxa_state, &ai)) != 0) + return (rv); + + switch (cmd) { + case LXA_IOC_MIXER_GET_VOL: + lxa_ml.lxa_ml_gain = ai.play.gain; + lxa_ml.lxa_ml_balance = ai.play.balance; + break; + case LXA_IOC_MIXER_GET_MIC: + lxa_ml.lxa_ml_gain = ai.record.gain; + lxa_ml.lxa_ml_balance = ai.record.balance; + break; + } + + if (ddi_copyout(&lxa_ml, (void *)arg, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + return (0); +} + +static int +lxa_mixer_set_common(lxa_state_t *lxa_state, int cmd, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + audio_info_t ai; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* get the new mixer settings */ + if (ddi_copyin((void *)arg, &lxa_ml, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + + /* sanity check the mixer settings */ + if (!LXA_MIXER_LEVELS_OK(&lxa_ml)) + return (EINVAL); + + /* initialize an audio_info struct with the new settings */ + AUDIO_INITINFO(&ai); + switch (cmd) { + case LXA_IOC_MIXER_SET_VOL: + ai.play.gain = lxa_ml.lxa_ml_gain; + ai.play.balance = lxa_ml.lxa_ml_balance; + break; + case LXA_IOC_MIXER_SET_MIC: + ai.record.gain = lxa_ml.lxa_ml_gain; + ai.record.balance = lxa_ml.lxa_ml_balance; + break; + } + + return (lxa_audio_setinfo(lxa_state, AUDIO_SETINFO, (intptr_t)&ai, + FKIOCTL)); +} + +static int +lxa_mixer_get_pcm(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* simply return the cached pcm mixer settings */ + mutex_enter(&lxa_lock); + if (ddi_copyout(&lxa_state->lxas_zs->lxa_zs_pcm_levels, (void *)arg, + sizeof (lxa_state->lxas_zs->lxa_zs_pcm_levels), mode) != 0) { + mutex_exit(&lxa_lock); + return (EFAULT); + } + mutex_exit(&lxa_lock); + return (0); +} + +static int +lxa_mixer_set_pcm(lxa_state_t *lxa_state, intptr_t arg, int mode) +{ + lxa_mixer_levels_t lxa_ml; + int rv; + + ASSERT(lxa_state->lxas_type == LXA_TYPE_AUDIOCTL); + + /* get the new mixer settings */ + if (ddi_copyin((void *)arg, &lxa_ml, sizeof (lxa_ml), mode) != 0) + return (EFAULT); + + /* sanity check the mixer settings */ + if (!LXA_MIXER_LEVELS_OK(&lxa_ml)) + return (EINVAL); + + mutex_enter(&lxa_lock); + + /* if there is an active output channel, update it */ + if (lxa_state->lxas_zs->lxa_zs_ostate != NULL) { + audio_info_t ai; + + /* initialize an audio_info struct with the new settings */ + AUDIO_INITINFO(&ai); + ai.play.gain = lxa_ml.lxa_ml_gain; + ai.play.balance = lxa_ml.lxa_ml_balance; + + if ((rv = lxa_audio_setinfo(lxa_state->lxas_zs->lxa_zs_ostate, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL)) != 0) { + mutex_exit(&lxa_lock); + return (rv); + } + } + + /* update the cached mixer settings */ + lxa_state->lxas_zs->lxa_zs_pcm_levels = lxa_ml; + + mutex_exit(&lxa_lock); + return (0); +} + +static int +lxa_zone_reg(intptr_t arg, int mode) +{ + lxa_zone_reg_t lxa_zr; + lxa_zstate_t *lxa_zs = NULL; + char *idev_name = NULL, *odev_name = NULL, *pval = NULL; + int i, junk; + + if (ddi_copyin((void *)arg, &lxa_zr, sizeof (lxa_zr), mode) != 0) + return (EFAULT); + + /* make sure that zone_name is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_zone_name); i++) + if (lxa_zr.lxa_zr_zone_name[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_zone_name)) + return (EINVAL); + + /* make sure that inputdev is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_inputdev); i++) + if (lxa_zr.lxa_zr_inputdev[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_inputdev)) + return (EINVAL); + + /* make sure it's a valid inputdev property value */ + if (lxa_devprop_verify(lxa_zr.lxa_zr_inputdev) != 0) + return (EINVAL); + + /* make sure that outputdev is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_outputdev); i++) + if (lxa_zr.lxa_zr_outputdev[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_outputdev)) + return (EINVAL); + + /* make sure it's a valid outputdev property value */ + if (lxa_devprop_verify(lxa_zr.lxa_zr_outputdev) != 0) + return (EINVAL); + + /* get the property names */ + idev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_INPUTDEV); + odev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_OUTPUTDEV); + + /* + * allocate and initialize a zone state structure + * since the audio device can't possibly be opened yet + * (since we're setting it up now and the zone isn't booted + * yet) assign some some resonable default pcm channel settings. + * also, default to one mixer channel. + */ + lxa_zs = kmem_zalloc(sizeof (*lxa_zs), KM_SLEEP); + lxa_zs->lxa_zs_zonename = strdup(lxa_zr.lxa_zr_zone_name); + lxa_zs->lxa_zs_pcm_levels.lxa_ml_gain = AUDIO_MID_GAIN; + lxa_zs->lxa_zs_pcm_levels.lxa_ml_balance = AUDIO_MID_BALANCE; + + mutex_enter(&lxa_lock); + + /* + * make sure this zone isn't already registered + * a zone is registered with properties for that zone exist + * or there is a zone state structure for that zone + */ + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + idev_name, &pval) == DDI_PROP_SUCCESS) { + goto err_unlock; + } + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + odev_name, &pval) == DDI_PROP_SUCCESS) { + goto err_unlock; + } + if (mod_hash_find(lxa_zstate_hash, + (mod_hash_key_t)lxa_zs->lxa_zs_zonename, + (mod_hash_val_t *)&junk) == 0) + goto err_unlock; + + /* + * create the new properties and insert the zone state structure + * into the global hash + */ + if (ddi_prop_update_string(DDI_DEV_T_NONE, lxa_dip, + idev_name, lxa_zr.lxa_zr_inputdev) != DDI_PROP_SUCCESS) + goto err_prop_remove; + if (ddi_prop_update_string(DDI_DEV_T_NONE, lxa_dip, + odev_name, lxa_zr.lxa_zr_outputdev) != DDI_PROP_SUCCESS) + goto err_prop_remove; + if (mod_hash_insert(lxa_zstate_hash, + (mod_hash_key_t)lxa_zs->lxa_zs_zonename, + (mod_hash_val_t)lxa_zs) != 0) + goto err_prop_remove; + + /* success! */ + lxa_registered_zones++; + mutex_exit(&lxa_lock); + + /* cleanup */ + strfree(idev_name); + strfree(odev_name); + return (0); + +err_prop_remove: + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, idev_name); + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, odev_name); + +err_unlock: + mutex_exit(&lxa_lock); + + if (lxa_zs != NULL) { + strfree(lxa_zs->lxa_zs_zonename); + kmem_free(lxa_zs, sizeof (*lxa_zs)); + } + if (pval != NULL) + ddi_prop_free(pval); + if (idev_name != NULL) + strfree(idev_name); + if (odev_name != NULL) + strfree(odev_name); + return (EIO); +} + +static int +lxa_zone_unreg(intptr_t arg, int mode) +{ + lxa_zone_reg_t lxa_zr; + lxa_zstate_t *lxa_zs = NULL; + char *idev_name = NULL, *odev_name = NULL, *pval = NULL; + int rv, i; + + if (ddi_copyin((void *)arg, &lxa_zr, sizeof (lxa_zr), mode) != 0) + return (EFAULT); + + /* make sure that zone_name is a valid string */ + for (i = 0; i < sizeof (lxa_zr.lxa_zr_zone_name); i++) + if (lxa_zr.lxa_zr_zone_name[i] == '\0') + break; + if (i == sizeof (lxa_zr.lxa_zr_zone_name)) + return (EINVAL); + + /* get the property names */ + idev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_INPUTDEV); + odev_name = lxa_devprop_name(lxa_zr.lxa_zr_zone_name, + LXA_PROP_OUTPUTDEV); + + mutex_enter(&lxa_lock); + + if (lxa_registered_zones <= 0) { + rv = ENOENT; + goto err_unlock; + } + + /* make sure this zone is actually registered */ + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + idev_name, &pval) != DDI_PROP_SUCCESS) { + rv = ENOENT; + goto err_unlock; + } + ddi_prop_free(pval); + pval = NULL; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, lxa_dip, + DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, + odev_name, &pval) != DDI_PROP_SUCCESS) { + rv = ENOENT; + goto err_unlock; + } + ddi_prop_free(pval); + pval = NULL; + if (mod_hash_find(lxa_zstate_hash, + (mod_hash_key_t)lxa_zr.lxa_zr_zone_name, + (mod_hash_val_t *)&lxa_zs) != 0) { + rv = ENOENT; + goto err_unlock; + } + ASSERT(strcmp(lxa_zr.lxa_zr_zone_name, lxa_zs->lxa_zs_zonename) == 0); + + /* + * if the audio device is currently in use then refuse to + * unregister the zone + */ + if ((lxa_zs->lxa_zs_ostate != NULL) || + (lxa_zs->lxa_zs_ostate != NULL)) { + rv = EBUSY; + goto err_unlock; + } + + /* success! cleanup zone config state */ + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, idev_name); + (void) ddi_prop_remove(DDI_DEV_T_NONE, lxa_dip, odev_name); + + /* + * note, the action of removing the zone state structure from the + * hash will automatically free lxa_zs->lxa_zs_zonename. + * + * the reason for this is that we used lxa_zs->lxa_zs_zonename + * as the hash key and by default mod_hash_create_strhash() uses + * mod_hash_strkey_dtor() as a the hash key destructor. (which + * free's the key for us. + */ + (void) mod_hash_remove(lxa_zstate_hash, + (mod_hash_key_t)lxa_zr.lxa_zr_zone_name, + (mod_hash_val_t *)&lxa_zs); + lxa_registered_zones--; + mutex_exit(&lxa_lock); + + /* cleanup */ + kmem_free(lxa_zs, sizeof (*lxa_zs)); + strfree(idev_name); + strfree(odev_name); + return (0); + +err_unlock: + mutex_exit(&lxa_lock); + + if (pval != NULL) + ddi_prop_free(pval); + if (idev_name != NULL) + strfree(idev_name); + if (odev_name != NULL) + strfree(odev_name); + return (rv); +} + +static int +lxa_ioctl_devctl(int cmd, intptr_t arg, int mode) +{ + /* devctl ioctls are only allowed from the global zone */ + ASSERT(getzoneid() == 0); + if (getzoneid() != 0) + return (EINVAL); + + switch (cmd) { + case LXA_IOC_ZONE_REG: + return (lxa_zone_reg(arg, mode)); + case LXA_IOC_ZONE_UNREG: + return (lxa_zone_unreg(arg, mode)); + } + + return (EINVAL); +} + +static int +/*ARGSUSED*/ +lxa_open(dev_t *devp, int flags, int otyp, cred_t *credp) +{ + lxa_dev_type_t open_type = LXA_TYPE_INVALID; + lxa_zstate_t *lxa_zs; + lxa_state_t *lxa_state; + minor_t minor; + int rv; + + if (getminor(*devp) == LXA_MINORNUM_DEVCTL) { + /* + * this is a devctl node, it exists to administer this + * pseudo driver so it doesn't actually need access to + * any underlying audio devices. hence there is nothing + * really to do here. course, this driver should + * only be administered from the global zone. + */ + ASSERT(getzoneid() == 0); + if (getzoneid() != 0) + return (EINVAL); + return (0); + } + + /* lookup the zone state structure */ + if (mod_hash_find(lxa_zstate_hash, (mod_hash_key_t)getzonename(), + (mod_hash_val_t *)&lxa_zs) != 0) { + return (EIO); + } + + /* determine what type of device was opened */ + switch (getminor(*devp)) { + case LXA_MINORNUM_DSP: + open_type = LXA_TYPE_AUDIO; + break; + case LXA_MINORNUM_MIXER: + open_type = LXA_TYPE_AUDIOCTL; + break; + default: + return (EINVAL); + } + ASSERT(open_type != LXA_TYPE_INVALID); + + /* all other opens are clone opens so get a new minor node */ + minor = id_alloc(lxa_minor_id); + + /* allocate and initialize the new lxa_state structure */ + lxa_state = kmem_zalloc(sizeof (*lxa_state), KM_SLEEP); + lxa_state->lxas_zs = lxa_zs; + lxa_state->lxas_dev_old = *devp; + lxa_state->lxas_dev_new = makedevice(getmajor(*devp), minor); + lxa_state->lxas_flags = flags; + lxa_state->lxas_type = open_type; + + /* initialize the input and output device */ + if (((rv = lxa_dev_open(lxa_state)) != 0) || + ((rv = lxa_dev_getfeatures(lxa_state)) != 0)) { + lxa_state_close(lxa_state); + return (rv); + } + + /* + * save this audio statue structure into a hash indexed + * by it's minor device number. (this will provide a convient + * way to lookup the state structure on future operations.) + */ + if (mod_hash_insert(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t)lxa_state) != 0) { + lxa_state_close(lxa_state); + return (EIO); + } + + mutex_enter(&lxa_lock); + + /* apply the currently cached zone PCM mixer levels */ + if ((lxa_state->lxas_type == LXA_TYPE_AUDIO) && + (lxa_state->lxas_odev_lh != NULL)) { + audio_info_t ai; + + AUDIO_INITINFO(&ai); + ai.play.gain = lxa_zs->lxa_zs_pcm_levels.lxa_ml_gain; + ai.play.balance = lxa_zs->lxa_zs_pcm_levels.lxa_ml_balance; + + if ((rv = lxa_audio_setinfo(lxa_state, + AUDIO_SETINFO, (intptr_t)&ai, FKIOCTL)) != 0) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (rv); + } + } + + /* + * we only allow one active open of the input or output device. + * check here for duplicate opens + */ + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + if ((lxa_state->lxas_idev_lh != NULL) && + (lxa_zs->lxa_zs_istate != NULL)) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (EBUSY); + } + if ((lxa_state->lxas_odev_lh != NULL) && + (lxa_zs->lxa_zs_ostate != NULL)) { + mutex_exit(&lxa_lock); + lxa_state_close(lxa_state); + return (EBUSY); + } + + /* not a duplicate open, update the global zone state */ + if (lxa_state->lxas_idev_lh != NULL) + lxa_zs->lxa_zs_istate = lxa_state; + if (lxa_state->lxas_odev_lh != NULL) + lxa_zs->lxa_zs_ostate = lxa_state; + } + mutex_exit(&lxa_lock); + + /* make sure to return our newly allocated dev_t */ + *devp = lxa_state->lxas_dev_new; + return (0); +} + +static int +/*ARGSUSED*/ +lxa_close(dev_t dev, int flags, int otyp, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* handle devctl minor nodes (these nodes don't have a handle */ + if (getminor(dev) == LXA_MINORNUM_DEVCTL) + return (0); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + lxa_state_close(lxa_state); + return (0); +} + +static int +/*ARGSUSED*/ +lxa_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + int rv; + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * any more reads or writes to the device + */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EIO); + + /* we can't do a read if there is no input device */ + if (lxa_state->lxas_idev_lh == NULL) + return (EBADF); + + /* pass the request on */ + while (uiop->uio_resid != 0) { + rv = ldi_read(lxa_state->lxas_idev_lh, uiop, kcred); + if ((rv != 0) || (uiop->uio_fmode & (FNONBLOCK|FNDELAY))) { + break; + } + } + return (rv); +} + +static int +/*ARGSUSED*/ +lxa_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + int rv; + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* + * if a process has mmaped this device then we don't allow + * any more reads or writes to the device + */ + if (lxa_state->lxas_umem_cookie != NULL) + return (EIO); + + /* we can't do a write if there is no output device */ + if (lxa_state->lxas_odev_lh == NULL) + return (EBADF); + + /* pass the request on */ + while (uiop->uio_resid != 0) { + rv = ldi_write(lxa_state->lxas_odev_lh, uiop, kcred); + if ((rv != 0) || (uiop->uio_fmode & (FNONBLOCK|FNDELAY))) { + break; + } + } + return (rv); +} + +static int +/*ARGSUSED*/ +lxa_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + + /* handle devctl minor nodes (these nodes don't have a handle */ + if (getminor(dev) == LXA_MINORNUM_DEVCTL) + return (lxa_ioctl_devctl(cmd, arg, mode)); + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + ASSERT((lxa_state->lxas_type == LXA_TYPE_AUDIO) || + (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL)); + + switch (cmd) { + case LXA_IOC_GETMINORNUM: + { + int minornum = getminor(lxa_state->lxas_dev_old); + if (ddi_copyout(&minornum, (void *)arg, + sizeof (minornum), mode) != 0) + return (EFAULT); + } + return (0); + } + + if (lxa_state->lxas_type == LXA_TYPE_AUDIO) { + /* deal with native ioctl */ + switch (cmd) { + case LXA_IOC_MMAP_OUTPUT: + return (lxa_ioc_mmap_output(lxa_state, arg, mode)); + case LXA_IOC_MMAP_PTR: + return (lxa_ioc_mmap_ptr(lxa_state, arg, mode)); + case LXA_IOC_GET_FRAG_INFO: + return (lxa_ioc_get_frag_info(lxa_state, arg, mode)); + case LXA_IOC_SET_FRAG_INFO: + return (lxa_ioc_set_frag_info(lxa_state, arg, mode)); + } + + /* deal with layered ioctls */ + switch (cmd) { + case AUDIO_DRAIN: + return (lxa_audio_drain(lxa_state)); + case AUDIO_SETINFO: + return (lxa_audio_setinfo(lxa_state, + AUDIO_SETINFO, arg, mode)); + case AUDIO_GETINFO: + return (lxa_audio_getinfo(lxa_state, arg, mode)); + } + } + + if (lxa_state->lxas_type == LXA_TYPE_AUDIOCTL) { + /* deal with native ioctl */ + switch (cmd) { + case LXA_IOC_MIXER_GET_VOL: + return (lxa_mixer_get_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_SET_VOL: + return (lxa_mixer_set_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_GET_MIC: + return (lxa_mixer_get_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_SET_MIC: + return (lxa_mixer_set_common(lxa_state, + cmd, arg, mode)); + case LXA_IOC_MIXER_GET_PCM: + return (lxa_mixer_get_pcm(lxa_state, arg, mode)); + case LXA_IOC_MIXER_SET_PCM: + return (lxa_mixer_set_pcm(lxa_state, arg, mode)); + } + + } + + return (EINVAL); +} + +static int +/*ARGSUSED*/ +lxa_devmap(dev_t dev, devmap_cookie_t dhp, + offset_t off, size_t len, size_t *maplen, uint_t model) +{ + lxa_state_t *lxa_state; + minor_t minor = getminor(dev); + ddi_umem_cookie_t umem_cookie; + void *umem_ptr; + int rv; + + /* get the handle for this device */ + if (mod_hash_find(lxa_state_hash, (mod_hash_key_t)(uintptr_t)minor, + (mod_hash_val_t *)&lxa_state) != 0) + return (EINVAL); + + /* we only support mmaping of audio devices */ + if (lxa_state->lxas_type != LXA_TYPE_AUDIO) + return (EINVAL); + + /* we only support output via mmap */ + if ((lxa_state->lxas_flags & FWRITE) == 0) + return (EINVAL); + + /* sanity check the amount of memory the user is allocating */ + if ((len == 0) || + (len > LXA_OSS_FRAG_MEM) || + ((len % lxa_state->lxas_frag_size) != 0)) + return (EINVAL); + + /* allocate and clear memory to mmap */ + umem_ptr = ddi_umem_alloc(len, DDI_UMEM_NOSLEEP, &umem_cookie); + if (umem_ptr == NULL) + return (ENOMEM); + bzero(umem_ptr, len); + + /* setup the memory mappings */ + rv = devmap_umem_setup(dhp, lxa_dip, NULL, umem_cookie, 0, len, + PROT_USER | PROT_READ | PROT_WRITE, 0, NULL); + if (rv != 0) { + ddi_umem_free(umem_cookie); + return (EIO); + } + + mutex_enter(&lxa_lock); + + /* we only support one mmap per open */ + if (lxa_state->lxas_umem_cookie != NULL) { + ASSERT(lxa_state->lxas_umem_ptr != NULL); + mutex_exit(&lxa_lock); + ddi_umem_free(umem_cookie); + return (EBUSY); + } + ASSERT(lxa_state->lxas_umem_ptr == NULL); + + *maplen = len; + lxa_state->lxas_umem_len = len; + lxa_state->lxas_umem_ptr = umem_ptr; + lxa_state->lxas_umem_cookie = umem_cookie; + mutex_exit(&lxa_lock); + return (0); +} + +static int +/*ARGSUSED*/ +lxa_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance = ddi_get_instance(dip); + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + ASSERT(instance == 0); + if (instance != 0) + return (DDI_FAILURE); + + lxa_dip = dip; + mutex_init(&lxa_lock, NULL, MUTEX_DEFAULT, NULL); + + /* create our minor nodes */ + if (ddi_create_minor_node(dip, LXA_MINORNAME_DEVCTL, S_IFCHR, + LXA_MINORNUM_DEVCTL, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LXA_MINORNAME_DSP, S_IFCHR, + LXA_MINORNUM_DSP, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LXA_MINORNAME_MIXER, S_IFCHR, + LXA_MINORNUM_MIXER, DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + /* allocate our data structures */ + lxa_minor_id = id_space_create("lxa_minor_id", + LXA_MINORNUM_COUNT, LX_AUDIO_MAX_OPENS); + lxa_state_hash = mod_hash_create_idhash("lxa_state_hash", + lxa_state_hash_size, mod_hash_null_valdtor); + lxa_zstate_hash = mod_hash_create_strhash("lxa_zstate_hash", + lxa_zstate_hash_size, mod_hash_null_valdtor); + + return (DDI_SUCCESS); +} + +static int +/*ARGSUSED*/ +lxa_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ASSERT(!MUTEX_HELD(&lxa_lock)); + if (lxa_registered_zones > 0) + return (DDI_FAILURE); + + mod_hash_destroy_idhash(lxa_state_hash); + mod_hash_destroy_idhash(lxa_zstate_hash); + id_space_destroy(lxa_minor_id); + lxa_state_hash = NULL; + lxa_dip = NULL; + + return (DDI_SUCCESS); +} + +static int +/*ARGSUSED*/ +lxa_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **resultp) +{ + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *resultp = lxa_dip; + return (DDI_SUCCESS); + + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)0; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Driver flags + */ +static struct cb_ops lxa_cb_ops = { + lxa_open, /* open */ + lxa_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lxa_read, /* read */ + lxa_write, /* write */ + lxa_ioctl, /* ioctl */ + lxa_devmap, /* devmap */ + nodev, /* mmap */ + ddi_devmap_segmap, /* segmap */ + nochpoll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP | D_DEVMAP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lxa_ops = { + DEVO_REV, + 0, + lxa_getinfo, + nulldev, + nulldev, + lxa_attach, + lxa_detach, + nodev, + &lxa_cb_ops, + NULL, + NULL, + ddi_quiesce_not_needed, /* quiesce */ +}; + +/* + * Module linkage information for the kernel. + */ +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "linux audio driver", /* description of module */ + &lxa_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +/* + * standard module entry points + */ +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_audio.conf b/usr/src/uts/common/brand/lx/io/lx_audio.conf new file mode 100644 index 0000000000..2eeb5eb7ee --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_audio.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_audio" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/io/lx_netlink.c b/usr/src/uts/common/brand/lx/io/lx_netlink.c new file mode 100644 index 0000000000..35ee5b97ed --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_netlink.c @@ -0,0 +1,1273 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Compatibility for the Linux netlink(7) kernel/user transport, as well as + * for in-kernel netlink(7) providers like rtnetlink(7). See RFC 3549 for + * details of the protocol, and the Linux man pages for details of the Linux + * implementation that we're mimicking. + */ + +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/brand.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <sys/lx_misc.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> +#include <sys/priv_const.h> +#include <sys/priv_impl.h> +#include <sys/priv.h> +#include <sys/policy.h> + +/* + * Flags in netlink header + */ +#define LX_NETLINK_NLM_F_REQUEST 1 +#define LX_NETLINK_NLM_F_MULTI 2 +#define LX_NETLINK_NLM_F_ACK 4 +#define LX_NETLINK_NLM_F_ECHO 8 +#define LX_NETLINK_NLM_F_DUMP_INTR 16 +#define LX_NETLINK_NLM_F_ROOT 0x100 +#define LX_NETLINK_NLM_F_MATCH 0x200 +#define LX_NETLINK_NLM_F_ATOMIC 0x400 + +/* + * Generic message type constants + */ +#define LX_NETLINK_NLMSG_NONE 0 +#define LX_NETLINK_NLMSG_NOOP 1 +#define LX_NETLINK_NLMSG_ERROR 2 +#define LX_NETLINK_NLMSG_DONE 3 +#define LX_NETLINK_NLMSG_OVERRUN 4 + +/* + * Protocol constants. + */ +#define LX_NETLINK_ROUTE 0 +#define LX_NETLINK_UNUSED 1 +#define LX_NETLINK_USERSOCK 2 +#define LX_NETLINK_FIREWALL 3 +#define LX_NETLINK_SOCK_DIAG 4 +#define LX_NETLINK_NFLOG 5 +#define LX_NETLINK_XFRM 6 +#define LX_NETLINK_SELINUX 7 +#define LX_NETLINK_ISCSI 8 +#define LX_NETLINK_AUDIT 9 +#define LX_NETLINK_FIB_LOOKUP 10 +#define LX_NETLINK_CONNECTOR 11 +#define LX_NETLINK_NETFILTER 12 +#define LX_NETLINK_IP6_FW 13 +#define LX_NETLINK_DNRTMSG 14 +#define LX_NETLINK_KOBJECT_UEVENT 15 +#define LX_NETLINK_GENERIC 16 +#define LX_NETLINK_SCSITRANSPORT 18 +#define LX_NETLINK_ECRYPTFS 19 +#define LX_NETLINK_RDMA 20 +#define LX_NETLINK_CRYPTO 21 + +/* + * rtnetlink(7) attribute-related constants + */ +#define LX_NETLINK_NLA_ALIGNTO 4 + +#define LX_NETLINK_RTM_NEWLINK 16 +#define LX_NETLINK_RTM_DELLINK 17 +#define LX_NETLINK_RTM_GETLINK 18 +#define LX_NETLINK_RTM_SETLINK 19 +#define LX_NETLINK_RTM_NEWADDR 20 +#define LX_NETLINK_RTM_DELADDR 21 +#define LX_NETLINK_RTM_GETADDR 22 +#define LX_NETLINK_RTM_NEWROUTE 24 +#define LX_NETLINK_RTM_DELROUTE 25 +#define LX_NETLINK_RTM_GETROUTE 26 +#define LX_NETLINK_RTM_NEWNEIGH 28 +#define LX_NETLINK_RTM_DELNEIGH 29 +#define LX_NETLINK_RTM_GETNEIGH 30 +#define LX_NETLINK_RTM_NEWRULE 32 +#define LX_NETLINK_RTM_DELRULE 33 +#define LX_NETLINK_RTM_GETRULE 34 +#define LX_NETLINK_RTM_NEWQDISC 36 +#define LX_NETLINK_RTM_DELQDISC 37 +#define LX_NETLINK_RTM_GETQDISC 38 +#define LX_NETLINK_RTM_NEWTCLASS 40 +#define LX_NETLINK_RTM_DELTCLASS 41 +#define LX_NETLINK_RTM_GETTCLASS 42 +#define LX_NETLINK_RTM_NEWTFILTER 44 +#define LX_NETLINK_RTM_DELTFILTER 45 +#define LX_NETLINK_RTM_GETTFILTER 46 +#define LX_NETLINK_RTM_NEWACTION 48 +#define LX_NETLINK_RTM_DELACTION 49 +#define LX_NETLINK_RTM_GETACTION 50 +#define LX_NETLINK_RTM_NEWPREFIX 52 +#define LX_NETLINK_RTM_GETMULTICAST 58 +#define LX_NETLINK_RTM_GETANYCAST 62 +#define LX_NETLINK_RTM_NEWNEIGHTBL 64 +#define LX_NETLINK_RTM_GETNEIGHTBL 66 +#define LX_NETLINK_RTM_SETNEIGHTBL 67 +#define LX_NETLINK_RTM_NEWNDUSEROPT 68 +#define LX_NETLINK_RTM_NEWADDRLABEL 72 +#define LX_NETLINK_RTM_DELADDRLABEL 73 +#define LX_NETLINK_RTM_GETADDRLABEL 74 +#define LX_NETLINK_RTM_GETDCB 78 +#define LX_NETLINK_RTM_SETDCB 79 +#define LX_NETLINK_RTM_NEWNETCONF 80 +#define LX_NETLINK_RTM_GETNETCONF 82 +#define LX_NETLINK_RTM_NEWMDB 84 +#define LX_NETLINK_RTM_DELMDB 85 +#define LX_NETLINK_RTM_GETMDB 86 +#define LX_NETLINK_RTM_MAX 87 + +/* + * rtnetlink(7) attribute constants + */ +#define LX_NETLINK_RTA_UNSPEC 1 +#define LX_NETLINK_RTA_DST 2 +#define LX_NETLINK_RTA_SRC 3 +#define LX_NETLINK_RTA_IIF 4 +#define LX_NETLINK_RTA_OIF 5 +#define LX_NETLINK_RTA_GATEWAY 6 +#define LX_NETLINK_RTA_PRIORITY 7 +#define LX_NETLINK_RTA_PREFSRC 8 +#define LX_NETLINK_RTA_METRICS 9 +#define LX_NETLINK_RTA_MULTIPATH 10 +#define LX_NETLINK_RTA_PROTOINFO 11 +#define LX_NETLINK_RTA_FLOW 12 +#define LX_NETLINK_RTA_CACHEINFO 13 +#define LX_NETLINK_RTA_SESSION 14 +#define LX_NETLINK_RTA_MP_ALGO 15 +#define LX_NETLINK_RTA_TABLE 16 +#define LX_NETLINK_RTA_MARK 17 +#define LX_NETLINK_RTA_MFC_STATS 18 + +/* + * rtnetlink(7) NEWLINK/DELLINK/GETLINK constants + */ +#define LX_NETLINK_IFLA_UNSPEC 0 +#define LX_NETLINK_IFLA_ADDRESS 1 +#define LX_NETLINK_IFLA_BROADCAST 2 +#define LX_NETLINK_IFLA_IFNAME 3 +#define LX_NETLINK_IFLA_MTU 4 +#define LX_NETLINK_IFLA_LINK 5 +#define LX_NETLINK_IFLA_QDISC 6 +#define LX_NETLINK_IFLA_STATS 7 +#define LX_NETLINK_IFLA_COST 8 +#define LX_NETLINK_IFLA_PRIORITY 9 +#define LX_NETLINK_IFLA_MASTER 10 +#define LX_NETLINK_IFLA_WIRELESS 11 +#define LX_NETLINK_IFLA_PROTINFO 12 +#define LX_NETLINK_IFLA_TXQLEN 13 +#define LX_NETLINK_IFLA_MAP 14 +#define LX_NETLINK_IFLA_WEIGHT 15 +#define LX_NETLINK_IFLA_OPERSTATE 16 +#define LX_NETLINK_IFLA_LINKMODE 17 +#define LX_NETLINK_IFLA_LINKINFO 18 +#define LX_NETLINK_IFLA_NET_NS_PID 19 +#define LX_NETLINK_IFLA_IFALIAS 20 +#define LX_NETLINK_IFLA_NUM_VF 21 +#define LX_NETLINK_IFLA_VFINFO_LIST 22 +#define LX_NETLINK_IFLA_STATS64 23 +#define LX_NETLINK_IFLA_VF_PORTS 24 +#define LX_NETLINK_IFLA_PORT_SELF 25 +#define LX_NETLINK_IFLA_AF_SPEC 26 +#define LX_NETLINK_IFLA_GROUP 27 +#define LX_NETLINK_IFLA_NET_NS_FD 28 +#define LX_NETLINK_IFLA_EXT_MASK 29 +#define LX_NETLINK_IFLA_PROMISCUITY 30 +#define LX_NETLINK_IFLA_NUM_TX_QUEUES 31 +#define LX_NETLINK_IFLA_NUM_RX_QUEUES 32 +#define LX_NETLINK_IFLA_CARRIER 33 +#define LX_NETLINK_IFLA_PHYS_PORT_ID 34 +#define LX_NETLINK_IFLA_CARRIER_CHANGES 35 +#define LX_NETLINK_IFLA_MAX 36 + +/* + * rtnetlink(7) NEWADDR/DELADDR/GETADDR constants + */ +#define LX_NETLINK_IFA_UNSPEC 0 +#define LX_NETLINK_IFA_ADDRESS 1 +#define LX_NETLINK_IFA_LOCAL 2 +#define LX_NETLINK_IFA_LABEL 3 +#define LX_NETLINK_IFA_BROADCAST 4 +#define LX_NETLINK_IFA_ANYCAST 5 +#define LX_NETLINK_IFA_CACHEINFO 6 +#define LX_NETLINK_IFA_MULTICAST 7 +#define LX_NETLINK_IFA_FLAGS 8 +#define LX_NETLINK_IFA_MAX 9 + +#define LX_NETLINK_IFA_F_SECONDARY 0x01 +#define LX_NETLINK_IFA_F_TEMPORARY LX_NETLINK_IFA_F_SECONDARY +#define LX_NETLINK_IFA_F_NODAD 0x02 +#define LX_NETLINK_IFA_F_OPTIMISTIC 0x04 +#define LX_NETLINK_IFA_F_DADFAILED 0x08 +#define LX_NETLINK_IFA_F_HOMEADDRESS 0x10 +#define LX_NETLINK_IFA_F_DEPRECATED 0x20 +#define LX_NETLINK_IFA_F_TENTATIVE 0x40 +#define LX_NETLINK_IFA_F_PERMANENT 0x80 +#define LX_NETLINK_IFA_F_MANAGETEMPADDR 0x100 +#define LX_NETLINK_IFA_F_NOPREFIXROUTE 0x200 + +/* + * Linux address families. + */ +#define LX_AF_INET 2 +#define LX_AF_INET6 10 +#define LX_AF_NETLINK 16 + +/* + * Linux interface flags. + */ +#define LX_IFF_UP (1<<0) +#define LX_IFF_BROADCAST (1<<1) +#define LX_IFF_DEBUG (1<<2) +#define LX_IFF_LOOPBACK (1<<3) +#define LX_IFF_POINTOPOINT (1<<4) +#define LX_IFF_NOTRAILERS (1<<5) +#define LX_IFF_RUNNING (1<<6) +#define LX_IFF_NOARP (1<<7) +#define LX_IFF_PROMISC (1<<8) +#define LX_IFF_ALLMULTI (1<<9) +#define LX_IFF_MASTER (1<<10) +#define LX_IFF_SLAVE (1<<11) +#define LX_IFF_MULTICAST (1<<12) +#define LX_IFF_PORTSEL (1<<13) +#define LX_IFF_AUTOMEDIA (1<<14) +#define LX_IFF_DYNAMIC (1<<15) +#define LX_IFF_LOWER_UP (1<<16) +#define LX_IFF_DORMANT (1<<17) +#define LX_IFF_ECHO (1<<18) + + +/* + * Netlink sockopts + */ +#define SOL_LX_NETLINK 270 + +#define LX_NETLINK_SO_ADD_MEMBERSHIP 1 +#define LX_NETLINK_SO_DROP_MEMBERSHIP 2 +#define LX_NETLINK_SO_PKTINFO 3 +#define LX_NETLINK_SO_BROADCAST_ERROR 4 +#define LX_NETLINK_SO_NO_ENOBUFS 5 +#define LX_NETLINK_SO_RX_RING 6 +#define LX_NETLINK_SO_TX_RING 7 + +typedef struct lx_netlink_hdr { + uint32_t lxnh_len; /* length of message */ + uint16_t lxnh_type; /* type of message */ + uint16_t lxnh_flags; /* flags */ + uint32_t lxnh_seq; /* sequence number */ + uint32_t lxnh_pid; /* sending pid */ +} lx_netlink_hdr_t; + +typedef struct lx_netlink_err { + lx_netlink_hdr_t lxne_hdr; /* header */ + int32_t lxne_errno; /* errno */ + lx_netlink_hdr_t lxne_failed; /* header of err */ +} lx_netlink_err_t; + +typedef struct lx_netlink_attr { + uint16_t lxna_len; /* length of attribute */ + uint16_t lxna_type; /* type of attribute */ +} lx_netlink_attr_t; + +typedef struct lx_netlink_ifinfomsg { + uint8_t lxnl_ifi_family; /* family: AF_UNSPEC */ + uint8_t lxnl_ifi__pad; + uint16_t lxnl_ifi_type; /* device type */ + uint32_t lxnl_ifi_index; /* interface index */ + uint32_t lxnl_ifi_flags; /* device flags */ + uint32_t lxnl_ifi_change; /* unused; must be -1 */ +} lx_netlink_ifinfomsg_t; + +typedef struct lx_netlink_ifaddrmsg { + uint8_t lxnl_ifa_family; /* address type */ + uint8_t lxnl_ifa_prefixlen; /* prefix length of address */ + uint8_t lxnl_ifa_flags; /* address flags */ + uint8_t lxnl_ifa_scope; /* address scope */ + uint8_t lxnl_ifa_index; /* interface index */ +} lx_netlink_ifaddrmsg_t; + +typedef struct lx_netlink_sockaddr { + sa_family_t lxnl_family; /* AF_LX_NETLINK */ + uint16_t lxnl_pad; /* padding */ + uint32_t lxnl_port; /* port id */ + uint32_t lxnl_groups; /* multicast groups mask */ +} lx_netlink_sockaddr_t; + +typedef struct lx_netlink_sock { + struct lx_netlink_sock *lxns_next; /* list of lx_netlink sockets */ + sock_upcalls_t *lxns_upcalls; /* pointer to socket upcalls */ + sock_upper_handle_t lxns_uphandle; /* socket upcall handle */ + ldi_handle_t lxns_iphandle; /* handle to /dev/ip */ + ldi_handle_t lxns_ip6handle; /* handle to /dev/ip6 */ + ldi_handle_t lxns_current; /* current ip handle */ + int lxns_proto; /* protocol */ + uint32_t lxns_port; /* port identifier */ + uint32_t lxns_groups; /* group subscriptions */ + uint32_t lxns_bufsize; /* buffer size */ +} lx_netlink_sock_t; + +typedef struct lx_netlink_reply { + lx_netlink_hdr_t lxnr_hdr; /* header that we're reply to */ + lx_netlink_sock_t *lxnr_sock; /* socket */ + uint32_t lxnr_seq; /* sequence number */ + uint16_t lxnr_type; /* type of reply */ + mblk_t *lxnr_mp; /* current mblk */ + mblk_t *lxnr_err; /* error mblk */ + mblk_t *lxnr_mp1; /* T_UNITDATA_IND mblk */ + int lxnr_errno; /* errno, if any */ +} lx_netlink_reply_t; + +static lx_netlink_sock_t *lx_netlink_head; /* head of lx_netlink sockets */ +static kmutex_t lx_netlink_lock; /* lock to protect state */ +static ldi_ident_t lx_netlink_ldi; /* LDI handle */ +static int lx_netlink_bufsize = 4096; /* default buffer size */ +static int lx_netlink_flowctrld; /* # of times flow controlled */ + +/*ARGSUSED*/ +static void +lx_netlink_activate(sock_lower_handle_t handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, + int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ | + SOCKOPT_MAXBLK | SOCKOPT_MINPSZ; + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl); + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_minpsz = 0; + + lxsock->lxns_upcalls = sock_upcalls; + lxsock->lxns_uphandle = sock_handle; + + sock_upcalls->su_set_proto_props(sock_handle, &sopp); +} + +/*ARGSUSED*/ +static int +lx_netlink_setsockopt(sock_lower_handle_t handle, int level, + int option_name, const void *optval, socklen_t optlen, struct cred *cr) +{ + if (level == SOL_SOCKET) { + return (0); + } else if (level != SOL_LX_NETLINK) { + return (EOPNOTSUPP); + } + + switch (option_name) { + case LX_NETLINK_SO_ADD_MEMBERSHIP: + case LX_NETLINK_SO_DROP_MEMBERSHIP: + case LX_NETLINK_SO_PKTINFO: + case LX_NETLINK_SO_BROADCAST_ERROR: + case LX_NETLINK_SO_NO_ENOBUFS: + case LX_NETLINK_SO_RX_RING: + case LX_NETLINK_SO_TX_RING: + /* Blatant lie */ + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_bind(sock_lower_handle_t handle, struct sockaddr *name, + socklen_t namelen, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)name; + + if (namelen != sizeof (lx_netlink_sockaddr_t) || + lxsa->lxnl_family != AF_LX_NETLINK) { + return (EINVAL); + } + + + if (lxsa->lxnl_groups != 0) { + /* + * On linux, CAP_NET_ADMIN is needed to bind to netlink groups. + * This roughly maps to PRIV_SYS_IP_CONFIG. + */ + if (secpolicy_ip_config(cr, B_FALSE) != 0) { + return (EACCES); + } + + /* Lie about group subscription for now */ + lxsock->lxns_groups = lxsa->lxnl_groups; + } + + /* + * Linux netlink uses nl_port to identify distinct netlink sockets. + * Binding to an address of nl_port=0 triggers the kernel to + * automatically assign a free nl_port identifier. Originally, + * consumers of lx_netlink were required to bind with that automatic + * address. We now support non-zero values for nl_port although strict + * checking to identify conflicts is not performed. Use of the + * id_space facility could be a convenient solution, if a need arose. + */ + if (lxsa->lxnl_port == 0) { + /* + * Because we are not doing conflict detection, there is no + * need to expend effort selecting a unique port for automatic + * addressing during bind. + */ + lxsock->lxns_port = curproc->p_pid; + } else { + lxsock->lxns_port = lxsa->lxnl_port; + } + + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_getsockname(sock_lower_handle_t handle, struct sockaddr *sa, + socklen_t *len, struct cred *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_sockaddr_t *lxsa = (lx_netlink_sockaddr_t *)sa; + + if (*len < sizeof (lx_netlink_sockaddr_t)) + return (EINVAL); + + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_pad = 0; + lxsa->lxnl_port = lxsock->lxns_port; + lxsa->lxnl_groups = lxsock->lxns_groups; + + *len = sizeof (lx_netlink_sockaddr_t); + + return (0); +} + +static mblk_t * +lx_netlink_alloc_mp1() +{ + return (allocb(sizeof (struct T_unitdata_ind) + + sizeof (lx_netlink_sockaddr_t), 0)); +} + +static lx_netlink_reply_t * +lx_netlink_reply(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, uint16_t type) +{ + lx_netlink_reply_t *reply; + mblk_t *err, *mp1; + + /* + * We always allocate an error block to assure that even if subsequent + * allocations fail, we can return an error. + */ + if ((err = allocb(sizeof (lx_netlink_err_t), 0)) == NULL) + return (NULL); + + if ((mp1 = lx_netlink_alloc_mp1()) == NULL) { + freeb(err); + return (NULL); + } + + reply = kmem_zalloc(sizeof (lx_netlink_reply_t), KM_SLEEP); + reply->lxnr_err = err; + reply->lxnr_sock = lxsock; + reply->lxnr_hdr = *hdr; + reply->lxnr_type = type; + reply->lxnr_mp1 = mp1; + + return (reply); +} + +static void +lx_netlink_reply_add(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + uint32_t align, alignto = LX_NETLINK_NLA_ALIGNTO; + mblk_t *mp = reply->lxnr_mp; + + if (reply->lxnr_errno) + return; + + align = (alignto - (size & (alignto - 1))) & (alignto - 1); + + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + + if (hdr->lxnh_len + size + align > lxsock->lxns_bufsize) { + reply->lxnr_errno = E2BIG; + return; + } + + bcopy(payload, mp->b_wptr, size); + + hdr->lxnh_len += size + align; + mp->b_wptr += size + align; +} + +static void +lx_netlink_reply_msg(lx_netlink_reply_t *reply, void *payload, uint32_t size) +{ + lx_netlink_hdr_t *hdr; + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + if (reply->lxnr_errno) + return; + + VERIFY(reply->lxnr_mp == NULL); + + if ((reply->lxnr_mp = mp = allocb(lxsock->lxns_bufsize, 0)) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + bzero(mp->b_rptr, lxsock->lxns_bufsize); + hdr = (lx_netlink_hdr_t *)mp->b_rptr; + hdr->lxnh_flags = LX_NETLINK_NLM_F_MULTI; + hdr->lxnh_len = sizeof (lx_netlink_hdr_t); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + + mp->b_wptr += sizeof (lx_netlink_hdr_t); + + if (payload == NULL) { + /* + * A NULL payload denotes a "done" message. + */ + hdr->lxnh_type = LX_NETLINK_NLMSG_DONE; + } else { + hdr->lxnh_type = reply->lxnr_type; + lx_netlink_reply_add(reply, payload, size); + } +} + +static void +lx_netlink_reply_attr(lx_netlink_reply_t *reply, uint16_t type, + void *payload, uint32_t size) +{ + lx_netlink_attr_t attr; + + attr.lxna_len = size + sizeof (lx_netlink_attr_t); + attr.lxna_type = type; + + lx_netlink_reply_add(reply, &attr, sizeof (attr)); + lx_netlink_reply_add(reply, payload, size); +} + +static void +lx_netlink_reply_attr_string(lx_netlink_reply_t *reply, + uint16_t type, const char *str) +{ + lx_netlink_reply_attr(reply, type, (void *)str, strlen(str) + 1); +} + +static void +lx_netlink_reply_attr_int32(lx_netlink_reply_t *reply, + uint16_t type, int32_t val) +{ + int32_t v = val; + + lx_netlink_reply_attr(reply, type, &v, sizeof (int32_t)); +} + +static int +lx_netlink_reply_ioctl(lx_netlink_reply_t *reply, int cmd, void *arg) +{ + int rval; + + if (reply->lxnr_errno != 0) + return (reply->lxnr_errno); + + if ((rval = ldi_ioctl(reply->lxnr_sock->lxns_current, + cmd, (intptr_t)arg, FKIOCTL, kcred, NULL)) != 0) { + reply->lxnr_errno = rval; + } + + return (rval); +} + +static void +lx_netlink_reply_sendup(lx_netlink_reply_t *reply, mblk_t *mp, mblk_t *mp1) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + lx_netlink_sockaddr_t *lxsa; + struct T_unitdata_ind *tunit; + int error; + + /* + * To prevent the stream head from coalescing messages and to indicate + * their origin, we send them as T_UNITDATA_IND messages, not as raw + * M_DATA. + */ + mp1->b_cont = mp; + + mp = mp1; + mp->b_datap->db_type = M_PROTO; + + tunit = (struct T_unitdata_ind *)mp->b_rptr; + tunit->PRIM_type = T_UNITDATA_IND; + tunit->SRC_length = sizeof (lx_netlink_sockaddr_t); + tunit->SRC_offset = sizeof (*tunit); + tunit->OPT_length = 0; + + lxsa = (lx_netlink_sockaddr_t *)(mp->b_rptr + sizeof (*tunit)); + lxsa->lxnl_family = AF_LX_NETLINK; + lxsa->lxnl_port = 0; + lxsa->lxnl_groups = 0; + lxsa->lxnl_pad = 0; + + mp->b_wptr += sizeof (*tunit) + sizeof (*lxsa); + + lxsock->lxns_upcalls->su_recv(lxsock->lxns_uphandle, mp, + msgdsize(mp), 0, &error, NULL); + + if (error != 0) + lx_netlink_flowctrld++; +} + +static void +lx_netlink_reply_send(lx_netlink_reply_t *reply) +{ + mblk_t *mp1; + + if (reply->lxnr_errno) + return; + + if ((mp1 = lx_netlink_alloc_mp1()) == NULL) { + reply->lxnr_errno = ENOMEM; + return; + } + + lx_netlink_reply_sendup(reply, reply->lxnr_mp, mp1); + reply->lxnr_mp = NULL; +} + +static void +lx_netlink_reply_done(lx_netlink_reply_t *reply) +{ + lx_netlink_sock_t *lxsock = reply->lxnr_sock; + mblk_t *mp; + + /* + * Denote that we're done via a message with a NULL payload. + */ + lx_netlink_reply_msg(reply, NULL, 0); + + if (reply->lxnr_errno) { + /* + * If anything failed, we'll send up an error message. + */ + lx_netlink_hdr_t *hdr; + lx_netlink_err_t *err; + + if (reply->lxnr_mp != NULL) { + freeb(reply->lxnr_mp); + reply->lxnr_mp = NULL; + } + + mp = reply->lxnr_err; + err = (lx_netlink_err_t *)mp->b_rptr; + hdr = &err->lxne_hdr; + + hdr->lxnh_type = LX_NETLINK_NLMSG_ERROR; + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_len = sizeof (lx_netlink_err_t); + hdr->lxnh_seq = reply->lxnr_hdr.lxnh_seq; + hdr->lxnh_pid = lxsock->lxns_port; + err->lxne_failed = reply->lxnr_hdr; + err->lxne_errno = reply->lxnr_errno; + mp->b_wptr += sizeof (lx_netlink_err_t); + reply->lxnr_err = NULL; + } else { + mp = reply->lxnr_mp; + VERIFY(mp != NULL); + reply->lxnr_mp = NULL; + } + + lx_netlink_reply_sendup(reply, mp, reply->lxnr_mp1); + + if (reply->lxnr_mp != NULL) + freeb(reply->lxnr_mp); + + if (reply->lxnr_err != NULL) + freeb(reply->lxnr_err); + + kmem_free(reply, sizeof (lx_netlink_reply_t)); +} + +static int +lx_netlink_reply_error(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, int errno) +{ + /* + * The type of the message doesn't matter, as we're going to explicitly + * set lxnr_errno and therefore send only an error message. + */ + lx_netlink_reply_t *reply = lx_netlink_reply(lxsock, hdr, 0); + + if (reply == NULL) + return (ENOMEM); + + reply->lxnr_errno = errno; + lx_netlink_reply_done(reply); + + return (0); +} + +static void +lx_netlink_getlink_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifinfomsg_t ifi; + int i; + char if_name[IFNAMSIZ]; + struct sockaddr_dl *sdl; + struct sockaddr hwaddr; + int hwaddr_size; + boolean_t is_loopback; + + struct { + int native; + int lx; + } flags[] = { + { IFF_UP, LX_IFF_UP }, + { IFF_BROADCAST, LX_IFF_BROADCAST }, + { IFF_DEBUG, LX_IFF_DEBUG }, + { IFF_LOOPBACK, LX_IFF_LOOPBACK }, + { IFF_POINTOPOINT, LX_IFF_POINTOPOINT }, + { IFF_NOTRAILERS, LX_IFF_NOTRAILERS }, + { IFF_RUNNING, LX_IFF_RUNNING }, + { IFF_NOARP, LX_IFF_NOARP }, + { IFF_PROMISC, LX_IFF_PROMISC }, + { IFF_ALLMULTI, LX_IFF_ALLMULTI }, + { IFF_MULTICAST, LX_IFF_MULTICAST }, + { 0 } + }; + + /* + * Most of the lx_netlink module is architected to emit information in + * an illumos-native manner. Socket syscalls such as getsockname will + * not translate fields to values Linux programs would expect since + * that conversion is performed by the generic socket emulation. + * + * This is _not_ true of the actual protocol output from lx_netlink. + * Since translating it at the socket layer would be onerous, all + * output (including constants and names) is pre-translated to values + * valid for Linux. + */ + + bzero(&ifi, sizeof (ifi)); + ifi.lxnl_ifi_family = AF_UNSPEC; + ifi.lxnl_ifi_change = (uint32_t)-1; + + /* Convert the name to be Linux-friendly */ + (void) strlcpy(if_name, lifr->lifr_name, IFNAMSIZ); + lx_ifname_convert(if_name, LX_IF_FROMNATIVE); + is_loopback = (strncmp(if_name, "lo", 2) == 0); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifi.lxnl_ifi_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + for (i = 0; flags[i].native; i++) { + if (lifr->lifr_flags & flags[i].native) + ifi.lxnl_ifi_flags |= flags[i].lx; + } + + /* + * Query the datalink address. + * The interface type will be included in the outgoing infomsg while + * the address itself will be output separately. + */ + sdl = (struct sockaddr_dl *)&lifr->lifr_addr; + bzero(sdl, sizeof (*sdl)); + if (!is_loopback) { + lx_netlink_reply_ioctl(reply, SIOCGLIFHWADDR, lifr); + } else { + /* Simulate an empty hwaddr for loopback */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + } + lx_stol_hwaddr(sdl, &hwaddr, &hwaddr_size); + + ifi.lxnl_ifi_type = hwaddr.sa_family; + lx_netlink_reply_msg(reply, &ifi, sizeof (lx_netlink_ifinfomsg_t)); + + lx_netlink_reply_attr_string(reply, LX_NETLINK_IFLA_IFNAME, if_name); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFMTU, lifr) != 0) + return; + + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_MTU, lifr->lifr_mtu); + + if (hwaddr_size != 0) { + lx_netlink_reply_attr(reply, LX_NETLINK_IFLA_ADDRESS, + hwaddr.sa_data, hwaddr_size); + } + + /* Emulate a txqlen of 1. (0 for loopbacks) */ + lx_netlink_reply_attr_int32(reply, LX_NETLINK_IFLA_TXQLEN, + (is_loopback) ? 0 : 1); + + lx_netlink_reply_send(reply); +} + +static void +lx_netlink_reply_eachfamily(lx_netlink_reply_t *reply, + void (*func)(lx_netlink_reply_t *, struct lifreq *), boolean_t distinct) +{ + lx_netlink_sock_t *sock = reply->lxnr_sock; + int nlifr, i; + + struct { + int family; + ldi_handle_t handle; + struct lifconf lifc; + struct lifnum lifn; + } families[] = { + { AF_INET, sock->lxns_iphandle }, + { AF_INET6, sock->lxns_ip6handle }, + { AF_UNSPEC } + }, *family, *check; + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + struct lifnum *lifn = &family->lifn; + + lifn->lifn_family = family->family; + sock->lxns_current = family->handle; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFNUM, lifn) != 0) + break; + + lifc->lifc_family = lifn->lifn_family; + lifc->lifc_flags = 0; + lifc->lifc_len = lifn->lifn_count * sizeof (struct lifreq); + if (lifn->lifn_count == 0) { + lifc->lifc_buf = NULL; + continue; + } + lifc->lifc_buf = kmem_alloc(lifc->lifc_len, KM_SLEEP); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFCONF, lifc) != 0) + break; + + nlifr = lifc->lifc_len / sizeof (lifc->lifc_req[0]); + + for (i = 0; i < nlifr; i++) { + if (!distinct) { + func(reply, &lifc->lifc_req[i]); + continue; + } + + /* + * If we have been asked to provide each interface + * exactly once, we need to (annoyingly) check this + * name against others that we've already processed for + * other families. Yes, this is quadratic time -- but + * the number of interfaces per family is expected to + * be very small. + */ + for (check = families; check != family; check++) { + struct lifconf *clifc = &check->lifc; + int cnlifr = clifc->lifc_len / + sizeof (clifc->lifc_req[0]), j; + char *nm = lifc->lifc_req[i].lifr_name, *cnm; + + for (j = 0; j < cnlifr; j++) { + cnm = clifc->lifc_req[j].lifr_name; + + if (strcmp(nm, cnm) == 0) + break; + } + + if (j != cnlifr) + break; + } + + if (check != family) + continue; + + func(reply, &lifc->lifc_req[i]); + } + } + + for (family = families; family->family != AF_UNSPEC; family++) { + struct lifconf *lifc = &family->lifc; + + if (lifc->lifc_buf != NULL) + kmem_free(lifc->lifc_buf, lifc->lifc_len); + } +} + +/*ARGSUSED*/ +static int +lx_netlink_getlink(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWLINK); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getlink_lifreq, B_TRUE); + lx_netlink_reply_done(reply); + + return (0); +} + +static void +lx_netlink_getaddr_lifreq(lx_netlink_reply_t *reply, struct lifreq *lifr) +{ + lx_netlink_ifaddrmsg_t ifa; + + bzero(&ifa, sizeof (ifa)); + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFINDEX, lifr) != 0) + return; + + ifa.lxnl_ifa_index = lifr->lifr_index; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFFLAGS, lifr) != 0) + return; + + if (!(lifr->lifr_flags & IFF_NOLOCAL)) { + if (lx_netlink_reply_ioctl(reply, SIOCGLIFSUBNET, lifr) != 0) + return; + + ifa.lxnl_ifa_prefixlen = lifr->lifr_addrlen; + + if (lx_netlink_reply_ioctl(reply, SIOCGLIFADDR, lifr) != 0) + return; + + if (lifr->lifr_addr.ss_family == AF_INET) { + struct sockaddr_in *sin; + + ifa.lxnl_ifa_family = LX_AF_INET; + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + sin = (struct sockaddr_in *)&lifr->lifr_addr; + + lx_netlink_reply_attr_int32(reply, + LX_NETLINK_IFA_ADDRESS, sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin; + + ifa.lxnl_ifa_family = LX_AF_INET6; + + lx_netlink_reply_msg(reply, &ifa, + sizeof (lx_netlink_ifaddrmsg_t)); + + sin = (struct sockaddr_in6 *)&lifr->lifr_addr; + + lx_netlink_reply_attr(reply, LX_NETLINK_IFA_ADDRESS, + &sin->sin6_addr, sizeof (sin->sin6_addr)); + } + } + + lx_netlink_reply_send(reply); +} + +/*ARGSUSED*/ +static int +lx_netlink_getaddr(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + lx_netlink_reply_t *reply; + + reply = lx_netlink_reply(lxsock, hdr, LX_NETLINK_RTM_NEWADDR); + + if (reply == NULL) + return (ENOMEM); + + lx_netlink_reply_eachfamily(reply, lx_netlink_getaddr_lifreq, B_FALSE); + lx_netlink_reply_done(reply); + + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_audit(lx_netlink_sock_t *lxsock, lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For all auditing messages, we return ECONNREFUSED, which seems to + * keep user-level auditing happy. (Or at least, non-suicidal.) + */ + return (ECONNREFUSED); +} + +/*ARGSUSED*/ +static int +lx_netlink_kobject_uevent(lx_netlink_sock_t *lxsock, + lx_netlink_hdr_t *hdr, mblk_t *mp) +{ + /* + * For udev, we just silently accept all writes and never actually + * reply with anything -- which appears to be sufficient for things + * to work. + */ + return (0); +} + +/*ARGSUSED*/ +static int +lx_netlink_send(sock_lower_handle_t handle, mblk_t *mp, + struct nmsghdr *msg, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle; + lx_netlink_hdr_t *hdr = (lx_netlink_hdr_t *)mp->b_rptr; + int i, rval; + + static struct { + int proto; + uint16_t type; + int (*func)(lx_netlink_sock_t *, lx_netlink_hdr_t *, mblk_t *); + } handlers[] = { + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETLINK, lx_netlink_getlink }, + { LX_NETLINK_ROUTE, + LX_NETLINK_RTM_GETADDR, lx_netlink_getaddr }, + { LX_NETLINK_AUDIT, + LX_NETLINK_NLMSG_NONE, lx_netlink_audit }, + { LX_NETLINK_KOBJECT_UEVENT, + LX_NETLINK_NLMSG_NONE, lx_netlink_kobject_uevent }, + { LX_NETLINK_NLMSG_NOOP, LX_NETLINK_NLMSG_NONE, NULL } + }; + + if (DB_TYPE(mp) != M_DATA || MBLKL(mp) < sizeof (lx_netlink_hdr_t)) { + freemsg(mp); + return (EPROTO); + } + + for (i = 0; handlers[i].func != NULL; i++) { + if (lxsock->lxns_proto != handlers[i].proto) + continue; + + if (handlers[i].type != LX_NETLINK_NLMSG_NONE && + hdr->lxnh_type != handlers[i].type) + continue; + + rval = handlers[i].func(lxsock, hdr, mp); + freemsg(mp); + + return (rval); + } + + /* + * An unrecognized message. We will bounce up an EOPNOTSUPP reply. + */ + rval = lx_netlink_reply_error(lxsock, hdr, EOPNOTSUPP); + freemsg(mp); + + return (rval); +} + +/*ARGSUSED*/ +static int +lx_netlink_close(sock_lower_handle_t handle, int flags, cred_t *cr) +{ + lx_netlink_sock_t *lxsock = (lx_netlink_sock_t *)handle, *sock, **prev; + + mutex_enter(&lx_netlink_lock); + + prev = &lx_netlink_head; + + for (sock = *prev; sock != lxsock; sock = sock->lxns_next) + prev = &sock->lxns_next; + + *prev = sock->lxns_next; + + mutex_exit(&lx_netlink_lock); + + (void) ldi_close(lxsock->lxns_iphandle, FREAD, kcred); + (void) ldi_close(lxsock->lxns_ip6handle, FREAD, kcred); + kmem_free(lxsock, sizeof (lx_netlink_sock_t)); + + return (0); +} + +static sock_downcalls_t sock_lx_netlink_downcalls = { + lx_netlink_activate, /* sd_activate */ + sock_accept_notsupp, /* sd_accept */ + lx_netlink_bind, /* sd_bind */ + sock_listen_notsupp, /* sd_listen */ + sock_connect_notsupp, /* sd_connect */ + sock_getpeername_notsupp, /* sd_getpeername */ + lx_netlink_getsockname, /* sd_getsockname */ + sock_getsockopt_notsupp, /* sd_getsockopt */ + lx_netlink_setsockopt, /* sd_setsockopt */ + lx_netlink_send, /* sd_send */ + NULL, /* sd_send_uio */ + NULL, /* sd_recv_uio */ + NULL, /* sd_poll */ + sock_shutdown_notsupp, /* sd_shutdown */ + sock_clr_flowctrl_notsupp, /* sd_setflowctrl */ + sock_ioctl_notsupp, /* sd_ioctl */ + lx_netlink_close /* sd_close */ +}; + +/*ARGSUSED*/ +static sock_lower_handle_t +lx_netlink_create(int family, int type, int proto, + sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, + int flags, cred_t *credp) +{ + lx_netlink_sock_t *lxsock; + ldi_handle_t handle, handle6; + cred_t *kcred = zone_kcred(); + int err; + + if (family != AF_LX_NETLINK || + (type != SOCK_DGRAM && type != SOCK_RAW)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + switch (proto) { + case LX_NETLINK_ROUTE: + case LX_NETLINK_AUDIT: + case LX_NETLINK_KOBJECT_UEVENT: + break; + + default: + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP, FREAD, kcred, + &handle, lx_netlink_ldi)) != 0) { + *errorp = err; + return (NULL); + } + + if ((err = ldi_open_by_name(DEV_IP6, FREAD, kcred, + &handle6, lx_netlink_ldi)) != 0) { + (void) ldi_close(handle, FREAD, kcred); + *errorp = err; + return (NULL); + } + + *sock_downcalls = &sock_lx_netlink_downcalls; + *smodep = SM_ATOMIC; + + lxsock = kmem_zalloc(sizeof (lx_netlink_sock_t), KM_SLEEP); + lxsock->lxns_iphandle = handle; + lxsock->lxns_ip6handle = handle6; + lxsock->lxns_bufsize = lx_netlink_bufsize; + lxsock->lxns_proto = proto; + + mutex_enter(&lx_netlink_lock); + + lxsock->lxns_next = lx_netlink_head; + lx_netlink_head = lxsock; + + mutex_exit(&lx_netlink_lock); + + return ((sock_lower_handle_t)lxsock); +} + +static void +lx_netlink_init(void) +{ + major_t major = mod_name_to_major("ip"); + int err; + + VERIFY(major != DDI_MAJOR_T_NONE); + + err = ldi_ident_from_major(major, &lx_netlink_ldi); + VERIFY(err == 0); +} + +static void +lx_netlink_fini(void) +{ + ldi_ident_release(lx_netlink_ldi); +} + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "lx_netlink", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + lx_netlink_create, + NULL +}; + +/* modldrv structure */ +static struct modlsockmod sockmod = { + &mod_sockmodops, "AF_LX_NETLINK socket module", &sinfo +}; + +/* modlinkage structure */ +static struct modlinkage ml = { + MODREV_1, + &sockmod, + NULL +}; + +int +_init(void) +{ + int err; + + lx_netlink_init(); + + if ((err = mod_install(&ml)) != 0) + lx_netlink_fini(); + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&ml, modinfop)); +} + +int +_fini(void) +{ + int err = 0; + + mutex_enter(&lx_netlink_lock); + + if (lx_netlink_head != NULL) + err = EBUSY; + + mutex_exit(&lx_netlink_lock); + + if (err == 0 && (err = mod_remove(&ml)) == 0) + lx_netlink_fini(); + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.c b/usr/src/uts/common/brand/lx/io/lx_ptm.c new file mode 100644 index 0000000000..db13b00ea4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.c @@ -0,0 +1,1165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + + +/* + * This driver attempts to emulate some of the the behaviors of + * Linux terminal devices (/dev/ptmx and /dev/pts/[0-9][0-9]*) on Solaris + * + * It does this by layering over the /dev/ptmx device and intercepting + * opens to it. + * + * This driver makes the following assumptions about the way the ptm/pts + * drivers on Solaris work: + * + * - all opens of the /dev/ptmx device node return a unique dev_t. + * + * - the dev_t minor node value for each open ptm instance corrospondes + * to it's associated slave terminal device number. ie. the path to + * the slave terminal device associated with an open ptm instance + * who's dev_t minor node vaue is 5, is /dev/pts/5. + * + * - the ptm driver always allocates the lowest numbered slave terminal + * device possible. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/devops.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/kstr.h> +#include <sys/lx_ptm.h> +#include <sys/modctl.h> +#include <sys/pathname.h> +#include <sys/ptms.h> +#include <sys/ptyvar.h> +#include <sys/stat.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/sysmacros.h> +#include <sys/types.h> + +#define LP_PTM_PATH "/dev/ptmx" +#define LP_PTS_PATH "/dev/pts/" +#define LP_PTS_DRV_NAME "pts" +#define LP_PTS_USEC_DELAY (5 * 1000) /* 5 ms */ +#define LP_PTS_USEC_DELAY_MAX (5 * MILLISEC) /* 5 ms */ + +/* + * this driver is layered on top of the ptm driver. we'd like to + * make this drivers minor name space a mirror of the ptm drivers + * namespace, but we can't actually do this. the reason is that the + * ptm driver is opened via the clone driver. there for no minor nodes + * of the ptm driver are actually accessible via the filesystem. + * since we're not a streams device we can't be opened by the clone + * driver. there for we need to have at least minor node accessible + * via the filesystem so that consumers can open it. we use the device + * node with a minor number of 0 for this purpose. what this means is + * that minor node 0 can't be used to map ptm minor node 0. since this + * minor node is now reserved we need to shift our ptm minor node + * mappings by one. ie. a ptm minor node with a value of 0 will + * corrospond to our minor node with a value of 1. these mappings are + * managed with the following macros. + */ +#define DEVT_TO_INDEX(x) LX_PTM_DEV_TO_PTS(x) +#define INDEX_TO_MINOR(x) ((x) + 1) + +/* + * grow our layered handle array by the same size increment that the ptm + * driver uses to grow the pty device space - PTY_MAXDELTA + */ +#define LP_PTY_INC 128 + +/* + * lx_ptm_ops contains state information about outstanding operations on the + * underlying master terminal device. Currently we only track information + * for read operations. + * + * Note that this data has not been rolled directly into the lx_ptm_handle + * structure because we can't put mutex's of condition variables into + * lx_ptm_handle structure. The reason is that the array of lx_ptm_handle + * structures linked to from the global lx_ptm state can be resized + * dynamically, and when it's resized, the new array is at a different + * memory location and the old array memory is discarded. Mutexs and cvs + * are accessed based off their address, so if this array was re-sized while + * there were outstanding operations on any mutexs or cvs in the array + * then the system would tip over. In the future the lx_ptm_handle structure + * array should probably be replaced with either an array of pointers to + * lx_ptm_handle structures or some other kind of data structure containing + * pointers to lx_ptm_handle structures. Then the lx_ptm_ops structure + * could be folded directly into the lx_ptm_handle structures. (This will + * also require the definition of a new locking mechanism to protect the + * contents of lx_ptm_handle structures.) + */ +typedef struct lx_ptm_ops { + int lpo_rops; + kcondvar_t lpo_rops_cv; + kmutex_t lpo_rops_lock; +} lx_ptm_ops_t; + +/* + * Every open of the master terminal device in a zone results in a new + * lx_ptm_handle handle allocation. These handles are stored in an array + * hanging off the lx_ptm_state structure. + */ +typedef struct lx_ptm_handle { + /* Device handle to the underlying real /dev/ptmx master terminal. */ + ldi_handle_t lph_handle; + + /* Flag to indicate if TIOCPKT mode has been enabled. */ + int lph_pktio; + + /* Number of times the slave device has been opened/closed. */ + int lph_eofed; + + /* Callback handler in the ptm driver to check if slave is open. */ + ptmptsopencb_t lph_ppocb; + + /* Pointer to state for operations on underlying device. */ + lx_ptm_ops_t *lph_lpo; +} lx_ptm_handle_t; + +/* + * Global state for the lx_ptm driver. + */ +typedef struct lx_ptm_state { + /* lx_ptm device devinfo pointer */ + dev_info_t *lps_dip; + + /* LDI ident used to open underlying real /dev/ptmx master terminals. */ + ldi_ident_t lps_li; + + /* pts drivers major number */ + major_t lps_pts_major; + + /* rw lock used to manage access and growth of lps_lh_array */ + krwlock_t lps_lh_rwlock; + + /* number of elements in lps_lh_array */ + uint_t lps_lh_count; + + /* Array of handles to underlying real /dev/ptmx master terminals. */ + lx_ptm_handle_t *lps_lh_array; +} lx_ptm_state_t; + +/* Pointer to the lx_ptm global state structure. */ +static lx_ptm_state_t lps; + +/* + * List of modules to be autopushed onto slave terminal devices when they + * are opened in an lx branded zone. + */ +static char *lx_pts_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +static void +lx_ptm_lh_grow(uint_t index) +{ + uint_t new_lh_count, old_lh_count; + lx_ptm_handle_t *new_lh_array, *old_lh_array; + + /* + * allocate a new array. we drop the rw lock on the array so that + * readers can still access devices in case our memory allocation + * blocks. + */ + new_lh_count = MAX(lps.lps_lh_count + LP_PTY_INC, index + 1); + new_lh_array = + kmem_zalloc(sizeof (lx_ptm_handle_t) * new_lh_count, KM_SLEEP); + + /* + * double check that we still actually need to increase the size + * of the array + */ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + if (index < lps.lps_lh_count) { + /* someone beat us to it so there's nothing more to do */ + rw_exit(&lps.lps_lh_rwlock); + kmem_free(new_lh_array, + sizeof (lx_ptm_handle_t) * new_lh_count); + return; + } + + /* copy the existing data into the new array */ + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_count != 0) { + bcopy(lps.lps_lh_array, new_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + } + + /* save info on the old array */ + old_lh_array = lps.lps_lh_array; + old_lh_count = lps.lps_lh_count; + + /* install the new array */ + lps.lps_lh_array = new_lh_array; + lps.lps_lh_count = new_lh_count; + + rw_exit(&lps.lps_lh_rwlock); + + /* free the old array */ + if (old_lh_array != NULL) { + kmem_free(old_lh_array, + sizeof (lx_ptm_handle_t) * old_lh_count); + } +} + +static void +lx_ptm_lh_insert(uint_t index, ldi_handle_t lh) +{ + lx_ptm_ops_t *lpo; + + ASSERT(lh != NULL); + + /* Allocate and initialize the ops structure */ + lpo = kmem_zalloc(sizeof (lx_ptm_ops_t), KM_SLEEP); + mutex_init(&lpo->lpo_rops_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&lpo->lpo_rops_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + /* check if we need to grow the size of the layered handle array */ + if (index >= lps.lps_lh_count) { + rw_exit(&lps.lps_lh_rwlock); + lx_ptm_lh_grow(index); + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + } + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle == NULL); + ASSERT(lps.lps_lh_array[index].lph_pktio == 0); + ASSERT(lps.lps_lh_array[index].lph_eofed == 0); + ASSERT(lps.lps_lh_array[index].lph_lpo == NULL); + + /* insert the new handle and return */ + lps.lps_lh_array[index].lph_handle = lh; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + lps.lps_lh_array[index].lph_lpo = lpo; + + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_remove(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + ASSERT(lps.lps_lh_array[index].lph_lpo->lpo_rops == 0); + ASSERT(!MUTEX_HELD(&lps.lps_lh_array[index].lph_lpo->lpo_rops_lock)); + + /* free the write handle */ + kmem_free(lps.lps_lh_array[index].lph_lpo, sizeof (lx_ptm_ops_t)); + lps.lps_lh_array[index].lph_lpo = NULL; + + /* remove the handle and return it */ + lh = lps.lps_lh_array[index].lph_handle; + lps.lps_lh_array[index].lph_handle = NULL; + lps.lps_lh_array[index].lph_pktio = 0; + lps.lps_lh_array[index].lph_eofed = 0; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static void +lx_ptm_lh_get_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + *ppocb = lps.lps_lh_array[index].lph_ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static void +lx_ptm_lh_set_ppocb(uint_t index, ptmptsopencb_t *ppocb) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + lps.lps_lh_array[index].lph_ppocb = *ppocb; + rw_exit(&lps.lps_lh_rwlock); +} + +static ldi_handle_t +lx_ptm_lh_lookup(uint_t index) +{ + ldi_handle_t lh; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the handle */ + lh = lps.lps_lh_array[index].lph_handle; + rw_exit(&lps.lps_lh_rwlock); + return (lh); +} + +static lx_ptm_ops_t * +lx_ptm_lpo_lookup(uint_t index) +{ + lx_ptm_ops_t *lpo; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_lpo != NULL); + + /* return the handle */ + lpo = lps.lps_lh_array[index].lph_lpo; + rw_exit(&lps.lps_lh_rwlock); + return (lpo); +} + +static int +lx_ptm_lh_pktio_get(uint_t index) +{ + int pktio; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the pktio state */ + pktio = lps.lps_lh_array[index].lph_pktio; + rw_exit(&lps.lps_lh_rwlock); + return (pktio); +} + +static void +lx_ptm_lh_pktio_set(uint_t index, int pktio) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the pktio state */ + lps.lps_lh_array[index].lph_pktio = pktio; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_lh_eofed_get(uint_t index) +{ + int eofed; + + rw_enter(&lps.lps_lh_rwlock, RW_READER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* return the eofed state */ + eofed = lps.lps_lh_array[index].lph_eofed; + rw_exit(&lps.lps_lh_rwlock); + return (eofed); +} + +static void +lx_ptm_lh_eofed_set(uint_t index) +{ + rw_enter(&lps.lps_lh_rwlock, RW_WRITER); + + ASSERT(index < lps.lps_lh_count); + ASSERT(lps.lps_lh_array[index].lph_handle != NULL); + + /* set the eofed state */ + lps.lps_lh_array[index].lph_eofed++; + rw_exit(&lps.lps_lh_rwlock); +} + +static int +lx_ptm_read_start(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* Wait for other read operations to finish */ + while (lpo->lpo_rops != 0) { + if (cv_wait_sig(&lpo->lpo_rops_cv, &lpo->lpo_rops_lock) == 0) { + mutex_exit(&lpo->lpo_rops_lock); + return (-1); + } + } + + /* Start a read operation */ + VERIFY(++lpo->lpo_rops == 1); + mutex_exit(&lpo->lpo_rops_lock); + return (0); +} + +static void +lx_ptm_read_end(dev_t dev) +{ + lx_ptm_ops_t *lpo = lx_ptm_lpo_lookup(DEVT_TO_INDEX(dev)); + + mutex_enter(&lpo->lpo_rops_lock); + ASSERT(lpo->lpo_rops >= 0); + + /* End a read operation */ + VERIFY(--lpo->lpo_rops == 0); + cv_signal(&lpo->lpo_rops_cv); + + mutex_exit(&lpo->lpo_rops_lock); +} + +static int +lx_ptm_pts_isopen(dev_t dev) +{ + ptmptsopencb_t ppocb; + + lx_ptm_lh_get_ppocb(DEVT_TO_INDEX(dev), &ppocb); + return (ppocb.ppocb_func(ppocb.ppocb_arg)); +} + +static void +lx_ptm_eof_read(ldi_handle_t lh) +{ + struct uio uio; + iovec_t iov; + char junk[1]; + + /* + * We can remove any EOF message from the head of the stream by + * doing a zero byte read from the stream. + */ + iov.iov_len = 0; + iov.iov_base = junk; + uio.uio_iovcnt = 1; + uio.uio_iov = &iov; + uio.uio_resid = iov.iov_len; + uio.uio_offset = 0; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_fmode = 0; + uio.uio_extflg = 0; + uio.uio_llimit = MAXOFFSET_T; + (void) ldi_read(lh, &uio, kcred); +} + +static int +lx_ptm_eof_drop_1(dev_t dev, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, msg_size, msg_count; + + *rvalp = 0; + + /* + * Check if there is an EOF message (represented by a zero length + * data message) at the head of the stream. Note that the + * I_NREAD ioctl is a streams framework ioctl so it will succeed + * even if there have been previous write errors on this stream. + */ + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + + if ((msg_count == 0) || (msg_size != 0)) { + /* No EOF message found */ + return (0); + } + + /* Record the fact that the slave device has been closed. */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + + /* drop the EOF */ + lx_ptm_eof_read(lh); + *rvalp = 1; + return (0); +} + +static int +lx_ptm_eof_drop(dev_t dev, int *rvalp) +{ + int rval, err; + + if (rvalp != NULL) + *rvalp = 0; + for (;;) { + if ((err = lx_ptm_eof_drop_1(dev, &rval)) != 0) + return (err); + if (rval == 0) + return (0); + if (rvalp != NULL) + *rvalp = 1; + } +} + +static int +lx_ptm_data_check(dev_t dev, int ignore_eof, int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + *rvalp = 0; + if (ignore_eof) { + int size, rval; + + if ((err = ldi_ioctl(lh, FIONREAD, (intptr_t)&size, + FKIOCTL, kcred, &rval)) != 0) + return (err); + if (size != 0) + *rvalp = 1; + } else { + int msg_size, msg_count; + + if ((err = ldi_ioctl(lh, I_NREAD, (intptr_t)&msg_size, + FKIOCTL, kcred, &msg_count)) != 0) + return (err); + if (msg_count != 0) + *rvalp = 1; + } + return (0); +} + +static int +lx_ptm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int err; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (ddi_create_minor_node(dip, LX_PTM_MINOR_NODE, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) + return (DDI_FAILURE); + + err = ldi_ident_from_dip(dip, &lps.lps_li); + if (err != 0) { + ddi_remove_minor_node(dip, ddi_get_name(dip)); + return (DDI_FAILURE); + } + + lps.lps_dip = dip; + lps.lps_pts_major = ddi_name_to_major(LP_PTS_DRV_NAME); + + rw_init(&lps.lps_lh_rwlock, NULL, RW_DRIVER, NULL); + lps.lps_lh_count = 0; + lps.lps_lh_array = NULL; + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ldi_ident_release(lps.lps_li); + lps.lps_dip = NULL; + + ASSERT((lps.lps_lh_count != 0) || (lps.lps_lh_array == NULL)); + ASSERT((lps.lps_lh_count == 0) || (lps.lps_lh_array != NULL)); + if (lps.lps_lh_array != NULL) { + kmem_free(lps.lps_lh_array, + sizeof (lx_ptm_handle_t) * lps.lps_lh_count); + lps.lps_lh_array = NULL; + lps.lps_lh_count = 0; + } + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +lx_ptm_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + struct strioctl iocb; + ptmptsopencb_t ppocb = { NULL, NULL }; + ldi_handle_t lh; + major_t maj, our_major = getmajor(*devp); + minor_t min, lastmin; + uint_t index, anchor = 1; + dev_t ptm_dev; + int err, rval = 0; + + /* + * Don't support the FNDELAY flag and FNONBLOCK until we either + * find a Linux app that opens /dev/ptmx with the O_NDELAY + * or O_NONBLOCK flags explicitly, or until we create test cases + * to determine how reads of master terminal devices opened with + * these flags behave in different situations on Linux. Supporting + * these flags will involve enhancing our read implementation + * and changing the way it deals with EOF notifications. + */ + if (flag & (FNDELAY | FNONBLOCK)) + return (ENOTSUP); + + /* + * we're layered on top of the ptm driver so open that driver + * first. (note that we're opening /dev/ptmx in the global + * zone, not ourselves in the Linux zone.) + */ + err = ldi_open_by_name(LP_PTM_PATH, flag, credp, &lh, lps.lps_li); + if (err != 0) + return (err); + + /* get the devt returned by the ptmx open */ + err = ldi_get_dev(lh, &ptm_dev); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (err); + } + + /* + * we're a cloning driver so here's well change the devt that we + * return. the ptmx is also a cloning driver so we'll just use + * it's minor number as our minor number (it already manages it's + * minor name space so no reason to duplicate the effort.) + */ + index = getminor(ptm_dev); + *devp = makedevice(our_major, INDEX_TO_MINOR(index)); + + /* Get a callback function to query if the pts device is open. */ + iocb.ic_cmd = PTMPTSOPENCB; + iocb.ic_timout = 0; + iocb.ic_len = sizeof (ppocb); + iocb.ic_dp = (char *)&ppocb; + + err = ldi_ioctl(lh, I_STR, (intptr_t)&iocb, FKIOCTL, kcred, &rval); + if ((err != 0) || (rval != 0)) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + ASSERT(ppocb.ppocb_func != NULL); + + /* + * now setup autopush for the terminal slave device. this is + * necessary so that when a Linux program opens the device we + * can push required strmod modules onto the stream. in Solaris + * this is normally done by the application that actually + * allocates the terminal. + */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + err = kstr_autopush(SET_AUTOPUSH, &maj, &min, &lastmin, + &anchor, lx_pts_mods); + if (err != 0) { + (void) ldi_close(lh, flag, credp); + return (EIO); /* XXX return something else here? */ + } + + /* save off this layered handle for future accesses */ + lx_ptm_lh_insert(index, lh); + lx_ptm_lh_set_ppocb(index, &ppocb); + return (0); +} + +/*ARGSUSED*/ +static int +lx_ptm_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + ldi_handle_t lh; + major_t maj; + minor_t min, lastmin; + uint_t index; + int err; + int i; + + index = DEVT_TO_INDEX(dev); + + /* + * we must cleanup all the state associated with this major/minor + * terminal pair before actually closing the ptm master device. + * this is required because once the close of the ptm device is + * complete major/minor terminal pair is immediatly available for + * re-use in any zone. + */ + + /* free up our saved reference for this layered handle */ + lh = lx_ptm_lh_remove(index); + + /* unconfigure autopush for the associated terminal slave device */ + maj = lps.lps_pts_major; + min = index; + lastmin = 0; + for (i = 0; i < 5; i++) { + /* + * we loop here because we don't want to release this ptm + * node if autopush can't be disabled on the associated + * slave device because then bad things could happen if + * another brand were to get this terminal allocated + * to them. If we keep failing we eventually drive on so that + * things don't hang. + */ + err = kstr_autopush(CLR_AUTOPUSH, &maj, &min, &lastmin, + 0, NULL); + if (err == 0) + break; + + cmn_err(CE_WARN, "lx zoneid %d: error %d on kstr_autopush", + getzoneid(), err); + + /* wait one second and try again */ + delay(drv_usectohz(1000000)); + } + + err = ldi_close(lh, flag, credp); + + /* + * note that we don't have to bother with changing the permissions + * on the associated slave device here. the reason is that no one + * can actually open the device untill it's associated master + * device is re-opened, which will result in the permissions on + * it being reset. + */ + return (err); +} + +static int +lx_ptm_read_loop(dev_t dev, struct uio *uiop, cred_t *credp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err, rval; + struct uio uio = *uiop; + + *loop = 0; + + /* + * Here's another way that Linux master terminals behave differently + * from Solaris master terminals. If you do a read on a Linux + * master terminal (that was opened witout NDELAY and NONBLOCK) + * who's corrosponding slave terminal is currently closed and + * has been opened and closed at least once, Linux return -1 and + * set errno to EIO where as Solaris blocks. + */ + if (lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev))) { + /* Slave has been opened and closed at least once. */ + if (lx_ptm_pts_isopen(dev) == 0) { + /* + * Slave is closed. Make sure that data is avaliable + * before attempting a read. + */ + if ((err = lx_ptm_data_check(dev, 0, &rval)) != 0) + return (err); + + /* If there is no data available then return. */ + if (rval == 0) + return (EIO); + } + } + + /* Actually do the read operation. */ + if ((err = ldi_read(lh, uiop, credp)) != 0) + return (err); + + /* If read returned actual data then return. */ + if (uio.uio_resid != uiop->uio_resid) + return (0); + + /* + * This was a zero byte read (ie, an EOF). This indicates + * that the slave terinal device has been closed. Record + * the fact that the slave device has been closed and retry + * the read operation. + */ + lx_ptm_lh_eofed_set(DEVT_TO_INDEX(dev)); + *loop = 1; + return (0); +} + +static int +lx_ptm_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int pktio = lx_ptm_lh_pktio_get(DEVT_TO_INDEX(dev)); + int err, loop; + struct uio uio; + struct iovec iovp; + + ASSERT(uiop->uio_iovcnt > 0); + + /* + * If packet mode has been enabled (via TIOCPKT) we need to pad + * all read requests with a leading byte that indicates any + * relevant control status information. + */ + if (pktio != 0) { + /* + * We'd like to write the control information into + * the current buffer but we can't yet. We don't + * want to modify userspace memory here only to have + * the read operation fail later. So instead + * what we'll do here is read one character from the + * beginning of the memory pointed to by the uio + * structure. This will advance the output pointer + * by one. Then when the read completes successfully + * we can update the byte that we passed over. Before + * we do the read make a copy of the current uiop and + * iovec structs so we can write to them later. + */ + uio = *uiop; + iovp = *uiop->uio_iov; + uio.uio_iov = &iovp; + + if (uwritec(uiop) == -1) + return (EFAULT); + } + + do { + /* + * Before we actually attempt a read operation we need + * to make sure there's some buffer space to actually + * read in some data. We do this because if we're in + * pktio mode and the caller only requested one byte, + * then we've already used up that one byte and we + * don't want to pass this read request. Doing a 0 + * byte read (unless there is a problem with the stream + * head) always returns succcess. Normally when a streams + * read returns 0 bytes we interpret that as an EOF on + * the stream (ie, the slave side has been opened and + * closed) and we ignore it and re-try the read operation. + * So if we pass on a 0 byte read here lx_ptm_read_loop() + * will tell us to loop around and we'll end up in an + * infinite loop. + */ + if (uiop->uio_resid == 0) + break; + + /* + * Serialize all reads. We need to do this so that we can + * properly emulate the behavior of master terminals on Linux. + * In reality this serializaion should not pose any kind of + * performance problem since it would be very strange to have + * multiple threads trying to read from the same master + * terminal device concurrently. + */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_read_loop(dev, uiop, credp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + + if (pktio != 0) { + uint8_t pktio_data = TIOCPKT_DATA; + + /* + * Note that the control status information we + * pass back is faked up in the sense that we + * don't actually report any events, we always + * report a status of 0. + */ + if (uiomove(&pktio_data, 1, UIO_READ, &uio) != 0) + return (EFAULT); + } + + return (0); +} + +static int +lx_ptm_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + err = ldi_write(lh, uiop, credp); + + return (err); +} + +static int +lx_ptm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + int err; + + /* + * here we need to make sure that we never allow the + * I_SETSIG and I_ESETSIG ioctls to pass through. we + * do this because we can't support them. + * + * the native Solaris ptm device supports these ioctls because + * they are streams framework ioctls and all streams devices + * support them by default. these ioctls cause the current + * process to be registered with a stream and receive signals + * when certain stream events occur. + * + * a problem arises with cleanup of these registrations + * for layered drivers. + * + * normally the streams framework is notified whenever a + * process closes any reference to a stream and it goes ahead + * and cleans up these registrations. but actual device drivers + * are not notified when a process performs a close operation + * unless the process is closing the last opened reference to + * the device on the entire system. + * + * so while we could pass these ioctls on and allow processes + * to register for signal delivery, we would never receive + * any notification when those processes exit (or close a + * stream) and we wouldn't be able to unregister them. + * + * luckily these operations are streams specific and Linux + * doesn't support streams devices. so it doesn't actually + * seem like we need to support these ioctls. if it turns + * out that we do need to support them for some reason in + * the future, the current driver model will have to be + * enhanced to better support streams device layering. + */ + if ((cmd == I_SETSIG) || (cmd == I_ESETSIG)) + return (EINVAL); + + /* + * here we fake up support for TIOCPKT. Linux applications expect + * /etc/ptmx to support this ioctl, but on Solaris it doesn't. + * (it is supported on older bsd style ptys.) so we'll fake + * up support for it here. + * + * the reason that this ioctl is emulated here instead of in + * userland is that this ioctl affects the results returned + * from read() operations. if this ioctl was emulated in + * userland the brand library would need to intercept all + * read operations and check to see if pktio was enabled + * for the fd being read from. since this ioctl only needs + * to be supported on the ptmx device it makes more sense + * to support it here where we can easily update the results + * returned for read() operations performed on ourselves. + */ + if (cmd == TIOCPKT) { + int pktio; + + if (ddi_copyin((void *)arg, &pktio, sizeof (pktio), + mode) != DDI_SUCCESS) + return (EFAULT); + + if (pktio == 0) + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 0); + else + lx_ptm_lh_pktio_set(DEVT_TO_INDEX(dev), 1); + + return (0); + } + + err = ldi_ioctl(lh, cmd, arg, mode, credp, rvalp); + + return (err); +} + +static int +lx_ptm_poll_loop(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp, int *loop) +{ + ldi_handle_t lh = lx_ptm_lh_lookup(DEVT_TO_INDEX(dev)); + short reventsp2; + int err, rval; + + *loop = 0; + + /* + * If the slave device has been opened and closed at least + * once and the slave device is currently closed, then poll + * always needs to returns immediatly. + */ + if ((lx_ptm_lh_eofed_get(DEVT_TO_INDEX(dev)) != 0) && + (lx_ptm_pts_isopen(dev) == 0)) { + /* In this case always return POLLHUP */ + *reventsp = POLLHUP; + + /* + * Check if there really is data on the stream. + * If so set the correct return flags. + */ + if ((err = lx_ptm_data_check(dev, 1, &rval)) != 0) { + /* Something went wrong. */ + return (err); + } + if (rval != 0) + *reventsp |= (events & (POLLIN | POLLRDNORM)); + + /* + * Is the user checking for writability? Note that for ptm + * devices Linux seems to ignore the POLLWRBAND write flag. + */ + if ((events & POLLWRNORM) == 0) + return (0); + + /* + * To check if the stream is writable we have to actually + * call poll, but make sure to set anyyet to 1 to prevent + * the streams framework from setting up callbacks. + */ + if ((err = ldi_poll(lh, POLLWRNORM, 1, &reventsp2, NULL)) != 0) + return (err); + + *reventsp |= (reventsp2 & POLLWRNORM); + } else { + int lockstate; + + /* The slave device is open, do the poll */ + if ((err = ldi_poll(lh, events, anyyet, reventsp, phpp)) != 0) + return (err); + + /* + * Drop any leading EOFs on the stream. + * + * Note that we have to use pollunlock() here to avoid + * recursive mutex enters in the poll framework. The + * reason is that if there is an EOF message on the stream + * then the act of reading from the queue to remove the + * message can cause the ptm drivers event service + * routine to be invoked, and if there is no open + * slave device then the ptm driver may generate + * error messages and put them on the stream. This + * in turn will generate a poll event and the poll + * framework will try to invoke any poll callbacks + * associated with the stream. In the process of + * doing that the poll framework will try to aquire + * locks that we are already holding. So we need to + * drop those locks here before we do our read. + */ + lockstate = pollunlock(); + err = lx_ptm_eof_drop(dev, &rval); + pollrelock(lockstate); + if (err) + return (err); + + /* If no EOF was dropped then return */ + if (rval == 0) + return (0); + + /* + * An EOF was removed from the stream. Retry the entire + * poll operation from the top because polls on the ptm + * device should behave differently now. + */ + *loop = 1; + } + return (0); +} + +static int +lx_ptm_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int loop, err; + + do { + /* Serialize ourself wrt read operations. */ + if (lx_ptm_read_start(dev) != 0) + return (EINTR); + + err = lx_ptm_poll_loop(dev, + events, anyyet, reventsp, phpp, &loop); + lx_ptm_read_end(dev); + if (err != 0) + return (err); + } while (loop != 0); + return (0); +} + +static struct cb_ops lx_ptm_cb_ops = { + lx_ptm_open, /* open */ + lx_ptm_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + lx_ptm_read, /* read */ + lx_ptm_write, /* write */ + lx_ptm_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + lx_ptm_poll, /* chpoll */ + ddi_prop_op, /* prop_op */ + NULL, /* cb_str */ + D_NEW | D_MP, + CB_REV, + NULL, + NULL +}; + +static struct dev_ops lx_ptm_ops = { + DEVO_REV, + 0, + ddi_getinfo_1to1, + nulldev, + nulldev, + lx_ptm_attach, + lx_ptm_detach, + nodev, + &lx_ptm_cb_ops, + NULL, + NULL, + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* type of module */ + "Linux master terminal driver", /* description of module */ + &lx_ptm_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/brand/lx/io/lx_ptm.conf b/usr/src/uts/common/brand/lx/io/lx_ptm.conf new file mode 100644 index 0000000000..481b4e3c74 --- /dev/null +++ b/usr/src/uts/common/brand/lx/io/lx_ptm.conf @@ -0,0 +1,27 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +#ident "%Z%%M% %I% %E% SMI" + +name="lx_ptm" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/brand/lx/os/lx_brand.c b/usr/src/uts/common/brand/lx/os/lx_brand.c new file mode 100644 index 0000000000..c22d782b68 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_brand.c @@ -0,0 +1,1873 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +/* + * The LX Brand: emulation of a Linux operating environment within a zone. + * + * OVERVIEW + * + * The LX brand enables a full Linux userland -- including a C library, + * init(1) framework, and some set of applications -- to run unmodified + * within an illumos zone. Unlike illumos, where applications are expected + * to link against and consume functions exported from libraries, the + * supported Linux binary compatibility boundary is the system call + * interface. By accurately emulating the behaviour of Linux system calls, + * Linux software can be executed in this environment as if it were running + * on a native Linux system. + * + * EMULATING LINUX SYSTEM CALLS + * + * Linux system calls are made in 32-bit processes via the "int 0x80" + * instruction; in 64-bit processes the "syscall" instruction is used, as it + * is with native illumos processes. In both cases, arguments to system + * calls are generally passed in registers and the usermode stack is not + * interpreted or modified by the Linux kernel. + * + * When the emulated Linux process makes a system call, it traps into the + * illumos kernel. The in-kernel brand module contains various emulation + * routines, and can fully service some emulated system calls; e.g. read(2) + * and write(2). Other system calls require assistance from the illumos + * libc, bouncing back out to the brand library ("lx_brand.so.1") for + * emulation. + * + * The brand mechanism allows for the provision of an alternative trap + * handler for the various system call mechanisms. Traditionally this was + * used to immediately revector execution to the usermode emulation library, + * which was responsible for handling all system calls. In the interests of + * more accurate emulation and increased performance, much of the regular + * illumos system call path is now invoked. Only the argument processing and + * handler dispatch are replaced by the brand, via the per-LWP + * "lwp_brand_syscall" interposition function pointer. + * + * THE NATIVE AND BRAND STACKS + * + * Some runtime environments (e.g. the Go language) allocate very small + * thread stacks, preferring to grow or split the stack as necessary. The + * Linux kernel generally does not use the usermode stack when servicing + * system calls, so this is not a problem. In order for our emulation to + * have the same zero stack impact, we must execute usermode emulation + * routines on an _alternate_ stack. This is similar, in principle, to the + * use of sigaltstack(3C) to run signal handlers off the main thread stack. + * + * To this end, the brand library allocates and installs an alternate stack + * (called the "native" stack) for each LWP. The in-kernel brand code uses + * this stack for usermode emulation calls and interposed signal delivery, + * while the emulated Linux process sees only the data on the main thread + * stack, known as the "brand" stack. The stack mode is tracked in the + * per-LWP brand-private data, using the LX_STACK_MODE_* enum. + * + * The stack mode doubles as a system call "mode bit". When in the + * LX_STACK_MODE_BRAND mode, system calls are processed as emulated Linux + * system calls. In other modes, system calls are assumed to be native + * illumos system calls as made during brand library initialisation and + * usermode emulation. + * + * USERMODE EMULATION + * + * When a Linux system call cannot be emulated within the kernel, we preserve + * the register state of the Linux process and revector the LWP to the brand + * library usermode emulation handler: the "lx_emulate()" function in + * "lx_brand.so.1". This revectoring is modelled on the delivery of signals, + * and is performed in "lx_emulate_user()". + * + * First, the emulated process state is written out to the usermode stack of + * the process as a "ucontext_t" object. Arguments to the emulation routine + * are passed on the stack or in registers, depending on the ABI. When the + * usermode emulation is complete, the result is passed back to the kernel + * (via the "B_EMULATION_DONE" brandsys subcommand) with the saved context + * for restoration. + * + * SIGNAL DELIVERY, SETCONTEXT AND GETCONTEXT + * + * When servicing emulated system calls in the usermode brand library, or + * during signal delivery, various state is preserved by the kernel so that + * the running LWP may be revectored to a handling routine. The context + * allows the kernel to restart the program at the point of interruption, + * either at the return of the signal handler, via setcontext(3C); or after + * the usermode emulation request has been serviced, via B_EMULATION_DONE. + * + * In illumos native processes, the saved context (a "ucontext_t" object) + * includes the state of registers and the current signal mask at the point + * of interruption. The context also includes a link to the most recently + * saved context, forming a chain to be unwound as requests complete. The LX + * brand requires additional book-keeping to describe the machine state: in + * particular, the current stack mode and the occupied extent of the native + * stack. + * + * The brand code is able to interpose on the context save and restore + * operations in the kernel -- see "lx_savecontext()" and + * "lx_restorecontext()" -- to enable getcontext(3C) and setcontext(3C) to + * function correctly in the face of a dual stack LWP. The brand also + * interposes on the signal delivery mechanism -- see "lx_sendsig()" and + * "lx_sendsig_stack()" -- to allow all signals to be delivered to the brand + * library interposer on the native stack, regardless of the interrupted + * execution mode. Linux sigaltstack(2) emulation is performed entirely by + * the usermode brand library during signal handler interposition. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/exec.h> +#include <sys/lx_impl.h> +#include <sys/machbrand.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_misc.h> +#include <sys/lx_futex.h> +#include <sys/lx_brand.h> +#include <sys/param.h> +#include <sys/termios.h> +#include <sys/sunddi.h> +#include <sys/ddi.h> +#include <sys/vnode.h> +#include <sys/pathname.h> +#include <sys/auxv.h> +#include <sys/priv.h> +#include <sys/regset.h> +#include <sys/privregs.h> +#include <sys/archsystm.h> +#include <sys/zone.h> +#include <sys/brand.h> +#include <sys/sdt.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <sys/core.h> +#include <sys/stack.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <lx_signum.h> + +int lx_debug = 0; + +void lx_init_brand_data(zone_t *); +void lx_free_brand_data(zone_t *); +void lx_setbrand(proc_t *); +int lx_getattr(zone_t *, int, void *, size_t *); +int lx_setattr(zone_t *, int, void *, size_t); +int lx_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t); +void lx_set_kern_version(zone_t *, char *); +void lx_copy_procdata(proc_t *, proc_t *); + +extern int getsetcontext(int, void *); +extern int waitsys(idtype_t, id_t, siginfo_t *, int); +#if defined(_SYSCALL32_IMPL) +extern int getsetcontext32(int, void *); +extern int waitsys32(idtype_t, id_t, siginfo_t *, int); +#endif + +extern void lx_proc_exit(proc_t *); +extern int lx_sched_affinity(int, uintptr_t, int, uintptr_t, int64_t *); + +extern void lx_ioctl_init(); +extern void lx_ioctl_fini(); + +lx_systrace_f *lx_systrace_entry_ptr; +lx_systrace_f *lx_systrace_return_ptr; + +static int lx_systrace_enabled; + +/* + * While this is effectively mmu.hole_start - PAGESIZE, we don't particularly + * want an MMU dependency here (and should there be a microprocessor without + * a hole, we don't want to start allocating from the top of the VA range). + */ +#define LX_MAXSTACK64 0x7ffffff00000 + +uint64_t lx_maxstack64 = LX_MAXSTACK64; + +static int lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action); + +static boolean_t lx_native_exec(uint8_t, const char **); +static uint32_t lx_map32limit(proc_t *); + +static void lx_savecontext(ucontext_t *); +static void lx_restorecontext(ucontext_t *); +static caddr_t lx_sendsig_stack(int); +static void lx_sendsig(int); +#if defined(_SYSCALL32_IMPL) +static void lx_savecontext32(ucontext32_t *); +#endif +static int lx_setid_clear(vattr_t *, cred_t *); +#if defined(_LP64) +static int lx_pagefault(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); +#endif + + +/* lx brand */ +struct brand_ops lx_brops = { + lx_init_brand_data, /* b_init_brand_data */ + lx_free_brand_data, /* b_free_brand_data */ + lx_brandsys, /* b_brandsys */ + lx_setbrand, /* b_setbrand */ + lx_getattr, /* b_getattr */ + lx_setattr, /* b_setattr */ + lx_copy_procdata, /* b_copy_procdata */ + lx_proc_exit, /* b_proc_exit */ + lx_exec, /* b_exec */ + lx_setrval, /* b_lwp_setrval */ + lx_lwpdata_alloc, /* b_lwpdata_alloc */ + lx_lwpdata_free, /* b_lwpdata_free */ + lx_initlwp, /* b_initlwp */ + lx_forklwp, /* b_forklwp */ + lx_freelwp, /* b_freelwp */ + lx_exitlwp, /* b_lwpexit */ + lx_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + NSIG, /* b_nsig */ + lx_exit_with_sig, /* b_exit_with_sig */ + lx_wait_filter, /* b_wait_filter */ + lx_native_exec, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + lx_map32limit, /* b_map32limit */ + lx_stop_notify, /* b_stop_notify */ + lx_waitid_helper, /* b_waitid_helper */ + lx_sigcld_repost, /* b_sigcld_repost */ + lx_ptrace_issig_stop, /* b_issig_stop */ + lx_ptrace_sig_ignorable, /* b_sig_ignorable */ + lx_savecontext, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + lx_savecontext32, /* b_savecontext32 */ +#endif + lx_restorecontext, /* b_restorecontext */ + lx_sendsig_stack, /* b_sendsig_stack */ + lx_sendsig, /* b_sendsig */ + lx_setid_clear, /* b_setid_clear */ +#if defined(_LP64) + lx_pagefault /* b_pagefault */ +#else + NULL +#endif +}; + +struct brand_mach_ops lx_mops = { + NULL, + NULL, + NULL, + NULL, + NULL, + lx_fixsegreg, + lx_fsbase +}; + +struct brand lx_brand = { + BRAND_VER_1, + "lx", + &lx_brops, + &lx_mops, + sizeof (struct lx_proc_data) +}; + +static struct modlbrand modlbrand = { + &mod_brandops, "lx brand", &lx_brand +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +lx_proc_exit(proc_t *p) +{ + VERIFY(p->p_brand == &lx_brand); + VERIFY(p->p_brand_data != NULL); +} + +void +lx_setbrand(proc_t *p) +{ + /* Send SIGCHLD to parent by default when child exits */ + ptolxproc(p)->l_signal = stol_signo[SIGCHLD]; +} + +/* ARGSUSED */ +int +lx_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) +{ + char vers[LX_VERS_MAX]; + + if (attr == LX_KERN_VERSION_NUM) { + if (bufsize > (LX_VERS_MAX - 1)) + return (ERANGE); + bzero(vers, LX_VERS_MAX); + if (copyin(buf, &vers, bufsize) != 0) + return (EFAULT); + lx_set_kern_version(zone, vers); + return (0); + } + return (EINVAL); +} + +/* ARGSUSED */ +int +lx_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) +{ + if (attr == LX_KERN_VERSION_NUM) { + if (*bufsize < LX_VERS_MAX) + return (ERANGE); + if (copyout(lx_get_zone_kern_version(curzone), buf, + LX_VERS_MAX) != 0) + return (EFAULT); + *bufsize = LX_VERS_MAX; + return (0); + } + return (-EINVAL); +} + +uint32_t +lx_map32limit(proc_t *p) +{ + /* + * To be bug-for-bug compatible with Linux, we have MAP_32BIT only + * allow mappings in the first 31 bits. This was a nuance in the + * original Linux implementation circa 2002, and applications have + * come to depend on its behavior. + * + * This is only relevant for 64-bit processes. + */ + if (p->p_model == DATAMODEL_LP64) + return (1 << 31); + + return ((uint32_t)USERLIMIT32); +} + +void +lx_brand_systrace_enable(void) +{ + VERIFY(!lx_systrace_enabled); + + lx_systrace_enabled = 1; +} + +void +lx_brand_systrace_disable(void) +{ + VERIFY(lx_systrace_enabled); + + lx_systrace_enabled = 0; +} + +void +lx_lwp_set_native_stack_current(lx_lwp_data_t *lwpd, uintptr_t new_sp) +{ + VERIFY(lwpd->br_ntv_stack != 0); + + /* + * The "brand-lx-set-ntv-stack-current" probe has arguments: + * arg0: stack pointer before change + * arg1: stack pointer after change + * arg2: current stack base + */ + DTRACE_PROBE3(brand__lx__set__ntv__stack__current, + uintptr_t, lwpd->br_ntv_stack_current, + uintptr_t, new_sp, + uintptr_t, lwpd->br_ntv_stack); + + lwpd->br_ntv_stack_current = new_sp; +} + +#if defined(_LP64) +static int +lx_pagefault(proc_t *p, klwp_t *lwp, caddr_t addr, enum fault_type type, + enum seg_rw rw) +{ + int syscall_num; + + /* + * We only want to handle a very specific set of circumstances. + * Namely: this is a 64-bit LX-branded process attempting to execute an + * address in a page for which it does not have a valid mapping. If + * this is not the case, we bail out as fast as possible. + */ + VERIFY(PROC_IS_BRANDED(p)); + if (type != F_INVAL || rw != S_EXEC || lwp_getdatamodel(lwp) != + DATAMODEL_NATIVE) { + return (-1); + } + + if (!lx_vsyscall_iscall(lwp, (uintptr_t)addr, &syscall_num)) { + return (-1); + } + + /* + * This is a valid vsyscall address. We service the system call and + * return 0 to signal that the pagefault has been handled completely. + */ + lx_vsyscall_enter(p, lwp, syscall_num); + return (0); +} +#endif + +/* + * This hook runs prior to sendsig() processing and allows us to nominate + * an alternative stack pointer for delivery of the signal handling frame. + * Critically, this routine should _not_ modify any LWP state as the + * savecontext() does not run until after this hook. + */ +static caddr_t +lx_sendsig_stack(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * We want to take signal delivery on the native stack, but only if + * one has been allocated and installed for this LWP. + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The program is not running on the native stack. Return + * the native stack pointer from our brand-private data so + * that we may switch to it for signal handling. + */ + return ((caddr_t)lwpd->br_ntv_stack_current); + } else { + struct regs *rp = lwptoregs(lwp); + + /* + * Either the program is already running on the native stack, + * or one has not yet been allocated for this LWP. Use the + * current stack pointer value. + */ + return ((caddr_t)rp->r_sp); + } +} + +/* + * This hook runs after sendsig() processing and allows us to update the + * per-LWP mode flags for system calls and stacks. The pre-signal + * context has already been saved and delivered to the user at this point. + */ +static void +lx_sendsig(int sig) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * In lx_sendsig_stack(), we nominated a stack pointer from the + * native stack. Update the stack mode, and the current in-use + * extent of the native stack, accordingly: + */ + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + lx_lwp_set_native_stack_current(lwpd, rp->r_sp); + + /* + * Fix up segment registers, etc. + */ + lx_switch_to_native(lwp); + break; + + default: + /* + * Otherwise, the brand library has not yet installed the + * alternate stack for this LWP. Signals will be handled on + * the regular stack thread. + */ + return; + } +} + +/* + * This hook runs prior to the context restoration, allowing us to take action + * or modify the context before it is loaded. + */ +static void +lx_restorecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = (uintptr_t)ucp->uc_brand_data[0]; + caddr_t sp = ucp->uc_brand_data[1]; + + /* + * We have a saved native stack pointer value that we must restore + * into the per-LWP data. + */ + if (flags & LX_UC_RESTORE_NATIVE_SP) { + lx_lwp_set_native_stack_current(lwpd, (uintptr_t)sp); + } + + /* + * We do not wish to restore the value of uc_link in this context, + * so replace it with the value currently in the LWP. + */ + if (flags & LX_UC_IGNORE_LINK) { + ucp->uc_link = (ucontext_t *)lwp->lwp_oldcontext; + } + + /* + * Restore the stack mode: + */ + if (flags & LX_UC_STACK_NATIVE) { + lwpd->br_stack_mode = LX_STACK_MODE_NATIVE; + } else if (flags & LX_UC_STACK_BRAND) { + lwpd->br_stack_mode = LX_STACK_MODE_BRAND; + } + +#if defined(__amd64) + /* + * Override the fs/gsbase in the context with the value provided + * through the Linux arch_prctl(2) system call. + */ + if (flags & LX_UC_STACK_BRAND) { + if (lwpd->br_lx_fsbase != 0) { + ucp->uc_mcontext.gregs[REG_FSBASE] = lwpd->br_lx_fsbase; + } + if (lwpd->br_lx_gsbase != 0) { + ucp->uc_mcontext.gregs[REG_GSBASE] = lwpd->br_lx_gsbase; + } + } +#endif +} + +static void +lx_savecontext(ucontext_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + uintptr_t flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (void *)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = + (void *)(uintptr_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = (void *)flags; +} + +#if defined(_SYSCALL32_IMPL) +static void +lx_savecontext32(ucontext32_t *ucp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + unsigned int flags = 0; + + /* + * The ucontext_t affords us three private pointer-sized members in + * "uc_brand_data". We pack a variety of flags into the first element, + * and an optional stack pointer in the second element. The flags + * determine which stack pointer (native or brand), if any, is stored + * in the second element. The third element may contain the system + * call number; this is analogous to the "orig_[er]ax" member of a + * Linux "user_regs_struct". + */ + + if (lwpd->br_stack_mode != LX_STACK_MODE_INIT && + lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + /* + * Record the value of the native stack pointer to restore + * when returning to this branded context: + */ + flags |= LX_UC_RESTORE_NATIVE_SP; + ucp->uc_brand_data[1] = (caddr32_t)lwpd->br_ntv_stack_current; + } + + /* + * Save the stack mode: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_NATIVE) { + flags |= LX_UC_STACK_NATIVE; + } else if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + flags |= LX_UC_STACK_BRAND; + } + + /* + * If we might need to restart this system call, save that information + * in the context: + */ + if (lwpd->br_stack_mode == LX_STACK_MODE_BRAND) { + ucp->uc_brand_data[2] = (caddr32_t)lwpd->br_syscall_num; + if (lwpd->br_syscall_restart) { + flags |= LX_UC_RESTART_SYSCALL; + } + } else { + ucp->uc_brand_data[2] = NULL; + } + + ucp->uc_brand_data[0] = flags; +} +#endif + +void +lx_init_brand_data(zone_t *zone) +{ + lx_zone_data_t *data; + ASSERT(zone->zone_brand == &lx_brand); + ASSERT(zone->zone_brand_data == NULL); + data = (lx_zone_data_t *)kmem_zalloc(sizeof (lx_zone_data_t), KM_SLEEP); + /* + * Set the default lxzd_kernel_version to 2.4. + * This can be changed by a call to setattr() during zone boot. + */ + (void) strlcpy(data->lxzd_kernel_version, "2.4.21", LX_VERS_MAX); + + /* + * Linux is not at all picky about address family when it comes to + * supporting interface-related ioctls. To mimic this behavior, we'll + * attempt those ioctls against a ksocket configured for that purpose. + */ + (void) ksocket_socket(&data->lxzd_ioctl_sock, AF_INET, SOCK_DGRAM, 0, + 0, zone->zone_kcred); + + zone->zone_brand_data = data; + + /* + * In Linux, if the init(1) process terminates the system panics. + * The zone must reboot to simulate this behaviour. + */ + zone->zone_reboot_on_init_exit = B_TRUE; +} + +void +lx_free_brand_data(zone_t *zone) +{ + lx_zone_data_t *data = ztolxzd(zone); + ASSERT(data != NULL); + if (data->lxzd_ioctl_sock != NULL) { + /* + * Since zone_kcred has been cleaned up already, close the + * socket using the global kcred. + */ + ksocket_close(data->lxzd_ioctl_sock, kcred); + data->lxzd_ioctl_sock = NULL; + } + zone->zone_brand_data = NULL; + kmem_free(data, sizeof (*data)); +} + +void +lx_unsupported(char *dmsg) +{ + lx_proc_data_t *pd = ttolxproc(curthread); + + DTRACE_PROBE1(brand__lx__unsupported, char *, dmsg); + + if (pd != NULL && (pd->l_flags & LX_PROC_STRICT_MODE) != 0) { + /* + * If this process was run with strict mode enabled + * (via LX_STRICT in the environment), we mark this + * LWP as having triggered an unsupported behaviour. + * This flag will be checked at an appropriate point + * by lx_check_strict_failure(). + */ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + lwpd->br_strict_failure = B_TRUE; + } +} + +void +lx_check_strict_failure(lx_lwp_data_t *lwpd) +{ + proc_t *p; + + if (!lwpd->br_strict_failure) { + return; + } + + lwpd->br_strict_failure = B_FALSE; + + /* + * If this process is operating in strict mode (via LX_STRICT in + * the environment), and has triggered a call to + * lx_unsupported(), we drop SIGSYS on it as we return. + */ + p = curproc; + mutex_enter(&p->p_lock); + sigtoproc(p, curthread, SIGSYS); + mutex_exit(&p->p_lock); +} + +void +lx_trace_sysenter(int syscall_num, uintptr_t *args) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_entry_ptr != NULL); + + (*lx_systrace_entry_ptr)(syscall_num, args[0], args[1], + args[2], args[3], args[4], args[5]); + } +} + +void +lx_trace_sysreturn(int syscall_num, long ret) +{ + if (lx_systrace_enabled) { + VERIFY(lx_systrace_return_ptr != NULL); + + (*lx_systrace_return_ptr)(syscall_num, ret, ret, 0, 0, 0, 0); + } +} + +/* + * Get the addresses of the user-space system call handler and attach it to + * the proc structure. Returning 0 indicates success; the value returned + * by the system call is the value stored in rval. Returning a non-zero + * value indicates a failure; the value returned is used to set errno, -1 + * is returned from the syscall and the contents of rval are ignored. To + * set errno and have the syscall return a value other than -1 we can + * manually set errno and rval and return 0. + */ +int +lx_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = ttoproc(t); + lx_proc_data_t *pd; + struct termios *termios; + uint_t termios_len; + int error; + int code; + int sig; + lx_brand_registration_t reg; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + /* + * There is one operation that is suppored for non-branded + * process. B_EXEC_BRAND. This is the equilivant of an + * exec call, but the new process that is created will be + * a branded process. + */ + if (cmd == B_EXEC_BRAND) { + VERIFY(p->p_zone != NULL); + VERIFY(p->p_zone->zone_brand == &lx_brand); + return (exec_common( + (char *)arg1, (const char **)arg2, (const char **)arg3, + EBA_BRAND)); + } + + /* For all other operations this must be a branded process. */ + if (p->p_brand == NULL) + return (ENOSYS); + + VERIFY(p->p_brand == &lx_brand); + VERIFY(p->p_brand_data != NULL); + + switch (cmd) { + case B_REGISTER: + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("stack mode was not PREINIT during " + "REGISTER\n"); + return (EINVAL); + } + + if (p->p_model == DATAMODEL_NATIVE) { + if (copyin((void *)arg1, ®, sizeof (reg)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + } +#ifdef _LP64 + else { + /* 32-bit userland on 64-bit kernel */ + lx_brand_registration32_t reg32; + + if (copyin((void *)arg1, ®32, sizeof (reg32)) != 0) { + lx_print("Failed to copyin brand registration " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + + reg.lxbr_version = (uint_t)reg32.lxbr_version; + reg.lxbr_handler = + (void *)(uintptr_t)reg32.lxbr_handler; + reg.lxbr_flags = reg32.lxbr_flags; + } +#endif + + if (reg.lxbr_version != LX_VERSION_1) { + lx_print("Invalid brand library version (%u)\n", + reg.lxbr_version); + return (EINVAL); + } + + if ((reg.lxbr_flags & ~LX_PROC_ALL) != 0) { + lx_print("Invalid brand flags (%u)\n", + reg.lxbr_flags); + return (EINVAL); + } + + lx_print("Assigning brand 0x%p and handler 0x%p to proc 0x%p\n", + (void *)&lx_brand, (void *)reg.lxbr_handler, (void *)p); + pd = p->p_brand_data; + pd->l_handler = (uintptr_t)reg.lxbr_handler; + pd->l_flags = reg.lxbr_flags; + + return (0); + + case B_TTYMODES: + /* This is necessary for emulating TCGETS ioctls. */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&termios, + &termios_len) != DDI_SUCCESS) + return (EIO); + + ASSERT(termios_len == sizeof (*termios)); + + if (copyout(&termios, (void *)arg1, sizeof (termios)) != 0) { + ddi_prop_free(termios); + return (EFAULT); + } + + ddi_prop_free(termios); + return (0); + + case B_ELFDATA: + pd = curproc->p_brand_data; + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(&pd->l_elf_data, (void *)arg1, + sizeof (lx_elf_data_t)) != 0) { + return (EFAULT); + } + } +#if defined(_LP64) + else { + /* 32-bit userland on 64-bit kernel */ + lx_elf_data32_t led32; + + led32.ed_phdr = (int)pd->l_elf_data.ed_phdr; + led32.ed_phent = (int)pd->l_elf_data.ed_phent; + led32.ed_phnum = (int)pd->l_elf_data.ed_phnum; + led32.ed_entry = (int)pd->l_elf_data.ed_entry; + led32.ed_base = (int)pd->l_elf_data.ed_base; + led32.ed_ldentry = (int)pd->l_elf_data.ed_ldentry; + + if (copyout(&led32, (void *)arg1, + sizeof (led32)) != 0) { + return (EFAULT); + } + } +#endif + return (0); + + case B_EXEC_NATIVE: + return (exec_common((char *)arg1, (const char **)arg2, + (const char **)arg3, EBA_NATIVE)); + + /* + * The B_TRUSS_POINT subcommand is used so that we can make a no-op + * syscall for debugging purposes (dtracing) from within the user-level + * emulation. + */ + case B_TRUSS_POINT: + return (0); + + case B_LPID_TO_SPAIR: { + /* + * Given a Linux pid as arg1, return the Solaris pid in arg2 and + * the Solaris LWP in arg3. We also translate pid 1 (which is + * hardcoded in many applications) to the zone's init process. + */ + pid_t s_pid; + id_t s_tid; + + if ((pid_t)arg1 == 1) { + s_pid = p->p_zone->zone_proc_initpid; + /* handle the dead/missing init(1M) case */ + if (s_pid == -1) + s_pid = 1; + s_tid = 1; + } else if (lx_lpid_to_spair((pid_t)arg1, &s_pid, &s_tid) < 0) { + return (ESRCH); + } + + if (copyout(&s_pid, (void *)arg2, sizeof (s_pid)) != 0 || + copyout(&s_tid, (void *)arg3, sizeof (s_tid)) != 0) { + return (EFAULT); + } + + return (0); + } + + case B_SET_AFFINITY_MASK: + case B_GET_AFFINITY_MASK: + /* + * Retrieve or store the CPU affinity mask for the + * requested linux pid. + * + * arg1 is a linux PID (0 means curthread). + * arg2 is the size of the given mask. + * arg3 is the address of the affinity mask. + */ + return (lx_sched_affinity(cmd, arg1, arg2, arg3, rval)); + + case B_PTRACE_STOP_FOR_OPT: + return (lx_ptrace_stop_for_option((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE, (ulong_t)arg3, arg4)); + + case B_PTRACE_CLONE_BEGIN: + return (lx_ptrace_set_clone_inherit((int)arg1, arg2 == 0 ? + B_FALSE : B_TRUE)); + + case B_PTRACE_KERNEL: + return (lx_ptrace_kernel((int)arg1, (pid_t)arg2, arg3, arg4)); + + case B_HELPER_WAITID: { + idtype_t idtype = (idtype_t)arg1; + id_t id = (id_t)arg2; + siginfo_t *infop = (siginfo_t *)arg3; + int options = (int)arg4; + + lwpd = ttolxlwp(curthread); + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = (int)arg5; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (waitsys32(idtype, id, infop, options)); + } else +#endif + { + return (waitsys(idtype, id, infop, options)); + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (0); + } + + case B_UNSUPPORTED: { + char dmsg[256]; + + if (copyin((void *)arg1, &dmsg, sizeof (dmsg)) != 0) { + lx_print("Failed to copyin unsupported msg " + "at 0x%p\n", (void *)arg1); + return (EFAULT); + } + dmsg[255] = '\0'; + lx_unsupported(dmsg); + + lx_check_strict_failure(lwpd); + + return (0); + } + + case B_STORE_ARGS: { + /* + * B_STORE_ARGS subcommand + * arg1 = address of struct to be copied in + * arg2 = size of the struct being copied in + * arg3-arg6 ignored + * rval = the amount of data copied. + */ + void *buf; + + /* only have upper limit because arg2 is unsigned */ + if (arg2 > LX_BR_ARGS_SIZE_MAX) { + return (EINVAL); + } + + buf = kmem_alloc(arg2, KM_SLEEP); + if (copyin((void *)arg1, buf, arg2) != 0) { + lx_print("Failed to copyin scall arg at 0x%p\n", + (void *) arg1); + kmem_free(buf, arg2); + /* + * Purposely not setting br_scall_args to NULL + * to preserve data for debugging. + */ + return (EFAULT); + } + + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, + lwpd->br_args_size); + } + + lwpd->br_scall_args = buf; + lwpd->br_args_size = arg2; + *rval = arg2; + return (0); + } + + case B_HELPER_CLONE: + return (lx_helper_clone(rval, arg1, (void *)arg2, (void *)arg3, + (void *)arg4)); + + case B_HELPER_SETGROUPS: + return (lx_helper_setgroups(arg1, (gid_t *)arg2)); + + case B_HELPER_SIGQUEUE: + return (lx_helper_rt_sigqueueinfo(arg1, arg2, + (siginfo_t *)arg3)); + + case B_HELPER_TGSIGQUEUE: + return (lx_helper_rt_tgsigqueueinfo(arg1, arg2, arg3, + (siginfo_t *)arg4)); + + case B_SET_THUNK_PID: + lwpd->br_lx_thunk_pid = arg1; + return (0); + + case B_GETPID: + /* + * The usermode clone(2) code needs to be able to call + * lx_getpid() from native code: + */ + *rval = lx_getpid(); + return (0); + + case B_SET_NATIVE_STACK: + /* + * B_SET_NATIVE_STACK subcommand + * arg1 = the base of the stack to use for emulation + */ + if (lwpd->br_stack_mode != LX_STACK_MODE_PREINIT) { + lx_print("B_SET_NATIVE_STACK when stack was already " + "set to %p\n", (void *)arg1); + return (EEXIST); + } + + /* + * We move from the PREINIT state, where we have no brand + * emulation stack, to the INIT state. Here, we are still + * running on what will become the BRAND stack, but are running + * emulation (i.e. native) code. Once the initialisation + * process for this thread has finished, we will jump to + * brand-specific code, while moving to the BRAND mode. + * + * When a new LWP is created, lx_initlwp() will clear the + * stack data. If that LWP is actually being duplicated + * into a child process by fork(2), lx_forklwp() will copy + * it so that the cloned thread will keep using the same + * alternate stack. + */ + lwpd->br_ntv_stack = arg1; + lwpd->br_stack_mode = LX_STACK_MODE_INIT; + lx_lwp_set_native_stack_current(lwpd, arg1); + + return (0); + + case B_GET_CURRENT_CONTEXT: + /* + * B_GET_CURRENT_CONTEXT subcommand: + * arg1 = address for pointer to current ucontext_t + */ + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + caddr32_t addr = (caddr32_t)lwp->lwp_oldcontext; + + error = copyout(&addr, (void *)arg1, sizeof (addr)); + } else +#endif + { + error = copyout(&lwp->lwp_oldcontext, (void *)arg1, + sizeof (lwp->lwp_oldcontext)); + } + + return (error != 0 ? EFAULT : 0); + + case B_JUMP_TO_LINUX: + /* + * B_JUMP_TO_LINUX subcommand: + * arg1 = ucontext_t pointer for jump state + */ + + if (arg1 == NULL) + return (EINVAL); + + switch (lwpd->br_stack_mode) { + case LX_STACK_MODE_NATIVE: { + struct regs *rp = lwptoregs(lwp); + + /* + * We are on the NATIVE stack, so we must preserve + * the extent of that stack. The pointer will be + * reset by a future setcontext(). + */ + lx_lwp_set_native_stack_current(lwpd, + (uintptr_t)rp->r_sp); + break; + } + + case LX_STACK_MODE_INIT: + /* + * The LWP is transitioning to Linux code for the first + * time. + */ + break; + + case LX_STACK_MODE_PREINIT: + /* + * This LWP has not installed an alternate stack for + * usermode emulation handling. + */ + return (ENOENT); + + case LX_STACK_MODE_BRAND: + /* + * The LWP should not be on the BRAND stack. + */ + exit(CLD_KILLED, SIGSYS); + return (0); + } + + /* + * Transfer control to Linux: + */ + return (lx_runexe(lwp, (void *)arg1)); + + case B_EMULATION_DONE: + /* + * B_EMULATION_DONE subcommand: + * arg1 = ucontext_t * to restore + * arg2 = system call number + * arg3 = return code + * arg4 = if operation failed, the errno value + */ + + /* + * The first part of this operation is a setcontext() to + * restore the register state to the copy we preserved + * before vectoring to the usermode emulation routine. + * If that fails, we return (hopefully) to the emulation + * routine and it will handle the error. + */ +#if (_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + error = getsetcontext32(SETCONTEXT, (void *)arg1); + } else +#endif + { + error = getsetcontext(SETCONTEXT, (void *)arg1); + } + + if (error != 0) { + return (error); + } + + /* + * The saved Linux context has been restored. We handle the + * return value or errno with code common to the in-kernel + * system call emulation. + */ + if ((error = (int)arg4) != 0) { + /* + * lx_syscall_return() looks at the errno in the LWP, + * so set it here: + */ + set_errno(error); + } + lx_syscall_return(ttolwp(curthread), (int)arg2, (long)arg3); + + return (0); + + case B_EXIT_AS_SIG: + code = CLD_KILLED; + sig = (int)arg1; + proc_is_exiting(p); + if (exitlwps(1) != 0) { + mutex_enter(&p->p_lock); + lwp_exit(); + } + ttolwp(curthread)->lwp_cursig = sig; + if (sig == SIGSEGV) { + if (core(sig, 0) == 0) + code = CLD_DUMPED; + } + exit(code, sig); + /* NOTREACHED */ + break; + } + + return (EINVAL); +} + +char * +lx_get_zone_kern_version(zone_t *zone) +{ + return (((lx_zone_data_t *)zone->zone_brand_data)->lxzd_kernel_version); +} + +void +lx_set_kern_version(zone_t *zone, char *vers) +{ + lx_zone_data_t *lxzd = (lx_zone_data_t *)zone->zone_brand_data; + + (void) strlcpy(lxzd->lxzd_kernel_version, vers, LX_VERS_MAX); +} + +/* + * Linux unconditionally removes the setuid and setgid bits when changing + * file ownership. This brand hook overrides the illumos native behaviour, + * which is based on the PRIV_FILE_SETID privilege. + */ +static int +lx_setid_clear(vattr_t *vap, cred_t *cr) +{ + if (S_ISDIR(vap->va_mode)) { + return (0); + } + + if (vap->va_mode & S_ISUID) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISUID; + } + if ((vap->va_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Copy the per-process brand data from a parent proc to a child. + */ +void +lx_copy_procdata(proc_t *child, proc_t *parent) +{ + lx_proc_data_t *cpd = child->p_brand_data; + lx_proc_data_t *ppd = parent->p_brand_data; + + VERIFY(parent->p_brand == &lx_brand); + VERIFY(child->p_brand == &lx_brand); + VERIFY(ppd != NULL); + VERIFY(cpd != NULL); + + *cpd = *ppd; + + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = 20; + cpd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = 20; + + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = LX_RLIM64_INFINITY; + + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = LX_RLIM64_INFINITY; + cpd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = LX_RLIM64_INFINITY; +} + +#if defined(_LP64) +static void +Ehdr32to64(Elf32_Ehdr *src, Ehdr *dst) +{ + bcopy(src->e_ident, dst->e_ident, sizeof (src->e_ident)); + dst->e_type = src->e_type; + dst->e_machine = src->e_machine; + dst->e_version = src->e_version; + dst->e_entry = src->e_entry; + dst->e_phoff = src->e_phoff; + dst->e_shoff = src->e_shoff; + dst->e_flags = src->e_flags; + dst->e_ehsize = src->e_ehsize; + dst->e_phentsize = src->e_phentsize; + dst->e_phnum = src->e_phnum; + dst->e_shentsize = src->e_shentsize; + dst->e_shnum = src->e_shnum; + dst->e_shstrndx = src->e_shstrndx; +} +#endif /* _LP64 */ + +static void +restoreexecenv(struct execenv *ep, stack_t *sp) +{ + klwp_t *lwp = ttolwp(curthread); + + setexecenv(ep); + lwp->lwp_sigaltstack.ss_sp = sp->ss_sp; + lwp->lwp_sigaltstack.ss_size = sp->ss_size; + lwp->lwp_sigaltstack.ss_flags = sp->ss_flags; +} + +extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int *); + +extern int elf32exec(struct vnode *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int *); + +/* + * Exec routine called by elfexec() to load either 32-bit or 64-bit Linux + * binaries. + */ +static int +lx_elfexec(struct vnode *vp, struct execa *uap, struct uarg *args, + struct intpdata *idata, int level, long *execsz, int setid, + caddr_t exec_file, struct cred *cred, int *brand_action) +{ + int error; + vnode_t *nvp; + Ehdr ehdr; + Addr uphdr_vaddr; + intptr_t voffset; + char *interp = NULL; + uintptr_t ldaddr = NULL; + int i; + proc_t *p = ttoproc(curthread); + klwp_t *lwp = ttolwp(curthread); + struct execenv env; + struct execenv origenv; + stack_t orig_sigaltstack; + struct user *up = PTOU(ttoproc(curthread)); + lx_elf_data_t *edp; + char *lib_path = NULL; + + ASSERT(ttoproc(curthread)->p_brand == &lx_brand); + ASSERT(ttoproc(curthread)->p_brand_data != NULL); + + edp = &ttolxproc(curthread)->l_elf_data; + + if (args->to_model == DATAMODEL_NATIVE) { + lib_path = LX_LIB_PATH; + } +#if defined(_LP64) + else { + lib_path = LX_LIB_PATH32; + } +#endif + + /* + * Set the brandname and library name for the new process so that + * elfexec() puts them onto the stack. + */ + args->brandname = LX_BRANDNAME; + args->emulator = lib_path; + +#if defined(_LP64) + /* + * To conform with the way Linux lays out the address space, we clamp + * the stack to be the top of the lower region of the x86-64 canonical + * form address space -- which has the side-effect of laying out the + * entire address space in that lower region. Note that this only + * matters on 64-bit processes (this value will always be greater than + * the size of a 32-bit address space) and doesn't actually affect + * USERLIMIT: if a Linux-branded processes wishes to map something + * into the top half of the address space, it can do so -- but with + * the user stack starting at the top of the bottom region, those high + * virtual addresses won't be used unless explicitly directed. + */ + args->maxstack = lx_maxstack64; +#endif + + /* + * We will first exec the brand library, then map in the linux + * executable and the linux linker. + */ + if ((error = lookupname(lib_path, UIO_SYSSPACE, FOLLOW, NULLVPP, + &nvp))) { + uprintf("%s: not found.", lib_path); + return (error); + } + + /* + * We will eventually set the p_exec member to be the vnode for the new + * executable when we call setexecenv(). However, if we get an error + * before that call we need to restore the execenv to its original + * values so that when we return to the caller fop_close() works + * properly while cleaning up from the failed exec(). Restoring the + * original value will also properly decrement the 2nd VN_RELE that we + * took on the brand library. + */ + origenv.ex_bssbase = p->p_bssbase; + origenv.ex_brkbase = p->p_brkbase; + origenv.ex_brksize = p->p_brksize; + origenv.ex_vp = p->p_exec; + orig_sigaltstack.ss_sp = lwp->lwp_sigaltstack.ss_sp; + orig_sigaltstack.ss_size = lwp->lwp_sigaltstack.ss_size; + orig_sigaltstack.ss_flags = lwp->lwp_sigaltstack.ss_flags; + + if (args->to_model == DATAMODEL_NATIVE) { + error = elfexec(nvp, uap, args, idata, level + 1, execsz, + setid, exec_file, cred, brand_action); + } +#if defined(_LP64) + else { + error = elf32exec(nvp, uap, args, idata, level + 1, execsz, + setid, exec_file, cred, brand_action); + } +#endif + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * exec-ed in the brand library above. + * The u_auxv vectors are now setup by elfexec to point to the + * brand emulation library and its linker. + */ + + bzero(&env, sizeof (env)); + + /* + * map in the the Linux executable + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, + &voffset, exec_file, &interp, &env.ex_bssbase, + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + + return (error); + } + + /* + * Save off the important properties of the lx executable. The brand + * library will ask us for this data later, when it is ready to set + * things up for the lx executable. + */ + edp->ed_phdr = (uphdr_vaddr == -1) ? voffset + ehdr.e_phoff : + voffset + uphdr_vaddr; + edp->ed_entry = voffset + ehdr.e_entry; + edp->ed_phent = ehdr.e_phentsize; + edp->ed_phnum = ehdr.e_phnum; + + if (interp != NULL) { + if (ehdr.e_type == ET_DYN) { + /* + * This is a shared object executable, so we need to + * pick a reasonable place to put the heap. Just don't + * use the first page. + */ + env.ex_brkbase = (caddr_t)PAGESIZE; + env.ex_bssbase = (caddr_t)PAGESIZE; + } + + /* + * If the program needs an interpreter (most do), map it in and + * store relevant information about it in the aux vector, where + * the brand library can find it. + */ + if ((error = lookupname(interp, UIO_SYSSPACE, FOLLOW, + NULLVPP, &nvp))) { + uprintf("%s: not found.", interp); + restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); + return (error); + } + + kmem_free(interp, MAXPATHLEN); + interp = NULL; + + /* + * map in the Linux linker + */ + if (args->to_model == DATAMODEL_NATIVE) { + error = mapexec_brand(nvp, args, &ehdr, + &uphdr_vaddr, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + } +#if defined(_LP64) + else { + Elf32_Ehdr ehdr32; + Elf32_Addr uphdr_vaddr32; + + error = mapexec32_brand(nvp, args, &ehdr32, + &uphdr_vaddr32, &voffset, exec_file, NULL, NULL, + NULL, NULL, NULL, &ldaddr); + + Ehdr32to64(&ehdr32, &ehdr); + + if (uphdr_vaddr32 == (Elf32_Addr)-1) + uphdr_vaddr = (Addr)-1; + else + uphdr_vaddr = uphdr_vaddr32; + } +#endif + + VN_RELE(nvp); + if (error != 0) { + restoreexecenv(&origenv, &orig_sigaltstack); + return (error); + } + + /* + * Now that we know the base address of the brand's linker, + * we also save this for later use by the brand library. + */ + edp->ed_base = voffset; + edp->ed_ldentry = voffset + ehdr.e_entry; + } else { + /* + * This program has no interpreter. The lx brand library will + * jump to the address in the AT_SUN_BRAND_LDENTRY aux vector, + * so in this case, put the entry point of the main executable + * there. + */ + if (ehdr.e_type == ET_EXEC) { + /* + * An executable with no interpreter, this must be a + * statically linked executable, which means we loaded + * it at the address specified in the elf header, in + * which case the e_entry field of the elf header is an + * absolute address. + */ + edp->ed_ldentry = ehdr.e_entry; + edp->ed_entry = ehdr.e_entry; + } else { + /* + * A shared object with no interpreter, we use the + * calculated address from above. + */ + edp->ed_ldentry = edp->ed_entry; + + /* + * In all situations except an ET_DYN elf object with no + * interpreter, we want to leave the brk and base + * values set by mapexec_brand alone. Normally when + * running ET_DYN objects on Solaris (most likely + * /lib/ld.so.1) the kernel sets brk and base to 0 since + * it doesn't know where to put the heap, and later the + * linker will call brk() to initialize the heap in: + * usr/src/cmd/sgs/rtld/common/setup.c:setup() + * after it has determined where to put it. (This + * decision is made after the linker loads and inspects + * elf properties of the target executable being run.) + * + * So for ET_DYN Linux executables, we also don't know + * where the heap should go, so we'll set the brk and + * base to 0. But in this case the Solaris linker will + * not initialize the heap, so when the Linux linker + * starts running there is no heap allocated. This + * seems to be ok on Linux 2.4 based systems because the + * Linux linker/libc fall back to using mmap() to + * allocate memory. But on 2.6 systems, running + * applications by specifying them as command line + * arguments to the linker results in segfaults for an + * as yet undetermined reason (which seems to indicatej + * that a more permanent fix for heap initalization in + * these cases may be necessary). + */ + if (ehdr.e_type == ET_DYN) { + env.ex_bssbase = (caddr_t)0; + env.ex_brkbase = (caddr_t)0; + env.ex_brksize = 0; + } + } + + } + + env.ex_vp = vp; + setexecenv(&env); + + /* + * We try to keep /proc's view of the aux vector consistent with + * what's on the process stack. + */ + if (args->to_model == DATAMODEL_NATIVE) { + auxv_t phdr_auxv[4] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_LX_SYSINFO_EHDR, 0 }, + { AT_SUN_BRAND_AUX4, 0 } + }; + phdr_auxv[0].a_un.a_val = edp->ed_phdr; + phdr_auxv[1].a_un.a_val = ldaddr; + phdr_auxv[2].a_un.a_val = 1; /* set in lx_init */ + phdr_auxv[3].a_type = AT_CLKTCK; + phdr_auxv[3].a_un.a_val = hz; + + if (copyout(&phdr_auxv, args->auxp_brand, + sizeof (phdr_auxv)) == -1) + return (EFAULT); + } +#if defined(_LP64) + else { + auxv32_t phdr_auxv32[3] = { + { AT_SUN_BRAND_LX_PHDR, 0 }, + { AT_SUN_BRAND_LX_INTERP, 0 }, + { AT_SUN_BRAND_AUX3, 0 } + }; + phdr_auxv32[0].a_un.a_val = edp->ed_phdr; + phdr_auxv32[1].a_un.a_val = ldaddr; + phdr_auxv32[2].a_type = AT_CLKTCK; + phdr_auxv32[2].a_un.a_val = hz; + + if (copyout(&phdr_auxv32, args->auxp_brand, + sizeof (phdr_auxv32)) == -1) + return (EFAULT); + } +#endif + + /* + * /proc uses the AT_ENTRY aux vector entry to deduce + * the location of the executable in the address space. The user + * structure contains a copy of the aux vector that needs to have those + * entries patched with the values of the real lx executable (they + * currently contain the values from the lx brand library that was + * elfexec'd, above). + * + * For live processes, AT_BASE is used to locate the linker segment, + * which /proc and friends will later use to find Solaris symbols + * (such as rtld_db_preinit). However, for core files, /proc uses + * AT_ENTRY to find the right segment to label as the executable. + * So we set AT_ENTRY to be the entry point of the linux executable, + * but leave AT_BASE to be the address of the Solaris linker. + */ + for (i = 0; i < __KERN_NAUXV_IMPL; i++) { + switch (up->u_auxv[i].a_type) { + case AT_ENTRY: + up->u_auxv[i].a_un.a_val = edp->ed_entry; + break; + + case AT_SUN_BRAND_LX_PHDR: + up->u_auxv[i].a_un.a_val = edp->ed_phdr; + break; + + case AT_SUN_BRAND_LX_INTERP: + up->u_auxv[i].a_un.a_val = ldaddr; + break; + + default: + break; + } + } + + return (0); +} + +boolean_t +lx_native_exec(uint8_t osabi, const char **interp) +{ + if (osabi != ELFOSABI_SOLARIS) + return (B_FALSE); + + /* + * If the process root matches the zone root, prepend /native to the + * interpreter path for native executables. Absolute precision from + * VN_CMP is not necessary since any change of process root is likely + * to make native binaries inaccessible via /native. + * + * Processes which chroot directly into /native will be able to + * function as expected with no need for the prefix. + */ + if (VN_CMP(curproc->p_user.u_rdir, curproc->p_zone->zone_rootvp)) { + *interp = "/native"; + } + + return (B_TRUE); +} + +static void +lx_syscall_init(void) +{ + int i; + + /* + * Count up the 32-bit Linux system calls. Note that lx_sysent32 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent32[i].sy_name != NULL; i++) + continue; + lx_nsysent32 = i; + +#if defined(_LP64) + /* + * Count up the 64-bit Linux system calls. Note that lx_sysent64 + * has (LX_NSYSCALLS + 1) entries. + */ + for (i = 0; i <= LX_NSYSCALLS && lx_sysent64[i].sy_name != NULL; i++) + continue; + lx_nsysent64 = i; +#endif +} + +int +_init(void) +{ + int err = 0; + + lx_syscall_init(); + + /* pid/tid conversion hash tables */ + lx_pid_init(); + + /* for lx_ioctl() */ + lx_ioctl_init(); + + /* for lx_futex() */ + lx_futex_init(); + + lx_ptrace_init(); + + err = mod_install(&modlinkage); + if (err != 0) { + cmn_err(CE_WARN, "Couldn't install lx brand module"); + + /* + * This looks drastic, but it should never happen. These + * two data structures should be completely free-able until + * they are used by Linux processes. Since the brand + * wasn't loaded there should be no Linux processes, and + * thus no way for these data structures to be modified. + */ + lx_pid_fini(); + lx_ioctl_fini(); + if (lx_futex_fini()) + panic("lx brand module cannot be loaded or unloaded."); + } + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + int futex_done = 0; + + /* + * If there are any zones using this brand, we can't allow it to be + * unloaded. + */ + if (brand_zone_count(&lx_brand)) + return (EBUSY); + + lx_ptrace_fini(); + lx_pid_fini(); + lx_ioctl_fini(); + + if ((err = lx_futex_fini()) != 0) + goto done; + futex_done = 1; + + err = mod_remove(&modlinkage); + +done: + if (err) { + /* + * If we can't unload the module, then we have to get it + * back into a sane state. + */ + lx_pid_init(); + + if (futex_done) + lx_futex_init(); + + } + + return (err); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_misc.c b/usr/src/uts/common/brand/lx/os/lx_misc.c new file mode 100644 index 0000000000..4b7310f1d5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_misc.c @@ -0,0 +1,747 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/archsystm.h> +#include <sys/privregs.h> +#include <sys/exec.h> +#include <sys/lwp.h> +#include <sys/sem.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_misc.h> +#include <sys/lx_futex.h> +#include <sys/cmn_err.h> +#include <sys/siginfo.h> +#include <sys/contract/process_impl.h> +#include <sys/x86_archext.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/proc.h> +#include <net/if.h> +#include <sys/sunddi.h> +#include <sys/dlpi.h> +#include <sys/sysmacros.h> + +/* Linux specific functions and definitions */ +static void lx_save(klwp_t *); +static void lx_restore(klwp_t *); + +/* + * Set the return code for the forked child, always zero + */ +/*ARGSUSED*/ +void +lx_setrval(klwp_t *lwp, int v1, int v2) +{ + lwptoregs(lwp)->r_r0 = 0; +} + +/* + * Reset process state on exec(2) + */ +void +lx_exec() +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = ttoproc(curthread); + lx_proc_data_t *pd = ptolxproc(p); + struct regs *rp = lwptoregs(lwp); + + /* + * Any l_handler handlers set as a result of B_REGISTER are now + * invalid; clear them. + */ + pd->l_handler = NULL; + + /* + * If this was a multi-threaded Linux process and this lwp wasn't the + * main lwp, then we need to make its Illumos and Linux PIDs match. + */ + if (curthread->t_tid != 1) { + lx_pid_reassign(curthread); + } + + /* + * Inform ptrace(2) that we are processing an execve(2) call so that if + * we are traced we can post either the PTRACE_EVENT_EXEC event or the + * legacy SIGTRAP. + */ + (void) lx_ptrace_stop_for_option(LX_PTRACE_O_TRACEEXEC, B_FALSE, 0, 0); + + /* clear the fs/gsbase values until the app. can reinitialize them */ + lwpd->br_lx_fsbase = NULL; + lwpd->br_ntv_fsbase = NULL; + lwpd->br_lx_gsbase = NULL; + lwpd->br_ntv_gsbase = NULL; + + /* + * Clear the native stack flags. This will be reinitialised by + * lx_init() in the new process image. + */ + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + lwpd->br_ntv_stack = 0; + lwpd->br_ntv_stack_current = 0; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, lx_save, + NULL); + + /* + * clear out the tls array + */ + bzero(lwpd->br_tls, sizeof (lwpd->br_tls)); + + /* + * reset the tls entries in the gdt + */ + kpreempt_disable(); + lx_restore(lwp); + kpreempt_enable(); + + /* + * The exec syscall doesn't return (so we don't call lx_syscall_return) + * but for our ptrace emulation we need to do this so that a tracer + * does not get out of sync. We know that by the time this lx_exec + * function is called that the exec has succeeded. + */ + rp->r_r0 = 0; + lx_ptrace_stop(LX_PR_SYSEXIT); +} + +void +lx_exitlwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + kthread_t *t; + sigqueue_t *sqp = NULL; + pid_t ppid; + id_t ptid; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + if (lwpd == NULL) + return; /* second time thru' */ + + mutex_enter(&p->p_lock); + lx_ptrace_exit(p, lwp); + mutex_exit(&p->p_lock); + + if (lwpd->br_robust_list != NULL) { + lx_futex_robust_exit((uintptr_t)lwpd->br_robust_list, + lwpd->br_pid); + } + + if (lwpd->br_clear_ctidp != NULL) { + (void) suword32(lwpd->br_clear_ctidp, 0); + (void) lx_futex((uintptr_t)lwpd->br_clear_ctidp, FUTEX_WAKE, 1, + NULL, NULL, 0); + } + + if (lwpd->br_signal != 0) { + /* + * The first thread in a process doesn't cause a signal to + * be sent when it exits. It was created by a fork(), not + * a clone(), so the parent should get signalled when the + * process exits. + */ + if (lwpd->br_ptid == -1) + goto free; + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + /* + * If br_ppid is 0, it means this is a CLONE_PARENT thread, + * so the signal goes to the parent process - not to a + * specific thread in this process. + */ + p = lwptoproc(lwp); + if (lwpd->br_ppid == 0) { + mutex_enter(&p->p_lock); + ppid = p->p_ppid; + t = NULL; + } else { + /* + * If we have been reparented to init or if our + * parent thread is gone, then nobody gets + * signaled. + */ + if ((lx_lwp_ppid(lwp, &ppid, &ptid) == 1) || + (ptid == -1)) + goto free; + + mutex_enter(&pidlock); + if ((p = prfind(ppid)) == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + goto free; + } + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if ((t = idtot(p, ptid)) == NULL) { + mutex_exit(&p->p_lock); + goto free; + } + } + + sqp->sq_info.si_signo = lwpd->br_signal; + sqp->sq_info.si_code = lwpd->br_exitwhy; + sqp->sq_info.si_status = lwpd->br_exitwhat; + sqp->sq_info.si_pid = lwpd->br_pid; + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(p, t, sqp); + mutex_exit(&p->p_lock); + sqp = NULL; + } + +free: + if (lwpd->br_scall_args != NULL) { + ASSERT(lwpd->br_args_size > 0); + kmem_free(lwpd->br_scall_args, lwpd->br_args_size); + } + if (sqp) + kmem_free(sqp, sizeof (sigqueue_t)); +} + +void +lx_freelwp(klwp_t *lwp) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + VERIFY(lwpd != NULL); + + /* + * Remove our system call interposer. + */ + lwp->lwp_brand_syscall = NULL; + + (void) removectx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + if (lwpd->br_pid != 0) { + lx_pid_rele(lwptoproc(lwp)->p_pid, lwptot(lwp)->t_tid); + } + + /* + * Ensure that lx_ptrace_exit() has been called to detach + * ptrace(2) tracers and tracees. + */ + VERIFY(lwpd->br_ptrace_tracer == NULL); + VERIFY(lwpd->br_ptrace_accord == NULL); + + lwp->lwp_brand = NULL; + kmem_free(lwpd, sizeof (struct lx_lwp_data)); +} + +void * +lx_lwpdata_alloc(proc_t *p) +{ + lx_lwp_data_t *lwpd; + struct lx_pid *lpidp; + pid_t newpid = 0; + struct pid *pidp = NULL; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * LWPs beyond the first will require a pid to be allocated to emulate + * Linux's goofy thread model. While this allocation may be + * unnecessary when a single-lwp process undergoes branding, it cannot + * be performed during b_initlwp due to p_lock being held. + */ + if (p->p_lwpcnt > 0) { + if ((newpid = pid_allocate(p, 0, 0)) < 0) { + return (NULL); + } + pidp = pid_find(newpid); + } + + lwpd = kmem_zalloc(sizeof (struct lx_lwp_data), KM_SLEEP); + lpidp = kmem_zalloc(sizeof (struct lx_pid), KM_SLEEP); + + lpidp->l_pid = newpid; + lpidp->l_pidp = pidp; + lwpd->br_lpid = lpidp; + return (lwpd); +} + +/* + * Free lwp brand data if an error occurred during lwp_create. + * Otherwise, lx_freelwp will be used to free the resources after they're + * associated with the lwp via lx_initlwp. + */ +void +lx_lwpdata_free(void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + VERIFY(lwpd != NULL); + VERIFY(lwpd->br_lpid != NULL); + + if (lwpd->br_lpid->l_pidp != NULL) { + (void) pid_rele(lwpd->br_lpid->l_pidp); + } + kmem_free(lwpd->br_lpid, sizeof (*lwpd->br_lpid)); + kmem_free(lwpd, sizeof (*lwpd)); +} + +void +lx_initlwp(klwp_t *lwp, void *lwpbd) +{ + lx_lwp_data_t *lwpd = (lx_lwp_data_t *)lwpbd; + lx_lwp_data_t *plwpd = ttolxlwp(curthread); + kthread_t *tp = lwptot(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwp->lwp_brand == NULL); + + lwpd->br_exitwhy = CLD_EXITED; + lwpd->br_lwp = lwp; + lwpd->br_clear_ctidp = NULL; + lwpd->br_set_ctidp = NULL; + lwpd->br_signal = 0; + lwpd->br_stack_mode = LX_STACK_MODE_PREINIT; + /* + * lwpd->br_affinitymask was zeroed by kmem_zalloc() + * as was lwpd->br_scall_args and lwpd->br_args_size. + */ + + /* + * The first thread in a process has ppid set to the parent + * process's pid, and ptid set to -1. Subsequent threads in the + * process have their ppid set to the pid of the thread that + * created them, and their ptid to that thread's tid. + */ + if (tp->t_next == tp) { + lwpd->br_ppid = tp->t_procp->p_ppid; + lwpd->br_ptid = -1; + } else if (plwpd != NULL) { + bcopy(plwpd->br_tls, lwpd->br_tls, sizeof (lwpd->br_tls)); + lwpd->br_ppid = plwpd->br_pid; + lwpd->br_ptid = curthread->t_tid; + /* The child inherits the fs/gsbase values from the parent */ + lwpd->br_lx_fsbase = plwpd->br_lx_fsbase; + lwpd->br_ntv_fsbase = plwpd->br_ntv_fsbase; + lwpd->br_lx_gsbase = plwpd->br_lx_gsbase; + lwpd->br_ntv_gsbase = plwpd->br_ntv_gsbase; + } else { + /* + * Oddball case: the parent thread isn't a Linux process. + */ + lwpd->br_ppid = 0; + lwpd->br_ptid = -1; + } + lwp->lwp_brand = lwpd; + + /* + * When during lx_lwpdata_alloc, we must decide whether or not to + * allocate a new pid to associate with the lwp. Since p_lock is not + * held at that point, the only time we can guarantee a new pid isn't + * needed is when p_lwpcnt == 0. This is because other lwps won't be + * present to race with us with regards to pid allocation. + * + * This means that in all other cases (where p_lwpcnt > 0), we expect + * that lx_lwpdata_alloc will allocate a pid for us to use here, even + * if it is uneeded. If this process is undergoing an exec, for + * example, the single existing lwp will not need a new pid when it is + * rebranded. In that case, lx_pid_assign will free the uneeded pid. + */ + VERIFY(lwpd->br_lpid->l_pidp != NULL || p->p_lwpcnt == 0); + + lx_pid_assign(tp, lwpd->br_lpid); + lwpd->br_tgid = lwpd->br_pid; + /* + * Having performed the lx pid assignement, the lpid reference is no + * longer needed. The underlying data will be freed during lx_freelwp. + */ + lwpd->br_lpid = NULL; + + installctx(lwptot(lwp), lwp, lx_save, lx_restore, NULL, NULL, + lx_save, NULL); + + /* + * Install branded system call hook for this LWP: + */ + lwp->lwp_brand_syscall = lx_syscall_enter; + + /* + * If the parent LWP has a ptrace(2) tracer, the new LWP may + * need to inherit that same tracer. + */ + if (plwpd != NULL) { + lx_ptrace_inherit_tracer(plwpd, lwpd); + } +} + +/* + * There is no need to have any locking for either the source or + * destination struct lx_lwp_data structs. This is always run in the + * thread context of the source thread, and the destination thread is + * always newly created and not referred to from anywhere else. + */ +void +lx_forklwp(klwp_t *srclwp, klwp_t *dstlwp) +{ + struct lx_lwp_data *src = srclwp->lwp_brand; + struct lx_lwp_data *dst = dstlwp->lwp_brand; + + dst->br_ppid = src->br_pid; + dst->br_ptid = lwptot(srclwp)->t_tid; + bcopy(src->br_tls, dst->br_tls, sizeof (dst->br_tls)); + + switch (src->br_stack_mode) { + case LX_STACK_MODE_BRAND: + case LX_STACK_MODE_NATIVE: + /* + * The parent LWP has an alternate stack installed. + * The child LWP should have the same stack base and extent. + */ + dst->br_stack_mode = src->br_stack_mode; + dst->br_ntv_stack = src->br_ntv_stack; + dst->br_ntv_stack_current = src->br_ntv_stack_current; + break; + + default: + /* + * Otherwise, clear the stack data for this LWP. + */ + dst->br_stack_mode = LX_STACK_MODE_PREINIT; + dst->br_ntv_stack = 0; + dst->br_ntv_stack_current = 0; + } + + /* + * copy only these flags + */ + dst->br_lwp_flags = src->br_lwp_flags & BR_CPU_BOUND; + dst->br_scall_args = NULL; +} + +/* + * When switching a Linux process off the CPU, clear its GDT entries. + */ +/* ARGSUSED */ +static void +lx_save(klwp_t *t) +{ + int i; + +#if defined(__amd64) + reset_sregs(); +#endif + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &null_udesc); +} + +/* + * When switching a Linux process on the CPU, set its GDT entries. + * + * For 64-bit code we don't have to worry about explicitly setting the + * %fsbase via wrmsr(MSR_AMD_FSBASE) here. Instead, that should happen + * automatically in update_sregs if we are executing in user-land. If this + * is the case then pcb_rupdate should be set. + */ +static void +lx_restore(klwp_t *t) +{ + struct lx_lwp_data *lwpd = lwptolxlwp(t); + user_desc_t *tls; + int i; + + ASSERT(lwpd); + + tls = lwpd->br_tls; + for (i = 0; i < LX_TLSNUM; i++) + gdt_update_usegd(GDT_TLSMIN + i, &tls[i]); +} + +void +lx_set_gdt(int entry, user_desc_t *descrp) +{ + + gdt_update_usegd(entry, descrp); +} + +void +lx_clear_gdt(int entry) +{ + gdt_update_usegd(entry, &null_udesc); +} + +longlong_t +lx_nosys() +{ + return (set_errno(ENOSYS)); +} + +/* + * Brand-specific routine to check if given non-Solaris standard segment + * register values should be modified to other values. + */ +/*ARGSUSED*/ +greg_t +lx_fixsegreg(greg_t sr, model_t datamodel) +{ + uint16_t idx = SELTOIDX(sr); + + ASSERT(sr == (sr & 0xffff)); + + /* + * If the segment selector is a valid TLS selector, just return it. + */ + if (!SELISLDT(sr) && idx >= GDT_TLSMIN && idx <= GDT_TLSMAX) + return (sr | SEL_UPL); + + /* + * Force the SR into the LDT in ring 3 for 32-bit processes. + * + * 64-bit processes get the null GDT selector since they are not + * allowed to have a private LDT. + */ +#if defined(__amd64) + return (datamodel == DATAMODEL_ILP32 ? (sr | SEL_TI_LDT | SEL_UPL) : 0); +#elif defined(__i386) + datamodel = datamodel; /* datamodel currently unused for 32-bit */ + return (sr | SEL_TI_LDT | SEL_UPL); +#endif /* __amd64 */ +} + +/* + * Brand-specific function to convert the fsbase as pulled from the register + * into a native fsbase suitable for locating the ulwp_t from the kernel. + */ +uintptr_t +lx_fsbase(klwp_t *lwp, uintptr_t fsbase) +{ + lx_lwp_data_t *lwpd = lwp->lwp_brand; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND || + lwpd->br_ntv_fsbase == NULL) { + return (fsbase); + } + + return (lwpd->br_ntv_fsbase); +} + +/* + * These two functions simulate winfo and post_sigcld for the lx brand. The + * difference is delivering a designated signal as opposed to always SIGCLD. + */ +static void +lx_winfo(proc_t *pp, k_siginfo_t *ip, struct lx_proc_data *dat) +{ + ASSERT(MUTEX_HELD(&pidlock)); + bzero(ip, sizeof (k_siginfo_t)); + ip->si_signo = ltos_signo[dat->l_signal]; + ip->si_code = pp->p_wcode; + ip->si_pid = pp->p_pid; + ip->si_ctid = PRCTID(pp); + ip->si_zoneid = pp->p_zone->zone_id; + ip->si_status = pp->p_wdata; + ip->si_stime = pp->p_stime; + ip->si_utime = pp->p_utime; +} + +static void +lx_post_exit_sig(proc_t *cp, sigqueue_t *sqp, struct lx_proc_data *dat) +{ + proc_t *pp = cp->p_parent; + + ASSERT(MUTEX_HELD(&pidlock)); + mutex_enter(&pp->p_lock); + /* + * Since Linux doesn't queue SIGCHLD, or any other non RT + * signals, we just blindly deliver whatever signal we can. + */ + ASSERT(sqp != NULL); + lx_winfo(cp, &sqp->sq_info, dat); + sigaddqa(pp, NULL, sqp); + sqp = NULL; + mutex_exit(&pp->p_lock); +} + + +/* + * Brand specific code for exiting and sending a signal to the parent, as + * opposed to sigcld(). + */ +void +lx_exit_with_sig(proc_t *cp, sigqueue_t *sqp) +{ + proc_t *pp = cp->p_parent; + lx_proc_data_t *lx_brand_data = ptolxproc(cp); + ASSERT(MUTEX_HELD(&pidlock)); + + switch (cp->p_wcode) { + case CLD_EXITED: + case CLD_DUMPED: + case CLD_KILLED: + ASSERT(cp->p_stat == SZOMB); + /* + * The broadcast on p_srwchan_cv is a kludge to + * wakeup a possible thread in uadmin(A_SHUTDOWN). + */ + cv_broadcast(&cp->p_srwchan_cv); + + /* + * Add to newstate list of the parent + */ + add_ns(pp, cp); + + cv_broadcast(&pp->p_cv); + if ((pp->p_flag & SNOWAIT) || + PTOU(pp)->u_signal[SIGCLD - 1] == SIG_IGN) { + if (!(cp->p_pidflag & CLDWAITPID)) + freeproc(cp); + } else if (!(cp->p_pidflag & CLDNOSIGCHLD) && + lx_brand_data->l_signal != 0) { + lx_post_exit_sig(cp, sqp, lx_brand_data); + sqp = NULL; + } + break; + + case CLD_STOPPED: + case CLD_CONTINUED: + case CLD_TRAPPED: + panic("Should not be called in this case"); + } + + if (sqp) + siginfofree(sqp); +} + +/* + * Filters based on arguments that have been passed in by a separate syscall + * using the B_STORE_ARGS mechanism. if the __WALL flag is set, no filter is + * applied, otherwise we look at the difference between a clone and non-clone + * process. + * The definition of a clone process in Linux is a thread that does not deliver + * SIGCHLD to its parent. The option __WCLONE indicates to wait only on clone + * processes. Without that option, a process should only wait on normal + * children. The following table shows the cases. + * + * default __WCLONE + * no SIGCHLD - X + * SIGCHLD X - + * + * This is an XOR of __WCLONE being set, and SIGCHLD being the signal sent on + * process exit. + * + * More information on wait in lx brands can be found at + * usr/src/lib/brand/lx/lx_brand/common/wait.c. + */ +boolean_t +lx_wait_filter(proc_t *pp, proc_t *cp) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int flags = lwpd->br_waitid_flags; + boolean_t ret; + + if (!lwpd->br_waitid_emulate) { + return (B_TRUE); + } + + mutex_enter(&cp->p_lock); + if (flags & LX_WALL) { + ret = B_TRUE; + } else { + lx_proc_data_t *pd = ptolxproc(cp); + boolean_t is_sigchld = B_TRUE; + boolean_t match_wclone = B_FALSE; + + /* + * When calling clone, an alternate signal can be chosen to + * deliver to the parent when the child exits. + */ + if (pd != NULL && pd->l_signal != stol_signo[SIGCHLD]) { + is_sigchld = B_FALSE; + } + if ((flags & LX_WCLONE) != 0) { + match_wclone = B_TRUE; + } + + ret = (match_wclone ^ is_sigchld) ? B_TRUE : B_FALSE; + } + mutex_exit(&cp->p_lock); + + return (ret); +} + +void +lx_ifname_convert(char *ifname, lx_if_action_t act) +{ + if (act == LX_IF_TONATIVE) { + if (strncmp(ifname, "lo", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo0", IFNAMSIZ); + } else { + if (strncmp(ifname, "lo0", IFNAMSIZ) == 0) + (void) strlcpy(ifname, "lo", IFNAMSIZ); + } +} + +void +lx_ifflags_convert(uint64_t *flags, lx_if_action_t act) +{ + uint64_t buf; + + buf = *flags & (IFF_UP | IFF_BROADCAST | IFF_DEBUG | + IFF_LOOPBACK | IFF_POINTOPOINT | IFF_NOTRAILERS | + IFF_RUNNING | IFF_NOARP | IFF_PROMISC | IFF_ALLMULTI); + + /* Linux has different shift for multicast flag */ + if (act == LX_IF_TONATIVE) { + if (*flags & 0x1000) + buf |= IFF_MULTICAST; + } else { + if (*flags & IFF_MULTICAST) + buf |= 0x1000; + } + *flags = buf; +} + + +void +lx_stol_hwaddr(const struct sockaddr_dl *src, struct sockaddr *dst, int *size) +{ + int copy_size = MIN(src->sdl_alen, sizeof (dst->sa_data)); + + switch (src->sdl_type) { + case DL_ETHER: + dst->sa_family = LX_ARPHRD_ETHER; + break; + case DL_LOOP: + dst->sa_family = LX_ARPHRD_LOOPBACK; + break; + default: + dst->sa_family = LX_ARPHRD_VOID; + } + + bcopy(LLADDR(src), dst->sa_data, copy_size); + *size = copy_size; +} diff --git a/usr/src/uts/common/brand/lx/os/lx_pid.c b/usr/src/uts/common/brand/lx/os/lx_pid.c new file mode 100644 index 0000000000..40179bbdaf --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_pid.c @@ -0,0 +1,395 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/bitmap.h> +#include <sys/var.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/zone.h> +#include <sys/lx_brand.h> + +#define LINUX_PROC_FACTOR 8 /* factor down the hash table by this */ +static int hash_len = 4; /* desired average hash chain length */ +static int hash_size; /* no of buckets in the hash table */ + +static struct lx_pid **stol_pid_hash; +static struct lx_pid **ltos_pid_hash; + +#define LTOS_HASH(pid) ((pid) & (hash_size - 1)) +#define STOL_HASH(pid, tid) (((pid) + (tid)) & (hash_size - 1)) + +static kmutex_t hash_lock; + +static void +lx_pid_insert_hash(struct lx_pid *lpidp) +{ + int shash = STOL_HASH(lpidp->s_pid, lpidp->s_tid); + int lhash = LTOS_HASH(lpidp->l_pid); + + ASSERT(MUTEX_HELD(&hash_lock)); + + lpidp->stol_next = stol_pid_hash[shash]; + stol_pid_hash[shash] = lpidp; + + lpidp->ltos_next = ltos_pid_hash[lhash]; + ltos_pid_hash[lhash] = lpidp; +} + +static struct lx_pid * +lx_pid_remove_hash(pid_t pid, id_t tid) +{ + struct lx_pid **hpp; + struct lx_pid *lpidp = NULL; + + ASSERT(MUTEX_HELD(&hash_lock)); + + hpp = &stol_pid_hash[STOL_HASH(pid, tid)]; + while (*hpp) { + if ((*hpp)->s_pid == pid && (*hpp)->s_tid == tid) { + lpidp = *hpp; + *hpp = (*hpp)->stol_next; + break; + } + hpp = &(*hpp)->stol_next; + } + + /* + * when called during error recovery the pid may already + * be released + */ + if (lpidp == NULL) + return (NULL); + + hpp = <os_pid_hash[LTOS_HASH(lpidp->l_pid)]; + while (*hpp) { + if (*hpp == lpidp) { + *hpp = lpidp->ltos_next; + break; + } + hpp = &(*hpp)->ltos_next; + } + + return (lpidp); +} + +/* + * given a solaris pid/tid pair, create a linux pid + */ +void +lx_pid_assign(kthread_t *t, struct lx_pid *lpidp) +{ + proc_t *p = ttoproc(t); + lx_lwp_data_t *lwpd = ttolxlwp(t); + pid_t s_pid = p->p_pid; + id_t s_tid = t->t_tid; + + /* + * When lx_initlwp is called from lx_setbrand, p_lwpcnt will already be + * equal to 1. Since lx_initlwp is being called against an lwp that + * already exists, an additional pid allocation is not necessary. + * + * We check for this by testing br_ppid == 0. + */ + if (p->p_lwpcnt > 0 && lwpd->br_ppid != 0) { + /* + * Assign allocated pid to any thread other than the first. + * The l_pid and l_pidp fields should be populated. + */ + VERIFY(lpidp->l_pidp != NULL); + VERIFY(lpidp->l_pid != 0); + } else { + /* + * There are cases where a pid is speculatively allocated but + * is not needed. We are obligated to free it here. + */ + if (lpidp->l_pidp != NULL) { + (void) pid_rele(lpidp->l_pidp); + } + lpidp->l_pidp = NULL; + lpidp->l_pid = s_pid; + } + + lpidp->s_pid = s_pid; + lpidp->s_tid = s_tid; + lpidp->l_start = t->t_start; + + /* + * now put the pid into the linux-solaris and solaris-linux + * conversion hash tables + */ + mutex_enter(&hash_lock); + lx_pid_insert_hash(lpidp); + mutex_exit(&hash_lock); + + lwpd->br_pid = lpidp->l_pid; +} + +/* + * If we are exec()ing the process, this thread's tid is about to be reset + * to 1. Make sure the Linux PID bookkeeping reflects that change. + */ +void +lx_pid_reassign(kthread_t *t) +{ + proc_t *p = ttoproc(t); + struct pid *old_pidp; + struct lx_pid *lpidp; + + ASSERT(p->p_lwpcnt == 1); + + mutex_enter(&hash_lock); + + /* + * Clean up all the traces of this thread's 'fake' Linux PID. + */ + lpidp = lx_pid_remove_hash(p->p_pid, t->t_tid); + ASSERT(lpidp != NULL); + old_pidp = lpidp->l_pidp; + lpidp->l_pidp = NULL; + + /* + * Now register this thread as (pid, 1). + */ + lpidp->l_pid = p->p_pid; + lpidp->s_pid = p->p_pid; + lpidp->s_tid = 1; + lx_pid_insert_hash(lpidp); + + mutex_exit(&hash_lock); + + if (old_pidp) + (void) pid_rele(old_pidp); +} + +/* + * release a solaris pid/tid pair + */ +void +lx_pid_rele(pid_t pid, id_t tid) +{ + struct lx_pid *lpidp; + + mutex_enter(&hash_lock); + lpidp = lx_pid_remove_hash(pid, tid); + mutex_exit(&hash_lock); + + if (lpidp) { + if (lpidp->l_pidp) + (void) pid_rele(lpidp->l_pidp); + + kmem_free(lpidp, sizeof (*lpidp)); + } +} + +/* + * given a linux pid, return the solaris pid/tid pair + */ +int +lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid) +{ + struct lx_pid *hp; + + if (l_pid == 1) { + pid_t initpid; + + /* + * We are trying to look up the Linux init process for the + * current zone, which we pretend has pid 1. + */ + if ((initpid = curzone->zone_proc_initpid) == -1) { + /* + * We could not find the init process for this zone. + */ + return (-1); + } + + if (s_pid != NULL) + *s_pid = initpid; + if (s_tid != NULL) + *s_tid = 1; + + return (0); + } + + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(l_pid)]; hp; hp = hp->ltos_next) { + if (l_pid == hp->l_pid) { + if (s_pid) + *s_pid = hp->s_pid; + if (s_tid) + *s_tid = hp->s_tid; + break; + } + } + mutex_exit(&hash_lock); + if (hp != NULL) + return (0); + + /* + * We didn't find this pid in our translation table. + * But this still could be the pid of a native process + * running in the current zone so check for that here. + * + * Note that prfind() only searches for processes in the current zone. + */ + mutex_enter(&pidlock); + if (prfind(l_pid) != NULL) { + mutex_exit(&pidlock); + if (s_pid) + *s_pid = l_pid; + if (s_tid) + *s_tid = 0; + return (0); + } + mutex_exit(&pidlock); + + return (-1); +} + +/* + * Given an lwp, return the Linux pid of its parent. If the caller + * wants them, we return the Solaris (pid, tid) as well. + */ +pid_t +lx_lwp_ppid(klwp_t *lwp, pid_t *ppidp, id_t *ptidp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + struct lx_pid *hp; + pid_t zoneinit = curproc->p_zone->zone_proc_initpid; + pid_t lppid, ppid; + + /* + * Be sure not to return a parent pid that should be invisible + * within this zone. + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * If the parent process's pid is the zone's init process, force it + * to the Linux init pid value of 1. + */ + if (ppid == zoneinit) + ppid = 1; + + /* + * There are two cases in which the Linux definition of a 'parent' + * matches that of Solaris: + * + * - if our tgid is the same as our PID, then we are either the + * first thread in the process or a CLONE_THREAD thread. + * + * - if the brand lwp value for ppid is 0, then we are either the + * child of a differently-branded process or a CLONE_PARENT thread. + */ + if (p->p_pid == lwpd->br_tgid || lwpd->br_ppid == 0) { + if (ppidp != NULL) + *ppidp = ppid; + if (ptidp != NULL) + *ptidp = -1; + return (ppid); + } + + /* + * Set the default Linux parent pid to be the pid of the zone's init + * process; this will get converted back to the Linux default of 1 + * later. + */ + lppid = zoneinit; + + /* + * If the process's parent isn't init, try and look up the Linux "pid" + * corresponding to the process's parent. + */ + if (ppid != 1) { + /* + * In all other cases, we are looking for the parent of this + * specific thread, which in Linux refers to the thread that + * clone()d it. We stashed that thread's PID away when this + * thread was created. + */ + mutex_enter(&hash_lock); + for (hp = ltos_pid_hash[LTOS_HASH(lwpd->br_ppid)]; hp; + hp = hp->ltos_next) { + if (lwpd->br_ppid == hp->l_pid) { + /* + * We found the PID we were looking for, but + * since we cached its value in this LWP's brand + * structure, it has exited and been reused by + * another process. + */ + if (hp->l_start > lwptot(lwp)->t_start) + break; + + lppid = lwpd->br_ppid; + if (ppidp != NULL) + *ppidp = hp->s_pid; + if (ptidp != NULL) + *ptidp = hp->s_tid; + + break; + } + } + mutex_exit(&hash_lock); + } + + if (lppid == zoneinit) { + lppid = 1; + + if (ppidp != NULL) + *ppidp = lppid; + if (ptidp != NULL) + *ptidp = -1; + } + + return (lppid); +} + +void +lx_pid_init(void) +{ + hash_size = 1 << highbit(v.v_proc / (hash_len * LINUX_PROC_FACTOR)); + + stol_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + ltos_pid_hash = kmem_zalloc(sizeof (struct lx_pid *) * hash_size, + KM_SLEEP); + + mutex_init(&hash_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +lx_pid_fini(void) +{ + kmem_free(stol_pid_hash, sizeof (struct lx_pid *) * hash_size); + kmem_free(ltos_pid_hash, sizeof (struct lx_pid *) * hash_size); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_ptrace.c b/usr/src/uts/common/brand/lx/os/lx_ptrace.c new file mode 100644 index 0000000000..1659da56ff --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_ptrace.c @@ -0,0 +1,2377 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Emulation of the Linux ptrace(2) interface. + * + * OVERVIEW + * + * The Linux process model is somewhat different from the illumos native + * model. One critical difference is that each Linux thread has a unique + * identifier in the pid namespace. The lx brand assigns a pid to each LWP + * within the emulated process, giving the pid of the process itself to the + * first LWP. + * + * The Linux ptrace(2) interface allows for any LWP in a branded process to + * exert control over any other LWP within the same zone. Control is exerted + * by the use of the ptrace(2) system call itself, which accepts a number of + * request codes. Feedback on traced events is primarily received by the + * tracer through SIGCLD and the emulated waitpid(2) and waitid(2) system + * calls. Many of the possible ptrace(2) requests will only succeed if the + * target LWP is in a "ptrace-stop" condition. + * + * HISTORY + * + * The brand support for ptrace(2) was originally built on top of the rich + * support for debugging and tracing provided through the illumos /proc + * interfaces, mounted at /native/proc within the zone. The native legacy + * ptrace(3C) functionality was used as a starting point, but was generally + * insufficient for complete and precise emulation. The extant legacy + * interface, and indeed our native SIGCLD and waitid(2) facilities, are + * focused on _process_ level concerns -- the Linux interface has been + * extended to be aware of LWPs as well. + * + * In order to allow us to focus on providing more complete and accurate + * emulation without extensive and undesirable changes to the native + * facilities, this second generation ptrace(2) emulation is mostly separate + * from any other tracing or debugging framework in the system. + * + * ATTACHING TRACERS TO TRACEES + * + * There are several ways that a child LWP may becomed traced by a tracer. + * To determine which attach method caused a tracee to become attached, one + * may inspect the "br_ptrace_attach" member of the LWP-specific brand data + * with the debugger. + * + * The first attach methods to consider are the attaching ptrace(2) requests: + * + * PTRACE_TRACEME + * + * If an LWP makes a PTRACE_TRACEME call, it will be attached as a tracee + * to its parent LWP (br_ppid). Using PTRACE_TRACEME does _not_ cause the + * tracee to be held in a stop condition. It is common practice for + * consumers to raise(SIGSTOP) immediately afterward. + * + * PTRACE_ATTACH + * + * An LWP may attempt to trace any other LWP in this, or another, process. + * We currently allow any attach where the process containing the tracer + * LWP has permission to write to /proc for the process containing the + * intended tracer. This action also sends a SIGSTOP to the newly attached + * tracee. + * + * The second class of attach methods are the clone(2)/fork(2) inheritance + * options that may be set on a tracee with PTRACE_SETOPTIONS: + * + * PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE + * + * If these options have been set on a tracee, then a fork(2), vfork(2) or + * clone(2) respectively will cause the newly created LWP to be traced by + * the same tracer. The same set of ptrace(2) options will also be set on + * the new child. + * + * The third class of attach method is the PTRACE_CLONE flag to clone(2). + * This flag induces the same inheritance as PTRACE_O_TRACECLONE, but is + * passed by the tracee as an argument to clone(2). + * + * DETACHING TRACEES + * + * Tracees can be detached by the tracer with the PTRACE_DETACH request. + * This request is only valid when the tracee is in a ptrace(2) stop + * condition, and is itself a restarting action. + * + * If the tracer exits without detaching all of its tracees, then all of the + * tracees are automatically detached and restarted. If a tracee was in + * "signal-delivery-stop" at the time the tracer exited, the signal will be + * released to the child unless it is a SIGSTOP. We drop this instance of + * SIGSTOP in order to prevent the child from becoming stopped by job + * control. + * + * ACCORD ALLOCATION AND MANAGEMENT + * + * The "lx_ptrace_accord_t" object tracks the agreement between a tracer LWP + * and zero or more tracee LWPs. It is explicitly illegal for a tracee to + * trace its tracer, and we block this in PTRACE_ATTACH/PTRACE_TRACEME. + * + * An LWP starts out without an accord. If a child of that LWP calls + * ptrace(2) with the PTRACE_TRACEME subcommand, or if the LWP itself uses + * PTRACE_ATTACH, an accord will be allocated and stored on that LWP. The + * accord structure is not released from that LWP until it arrives in + * lx_exitlwp(), as called by lwp_exit(). A new accord will not be + * allocated, even if one does not exist, once an LWP arrives in lx_exitlwp() + * and sets the LX_PTRACE_EXITING flag. An LWP will have at most one accord + * structure throughout its entire lifecycle; once it has one, it has the + * same one until death. + * + * The accord is reference counted (lxpa_refcnt), starting at a count of one + * at creation to represent the link from the tracer LWP to its accord. The + * accord is not freed until the reference count falls to zero. + * + * To make mutual exclusion between a detaching tracer and various notifying + * tracees simpler, the tracer will hold "pidlock" while it clears the + * accord members that point back to the tracer LWP and CV. + * + * SIGNALS AND JOB CONTROL + * + * Various actions, either directly ptrace(2) related or commonly associated + * with tracing, cause process- or thread-directed SIGSTOP signals to be sent + * to tracees. These signals, and indeed any signal other than SIGKILL, can + * be suppressed by the tracer when using a restarting request (including + * PTRACE_DETACH) on a child. The signal may also be substituted for a + * different signal. + * + * If a SIGSTOP (or other stopping signal) is not suppressed by the tracer, + * it will induce the regular illumos native job control stop of the entire + * traced process. This is at least passingly similar to the Linux "group + * stop" ptrace(2) condition. + * + * SYSTEM CALL TRACING + * + * The ptrace(2) interface enables the tracer to hold the tracee on entry and + * exit from system calls. When a stopped tracee is restarted through the + * PTRACE_SYSCALL request, the LX_PTRACE_SYSCALL flag is set until the next + * system call boundary. Whether this is a "syscall-entry-stop" or + * "syscall-exit-stop", the tracee is held and the tracer is notified via + * SIGCLD/waitpid(2) in the usual way. The flag LX_PTRACE_SYSCALL flag is + * cleared after each stop; for ongoing system call tracing the tracee must + * be continuously restarted with PTRACE_SYSCALL. + * + * EVENT STOPS + * + * Various events (particularly FORK, VFORK, CLONE, EXEC and EXIT) are + * enabled by the tracer through PTRACE_SETOPTIONS. Once enabled, the tracee + * will be stopped at the nominated points of interest and the tracer + * notified. The tracer may request additional information about the event, + * such as the pid of new LWPs and processes, via PTRACE_GETEVENTMSG. + * + * LOCK ORDERING RULES + * + * It is not safe, in general, to hold p_lock for two different processes at + * the same time. This constraint is the primary reason for the existence + * (and complexity) of the ptrace(2) accord mechanism. + * + * In order to facilitate looking up accords by the "pid" of a tracer LWP, + * p_lock for the tracer process may be held while entering the accord mutex + * (lxpa_lock). This mutex protects the accord flags and reference count. + * The reference count is manipulated through lx_ptrace_accord_hold() and + * lx_ptrace_accord_rele(). + * + * DO NOT interact with the accord mutex (lxpa_lock) directly. The + * lx_ptrace_accord_enter() and lx_ptrace_accord_exit() functions do various + * book-keeping and lock ordering enforcement and MUST be used. + * + * It is NOT legal to take ANY p_lock while holding the accord mutex + * (lxpa_lock). If the lxpa_tracees_lock is to be held concurrently with + * lxpa_lock, lxpa_lock MUST be taken first and dropped before taking p_lock + * of any processes from the tracee list. + * + * It is NOT legal to take a tracee p_lock and then attempt to enter the + * accord mutex (or tracee list mutex) of its tracer. When running as the + * tracee LWP, the tracee's hold will prevent the accord from being freed. + * Use of the LX_PTRACE_STOPPING or LX_PTRACE_CLONING flag in the + * LWP-specific brand data prevents an exiting tracer from altering the + * tracee until the tracee has come to an orderly stop, without requiring the + * tracee to hold its own p_lock the entire time it is stopping. + * + * It is not safe, in general, to enter "pidlock" while holding the p_lock of + * any process. It is similarly illegal to hold any accord locks (lxpa_lock + * or lxpa_sublock) while attempting to enter "pidlock". As "pidlock" is a + * global mutex, it should be held for the shortest possible time. + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/procfs.h> +#include <sys/cmn_err.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/wait.h> +#include <sys/prsystm.h> +#include <sys/note.h> + +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_syscall.h> +#include <lx_signum.h> + + +typedef enum lx_ptrace_cont_flags_t { + LX_PTC_NONE = 0x00, + LX_PTC_SYSCALL = 0x01, + LX_PTC_SINGLESTEP = 0x02 +} lx_ptrace_cont_flags_t; + +/* + * Macros for checking the state of an LWP via "br_ptrace_flags": + */ +#define LX_PTRACE_BUSY \ + (LX_PTRACE_EXITING | LX_PTRACE_STOPPING | LX_PTRACE_CLONING) + +#define VISIBLE(a) (((a)->br_ptrace_flags & LX_PTRACE_EXITING) == 0) +#define TRACEE_BUSY(a) (((a)->br_ptrace_flags & LX_PTRACE_BUSY) != 0) + +#define ACCORD_HELD(a) MUTEX_HELD(&(a)->lxpa_lock) + +static kcondvar_t lx_ptrace_busy_cv; +static kmem_cache_t *lx_ptrace_accord_cache; + +/* + * Enter the accord mutex. + */ +static void +lx_ptrace_accord_enter(lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + + mutex_enter(&accord->lxpa_lock); +} + +/* + * Exit the accord mutex. If the reference count has dropped to zero, + * free the accord. + */ +static void +lx_ptrace_accord_exit(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + if (accord->lxpa_refcnt > 0) { + mutex_exit(&accord->lxpa_lock); + return; + } + + /* + * When the reference count drops to zero we must free the accord. + */ + VERIFY(accord->lxpa_tracer == NULL); + VERIFY(MUTEX_NOT_HELD(&accord->lxpa_tracees_lock)); + VERIFY(list_is_empty(&accord->lxpa_tracees)); + VERIFY(accord->lxpa_flags & LX_ACC_TOMBSTONE); + + mutex_destroy(&accord->lxpa_lock); + mutex_destroy(&accord->lxpa_tracees_lock); + + kmem_cache_free(lx_ptrace_accord_cache, accord); +} + +/* + * Drop our reference to this accord. If this drops the reference count + * to zero, the next lx_ptrace_accord_exit() will free the accord. + */ +static void +lx_ptrace_accord_rele(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + VERIFY(accord->lxpa_refcnt > 0); + accord->lxpa_refcnt--; +} + +/* + * Place an additional hold on an accord. + */ +static void +lx_ptrace_accord_hold(lx_ptrace_accord_t *accord) +{ + VERIFY(ACCORD_HELD(accord)); + + accord->lxpa_refcnt++; +} + +/* + * Fetch the accord for this LWP. If one has not yet been created, and the + * process is not exiting, allocate it now. Must be called with p_lock held + * for the process containing the target LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_locked(klwp_t *lwp, lx_ptrace_accord_t **accordp, + boolean_t allocate_one) +{ + lx_ptrace_accord_t *lxpa; + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + proc_t *p = lwptoproc(lwp); + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * If this LWP does not have an accord, we wish to allocate + * and install one. + */ + if ((lxpa = lwpd->br_ptrace_accord) == NULL) { + if (!allocate_one || !VISIBLE(lwpd)) { + /* + * Either we do not wish to allocate an accord, or this + * LWP has already begun exiting from a ptrace + * perspective. + */ + *accordp = NULL; + return (ESRCH); + } + + lxpa = kmem_cache_alloc(lx_ptrace_accord_cache, KM_SLEEP); + bzero(lxpa, sizeof (*lxpa)); + + /* + * The initial reference count is 1 because we are referencing + * it in from the soon-to-be tracer LWP. + */ + lxpa->lxpa_refcnt = 1; + mutex_init(&lxpa->lxpa_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&lxpa->lxpa_tracees_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&lxpa->lxpa_tracees, sizeof (lx_lwp_data_t), + offsetof(lx_lwp_data_t, br_ptrace_linkage)); + lxpa->lxpa_cvp = &p->p_cv; + + lxpa->lxpa_tracer = lwpd; + lwpd->br_ptrace_accord = lxpa; + } + + /* + * Lock the accord before returning it to the caller. + */ + lx_ptrace_accord_enter(lxpa); + + /* + * There should be at least one active reference to this accord, + * otherwise it should have been freed. + */ + VERIFY(lxpa->lxpa_refcnt > 0); + + *accordp = lxpa; + return (0); +} + +/* + * Accords belong to the tracer LWP. Get the accord for this tracer or return + * an error if it was not possible. To prevent deadlocks, the caller MUST NOT + * hold p_lock on its own or any other process. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get_by_pid(pid_t lxpid, lx_ptrace_accord_t **accordp) +{ + int ret = ESRCH; + pid_t apid; + id_t atid; + proc_t *aproc; + kthread_t *athr; + klwp_t *alwp; + lx_lwp_data_t *alwpd; + + VERIFY(MUTEX_NOT_HELD(&curproc->p_lock)); + + /* + * Locate the process containing the tracer LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_to_spair(lxpid, &apid, &atid) != 0 || + (aproc = sprlock(apid)) == NULL) { + return (ESRCH); + } + + /* + * Locate the tracer LWP itself and ensure that it is visible to + * ptrace(2). + */ + if ((athr = idtot(aproc, atid)) == NULL || + (alwp = ttolwp(athr)) == NULL || + (alwpd = lwptolxlwp(alwp)) == NULL || + !VISIBLE(alwpd)) { + sprunlock(aproc); + return (ESRCH); + } + + /* + * We should not fetch our own accord this way. + */ + if (athr == curthread) { + sprunlock(aproc); + return (EPERM); + } + + /* + * Fetch (or allocate) the accord owned by this tracer LWP: + */ + ret = lx_ptrace_accord_get_locked(alwp, accordp, B_TRUE); + + /* + * Unlock the process and return. + */ + sprunlock(aproc); + return (ret); +} + +/* + * Get (or allocate) the ptrace(2) accord for the current LWP, acting as a + * tracer. The caller MUST NOT currently hold p_lock on the process containing + * this LWP. + * + * If successful, we return holding the accord lock (lxpa_lock). + */ +static int +lx_ptrace_accord_get(lx_ptrace_accord_t **accordp, boolean_t allocate_one) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + int ret; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * Lock the tracer (this LWP). + */ + mutex_enter(&p->p_lock); + + /* + * Fetch (or allocate) the accord for this LWP: + */ + ret = lx_ptrace_accord_get_locked(lwp, accordp, allocate_one); + + mutex_exit(&p->p_lock); + + return (ret); +} + +/* + * Restart an LWP if it is in "ptrace-stop". This function may induce sleep, + * so the caller MUST NOT hold any mutexes other than p_lock for the process + * containing the LWP. + */ +static void +lx_ptrace_restart_lwp(klwp_t *lwp) +{ + kthread_t *rt = lwptot(lwp); + proc_t *rproc = lwptoproc(lwp); + lx_lwp_data_t *rlwpd = lwptolxlwp(lwp); + + VERIFY(rt != curthread); + VERIFY(MUTEX_HELD(&rproc->p_lock)); + + /* + * Exclude potential meddling from procfs. + */ + prbarrier(rproc); + + /* + * Check that the LWP is still in "ptrace-stop" and, if so, restart it. + */ + thread_lock(rt); + if (BSTOPPED(rt) && rt->t_whystop == PR_BRAND) { + rt->t_schedflag |= TS_BSTART; + setrun_locked(rt); + + /* + * Clear stop reason. + */ + rlwpd->br_ptrace_whystop = 0; + rlwpd->br_ptrace_whatstop = 0; + rlwpd->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND | + LX_PTRACE_WAITPEND); + } + thread_unlock(rt); +} + +static void +lx_winfo(lx_lwp_data_t *remote, k_siginfo_t *ip, boolean_t waitflag, + pid_t *event_ppid, pid_t *event_pid) +{ + int signo; + + /* + * Populate our k_siginfo_t with data about this "ptrace-stop" + * condition: + */ + bzero(ip, sizeof (*ip)); + ip->si_signo = SIGCLD; + ip->si_pid = remote->br_pid; + ip->si_code = CLD_TRAPPED; + + switch (remote->br_ptrace_whatstop) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + ip->si_status = SIGTRAP; + if (remote->br_ptrace_options & LX_PTRACE_O_TRACESYSGOOD) { + ip->si_status |= 0x80; + } + break; + + case LX_PR_SIGNALLED: + signo = remote->br_ptrace_stopsig; + if (signo < 1 || signo >= LX_NSIG) { + /* + * If this signal number is not valid, pretend it + * was a SIGTRAP. + */ + ip->si_status = SIGTRAP; + } else { + ip->si_status = ltos_signo[signo]; + } + break; + + case LX_PR_EVENT: + ip->si_status = SIGTRAP | remote->br_ptrace_event; + /* + * Record the Linux pid of both this LWP and the create + * event we are dispatching. We will use this information + * to unblock any subsequent ptrace(2) events that depend + * on this one. + */ + if (event_ppid != NULL) + *event_ppid = remote->br_pid; + if (event_pid != NULL) + *event_pid = (pid_t)remote->br_ptrace_eventmsg; + break; + + default: + cmn_err(CE_PANIC, "unxpected stop subreason: %d", + remote->br_ptrace_whatstop); + } + + /* + * If WNOWAIT was specified, do not mark the event as posted + * so that it may be re-fetched on another call to waitid(). + */ + if (waitflag) { + remote->br_ptrace_flags &= ~(LX_PTRACE_CLDPEND | + LX_PTRACE_WAITPEND); + } +} + +/* + * Receive notification from stop() of a PR_BRAND stop. + */ +void +lx_stop_notify(proc_t *p, klwp_t *lwp, ushort_t why, ushort_t what) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + klwp_t *plwp = NULL; + proc_t *pp = NULL; + lx_lwp_data_t *parent; + boolean_t cldpend = B_TRUE; + boolean_t cldpost = B_FALSE; + sigqueue_t *sqp = NULL; + + /* + * We currently only care about LX-specific stop reasons. + */ + if (why != PR_BRAND) + return; + + switch (what) { + case LX_PR_SYSENTRY: + case LX_PR_SYSEXIT: + case LX_PR_SIGNALLED: + case LX_PR_EVENT: + break; + default: + cmn_err(CE_PANIC, "unexpected subreason for PR_BRAND" + " stop: %d", (int)what); + } + + /* + * We should be holding the lock on our containing process. The + * STOPPING flag should have been set by lx_ptrace_stop() for all + * PR_BRAND stops. + */ + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(lwpd->br_ptrace_flags & LX_PTRACE_STOPPING); + VERIFY((accord = lwpd->br_ptrace_tracer) != NULL); + + /* + * We must drop our process lock to take "pidlock". The + * LX_PTRACE_STOPPING flag protects us from an exiting tracer. + */ + mutex_exit(&p->p_lock); + + /* + * Allocate before we enter any mutexes. + */ + sqp = kmem_zalloc(sizeof (*sqp), KM_SLEEP); + + /* + * We take pidlock now, which excludes all callers of waitid() and + * prevents a detaching tracer from clearing critical accord members. + */ + mutex_enter(&pidlock); + mutex_enter(&p->p_lock); + + /* + * Get the ptrace(2) "parent" process, to which we may send + * a SIGCLD signal later. + */ + if ((parent = accord->lxpa_tracer) != NULL && + (plwp = parent->br_lwp) != NULL) { + pp = lwptoproc(plwp); + } + + /* + * Our tracer should not have been modified in our absence; the + * LX_PTRACE_STOPPING flag prevents it. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + + /* + * Stash data for this stop condition in the LWP data while we hold + * both pidlock and our p_lock. + */ + lwpd->br_ptrace_whystop = why; + lwpd->br_ptrace_whatstop = what; + lwpd->br_ptrace_flags |= LX_PTRACE_WAITPEND; + + /* + * If this event does not depend on an event from the parent LWP, + * populate the siginfo_t for the event pending on this tracee LWP. + */ + if (!(lwpd->br_ptrace_flags & LX_PTRACE_PARENT_WAIT) && pp != NULL) { + cldpost = B_TRUE; + lx_winfo(lwpd, &sqp->sq_info, B_FALSE, NULL, NULL); + } + + /* + * Drop our p_lock so that we may lock the tracer. + */ + mutex_exit(&p->p_lock); + if (cldpost && pp != NULL) { + /* + * Post the SIGCLD to the tracer. + */ + mutex_enter(&pp->p_lock); + if (!sigismember(&pp->p_sig, SIGCLD)) { + sigaddqa(pp, plwp->lwp_thread, sqp); + cldpend = B_FALSE; + sqp = NULL; + } + mutex_exit(&pp->p_lock); + } + + /* + * We re-take our process lock now. The lock will be held until + * the thread is actually marked stopped, so we will not race with + * lx_ptrace_lock_if_stopped() or lx_waitid_helper(). + */ + mutex_enter(&p->p_lock); + + /* + * We clear the STOPPING flag; stop() continues to hold our p_lock + * until our thread stop state is visible. + */ + lwpd->br_ptrace_flags &= ~LX_PTRACE_STOPPING; + lwpd->br_ptrace_flags |= LX_PTRACE_STOPPED; + if (cldpend) { + /* + * We sent the SIGCLD for this new wait condition already. + */ + lwpd->br_ptrace_flags |= LX_PTRACE_CLDPEND; + } + + /* + * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will + * be sleeping on this CV until LX_PTRACE_STOPPING is clear. Wake it + * now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + + /* + * While still holding pidlock, we attempt to wake our tracer from a + * potential waitid() slumber. + */ + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + + /* + * We release pidlock and return as we were called: with our p_lock + * held. + */ + mutex_exit(&pidlock); + + if (sqp != NULL) { + kmem_free(sqp, sizeof (*sqp)); + } +} + +/* + * For any restarting action (e.g. PTRACE_CONT, PTRACE_SYSCALL or + * PTRACE_DETACH) to be allowed, the tracee LWP must be in "ptrace-stop". This + * check must ONLY be run on tracees of the current LWP. If the check is + * successful, we return with the tracee p_lock held. + */ +static int +lx_ptrace_lock_if_stopped(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote) +{ + klwp_t *rlwp = remote->br_lwp; + proc_t *rproc = lwptoproc(rlwp); + kthread_t *rt = lwptot(rlwp); + + /* + * We must never check that we, ourselves, are stopped. We must also + * have the accord tracee list locked while we lock our tracees. + */ + VERIFY(curthread != rt); + VERIFY(MUTEX_HELD(&accord->lxpa_tracees_lock)); + VERIFY(accord->lxpa_tracer == ttolxlwp(curthread)); + + /* + * Lock the process containing the tracee LWP. + */ + mutex_enter(&rproc->p_lock); + if (!VISIBLE(remote)) { + /* + * The tracee LWP is currently detaching itself as it exits. + * It is no longer visible to ptrace(2). + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * We must only check whether tracees of the current LWP are stopped. + * We check this condition after confirming visibility as an exiting + * tracee may no longer be completely consistent. + */ + VERIFY(remote->br_ptrace_tracer == accord); + + if (!(remote->br_ptrace_flags & LX_PTRACE_STOPPED)) { + /* + * The tracee is not in "ptrace-stop", so we release the + * process. + */ + mutex_exit(&rproc->p_lock); + return (ESRCH); + } + + /* + * The tracee is stopped. We return holding its process lock so that + * the caller may manipulate it. + */ + return (0); +} + +static int +lx_ptrace_setoptions(lx_lwp_data_t *remote, uintptr_t options) +{ + /* + * Check for valid options. + */ + if ((options & ~LX_PTRACE_O_ALL) != 0) { + return (EINVAL); + } + + /* + * Set ptrace options on the target LWP. + */ + remote->br_ptrace_options = (lx_ptrace_options_t)options; + + return (0); +} + +static int +lx_ptrace_geteventmsg(lx_lwp_data_t *remote, void *umsgp) +{ + int error; + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + uint32_t tmp = remote->br_ptrace_eventmsg; + + error = copyout(&tmp, umsgp, sizeof (uint32_t)); + } else +#endif + { + error = copyout(&remote->br_ptrace_eventmsg, umsgp, + sizeof (ulong_t)); + } + + return (error); +} + +static int +lx_ptrace_getregs(lx_lwp_data_t *remote, void *uregsp) +{ + if (remote->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The LWP was stopped with the brand stack and register + * state loaded, e.g. during a system call emulated within + * the kernel. Return the LWP register state. + */ + return (lx_regs_to_userregs(remote, uregsp)); + } else if (remote->br_stack_mode == LX_STACK_MODE_PREINIT && + (remote->br_ptrace_whatstop == LX_PR_SIGNALLED || + remote->br_ptrace_whatstop == LX_PR_SYSEXIT)) { + /* + * The LWP was stopped by tracing on exec. + */ + return (lx_regs_to_userregs(remote, uregsp)); + } else if (remote->br_stack_mode == LX_STACK_MODE_NATIVE && + remote->br_ptrace_whystop == PR_BRAND && + remote->br_ptrace_whatstop == LX_PR_EVENT) { + /* + * Called while we're ptrace event stopped by lx_exec. + */ + return (lx_regs_to_userregs(remote, uregsp)); + } else if (remote->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Return the register state + * from that ucontext_t. + */ + return (lx_uc_to_userregs(remote, + (void *)remote->br_ptrace_stopucp, uregsp)); + } else { + /* + * The register state is not currently available. + */ + return (EIO); + } +} + +static int +lx_ptrace_setregs(lx_lwp_data_t *remote, void *uregsp) +{ + if (remote->br_stack_mode == LX_STACK_MODE_BRAND) { + /* + * The LWP was stopped with the brand stack and register + * state loaded, e.g. during a system call emulated within + * the kernel. Write to the LWP register state. + */ + return (lx_userregs_to_regs(remote, uregsp)); + } else if (remote->br_ptrace_stopucp != NULL) { + /* + * The LWP was stopped in the usermode emulation library + * but a ucontext_t for the preserved brand stack and + * register state was provided. Write to the register state + * in that ucontext_t. + */ + return (lx_userregs_to_uc(remote, + (void *)remote->br_ptrace_stopucp, uregsp)); + } else { + /* + * The register state is not currently available. + */ + return (EIO); + } +} + +/* + * Implements the PTRACE_CONT subcommand of the Linux ptrace(2) interface. + */ +static int +lx_ptrace_cont(lx_lwp_data_t *remote, lx_ptrace_cont_flags_t flags, int signo) +{ + klwp_t *lwp = remote->br_lwp; + + if (flags & LX_PTC_SINGLESTEP) { + /* + * We do not currently support single-stepping. + */ + lx_unsupported("PTRACE_SINGLESTEP not currently implemented"); + return (EINVAL); + } + + /* + * The tracer may choose to suppress the delivery of a signal, or + * select an alternative signal for delivery. If this is an + * appropriate ptrace(2) "signal-delivery-stop", br_ptrace_stopsig + * will be used as the new signal number. + * + * As with so many other aspects of the Linux ptrace(2) interface, this + * may fail silently if the state machine is not aligned correctly. + */ + remote->br_ptrace_stopsig = signo; + + /* + * Handle the syscall-stop flag if this is a PTRACE_SYSCALL restart: + */ + if (flags & LX_PTC_SYSCALL) { + remote->br_ptrace_flags |= LX_PTRACE_SYSCALL; + } else { + remote->br_ptrace_flags &= ~LX_PTRACE_SYSCALL; + } + + lx_ptrace_restart_lwp(lwp); + + return (0); +} + +/* + * Implements the PTRACE_DETACH subcommand of the Linux ptrace(2) interface. + * + * The LWP identified by the Linux pid "lx_pid" will, if it as a tracee of the + * current LWP, be detached and set runnable. If the specified LWP is not + * currently in the "ptrace-stop" state, the routine will return ESRCH as if + * the LWP did not exist at all. + * + * The caller must not hold p_lock on any process. + */ +static int +lx_ptrace_detach(lx_ptrace_accord_t *accord, lx_lwp_data_t *remote, int signo, + boolean_t *release_hold) +{ + klwp_t *rlwp; + + rlwp = remote->br_lwp; + + /* + * The tracee LWP was in "ptrace-stop" and we now hold its p_lock. + * Detach the LWP from the accord and set it running. + */ + VERIFY(!TRACEE_BUSY(remote)); + remote->br_ptrace_flags &= ~(LX_PTRACE_SYSCALL | LX_PTRACE_INHERIT); + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + *release_hold = B_TRUE; + + /* + * The tracer may, as described in lx_ptrace_cont(), choose to suppress + * or modify the delivered signal. + */ + remote->br_ptrace_stopsig = signo; + + lx_ptrace_restart_lwp(rlwp); + + return (0); +} + +/* + * This routine implements the PTRACE_ATTACH operation of the Linux ptrace(2) + * interface. + * + * This LWP is requesting to be attached as a tracer to another LWP -- the + * tracee. If a ptrace accord to track the list of tracees has not yet been + * allocated, one will be allocated and attached to this LWP now. + * + * The "br_ptrace_tracer" on the tracee LWP is set to this accord, and the + * tracee LWP is then added to the "lxpa_tracees" list in the accord. We drop + * locks between these two phases; the only consumer of trace events from this + * accord is this LWP, which obviously cannot be running waitpid(2) at the same + * time as this call to ptrace(2). + */ +static int +lx_ptrace_attach(pid_t lx_pid) +{ + int error = ESRCH; + /* + * Our (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + /* + * Remote (Tracee) LWP: + */ + pid_t rpid; + id_t rtid; + proc_t *rproc; + kthread_t *rthr; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + + if (lwpd->br_pid == lx_pid) { + /* + * We cannot trace ourselves. + */ + return (EPERM); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This + * routine should not fail because the LWP cannot make ptrace(2) system + * calls after it has begun exiting. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTRACE_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * Place speculative hold in case the attach is successful. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Locate the process containing the tracee LWP based on its Linux pid + * and lock it. + */ + if (lx_lpid_to_spair(lx_pid, &rpid, &rtid) != 0 || + (rproc = sprlock(rpid)) == NULL) { + /* + * We could not find the target process. + */ + goto errout; + } + + /* + * Locate the tracee LWP. + */ + if ((rthr = idtot(rproc, rtid)) == NULL || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL || + !VISIBLE(rlwpd)) { + /* + * The LWP could not be found, was not branded, or is not + * visible to ptrace(2) at this time. + */ + goto unlock_errout; + } + + /* + * We now hold the lock on the tracee. Attempt to install ourselves + * as the tracer. + */ + if (curproc != rproc && priv_proc_cred_perm(curproc->p_cred, rproc, + NULL, VWRITE) != 0) { + /* + * This process does not have permission to trace the remote + * process. + */ + error = EPERM; + } else if (rlwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(list_link_active(&rlwpd->br_ptrace_linkage)); + VERIFY(rlwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + lx_proc_data_t *rprocd; + + /* + * Bond the tracee to the accord. + */ + VERIFY0(rlwpd->br_ptrace_flags & LX_PTRACE_EXITING); + VERIFY(rlwpd->br_ptrace_attach == LX_PTA_NONE); + rlwpd->br_ptrace_attach = LX_PTA_ATTACH; + rlwpd->br_ptrace_tracer = accord; + + /* + * We had no tracer, and are thus not in the tracees list. + * It is safe to take the tracee list lock while we insert + * ourselves. + */ + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(!list_link_active(&rlwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, rlwpd); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Send a thread-directed SIGSTOP. + */ + sigtoproc(rproc, rthr, SIGSTOP); + + /* + * Set the in-kernel process-wide ptrace(2) enable flag. + */ + rprocd = ttolxproc(rthr); + rprocd->l_ptrace = 1; + + error = 0; + } + +unlock_errout: + /* + * Unlock the process containing the tracee LWP and the accord. + */ + sprunlock(rproc); + +errout: + if (error != 0) { + /* + * The attach was not successful. Remove our speculative + * hold. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); +} + +int +lx_ptrace_set_clone_inherit(int option, boolean_t inherit_flag) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + + switch (option) { + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_clone_option = option; + break; + + default: + return (EINVAL); + } + + if (inherit_flag) { + lwpd->br_ptrace_flags |= LX_PTRACE_INHERIT; + } else { + lwpd->br_ptrace_flags &= ~LX_PTRACE_INHERIT; + } + + mutex_exit(&p->p_lock); + return (0); +} + +/* + * If the parent LWP is being traced, we want to attach ourselves to the + * same accord. + */ +void +lx_ptrace_inherit_tracer(lx_lwp_data_t *src, lx_lwp_data_t *dst) +{ + proc_t *srcp = lwptoproc(src->br_lwp); + proc_t *dstp = lwptoproc(dst->br_lwp); + lx_ptrace_accord_t *accord; + boolean_t unlock = B_FALSE; + + if (srcp == dstp) { + /* + * This is syslwp_create(), so the process p_lock is already + * held. + */ + VERIFY(MUTEX_HELD(&srcp->p_lock)); + } else { + unlock = B_TRUE; + mutex_enter(&srcp->p_lock); + } + + if ((accord = src->br_ptrace_tracer) == NULL) { + /* + * The source LWP does not have a tracer to inherit. + */ + goto out; + } + + /* + * There are two conditions to check when determining if the new + * child should inherit the same tracer (and tracing options) as its + * parent. Either condition is sufficient to trigger inheritance. + */ + dst->br_ptrace_attach = LX_PTA_NONE; + if ((src->br_ptrace_options & src->br_ptrace_clone_option) != 0) { + /* + * Condition 1: + * The clone(2), fork(2) and vfork(2) emulated system calls + * populate "br_ptrace_clone_option" with the specific + * ptrace(2) SETOPTIONS option that applies to this + * operation. If the relevant option has been enabled by the + * tracer then we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_OPTIONS; + + } else if ((src->br_ptrace_flags & LX_PTRACE_INHERIT) != 0) { + /* + * Condition 2: + * If the caller opted in to inheritance with the + * PTRACE_CLONE flag to clone(2), the LX_PTRACE_INHERIT flag + * will be set and we inherit. + */ + dst->br_ptrace_attach |= LX_PTA_INHERIT_CLONE; + } + + /* + * These values only apply for the duration of a single clone(2), et + * al, system call. + */ + src->br_ptrace_flags &= ~LX_PTRACE_INHERIT; + src->br_ptrace_clone_option = 0; + + if (dst->br_ptrace_attach == LX_PTA_NONE) { + /* + * No condition triggered inheritance. + */ + goto out; + } + + /* + * Set the LX_PTRACE_CLONING flag to prevent us from being detached + * while our p_lock is dropped. + */ + src->br_ptrace_flags |= LX_PTRACE_CLONING; + mutex_exit(&srcp->p_lock); + + /* + * Hold the accord for the new LWP. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * Install the tracer and copy the current PTRACE_SETOPTIONS options. + */ + dst->br_ptrace_tracer = accord; + dst->br_ptrace_options = src->br_ptrace_options; + + /* + * This flag prevents waitid() from seeing events for the new child + * until the parent is able to post the relevant ptrace event to + * the tracer. + */ + dst->br_ptrace_flags |= LX_PTRACE_PARENT_WAIT; + + mutex_enter(&accord->lxpa_tracees_lock); + VERIFY(list_link_active(&src->br_ptrace_linkage)); + VERIFY(!list_link_active(&dst->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, dst); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Relock our process and clear our busy flag. + */ + mutex_enter(&srcp->p_lock); + src->br_ptrace_flags &= ~LX_PTRACE_CLONING; + + /* + * If lx_ptrace_exit_tracer() is trying to detach our tracer, it will + * be sleeping on this CV until LX_PTRACE_CLONING is clear. Wake it + * now. + */ + cv_broadcast(&lx_ptrace_busy_cv); + +out: + if (unlock) { + mutex_exit(&srcp->p_lock); + } +} + +static int +lx_ptrace_traceme(void) +{ + int error; + boolean_t did_attach = B_FALSE; + /* + * Our (Tracee) LWP: + */ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + /* + * Remote (Tracer) LWP: + */ + lx_ptrace_accord_t *accord; + + /* + * We are intending to be the tracee. Fetch (or allocate) the accord + * for our parent LWP. + */ + if ((error = lx_ptrace_accord_get_by_pid(lx_lwp_ppid(lwp, NULL, + NULL), &accord)) != 0) { + /* + * Could not determine the Linux pid of the parent LWP, or + * could not get the accord for that LWP. + */ + return (error); + } + + /* + * We now hold the accord lock. + */ + if (accord->lxpa_flags & LX_ACC_TOMBSTONE) { + /* + * The accord is marked for death; give up now. + */ + lx_ptrace_accord_exit(accord); + return (ESRCH); + } + + /* + * Bump the reference count so that the accord is not freed. We need + * to drop the accord lock before we take our own p_lock. + */ + lx_ptrace_accord_hold(accord); + lx_ptrace_accord_exit(accord); + + /* + * We now lock _our_ process and determine if we can install our parent + * as our tracer. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer != NULL) { + /* + * This LWP is already being traced. + */ + VERIFY(lwpd->br_ptrace_attach != LX_PTA_NONE); + error = EPERM; + } else { + /* + * Bond ourselves to the accord. We already bumped the accord + * reference count. + */ + VERIFY(lwpd->br_ptrace_attach == LX_PTA_NONE); + lwpd->br_ptrace_attach = LX_PTA_TRACEME; + lwpd->br_ptrace_tracer = accord; + did_attach = B_TRUE; + error = 0; + } + mutex_exit(&p->p_lock); + + /* + * Lock the accord tracee list and add this LWP. Once we are in the + * tracee list, it is the responsibility of the tracer to detach us. + */ + if (error == 0) { + lx_ptrace_accord_enter(accord); + mutex_enter(&accord->lxpa_tracees_lock); + + if (!(accord->lxpa_flags & LX_ACC_TOMBSTONE)) { + lx_proc_data_t *procd = ttolxproc(curthread); + + /* + * Put ourselves in the tracee list for this accord. + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + list_insert_tail(&accord->lxpa_tracees, lwpd); + mutex_exit(&accord->lxpa_tracees_lock); + lx_ptrace_accord_exit(accord); + + /* + * Set the in-kernel process-wide ptrace(2) enable + * flag. + */ + procd->l_ptrace = 1; + + return (0); + } + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * The accord has been marked for death. We must + * untrace ourselves. + */ + error = ESRCH; + lx_ptrace_accord_exit(accord); + } + + /* + * Our optimism was unjustified: We were unable to attach. We need to + * lock the process containing this LWP again in order to remove the + * tracer. + */ + VERIFY(error != 0); + mutex_enter(&p->p_lock); + if (did_attach) { + /* + * Verify that things were as we left them: + */ + VERIFY(!list_link_active(&lwpd->br_ptrace_linkage)); + VERIFY(lwpd->br_ptrace_tracer == accord); + + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + } + mutex_exit(&p->p_lock); + + /* + * Remove our speculative hold on the accord, possibly causing it to be + * freed in the process. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + + return (error); +} + +static boolean_t +lx_ptrace_stop_common(proc_t *p, lx_lwp_data_t *lwpd, ushort_t what) +{ + boolean_t reset_nostop = B_FALSE; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark this LWP as stopping and call stop() to enter "ptrace-stop". + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTRACE_STOPPING); + lwpd->br_ptrace_flags |= LX_PTRACE_STOPPING; + + if (lwpd->br_lwp->lwp_nostop == 1 && + lwpd->br_ptrace_event == LX_PTRACE_EVENT_EXEC) { + /* We need to clear this to get the signal delivered. */ + lwpd->br_lwp->lwp_nostop = 0; + reset_nostop = B_TRUE; + } + + stop(PR_BRAND, what); + + if (reset_nostop) { + VERIFY(lwpd->br_lwp->lwp_nostop == 0); + lwpd->br_lwp->lwp_nostop = 1; + } + + /* + * We are back from "ptrace-stop" with our process lock held. + */ + lwpd->br_ptrace_flags &= ~(LX_PTRACE_STOPPING | LX_PTRACE_STOPPED | + LX_PTRACE_CLDPEND); + lwpd->br_ptrace_stopucp = NULL; + cv_broadcast(&lx_ptrace_busy_cv); + mutex_exit(&p->p_lock); + + return (B_TRUE); +} + +int +lx_ptrace_stop_for_option(int option, boolean_t child, ulong_t msg, + uintptr_t ucp) +{ + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (!child) { + /* + * Only the first event posted by a new process is to be held + * until the matching parent event is dispatched, and only if + * it is a "child" event. This is not a child event, so we + * clear the wait flag. + */ + lwpd->br_ptrace_flags &= ~LX_PTRACE_PARENT_WAIT; + } + + if (!(lwpd->br_ptrace_options & option)) { + if (option == LX_PTRACE_O_TRACEEXEC) { + /* + * Without PTRACE_O_TRACEEXEC, the Linux kernel will + * send SIGTRAP to the process. + */ + sigtoproc(p, t, SIGTRAP); + mutex_exit(&p->p_lock); + return (0); + } + + /* + * The flag for this trace event is not enabled, so we will not + * stop. + */ + mutex_exit(&p->p_lock); + return (ESRCH); + } + + if (child) { + switch (option) { + case LX_PTRACE_O_TRACECLONE: + case LX_PTRACE_O_TRACEFORK: + case LX_PTRACE_O_TRACEVFORK: + /* + * Send the child LWP a directed SIGSTOP. + */ + sigtoproc(p, t, SIGSTOP); + mutex_exit(&p->p_lock); + return (0); + default: + goto nostop; + } + } + + lwpd->br_ptrace_eventmsg = msg; + + switch (option) { + case LX_PTRACE_O_TRACECLONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_CLONE; + break; + case LX_PTRACE_O_TRACEEXEC: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXEC; + lwpd->br_ptrace_eventmsg = 0; + break; + case LX_PTRACE_O_TRACEEXIT: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_EXIT; + break; + case LX_PTRACE_O_TRACEFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_FORK; + break; + case LX_PTRACE_O_TRACEVFORK: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK; + break; + case LX_PTRACE_O_TRACEVFORKDONE: + lwpd->br_ptrace_event = LX_PTRACE_EVENT_VFORK_DONE; + lwpd->br_ptrace_eventmsg = 0; + break; + default: + goto nostop; + } + + /* + * Userland may have passed in a ucontext_t pointer for + * PTRACE_GETREGS/PTRACE_SETREGS usage while stopped. + */ + lwpd->br_ptrace_stopucp = ucp; + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, LX_PR_EVENT) ? 0 : ESRCH); + +nostop: + lwpd->br_ptrace_event = 0; + lwpd->br_ptrace_eventmsg = 0; + mutex_exit(&p->p_lock); + return (ESRCH); +} + +boolean_t +lx_ptrace_stop(ushort_t what) +{ + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + + VERIFY(what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT || + what == LX_PR_SIGNALLED); + + /* + * If we do not have an accord, bail out early. + */ + if (lwpd->br_ptrace_tracer == NULL) + return (B_FALSE); + + /* + * Lock this process and re-check the condition. + */ + mutex_enter(&p->p_lock); + if (lwpd->br_ptrace_tracer == NULL) { + VERIFY0(lwpd->br_ptrace_flags & LX_PTRACE_SYSCALL); + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + if (what == LX_PR_SYSENTRY || what == LX_PR_SYSEXIT) { + /* + * This is a syscall-entry-stop or syscall-exit-stop point. + */ + if (!(lwpd->br_ptrace_flags & LX_PTRACE_SYSCALL)) { + /* + * A system call stop has not been requested. + */ + mutex_exit(&p->p_lock); + return (B_FALSE); + } + + /* + * The PTRACE_SYSCALL restart command applies only to the next + * system call entry or exit. The tracer must restart us with + * PTRACE_SYSCALL while we are in ptrace-stop for us to fire + * again at the next system call boundary. + */ + lwpd->br_ptrace_flags &= ~LX_PTRACE_SYSCALL; + } + + /* + * p_lock for the process containing the tracee will be dropped by + * lx_ptrace_stop_common(). + */ + return (lx_ptrace_stop_common(p, lwpd, what)); +} + +int +lx_ptrace_issig_stop(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int lx_sig; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * If we do not have an accord, bail out now. Additionally, if there + * is no valid signal then we have no reason to stop. + */ + if (lwpd->br_ptrace_tracer == NULL || lwp->lwp_cursig == SIGKILL || + (lwp->lwp_cursig == 0 || lwp->lwp_cursig > NSIG) || + (lx_sig = stol_signo[lwp->lwp_cursig]) < 1) { + return (0); + } + + /* + * We stash the signal on the LWP where our waitid_helper will find it + * and enter the ptrace "signal-delivery-stop" condition. + */ + lwpd->br_ptrace_stopsig = lx_sig; + (void) lx_ptrace_stop_common(p, lwpd, LX_PR_SIGNALLED); + mutex_enter(&p->p_lock); + + /* + * When we return, the signal may have been altered or suppressed. + */ + if (lwpd->br_ptrace_stopsig != lx_sig) { + int native_sig; + lx_sig = lwpd->br_ptrace_stopsig; + + if (lx_sig >= LX_NSIG) { + lx_sig = 0; + } + + /* + * Translate signal from Linux signal number back to + * an illumos native signal. + */ + if (lx_sig >= LX_NSIG || lx_sig < 0 || (native_sig = + ltos_signo[lx_sig]) < 1) { + /* + * The signal is not deliverable. + */ + lwp->lwp_cursig = 0; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo) { + siginfofree(lwp->lwp_curinfo); + lwp->lwp_curinfo = NULL; + } + } else { + /* + * Alter the currently dispatching signal. + */ + if (native_sig == SIGKILL) { + /* + * We mark ourselves the victim and request + * a restart of signal processing. + */ + p->p_flag |= SKILLED; + p->p_flag &= ~SEXTKILLED; + return (-1); + } + lwp->lwp_cursig = native_sig; + lwp->lwp_extsig = 0; + if (lwp->lwp_curinfo != NULL) { + lwp->lwp_curinfo->sq_info.si_signo = native_sig; + } + } + } + + lwpd->br_ptrace_stopsig = 0; + return (0); +} + +boolean_t +lx_ptrace_sig_ignorable(proc_t *p, int sig) +{ + lx_proc_data_t *lxpd = ptolxproc(p); + + if (lxpd->l_ptrace != 0 && lx_stol_signo(sig, 0) != 0) { + /* + * In order to preserve proper ptrace behavior when it comes to + * signal handling, it is unacceptable to ignore any signals. + * Doing so would bypass the logic in lx_ptrace_issig_stop. + */ + return (B_FALSE); + } + return (B_TRUE); +} + +static void +lx_ptrace_exit_tracer(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + lx_ptrace_accord_enter(accord); + /* + * Mark this accord for death. This means no new tracees can be + * attached to this accord. + */ + VERIFY0(accord->lxpa_flags & LX_ACC_TOMBSTONE); + accord->lxpa_flags |= LX_ACC_TOMBSTONE; + lx_ptrace_accord_exit(accord); + + /* + * Walk the list of tracees, detaching them and setting them runnable + * if they are stopped. + */ + for (;;) { + klwp_t *rlwp; + proc_t *rproc; + lx_lwp_data_t *remote; + kmutex_t *rmp; + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + mutex_exit(&accord->lxpa_tracees_lock); + break; + } + + /* + * Fetch the first tracee LWP in the list and lock the process + * which contains it. + */ + remote = list_head(&accord->lxpa_tracees); + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + /* + * The p_lock mutex persists beyond the life of the process + * itself. We save the address, here, to prevent the need to + * dereference the proc_t after awaking from sleep. + */ + rmp = &rproc->p_lock; + mutex_enter(rmp); + + if (TRACEE_BUSY(remote)) { + /* + * This LWP is currently detaching itself on exit, or + * mid-way through stop(). We must wait for this + * action to be completed. While we wait on the CV, we + * must drop the accord tracee list lock. + */ + mutex_exit(&accord->lxpa_tracees_lock); + cv_wait(&lx_ptrace_busy_cv, rmp); + + /* + * While we were waiting, some state may have changed. + * Restart the walk to be sure we don't miss anything. + */ + mutex_exit(rmp); + continue; + } + + /* + * We now hold p_lock on the process. Remove the tracee from + * the list. + */ + VERIFY(list_link_active(&remote->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, remote); + + /* + * Unlink the accord and clear our trace flags. + */ + remote->br_ptrace_attach = LX_PTA_NONE; + remote->br_ptrace_tracer = NULL; + remote->br_ptrace_flags = 0; + + /* + * Let go of the list lock before we restart the LWP. We must + * not hold any locks other than the process p_lock when + * we call lx_ptrace_restart_lwp() as it will thread_lock + * the tracee. + */ + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Ensure that the LWP is not stopped on our account. + */ + lx_ptrace_restart_lwp(rlwp); + + /* + * Unlock the former tracee. + */ + mutex_exit(rmp); + + /* + * Drop the hold this tracee had on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + mutex_enter(&p->p_lock); + lwpd->br_ptrace_accord = NULL; + mutex_exit(&p->p_lock); + + /* + * Clean up and release our hold on the accord If we completely + * detached all tracee LWPs, this will free the accord. Otherwise, it + * will be freed when they complete their cleanup. + * + * We hold "pidlock" while clearing these members for easy exclusion of + * waitid(), etc. + */ + mutex_enter(&pidlock); + lx_ptrace_accord_enter(accord); + accord->lxpa_cvp = NULL; + accord->lxpa_tracer = NULL; + mutex_exit(&pidlock); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +static void +lx_ptrace_exit_tracee(proc_t *p, lx_lwp_data_t *lwpd, + lx_ptrace_accord_t *accord) +{ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * We are the tracee LWP. Lock the accord tracee list and then our + * containing process. + */ + mutex_enter(&accord->lxpa_tracees_lock); + mutex_enter(&p->p_lock); + + /* + * Remove our reference to the accord. We will release our hold + * later. + */ + VERIFY(lwpd->br_ptrace_tracer == accord); + lwpd->br_ptrace_attach = LX_PTA_NONE; + lwpd->br_ptrace_tracer = NULL; + + /* + * Remove this LWP from the accord tracee list: + */ + VERIFY(list_link_active(&lwpd->br_ptrace_linkage)); + list_remove(&accord->lxpa_tracees, lwpd); + + /* + * Wake up any tracers waiting for us to detach from the accord. + */ + cv_broadcast(&lx_ptrace_busy_cv); + mutex_exit(&p->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + /* + * Grab "pidlock" and wake the tracer if it is blocked in waitid(). + */ + mutex_enter(&pidlock); + if (accord->lxpa_cvp != NULL) { + cv_broadcast(accord->lxpa_cvp); + } + mutex_exit(&pidlock); + + /* + * Release our hold on the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); +} + +/* + * This routine is called from lx_exitlwp() when an LWP is ready to exit. If + * this LWP is being traced, it will be detached from the tracer's accord. The + * routine will also detach any LWPs being traced by this LWP. + */ +void +lx_ptrace_exit(proc_t *p, klwp_t *lwp) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + + VERIFY(MUTEX_HELD(&p->p_lock)); + + /* + * Mark our LWP as exiting from a ptrace perspective. This will + * prevent a new accord from being allocated if one does not exist + * already, and will make us invisible to PTRACE_ATTACH/PTRACE_TRACEME. + */ + VERIFY0(lwpd->br_ptrace_flags & LX_PTRACE_EXITING); + lwpd->br_ptrace_flags |= LX_PTRACE_EXITING; + + if ((accord = lwpd->br_ptrace_tracer) != NULL) { + /* + * We are traced by another LWP and must detach ourselves. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracee(p, lwpd, accord); + mutex_enter(&p->p_lock); + } + + if ((accord = lwpd->br_ptrace_accord) != NULL) { + /* + * We have been tracing other LWPs, and must detach from + * them and clean up our accord. + */ + mutex_exit(&p->p_lock); + lx_ptrace_exit_tracer(p, lwpd, accord); + mutex_enter(&p->p_lock); + } +} + +/* + * Called when a SIGCLD signal is dispatched so that we may enqueue another. + * Return 0 if we enqueued a signal, or -1 if not. + */ +int +lx_sigcld_repost(proc_t *pp, sigqueue_t *sqp) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + boolean_t found = B_FALSE; + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&pp->p_lock)); + VERIFY(lwptoproc(lwp) == pp); + + mutex_enter(&pp->p_lock); + if ((accord = lwpd->br_ptrace_accord) == NULL) { + /* + * This LWP is not a tracer LWP, so there will be no + * SIGCLD. + */ + mutex_exit(&pp->p_lock); + return (-1); + } + mutex_exit(&pp->p_lock); + + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTRACE_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTRACE_CLDPEND)) { + /* + * No SIGCLD is required for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + return (-1); + } + + /* + * Generate siginfo for this tracee LWP. + */ + lx_winfo(remote, &sqp->sq_info, B_FALSE, NULL, NULL); + remote->br_ptrace_flags &= ~LX_PTRACE_CLDPEND; + mutex_exit(&rproc->p_lock); + + mutex_enter(&pp->p_lock); + if (sigismember(&pp->p_sig, SIGCLD)) { + mutex_exit(&pp->p_lock); + + mutex_enter(&rproc->p_lock); + remote->br_ptrace_flags |= LX_PTRACE_CLDPEND; + mutex_exit(&rproc->p_lock); + + return (-1); + } + sigaddqa(pp, curthread, sqp); + mutex_exit(&pp->p_lock); + + return (0); +} + +/* + * Consume the next available ptrace(2) event queued against the accord for + * this LWP. The event will be emitted as if through waitid(), and converted + * by lx_waitpid() and friends before the return to usermode. + */ +int +lx_waitid_helper(idtype_t idtype, id_t id, k_siginfo_t *ip, int options, + boolean_t *brand_wants_wait, int *rval) +{ + lx_ptrace_accord_t *accord; + klwp_t *lwp = ttolwp(curthread); + proc_t *p = lwptoproc(lwp); + lx_lwp_data_t *local = lwptolxlwp(lwp); + lx_lwp_data_t *remote; + boolean_t found = B_FALSE; + klwp_t *rlwp = NULL; + proc_t *rproc = NULL; + pid_t event_pid = 0, event_ppid = 0; + boolean_t waitflag = !(options & WNOWAIT); + + VERIFY(MUTEX_HELD(&pidlock)); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + + /* + * By default, we do not expect waitid() to block on our account. + */ + *brand_wants_wait = B_FALSE; + + if (!local->br_waitid_emulate) { + /* + * This waitid() call is not expecting emulated results. + */ + return (-1); + } + + switch (idtype) { + case P_ALL: + case P_PID: + case P_PGID: + break; + default: + /* + * This idtype has no power here. + */ + return (-1); + } + + if (lx_ptrace_accord_get(&accord, B_FALSE) != 0) { + /* + * This LWP does not have an accord; it cannot be tracing. + */ + return (-1); + } + + /* + * We do not need an additional hold on the accord as it belongs to + * the running, tracer, LWP. + */ + lx_ptrace_accord_exit(accord); + + mutex_enter(&accord->lxpa_tracees_lock); + if (list_is_empty(&accord->lxpa_tracees)) { + /* + * Though it has an accord, there are currently no tracees in + * the list for this LWP. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (-1); + } + + /* + * Walk the list of tracees and determine if any of them have events to + * report. + */ + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * If the __WALL option was passed, we unconditionally consider + * every possible child. + */ + if (!(local->br_waitid_flags & LX_WALL)) { + /* + * Otherwise, we check to see if this LWP matches an + * id we are waiting for. + */ + switch (idtype) { + case P_ALL: + break; + case P_PID: + if (remote->br_pid != id) + continue; + break; + case P_PGID: + if (rproc->p_pgrp != id) + continue; + break; + default: + cmn_err(CE_PANIC, "unexpected idtype: %d", + idtype); + } + } + + /* + * Check if this LWP is in "ptrace-stop". If in the correct + * stop condition, lock the process containing the tracee LWP. + */ + if (lx_ptrace_lock_if_stopped(accord, remote) != 0) { + continue; + } + + if (remote->br_ptrace_flags & LX_PTRACE_PARENT_WAIT) { + /* + * This event depends on waitid() clearing out the + * event of another LWP. Skip it for now. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + if (!(remote->br_ptrace_flags & LX_PTRACE_WAITPEND) || + remote->br_ptrace_whystop == 0 || + remote->br_ptrace_whatstop == 0) { + /* + * No (new) stop reason to post for this LWP. + */ + mutex_exit(&rproc->p_lock); + continue; + } + + /* + * We found a process of interest. Leave the process + * containing the tracee LWP locked and break out of the loop. + */ + found = B_TRUE; + break; + } + mutex_exit(&accord->lxpa_tracees_lock); + + if (!found) { + /* + * There were no events of interest, but we have tracees. + * Signal to waitid() that it should block if the provided + * flags allow for it. + */ + *brand_wants_wait = B_TRUE; + return (-1); + } + + /* + * Populate the signal information. + */ + lx_winfo(remote, ip, waitflag, &event_ppid, &event_pid); + + /* + * Unlock the tracee. + */ + mutex_exit(&rproc->p_lock); + + if (event_pid != 0 && event_ppid != 0) { + /* + * We need to do another pass around the tracee list and + * unblock any events that have a "happens after" relationship + * with this event. + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + mutex_enter(&rproc->p_lock); + + if (remote->br_pid != event_pid || + remote->br_ppid != event_ppid) { + mutex_exit(&rproc->p_lock); + continue; + } + + remote->br_ptrace_flags &= ~LX_PTRACE_PARENT_WAIT; + + mutex_exit(&rproc->p_lock); + } + mutex_exit(&accord->lxpa_tracees_lock); + } + + /* + * If we are consuming this wait state, we remove the SIGCLD from + * the queue and post another. + */ + if (waitflag) { + mutex_exit(&pidlock); + sigcld_delete(ip); + sigcld_repost(); + mutex_enter(&pidlock); + } + + *rval = 0; + return (0); +} + +/* + * Some PTRACE_* requests are handled in-kernel by this function. It is called + * through brandsys() via the B_PTRACE_KERNEL subcommand. + */ +int +lx_ptrace_kernel(int ptrace_op, pid_t lxpid, uintptr_t addr, uintptr_t data) +{ + lx_lwp_data_t *local = ttolxlwp(curthread); + lx_ptrace_accord_t *accord; + lx_lwp_data_t *remote; + klwp_t *rlwp; + proc_t *rproc; + int error; + boolean_t found = B_FALSE; + boolean_t release_hold = B_FALSE; + + _NOTE(ARGUNUSED(addr)); + + /* + * These actions do not require the target LWP to be traced or stopped. + */ + switch (ptrace_op) { + case LX_PTRACE_TRACEME: + return (lx_ptrace_traceme()); + + case LX_PTRACE_ATTACH: + return (lx_ptrace_attach(lxpid)); + } + + /* + * Ensure that we have an accord and obtain a lock on it. This routine + * should not fail because the LWP cannot make ptrace(2) system calls + * after it has begun exiting. + */ + VERIFY0(local->br_ptrace_flags & LX_PTRACE_EXITING); + VERIFY(lx_ptrace_accord_get(&accord, B_TRUE) == 0); + + /* + * The accord belongs to this (the tracer) LWP, and we have a hold on + * it. We drop the lock so that we can take other locks. + */ + lx_ptrace_accord_exit(accord); + + /* + * Does the tracee list contain the pid in question? + */ + mutex_enter(&accord->lxpa_tracees_lock); + for (remote = list_head(&accord->lxpa_tracees); remote != NULL; + remote = list_next(&accord->lxpa_tracees, remote)) { + if (remote->br_pid == lxpid) { + found = B_TRUE; + break; + } + } + if (!found) { + /* + * The requested pid does not appear in the tracee list. + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (ESRCH); + } + + /* + * Attempt to lock the target LWP. + */ + if ((error = lx_ptrace_lock_if_stopped(accord, remote)) != 0) { + /* + * The LWP was not in "ptrace-stop". + */ + mutex_exit(&accord->lxpa_tracees_lock); + return (error); + } + + /* + * The target LWP is in "ptrace-stop". We have the containing process + * locked. + */ + rlwp = remote->br_lwp; + rproc = lwptoproc(rlwp); + + /* + * Process the ptrace(2) request: + */ + switch (ptrace_op) { + case LX_PTRACE_DETACH: + error = lx_ptrace_detach(accord, remote, (int)data, + &release_hold); + break; + + case LX_PTRACE_CONT: + error = lx_ptrace_cont(remote, LX_PTC_NONE, (int)data); + break; + + case LX_PTRACE_SYSCALL: + error = lx_ptrace_cont(remote, LX_PTC_SYSCALL, (int)data); + break; + + case LX_PTRACE_SINGLESTEP: + error = lx_ptrace_cont(remote, LX_PTC_SINGLESTEP, (int)data); + break; + + case LX_PTRACE_SETOPTIONS: + error = lx_ptrace_setoptions(remote, data); + break; + + case LX_PTRACE_GETEVENTMSG: + error = lx_ptrace_geteventmsg(remote, (void *)data); + break; + + case LX_PTRACE_GETREGS: + error = lx_ptrace_getregs(remote, (void *)data); + break; + + case LX_PTRACE_SETREGS: + error = lx_ptrace_setregs(remote, (void *)data); + break; + + default: + error = EINVAL; + } + + /* + * Drop the lock on both the tracee process and the tracee list. + */ + mutex_exit(&rproc->p_lock); + mutex_exit(&accord->lxpa_tracees_lock); + + if (release_hold) { + /* + * Release a hold from the accord. + */ + lx_ptrace_accord_enter(accord); + lx_ptrace_accord_rele(accord); + lx_ptrace_accord_exit(accord); + } + + return (error); +} + +void +lx_ptrace_init(void) +{ + cv_init(&lx_ptrace_busy_cv, NULL, CV_DEFAULT, NULL); + + lx_ptrace_accord_cache = kmem_cache_create("lx_ptrace_accord", + sizeof (lx_ptrace_accord_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lx_ptrace_fini(void) +{ + cv_destroy(&lx_ptrace_busy_cv); + + kmem_cache_destroy(lx_ptrace_accord_cache); +} diff --git a/usr/src/uts/common/brand/lx/os/lx_syscall.c b/usr/src/uts/common/brand/lx/os/lx_syscall.c new file mode 100644 index 0000000000..cce902a943 --- /dev/null +++ b/usr/src/uts/common/brand/lx/os/lx_syscall.c @@ -0,0 +1,1220 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/thread.h> +#include <sys/systm.h> +#include <sys/syscall.h> +#include <sys/proc.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/model.h> +#include <sys/privregs.h> +#include <sys/brand.h> +#include <sys/machbrand.h> +#include <sys/sdt.h> +#include <sys/lx_syscalls.h> +#include <sys/lx_brand.h> +#include <sys/lx_impl.h> +#include <sys/lx_misc.h> +#include <lx_errno.h> + + +/* + * Flags for sysent entries: + */ +#define LX_SYS_NOSYS_REASON 0x07 +#define LX_SYS_EBPARG6 0x08 + +/* + * Flags that denote the specific reason we do not have a particular system + * call. These reasons are only valid if the function is NULL. + */ +#define NOSYS_USERMODE 0 +#define NOSYS_NULL 1 +#define NOSYS_NONE 2 +#define NOSYS_NO_EQUIV 3 +#define NOSYS_KERNEL 4 +#define NOSYS_UNDOC 5 +#define NOSYS_OBSOLETE 6 +#define NOSYS_MAX NOSYS_OBSOLETE + +#if NOSYS_MAX > LX_SYS_NOSYS_REASON +#error NOSYS reason codes must fit in LX_SYS_NOSYS_REASON +#endif + +/* + * Strings describing the reason we do not emulate a particular system call + * in the kernel. + */ +static char *nosys_reasons[] = { + NULL, /* NOSYS_USERMODE means this call is emulated in usermode */ + "Not done yet", + "No such Linux system call", + "No equivalent illumos functionality", + "Reads/modifies Linux kernel state", + "Undocumented and/or rarely used system call", + "Unsupported, obsolete system call" +}; + + +#if defined(_LP64) +/* + * System call handler table and entry count for Linux x86_64 (amd64): + */ +lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +int lx_nsysent64; +#endif +/* + * System call handler table and entry count for Linux x86 (i386): + */ +lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +int lx_nsysent32; + +#if defined(_LP64) +struct lx_vsyscall +{ + uintptr_t lv_addr; + uintptr_t lv_scnum; +} lx_vsyscalls[] = { + { LX_VSYS_gettimeofday, LX_SYS_gettimeofday }, + { LX_VSYS_time, LX_SYS_time }, + { LX_VSYS_getcpu, LX_SYS_getcpu }, + { NULL, NULL } +}; +#endif + +#if defined(__amd64) +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + /* + * Note: Syscall argument passing is different from function + * call argument passing on amd64. For function calls, the + * fourth arg is passed via %rcx, but for system calls the 4th + * arg is passed via %r10. This is because in amd64, the + * syscall instruction puts the lower 32 bits of %rflags in + * %r11 and puts the %rip value to %rcx. + * + * Appendix A of the amd64 ABI (Linux conventions) states that + * syscalls are limited to 6 args and no arg is passed on the + * stack. + */ + args[0] = rp->r_rdi; + args[1] = rp->r_rsi; + args[2] = rp->r_rdx; + args[3] = rp->r_r10; + args[4] = rp->r_r8; + args[5] = rp->r_r9; + } else { + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + uint32_t args32[6]; + + if (copyin((void *)rp->r_rbx, &args32, + sizeof (args32)) != 0) { + /* + * Clear the argument vector so that the + * trace probe does not expose kernel + * memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + + args[0] = args32[0]; + args[1] = args32[1]; + args[2] = args32[2]; + args[3] = args32[3]; + args[4] = args32[4]; + args[5] = args32[5]; + } else { + args[0] = rp->r_rbx; + args[1] = rp->r_rcx; + args[2] = rp->r_rdx; + args[3] = rp->r_rsi; + args[4] = rp->r_rdi; + args[5] = rp->r_rbp; + } + } + + return (0); +} + +#else /* !__amd64 */ + +static int +lx_emulate_args(klwp_t *lwp, const lx_sysent_t *s, uintptr_t *args) +{ + struct regs *rp = lwptoregs(lwp); + + /* + * If the system call takes 6 args, then libc has stashed them + * in memory at the address contained in %ebx. Except for some + * syscalls which store the 6th argument in %ebp. + */ + if (s->sy_narg == 6 && !(s->sy_flags & LX_SYS_EBPARG6)) { + if (copyin((void *)rp->r_ebx, args, 6 * sizeof (uintptr_t)) != + 0) { + /* + * Clear the argument vector so that the trace probe + * does not expose kernel memory. + */ + bzero(args, 6 * sizeof (uintptr_t)); + return (set_errno(EFAULT)); + } + } else { + args[0] = rp->r_ebx; + args[1] = rp->r_ecx; + args[2] = rp->r_edx; + args[3] = rp->r_esi; + args[4] = rp->r_edi; + args[5] = rp->r_ebp; + } + + return (0); +} +#endif + +int +lx_syscall_return(klwp_t *lwp, int syscall_num, long ret) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int error = lwp->lwp_errno; + + if (error != EINTR) { + /* + * If this system call was not interrupted, clear the system + * call restart flag before lx_setcontext() can pass it to + * usermode. + */ + lwpd->br_syscall_restart = B_FALSE; + } + + if (error != 0) { + /* + * Convert from illumos to Linux errno: + */ + ret = -lx_errno(error, EINVAL); + } + + /* + * 32-bit Linux system calls return via %eax; 64-bit calls return via + * %rax. + */ + rp->r_r0 = ret; + + /* + * Hold for the ptrace(2) "syscall-exit-stop" condition if required by + * PTRACE_SYSCALL. Note that the register state may be modified by + * tracer. + */ + lx_ptrace_stop(LX_PR_SYSEXIT); + + /* + * Fire the DTrace "lx-syscall:::return" probe: + */ + lx_trace_sysreturn(syscall_num, ret); + + /* + * Clear errno for next time. We do not clear "br_syscall_restart" or + * "br_syscall_num" as they are potentially used by "lx_savecontext()" + * in the signal delivery path. + */ + lwp->lwp_errno = 0; + + lx_check_strict_failure(lwpd); + + /* + * We want complete control of the registers on return from this + * emulated Linux system call: + */ + lwp->lwp_eosys = JUSTRETURN; + curthread->t_post_sys = 1; + aston(curthread); + + return (0); +} + +static void +lx_syscall_unsup_msg(lx_sysent_t *s, int syscall_num, int unsup_reason) +{ + char buf[100]; + + if (s == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): out of bounds", + syscall_num); + } else { + VERIFY(unsup_reason < (sizeof (nosys_reasons) / + sizeof (*nosys_reasons))); + + if (s->sy_name == NULL) { + (void) snprintf(buf, sizeof (buf), "NOSYS (%d): %s", + syscall_num, nosys_reasons[unsup_reason]); + } else { + (void) snprintf(buf, sizeof (buf), "NOSYS (%s): %s", + s->sy_name, nosys_reasons[unsup_reason]); + } + } + + lx_unsupported(buf); +} + +/* + * This function is used to override the processing of arguments and + * invocation of a handler for emulated system calls, installed on each + * branded LWP as "lwp_brand_syscall". If this system call should use the + * native path, we return 1. If we handled this system call (and have made + * arrangements with respect to post-return usermode register state) we + * return 0. + */ +int +lx_syscall_enter(void) +{ + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + struct regs *rp = lwptoregs(lwp); + int syscall_num; + int error; + long ret = 0; + lx_sysent_t *s; + uintptr_t args[6]; + unsigned int unsup_reason; + + /* + * If we got here, we should have an LWP-specific brand data + * structure. + */ + VERIFY(lwpd != NULL); + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * The lwp is not in in BRAND execution mode, so we return + * to the regular native system call path. + */ + DTRACE_PROBE(brand__lx__syscall__hook__skip); + return (1); + } + + /* + * Clear the restartable system call flag. This flag will be set + * on in the system call handler if the call is a candidate for + * a restart. It will be saved by lx_setcontext() in the event + * that we take a signal, and used in the signal handling path + * to restart the system call iff SA_RESTART was set for this + * signal. Save the system call number so that we can store it + * in the saved context if required. + */ + lwpd->br_syscall_restart = B_FALSE; + lwpd->br_syscall_num = (int)rp->r_r0; + + /* + * Hold for the ptrace(2) "syscall-entry-stop" condition if traced by + * PTRACE_SYSCALL. The system call number and arguments may be + * modified by the tracer. + */ + lx_ptrace_stop(LX_PR_SYSENTRY); + + /* + * Check that the system call number is within the bounds we expect. + */ + syscall_num = lwpd->br_syscall_num; + if (syscall_num < 0 || syscall_num > LX_MAX_SYSCALL(lwp)) { + lx_syscall_unsup_msg(NULL, syscall_num, 0); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + +#if defined(_LP64) + if (lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) { + s = &lx_sysent64[syscall_num]; + } else +#endif + { + s = &lx_sysent32[syscall_num]; + } + + /* + * Process the arguments for this system call and fire the DTrace + * "lx-syscall:::entry" probe: + */ + error = lx_emulate_args(lwp, s, args); + lx_trace_sysenter(syscall_num, args); + if (error != 0) { + /* + * Could not read and process the arguments. Return the error + * to the process. + */ + set_errno(error); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } + + if (s->sy_callc != NULL) { + /* + * Call the in-kernel handler for this Linux system call: + */ + ret = s->sy_callc(args[0], args[1], args[2], args[3], args[4], + args[5]); + lx_syscall_return(lwp, syscall_num, ret); + return (0); + } + + /* + * There is no in-kernel handler. + */ + switch (unsup_reason = (s->sy_flags & LX_SYS_NOSYS_REASON)) { + case NOSYS_USERMODE: + /* + * Pass to the usermode emulation routine. + */ +#if defined(_LP64) + if (get_udatamodel() != DATAMODEL_NATIVE) { + lx_emulate_user32(lwp, syscall_num, args); + } else +#endif + { + lx_emulate_user(lwp, syscall_num, args); + } + return (0); + + default: + /* + * We are not emulating this system call at all. + */ + lx_syscall_unsup_msg(s, syscall_num, unsup_reason); + + set_errno(ENOTSUP); + lx_syscall_return(lwp, syscall_num, -1); + return (0); + } +} + +#if defined(_LP64) +/* + * Emulate vsyscall support. + * + * Linux magically maps a single page into the address space of each process, + * allowing them to make 'vsyscalls'. Originally designed to counteract the + * perceived overhead of regular system calls, vsyscalls were implemented as + * code residing in userspace which could be called directly. The userspace + * implementations of these vsyscalls which have now been replaced by + * instructions which vector into the normal syscall path. + * + * Implementing vsyscalls on Illumos is complicated by the fact that the + * required static address region resides inside the kernel address space. + * Rather than mapping a user-accessible page into the KAS, a different + * approach is taken. The vsyscall gate is emulated by interposing on + * pagefaults in trap(). An attempt to execute a known vsyscall address will + * result in emulating the appropriate system call rather than inducing a + * SIGSEGV. + */ +void +lx_vsyscall_enter(proc_t *p, klwp_t *lwp, int scnum) +{ + struct regs *rp = lwptoregs(lwp); + uintptr_t raddr; + + /* + * Fetch the return address from the process stack. + */ + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + if (copyin((void *)rp->r_rsp, &raddr, sizeof (raddr)) != 0) { +#if DEBUG + printf("lx_vsyscall_call: bad brand stack at vsyscall " + "cmd=%s, pid=%d, sp=0x%p\n", PTOU(p)->u_comm, + p->p_pid, (void *)rp->r_rsp); +#endif + + /* + * The process jumped to the vsyscall address without a + * correctly configured stack. Terminate the process. + */ + exit(CLD_KILLED, SIGSEGV); + return; + } + + DTRACE_PROBE1(brand__lx__vsyscall, int, scnum); + + /* Simulate vectoring into the syscall */ + rp->r_rax = scnum; + rp->r_rip = raddr; + rp->r_rsp += sizeof (uintptr_t); + + lx_syscall_enter(); +} + +boolean_t +lx_vsyscall_iscall(klwp_t *lwp, uintptr_t addr, int *scnum) +{ + lx_lwp_data_t *lwpd = lwptolxlwp(lwp); + int i; + + if (lwpd->br_stack_mode != LX_STACK_MODE_BRAND) { + /* + * We only handle vsyscalls when running Linux code. + */ + return (B_FALSE); + } + + if (addr < LX_VSYSCALL_ADDR || + addr >= (LX_VSYSCALL_ADDR + LX_VSYSCALL_SIZE)) { + /* + * Ignore faults outside the vsyscall page. + */ + return (B_FALSE); + } + + for (i = 0; lx_vsyscalls[i].lv_addr != NULL; i++) { + if (addr == lx_vsyscalls[i].lv_addr) { + /* + * This is a valid vsyscall address. + */ + *scnum = lx_vsyscalls[i].lv_scnum; + return (B_TRUE); + } + } + + lx_unsupported("bad vsyscall access"); + return (B_FALSE); +} +#endif + +/* + * Linux defines system call numbers for 32-bit x86 in the file: + * arch/x86/syscalls/syscall_32.tbl + */ +lx_sysent_t lx_sysent32[] = { + {"nosys", NULL, NOSYS_NONE, 0}, /* 0 */ + {"exit", NULL, 0, 1}, /* 1 */ + {"fork", NULL, 0, 0}, /* 2 */ + {"read", lx_read, 0, 3}, /* 3 */ + {"write", lx_write, 0, 3}, /* 4 */ + {"open", lx_open, 0, 3}, /* 5 */ + {"close", NULL, 0, 1}, /* 6 */ + {"waitpid", lx_waitpid, 0, 3}, /* 7 */ + {"creat", NULL, 0, 2}, /* 8 */ + {"link", NULL, 0, 2}, /* 9 */ + {"unlink", NULL, 0, 1}, /* 10 */ + {"execve", NULL, 0, 3}, /* 11 */ + {"chdir", NULL, 0, 1}, /* 12 */ + {"time", lx_time, 0, 1}, /* 13 */ + {"mknod", NULL, 0, 3}, /* 14 */ + {"chmod", lx_chmod, 0, 2}, /* 15 */ + {"lchown16", lx_lchown16, 0, 3}, /* 16 */ + {"break", NULL, NOSYS_OBSOLETE, 0}, /* 17 */ + {"stat", NULL, NOSYS_OBSOLETE, 0}, /* 18 */ + {"lseek", NULL, 0, 3}, /* 19 */ + {"getpid", lx_getpid, 0, 0}, /* 20 */ + {"mount", NULL, 0, 5}, /* 21 */ + {"umount", NULL, 0, 1}, /* 22 */ + {"setuid16", NULL, 0, 1}, /* 23 */ + {"getuid16", NULL, 0, 0}, /* 24 */ + {"stime", NULL, 0, 1}, /* 25 */ + {"ptrace", NULL, 0, 4}, /* 26 */ + {"alarm", NULL, 0, 1}, /* 27 */ + {"fstat", NULL, NOSYS_OBSOLETE, 0}, /* 28 */ + {"pause", NULL, 0, 0}, /* 29 */ + {"utime", NULL, 0, 2}, /* 30 */ + {"stty", NULL, NOSYS_OBSOLETE, 0}, /* 31 */ + {"gtty", NULL, NOSYS_OBSOLETE, 0}, /* 32 */ + {"access", NULL, 0, 2}, /* 33 */ + {"nice", NULL, 0, 1}, /* 34 */ + {"ftime", NULL, NOSYS_OBSOLETE, 0}, /* 35 */ + {"sync", NULL, 0, 0}, /* 36 */ + {"kill", lx_kill, 0, 2}, /* 37 */ + {"rename", NULL, 0, 2}, /* 38 */ + {"mkdir", lx_mkdir, 0, 2}, /* 39 */ + {"rmdir", NULL, 0, 1}, /* 40 */ + {"dup", NULL, 0, 1}, /* 41 */ + {"pipe", lx_pipe, 0, 1}, /* 42 */ + {"times", NULL, 0, 1}, /* 43 */ + {"prof", NULL, NOSYS_OBSOLETE, 0}, /* 44 */ + {"brk", lx_brk, 0, 1}, /* 45 */ + {"setgid16", NULL, 0, 1}, /* 46 */ + {"getgid16", NULL, 0, 0}, /* 47 */ + {"signal", NULL, 0, 2}, /* 48 */ + {"geteuid16", NULL, 0, 0}, /* 49 */ + {"getegid16", NULL, 0, 0}, /* 50 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 51 */ + {"umount2", NULL, 0, 2}, /* 52 */ + {"lock", NULL, NOSYS_OBSOLETE, 0}, /* 53 */ + {"ioctl", lx_ioctl, 0, 3}, /* 54 */ + {"fcntl", lx_fcntl, 0, 3}, /* 55 */ + {"mpx", NULL, NOSYS_OBSOLETE, 0}, /* 56 */ + {"setpgid", NULL, 0, 2}, /* 57 */ + {"ulimit", NULL, NOSYS_OBSOLETE, 0}, /* 58 */ + {"olduname", NULL, NOSYS_OBSOLETE, 0}, /* 59 */ + {"umask", NULL, 0, 1}, /* 60 */ + {"chroot", NULL, 0, 1}, /* 61 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 62 */ + {"dup2", NULL, 0, 2}, /* 63 */ + {"getppid", lx_getppid, 0, 0}, /* 64 */ + {"getpgrp", NULL, 0, 0}, /* 65 */ + {"setsid", NULL, 0, 0}, /* 66 */ + {"sigaction", NULL, 0, 3}, /* 67 */ + {"sgetmask", NULL, NOSYS_OBSOLETE, 0}, /* 68 */ + {"ssetmask", NULL, NOSYS_OBSOLETE, 0}, /* 69 */ + {"setreuid16", NULL, 0, 2}, /* 70 */ + {"setregid16", NULL, 0, 2}, /* 71 */ + {"sigsuspend", NULL, 0, 1}, /* 72 */ + {"sigpending", NULL, 0, 1}, /* 73 */ + {"sethostname", NULL, 0, 2}, /* 74 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 75 */ + {"getrlimit", lx_oldgetrlimit, 0, 2}, /* 76 */ + {"getrusage", NULL, 0, 2}, /* 77 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 78 */ + {"settimeofday", NULL, 0, 2}, /* 79 */ + {"getgroups16", NULL, 0, 2}, /* 80 */ + {"setgroups16", NULL, 0, 2}, /* 81 */ + {"select", NULL, NOSYS_OBSOLETE, 0}, /* 82 */ + {"symlink", NULL, 0, 2}, /* 83 */ + {"oldlstat", NULL, NOSYS_OBSOLETE, 0}, /* 84 */ + {"readlink", NULL, 0, 3}, /* 85 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 86 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 87 */ + {"reboot", NULL, 0, 4}, /* 88 */ + {"readdir", NULL, 0, 3}, /* 89 */ + {"mmap", NULL, 0, 6}, /* 90 */ + {"munmap", NULL, 0, 2}, /* 91 */ + {"truncate", NULL, 0, 2}, /* 92 */ + {"ftruncate", NULL, 0, 2}, /* 93 */ + {"fchmod", lx_fchmod, 0, 2}, /* 94 */ + {"fchown16", lx_fchown16, 0, 3}, /* 95 */ + {"getpriority", NULL, 0, 2}, /* 96 */ + {"setpriority", NULL, 0, 3}, /* 97 */ + {"profil", NULL, NOSYS_NO_EQUIV, 0}, /* 98 */ + {"statfs", NULL, 0, 2}, /* 99 */ + {"fstatfs", NULL, 0, 2}, /* 100 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 101 */ + {"socketcall", NULL, 0, 2}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"setitimer", NULL, 0, 3}, /* 104 */ + {"getitimer", NULL, 0, 2}, /* 105 */ + {"stat", NULL, 0, 2}, /* 106 */ + {"lstat", NULL, 0, 2}, /* 107 */ + {"fstat", NULL, 0, 2}, /* 108 */ + {"uname", NULL, NOSYS_OBSOLETE, 0}, /* 109 */ + {"oldiopl", NULL, NOSYS_NO_EQUIV, 0}, /* 110 */ + {"vhangup", NULL, 0, 0}, /* 111 */ + {"idle", NULL, NOSYS_NO_EQUIV, 0}, /* 112 */ + {"vm86old", NULL, NOSYS_OBSOLETE, 0}, /* 113 */ + {"wait4", lx_wait4, 0, 4}, /* 114 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 115 */ + {"sysinfo", lx_sysinfo32, 0, 1}, /* 116 */ + {"ipc", NULL, 0, 5}, /* 117 */ + {"fsync", NULL, 0, 1}, /* 118 */ + {"sigreturn", NULL, 0, 1}, /* 119 */ + {"clone", NULL, 0, 5}, /* 120 */ + {"setdomainname", NULL, 0, 2}, /* 121 */ + {"uname", NULL, 0, 1}, /* 122 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 123 */ + {"adjtimex", NULL, 0, 1}, /* 124 */ + {"mprotect", NULL, 0, 3}, /* 125 */ + {"sigprocmask", NULL, 0, 3}, /* 126 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 127 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 128 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 129 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 130 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 131 */ + {"getpgid", NULL, 0, 1}, /* 132 */ + {"fchdir", NULL, 0, 1}, /* 133 */ + {"bdflush", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"sysfs", NULL, 0, 3}, /* 135 */ + {"personality", NULL, 0, 1}, /* 136 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 137 */ + {"setfsuid16", NULL, 0, 1}, /* 138 */ + {"setfsgid16", NULL, 0, 1}, /* 139 */ + {"llseek", NULL, 0, 5}, /* 140 */ + {"getdents", lx_getdents_32, 0, 3}, /* 141 */ + {"select", NULL, 0, 5}, /* 142 */ + {"flock", NULL, 0, 2}, /* 143 */ + {"msync", NULL, 0, 3}, /* 144 */ + {"readv", NULL, 0, 3}, /* 145 */ + {"writev", NULL, 0, 3}, /* 146 */ + {"getsid", NULL, 0, 1}, /* 147 */ + {"fdatasync", NULL, 0, 1}, /* 148 */ + {"sysctl", NULL, 0, 1}, /* 149 */ + {"mlock", NULL, 0, 2}, /* 150 */ + {"munlock", NULL, 0, 2}, /* 151 */ + {"mlockall", NULL, 0, 1}, /* 152 */ + {"munlockall", NULL, 0, 0}, /* 153 */ + {"sched_setparam", NULL, 0, 2}, /* 154 */ + {"sched_getparam", NULL, 0, 2}, /* 155 */ + {"sched_setscheduler", NULL, 0, 3}, /* 156 */ + {"sched_getscheduler", NULL, 0, 1}, /* 157 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 158 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 159 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 160 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 161 */ + {"nanosleep", NULL, 0, 2}, /* 162 */ + {"mremap", NULL, 0, 5}, /* 163 */ + {"setresuid16", lx_setresuid16, 0, 3}, /* 164 */ + {"getresuid16", NULL, 0, 3}, /* 165 */ + {"vm86", NULL, NOSYS_NO_EQUIV, 0}, /* 166 */ + {"query_module", NULL, 0, 5}, /* 167 */ + {"poll", NULL, 0, 3}, /* 168 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 169 */ + {"setresgid16", lx_setresgid16, 0, 3}, /* 170 */ + {"getresgid16", NULL, 0, 3}, /* 171 */ + {"prctl", NULL, 0, 5}, /* 172 */ + {"rt_sigreturn", NULL, 0, 0}, /* 173 */ + {"rt_sigaction", NULL, 0, 4}, /* 174 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 175 */ + {"rt_sigpending", NULL, 0, 2}, /* 176 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 177 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 178 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 179 */ + {"pread64", NULL, 0, 5}, /* 180 */ + {"pwrite64", NULL, 0, 5}, /* 181 */ + {"chown16", lx_chown16, 0, 3}, /* 182 */ + {"getcwd", NULL, 0, 2}, /* 183 */ + {"capget", NULL, 0, 2}, /* 184 */ + {"capset", NULL, 0, 2}, /* 185 */ + {"sigaltstack", NULL, 0, 2}, /* 186 */ + {"sendfile", NULL, 0, 4}, /* 187 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 188 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 189 */ + {"vfork", NULL, 0, 0}, /* 190 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 191 */ + {"mmap2", NULL, LX_SYS_EBPARG6, 6}, /* 192 */ + {"truncate64", NULL, 0, 3}, /* 193 */ + {"ftruncate64", NULL, 0, 3}, /* 194 */ + {"stat64", NULL, 0, 2}, /* 195 */ + {"lstat64", NULL, 0, 2}, /* 196 */ + {"fstat64", NULL, 0, 2}, /* 197 */ + {"lchown", lx_lchown, 0, 3}, /* 198 */ + {"getuid", NULL, 0, 0}, /* 199 */ + {"getgid", NULL, 0, 0}, /* 200 */ + {"geteuid", NULL, 0, 0}, /* 201 */ + {"getegid", NULL, 0, 0}, /* 202 */ + {"setreuid", NULL, 0, 0}, /* 203 */ + {"setregid", NULL, 0, 0}, /* 204 */ + {"getgroups", NULL, 0, 2}, /* 205 */ + {"setgroups", NULL, 0, 2}, /* 206 */ + {"fchown", lx_fchown, 0, 3}, /* 207 */ + {"setresuid", lx_setresuid, 0, 3}, /* 208 */ + {"getresuid", NULL, 0, 3}, /* 209 */ + {"setresgid", lx_setresgid, 0, 3}, /* 210 */ + {"getresgid", NULL, 0, 3}, /* 211 */ + {"chown", lx_chown, 0, 3}, /* 212 */ + {"setuid", NULL, 0, 1}, /* 213 */ + {"setgid", NULL, 0, 1}, /* 214 */ + {"setfsuid", NULL, 0, 1}, /* 215 */ + {"setfsgid", NULL, 0, 1}, /* 216 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 217 */ + {"mincore", NULL, 0, 3}, /* 218 */ + {"madvise", NULL, 0, 3}, /* 219 */ + {"getdents64", lx_getdents64, 0, 3}, /* 220 */ + {"fcntl64", lx_fcntl64, 0, 3}, /* 221 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 222 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 223 */ + {"gettid", lx_gettid, 0, 0}, /* 224 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 225 */ + {"setxattr", lx_xattr, 0, 5}, /* 226 */ + {"lsetxattr", lx_xattr, 0, 5}, /* 227 */ + {"fsetxattr", lx_xattr, 0, 5}, /* 228 */ + {"getxattr", lx_xattr, 0, 4}, /* 229 */ + {"lgetxattr", lx_xattr, 0, 4}, /* 230 */ + {"fgetxattr", lx_xattr, 0, 4}, /* 231 */ + {"listxattr", lx_xattr, 0, 3}, /* 232 */ + {"llistxattr", lx_xattr, 0, 3}, /* 233 */ + {"flistxattr", lx_xattr, 0, 3}, /* 234 */ + {"removexattr", lx_xattr, 0, 2}, /* 235 */ + {"lremovexattr", lx_xattr, 0, 2}, /* 236 */ + {"fremovexattr", lx_xattr, 0, 2}, /* 237 */ + {"tkill", lx_tkill, 0, 2}, /* 238 */ + {"sendfile64", NULL, 0, 4}, /* 239 */ + {"futex", lx_futex, LX_SYS_EBPARG6, 6}, /* 240 */ + {"sched_setaffinity", NULL, 0, 3}, /* 241 */ + {"sched_getaffinity", NULL, 0, 3}, /* 242 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 243 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 244 */ + {"io_setup", NULL, 0, 2}, /* 245 */ + {"io_destroy", NULL, 0, 1}, /* 246 */ + {"io_getevents", NULL, 0, 5}, /* 247 */ + {"io_submit", NULL, 0, 3}, /* 248 */ + {"io_cancel", NULL, 0, 3}, /* 249 */ + {"fadvise64", NULL, 0, 4}, /* 250 */ + {"nosys", NULL, 0, 0}, /* 251 */ + {"group_exit", NULL, 0, 1}, /* 252 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 253 */ + {"epoll_create", NULL, 0, 1}, /* 254 */ + {"epoll_ctl", NULL, 0, 4}, /* 255 */ + {"epoll_wait", NULL, 0, 4}, /* 256 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 257 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 258 */ + {"timer_create", NULL, 0, 3}, /* 259 */ + {"timer_settime", NULL, 0, 4}, /* 260 */ + {"timer_gettime", NULL, 0, 2}, /* 261 */ + {"timer_getoverrun", NULL, 0, 1}, /* 262 */ + {"timer_delete", NULL, 0, 1}, /* 263 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 264 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 265 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 266 */ + {"clock_nanosleep", NULL, 0, 4}, /* 267 */ + {"statfs64", NULL, 0, 2}, /* 268 */ + {"fstatfs64", NULL, 0, 2}, /* 269 */ + {"tgkill", lx_tgkill, 0, 3}, /* 270 */ + +/* + * The following system calls only exist in kernel 2.6 and greater: + */ + {"utimes", NULL, 0, 2}, /* 271 */ + {"fadvise64_64", NULL, 0, 4}, /* 272 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 273 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 274 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 275 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 276 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 277 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 278 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 279 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 280 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 281 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 282 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 283 */ + {"waitid", lx_waitid, 0, 4}, /* 284 */ + {"sys_setaltroot", NULL, NOSYS_NULL, 0}, /* 285 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 286 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 287 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 288 */ + {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 289 */ + {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 290 */ + {"inotify_init", NULL, 0, 0}, /* 291 */ + {"inotify_add_watch", NULL, 0, 3}, /* 292 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 293 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 294 */ + {"openat", lx_openat, 0, 4}, /* 295 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 296 */ + {"mknodat", NULL, 0, 4}, /* 297 */ + {"fchownat", lx_fchownat, 0, 5}, /* 298 */ + {"futimesat", NULL, 0, 3}, /* 299 */ + {"fstatat64", NULL, 0, 4}, /* 300 */ + {"unlinkat", NULL, 0, 3}, /* 301 */ + {"renameat", NULL, 0, 4}, /* 302 */ + {"linkat", NULL, 0, 5}, /* 303 */ + {"symlinkat", NULL, 0, 3}, /* 304 */ + {"readlinkat", NULL, 0, 4}, /* 305 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 306 */ + {"faccessat", NULL, 0, 4}, /* 307 */ + {"pselect6", NULL, LX_SYS_EBPARG6, 6}, /* 308 */ + {"ppoll", NULL, 0, 5}, /* 309 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 310 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 311 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 312 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 314 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 315 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 316 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getcpu", lx_getcpu, 0, 3}, /* 318 */ + {"epoll_pwait", NULL, 0, 5}, /* 319 */ + {"utimensat", NULL, 0, 4}, /* 320 */ + {"signalfd", NULL, NOSYS_NULL, 0}, /* 321 */ + {"timerfd_create", NULL, 0, 2}, /* 322 */ + {"eventfd", NULL, 0, 1}, /* 323 */ + {"fallocate", NULL, NOSYS_NULL, 0}, /* 324 */ + {"timerfd_settime", NULL, 0, 4}, /* 325 */ + {"timerfd_gettime", NULL, 0, 2}, /* 326 */ + {"signalfd4", NULL, NOSYS_NULL, 0}, /* 327 */ + {"eventfd2", NULL, 0, 2}, /* 328 */ + {"epoll_create1", NULL, 0, 1}, /* 329 */ + {"dup3", NULL, 0, 3}, /* 330 */ + {"pipe2", lx_pipe2, 0, 2}, /* 331 */ + {"inotify_init1", NULL, 0, 1}, /* 332 */ + {"preadv", NULL, 0, 4}, /* 333 */ + {"pwritev", NULL, 0, 4}, /* 334 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 335 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 336 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 337 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 338 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 339 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 340 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 341 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 342 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 343 */ + {"syncfs", NULL, NOSYS_NULL, 0}, /* 344 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 345 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 346 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 347 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 348 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 349 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 350 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 351 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 352 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 353 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 354 */ + {"getrandom", lx_getrandom, 0, 3}, /* 355 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 356 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 357 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 358 */ +}; + +#if defined(_LP64) +/* + * Linux defines system call numbers for 64-bit x86 in the file: + * arch/x86/syscalls/syscall_64.tbl + */ +lx_sysent_t lx_sysent64[] = { + {"read", lx_read, 0, 3}, /* 0 */ + {"write", lx_write, 0, 3}, /* 1 */ + {"open", lx_open, 0, 3}, /* 2 */ + {"close", NULL, 0, 1}, /* 3 */ + {"stat", NULL, 0, 2}, /* 4 */ + {"fstat", NULL, 0, 2}, /* 5 */ + {"lstat", NULL, 0, 2}, /* 6 */ + {"poll", NULL, 0, 3}, /* 7 */ + {"lseek", NULL, 0, 3}, /* 8 */ + {"mmap", NULL, 0, 6}, /* 9 */ + {"mprotect", NULL, 0, 3}, /* 10 */ + {"munmap", NULL, 0, 2}, /* 11 */ + {"brk", lx_brk, 0, 1}, /* 12 */ + {"rt_sigaction", NULL, 0, 4}, /* 13 */ + {"rt_sigprocmask", NULL, 0, 4}, /* 14 */ + {"rt_sigreturn", NULL, 0, 0}, /* 15 */ + {"ioctl", lx_ioctl, 0, 3}, /* 16 */ + {"pread64", NULL, 0, 4}, /* 17 */ + {"pwrite64", NULL, 0, 4}, /* 18 */ + {"readv", NULL, 0, 3}, /* 19 */ + {"writev", NULL, 0, 3}, /* 20 */ + {"access", NULL, 0, 2}, /* 21 */ + {"pipe", lx_pipe, 0, 1}, /* 22 */ + {"select", NULL, 0, 5}, /* 23 */ + {"sched_yield", lx_sched_yield, 0, 0}, /* 24 */ + {"mremap", NULL, 0, 5}, /* 25 */ + {"msync", NULL, 0, 3}, /* 26 */ + {"mincore", NULL, 0, 3}, /* 27 */ + {"madvise", NULL, 0, 3}, /* 28 */ + {"shmget", NULL, 0, 3}, /* 29 */ + {"shmat", NULL, 0, 4}, /* 30 */ + {"shmctl", NULL, 0, 3}, /* 31 */ + {"dup", NULL, 0, 1}, /* 32 */ + {"dup2", NULL, 0, 2}, /* 33 */ + {"pause", NULL, 0, 0}, /* 34 */ + {"nanosleep", NULL, 0, 2}, /* 35 */ + {"getitimer", NULL, 0, 2}, /* 36 */ + {"alarm", NULL, 0, 1}, /* 37 */ + {"setitimer", NULL, 0, 3}, /* 38 */ + {"getpid", lx_getpid, 0, 0}, /* 39 */ + {"sendfile", NULL, 0, 4}, /* 40 */ + {"socket", NULL, 0, 3}, /* 41 */ + {"connect", NULL, 0, 3}, /* 42 */ + {"accept", NULL, 0, 3}, /* 43 */ + {"sendto", NULL, 0, 6}, /* 44 */ + {"recvfrom", NULL, 0, 6}, /* 45 */ + {"sendmsg", NULL, 0, 3}, /* 46 */ + {"recvmsg", NULL, 0, 3}, /* 47 */ + {"shutdown", NULL, 0, 2}, /* 48 */ + {"bind", NULL, 0, 3}, /* 49 */ + {"listen", NULL, 0, 2}, /* 50 */ + {"getsockname", NULL, 0, 3}, /* 51 */ + {"getpeername", NULL, 0, 3}, /* 52 */ + {"socketpair", NULL, 0, 4}, /* 53 */ + {"setsockopt", NULL, 0, 5}, /* 54 */ + {"getsockopt", NULL, 0, 5}, /* 55 */ + {"clone", NULL, 0, 5}, /* 56 */ + {"fork", NULL, 0, 0}, /* 57 */ + {"vfork", NULL, 0, 0}, /* 58 */ + {"execve", NULL, 0, 3}, /* 59 */ + {"exit", NULL, 0, 1}, /* 60 */ + {"wait4", lx_wait4, 0, 4}, /* 61 */ + {"kill", lx_kill, 0, 2}, /* 62 */ + {"uname", NULL, 0, 1}, /* 63 */ + {"semget", NULL, 0, 3}, /* 64 */ + {"semop", NULL, 0, 3}, /* 65 */ + {"semctl", NULL, 0, 4}, /* 66 */ + {"shmdt", NULL, 0, 1}, /* 67 */ + {"msgget", NULL, 0, 2}, /* 68 */ + {"msgsnd", NULL, 0, 4}, /* 69 */ + {"msgrcv", NULL, 0, 5}, /* 70 */ + {"msgctl", NULL, 0, 3}, /* 71 */ + {"fcntl", lx_fcntl64, 0, 3}, /* 72 */ + {"flock", NULL, 0, 2}, /* 73 */ + {"fsync", NULL, 0, 1}, /* 74 */ + {"fdatasync", NULL, 0, 1}, /* 75 */ + {"truncate", NULL, 0, 2}, /* 76 */ + {"ftruncate", NULL, 0, 2}, /* 77 */ + {"getdents", lx_getdents_64, 0, 3}, /* 78 */ + {"getcwd", NULL, 0, 2}, /* 79 */ + {"chdir", NULL, 0, 1}, /* 80 */ + {"fchdir", NULL, 0, 1}, /* 81 */ + {"rename", NULL, 0, 2}, /* 82 */ + {"mkdir", lx_mkdir, 0, 2}, /* 83 */ + {"rmdir", NULL, 0, 1}, /* 84 */ + {"creat", NULL, 0, 2}, /* 85 */ + {"link", NULL, 0, 2}, /* 86 */ + {"unlink", NULL, 0, 1}, /* 87 */ + {"symlink", NULL, 0, 2}, /* 88 */ + {"readlink", NULL, 0, 3}, /* 89 */ + {"chmod", lx_chmod, 0, 2}, /* 90 */ + {"fchmod", lx_fchmod, 0, 2}, /* 91 */ + {"chown", lx_chown, 0, 3}, /* 92 */ + {"fchown", lx_fchown, 0, 3}, /* 93 */ + {"lchown", lx_lchown, 0, 3}, /* 94 */ + {"umask", NULL, 0, 1}, /* 95 */ + {"gettimeofday", lx_gettimeofday, 0, 2}, /* 96 */ + {"getrlimit", lx_getrlimit, 0, 2}, /* 97 */ + {"getrusage", NULL, 0, 2}, /* 98 */ + {"sysinfo", lx_sysinfo64, 0, 1}, /* 99 */ + {"times", NULL, 0, 1}, /* 100 */ + {"ptrace", NULL, 0, 4}, /* 101 */ + {"getuid", NULL, 0, 0}, /* 102 */ + {"syslog", NULL, 0, 3}, /* 103 */ + {"getgid", NULL, 0, 0}, /* 104 */ + {"setuid", NULL, 0, 1}, /* 105 */ + {"setgid", NULL, 0, 1}, /* 106 */ + {"geteuid", NULL, 0, 0}, /* 107 */ + {"getegid", NULL, 0, 0}, /* 108 */ + {"setpgid", NULL, 0, 2}, /* 109 */ + {"getppid", lx_getppid, 0, 0}, /* 110 */ + {"getpgrp", NULL, 0, 0}, /* 111 */ + {"setsid", NULL, 0, 0}, /* 112 */ + {"setreuid", NULL, 0, 0}, /* 113 */ + {"setregid", NULL, 0, 0}, /* 114 */ + {"getgroups", NULL, 0, 2}, /* 115 */ + {"setgroups", NULL, 0, 2}, /* 116 */ + {"setresuid", lx_setresuid, 0, 3}, /* 117 */ + {"getresuid", NULL, 0, 3}, /* 118 */ + {"setresgid", lx_setresgid, 0, 3}, /* 119 */ + {"getresgid", NULL, 0, 3}, /* 120 */ + {"getpgid", NULL, 0, 1}, /* 121 */ + {"setfsuid", NULL, 0, 1}, /* 122 */ + {"setfsgid", NULL, 0, 1}, /* 123 */ + {"getsid", NULL, 0, 1}, /* 124 */ + {"capget", NULL, 0, 2}, /* 125 */ + {"capset", NULL, 0, 2}, /* 126 */ + {"rt_sigpending", NULL, 0, 2}, /* 127 */ + {"rt_sigtimedwait", NULL, 0, 4}, /* 128 */ + {"rt_sigqueueinfo", NULL, 0, 3}, /* 129 */ + {"rt_sigsuspend", NULL, 0, 2}, /* 130 */ + {"sigaltstack", NULL, 0, 2}, /* 131 */ + {"utime", NULL, 0, 2}, /* 132 */ + {"mknod", NULL, 0, 3}, /* 133 */ + {"uselib", NULL, NOSYS_KERNEL, 0}, /* 134 */ + {"personality", NULL, 0, 1}, /* 135 */ + {"ustat", NULL, NOSYS_OBSOLETE, 2}, /* 136 */ + {"statfs", NULL, 0, 2}, /* 137 */ + {"fstatfs", NULL, 0, 2}, /* 138 */ + {"sysfs", NULL, 0, 3}, /* 139 */ + {"getpriority", NULL, 0, 2}, /* 140 */ + {"setpriority", NULL, 0, 3}, /* 141 */ + {"sched_setparam", NULL, 0, 2}, /* 142 */ + {"sched_getparam", NULL, 0, 2}, /* 143 */ + {"sched_setscheduler", NULL, 0, 3}, /* 144 */ + {"sched_getscheduler", NULL, 0, 1}, /* 145 */ + {"sched_get_priority_max", NULL, 0, 1}, /* 146 */ + {"sched_get_priority_min", NULL, 0, 1}, /* 147 */ + {"sched_rr_get_interval", NULL, 0, 2}, /* 148 */ + {"mlock", NULL, 0, 2}, /* 149 */ + {"munlock", NULL, 0, 2}, /* 150 */ + {"mlockall", NULL, 0, 1}, /* 151 */ + {"munlockall", NULL, 0, 0}, /* 152 */ + {"vhangup", NULL, 0, 0}, /* 153 */ + {"modify_ldt", lx_modify_ldt, 0, 3}, /* 154 */ + {"pivot_root", NULL, NOSYS_KERNEL, 0}, /* 155 */ + {"sysctl", NULL, 0, 1}, /* 156 */ + {"prctl", NULL, 0, 5}, /* 157 */ + {"arch_prctl", lx_arch_prctl, 0, 2}, /* 158 */ + {"adjtimex", NULL, 0, 1}, /* 159 */ + {"setrlimit", lx_setrlimit, 0, 2}, /* 160 */ + {"chroot", NULL, 0, 1}, /* 161 */ + {"sync", NULL, 0, 0}, /* 162 */ + {"acct", NULL, NOSYS_NO_EQUIV, 0}, /* 163 */ + {"settimeofday", NULL, 0, 2}, /* 164 */ + {"mount", NULL, 0, 5}, /* 165 */ + {"umount2", NULL, 0, 2}, /* 166 */ + {"swapon", NULL, NOSYS_KERNEL, 0}, /* 167 */ + {"swapoff", NULL, NOSYS_KERNEL, 0}, /* 168 */ + {"reboot", NULL, 0, 4}, /* 169 */ + {"sethostname", NULL, 0, 2}, /* 170 */ + {"setdomainname", NULL, 0, 2}, /* 171 */ + {"iopl", NULL, NOSYS_NO_EQUIV, 0}, /* 172 */ + {"ioperm", NULL, NOSYS_NO_EQUIV, 0}, /* 173 */ + {"create_module", NULL, NOSYS_KERNEL, 0}, /* 174 */ + {"init_module", NULL, NOSYS_KERNEL, 0}, /* 175 */ + {"delete_module", NULL, NOSYS_KERNEL, 0}, /* 176 */ + {"get_kernel_syms", NULL, NOSYS_KERNEL, 0}, /* 177 */ + {"query_module", NULL, 0, 5}, /* 178 */ + {"quotactl", NULL, NOSYS_KERNEL, 0}, /* 179 */ + {"nfsservctl", NULL, NOSYS_KERNEL, 0}, /* 180 */ + {"getpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 181 */ + {"putpmsg", NULL, NOSYS_OBSOLETE, 0}, /* 182 */ + {"afs_syscall", NULL, NOSYS_KERNEL, 0}, /* 183 */ + {"tux", NULL, NOSYS_NO_EQUIV, 0}, /* 184 */ + {"security", NULL, NOSYS_NO_EQUIV, 0}, /* 185 */ + {"gettid", lx_gettid, 0, 0}, /* 186 */ + {"readahead", NULL, NOSYS_NO_EQUIV, 0}, /* 187 */ + {"setxattr", lx_xattr, 0, 5}, /* 188 */ + {"lsetxattr", lx_xattr, 0, 5}, /* 189 */ + {"fsetxattr", lx_xattr, 0, 5}, /* 190 */ + {"getxattr", lx_xattr, 0, 4}, /* 191 */ + {"lgetxattr", lx_xattr, 0, 4}, /* 192 */ + {"fgetxattr", lx_xattr, 0, 4}, /* 193 */ + {"listxattr", lx_xattr, 0, 3}, /* 194 */ + {"llistxattr", lx_xattr, 0, 3}, /* 195 */ + {"flistxattr", lx_xattr, 0, 3}, /* 196 */ + {"removexattr", lx_xattr, 0, 2}, /* 197 */ + {"lremovexattr", lx_xattr, 0, 2}, /* 198 */ + {"fremovexattr", lx_xattr, 0, 2}, /* 199 */ + {"tkill", lx_tkill, 0, 2}, /* 200 */ + {"time", lx_time, 0, 1}, /* 201 */ + {"futex", lx_futex, 0, 6}, /* 202 */ + {"sched_setaffinity", NULL, 0, 3}, /* 203 */ + {"sched_getaffinity", NULL, 0, 3}, /* 204 */ + {"set_thread_area", lx_set_thread_area, 0, 1}, /* 205 */ + {"io_setup", NULL, 0, 2}, /* 206 */ + {"io_destroy", NULL, 0, 1}, /* 207 */ + {"io_getevents", NULL, 0, 5}, /* 208 */ + {"io_submit", NULL, 0, 3}, /* 209 */ + {"io_cancel", NULL, 0, 3}, /* 210 */ + {"get_thread_area", lx_get_thread_area, 0, 1}, /* 211 */ + {"lookup_dcookie", NULL, NOSYS_NO_EQUIV, 0}, /* 212 */ + {"epoll_create", NULL, 0, 1}, /* 213 */ + {"epoll_ctl_old", NULL, NOSYS_NULL, 0}, /* 214 */ + {"epoll_wait_old", NULL, NOSYS_NULL, 0}, /* 215 */ + {"remap_file_pages", NULL, NOSYS_NO_EQUIV, 0}, /* 216 */ + {"getdents64", lx_getdents64, 0, 3}, /* 217 */ + {"set_tid_address", lx_set_tid_address, 0, 1}, /* 218 */ + {"restart_syscall", NULL, NOSYS_NULL, 0}, /* 219 */ + {"semtimedop", NULL, 0, 4}, /* 220 */ + {"fadvise64", NULL, 0, 4}, /* 221 */ + {"timer_create", NULL, 0, 3}, /* 222 */ + {"timer_settime", NULL, 0, 4}, /* 223 */ + {"timer_gettime", NULL, 0, 2}, /* 224 */ + {"timer_getoverrun", NULL, 0, 1}, /* 225 */ + {"timer_delete", NULL, 0, 1}, /* 226 */ + {"clock_settime", lx_clock_settime, 0, 2}, /* 227 */ + {"clock_gettime", lx_clock_gettime, 0, 2}, /* 228 */ + {"clock_getres", lx_clock_getres, 0, 2}, /* 229 */ + {"clock_nanosleep", NULL, 0, 4}, /* 230 */ + {"exit_group", NULL, 0, 1}, /* 231 */ + {"epoll_wait", NULL, 0, 4}, /* 232 */ + {"epoll_ctl", NULL, 0, 4}, /* 233 */ + {"tgkill", lx_tgkill, 0, 3}, /* 234 */ + {"utimes", NULL, 0, 2}, /* 235 */ + {"vserver", NULL, NOSYS_NULL, 0}, /* 236 */ + {"mbind", NULL, NOSYS_NULL, 0}, /* 237 */ + {"set_mempolicy", NULL, NOSYS_NULL, 0}, /* 238 */ + {"get_mempolicy", NULL, NOSYS_NULL, 0}, /* 239 */ + {"mq_open", NULL, NOSYS_NULL, 0}, /* 240 */ + {"mq_unlink", NULL, NOSYS_NULL, 0}, /* 241 */ + {"mq_timedsend", NULL, NOSYS_NULL, 0}, /* 242 */ + {"mq_timedreceive", NULL, NOSYS_NULL, 0}, /* 243 */ + {"mq_notify", NULL, NOSYS_NULL, 0}, /* 244 */ + {"mq_getsetattr", NULL, NOSYS_NULL, 0}, /* 245 */ + {"kexec_load", NULL, NOSYS_NULL, 0}, /* 246 */ + {"waitid", lx_waitid, 0, 4}, /* 247 */ + {"add_key", NULL, NOSYS_NULL, 0}, /* 248 */ + {"request_key", NULL, NOSYS_NULL, 0}, /* 249 */ + {"keyctl", NULL, NOSYS_NULL, 0}, /* 250 */ + {"ioprio_set", NULL, NOSYS_NULL, 0}, /* 251 */ + {"ioprio_get", NULL, NOSYS_NULL, 0}, /* 252 */ + {"inotify_init", NULL, 0, 0}, /* 253 */ + {"inotify_add_watch", NULL, 0, 3}, /* 254 */ + {"inotify_rm_watch", NULL, 0, 2}, /* 255 */ + {"migrate_pages", NULL, NOSYS_NULL, 0}, /* 256 */ + {"openat", lx_openat, 0, 4}, /* 257 */ + {"mkdirat", lx_mkdirat, 0, 3}, /* 258 */ + {"mknodat", NULL, 0, 4}, /* 259 */ + {"fchownat", lx_fchownat, 0, 5}, /* 260 */ + {"futimesat", NULL, 0, 3}, /* 261 */ + {"fstatat64", NULL, 0, 4}, /* 262 */ + {"unlinkat", NULL, 0, 3}, /* 263 */ + {"renameat", NULL, 0, 4}, /* 264 */ + {"linkat", NULL, 0, 5}, /* 265 */ + {"symlinkat", NULL, 0, 3}, /* 266 */ + {"readlinkat", NULL, 0, 4}, /* 267 */ + {"fchmodat", lx_fchmodat, 0, 3}, /* 268 */ + {"faccessat", NULL, 0, 4}, /* 269 */ + {"pselect6", NULL, 0, 6}, /* 270 */ + {"ppoll", NULL, 0, 5}, /* 271 */ + {"unshare", NULL, NOSYS_NULL, 0}, /* 272 */ + {"set_robust_list", lx_set_robust_list, 0, 2}, /* 273 */ + {"get_robust_list", lx_get_robust_list, 0, 3}, /* 274 */ + {"splice", NULL, NOSYS_NULL, 0}, /* 275 */ + {"tee", NULL, NOSYS_NULL, 0}, /* 276 */ + {"sync_file_range", NULL, NOSYS_NULL, 0}, /* 277 */ + {"vmsplice", NULL, NOSYS_NULL, 0}, /* 278 */ + {"move_pages", NULL, NOSYS_NULL, 0}, /* 279 */ + {"utimensat", NULL, 0, 4}, /* 280 */ + {"epoll_pwait", NULL, 0, 5}, /* 281 */ + {"signalfd", NULL, NOSYS_NULL, 0}, /* 282 */ + {"timerfd_create", NULL, 0, 2}, /* 283 */ + {"eventfd", NULL, 0, 1}, /* 284 */ + {"fallocate", NULL, NOSYS_NULL, 0}, /* 285 */ + {"timerfd_settime", NULL, 0, 4}, /* 286 */ + {"timerfd_gettime", NULL, 0, 2}, /* 287 */ + {"accept4", NULL, 0, 4}, /* 288 */ + {"signalfd4", NULL, NOSYS_NULL, 0}, /* 289 */ + {"eventfd2", NULL, 0, 2}, /* 290 */ + {"epoll_create1", NULL, 0, 1}, /* 291 */ + {"dup3", NULL, 0, 3}, /* 292 */ + {"pipe2", lx_pipe2, 0, 2}, /* 293 */ + {"inotify_init1", NULL, 0, 1}, /* 294 */ + {"preadv", NULL, 0, 4}, /* 295 */ + {"pwritev", NULL, 0, 4}, /* 296 */ + {"rt_tgsigqueueinfo", NULL, 0, 4}, /* 297 */ + {"perf_event_open", NULL, NOSYS_NULL, 0}, /* 298 */ + {"recvmmsg", NULL, NOSYS_NULL, 0}, /* 299 */ + {"fanotify_init", NULL, NOSYS_NULL, 0}, /* 300 */ + {"fanotify_mark", NULL, NOSYS_NULL, 0}, /* 301 */ + {"prlimit64", lx_prlimit64, 0, 4}, /* 302 */ + {"name_to_handle_at", NULL, NOSYS_NULL, 0}, /* 303 */ + {"open_by_handle_at", NULL, NOSYS_NULL, 0}, /* 304 */ + {"clock_adjtime", NULL, NOSYS_NULL, 0}, /* 305 */ + {"syncfs", NULL, NOSYS_NULL, 0}, /* 306 */ + {"sendmmsg", NULL, NOSYS_NULL, 0}, /* 307 */ + {"setns", NULL, NOSYS_NULL, 0}, /* 309 */ + {"getcpu", lx_getcpu, 0, 3}, /* 309 */ + {"process_vm_readv", NULL, NOSYS_NULL, 0}, /* 310 */ + {"process_vm_writev", NULL, NOSYS_NULL, 0}, /* 311 */ + {"kcmp", NULL, NOSYS_NULL, 0}, /* 312 */ + {"finit_module", NULL, NOSYS_NULL, 0}, /* 313 */ + {"sched_setattr", NULL, NOSYS_NULL, 0}, /* 314 */ + {"sched_getattr", NULL, NOSYS_NULL, 0}, /* 315 */ + {"renameat2", NULL, NOSYS_NULL, 0}, /* 316 */ + {"seccomp", NULL, NOSYS_NULL, 0}, /* 317 */ + {"getrandom", lx_getrandom, 0, 3}, /* 318 */ + {"memfd_create", NULL, NOSYS_NULL, 0}, /* 319 */ + {"kexec_file_load", NULL, NOSYS_NULL, 0}, /* 320 */ + {"bpf", NULL, NOSYS_NULL, 0}, /* 321 */ + {"execveat", NULL, NOSYS_NULL, 0}, /* 322 */ + + /* XXX TBD gap then x32 syscalls from 512 - 544 */ +}; +#endif diff --git a/usr/src/uts/common/brand/lx/procfs/lx_proc.h b/usr/src/uts/common/brand/lx/procfs/lx_proc.h new file mode 100644 index 0000000000..b383a4aef7 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_proc.h @@ -0,0 +1,289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifdef _LXPROC_NATIVE_H +#error Attempted to include branded lx_proc.h after native lxproc.h +#endif + +#ifndef _LXPROC_H +#define _LXPROC_H +#define _LXPROC_BRANDED_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_LIMITS, /* /proc/<pid>/limits */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_MOUNTINFO, /* /proc/<pid>/mountinfo */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_TASKDIR, /* /proc/<pid>/task */ + LXPR_PID_TASK_IDDIR, /* /proc/<pid>/task/<tid> */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_PID_TID_CMDLINE, /* /proc/<pid>/task/<tid>/cmdline */ + LXPR_PID_TID_CPU, /* /proc/<pid>/task/<tid>/cpu */ + LXPR_PID_TID_CURDIR, /* /proc/<pid>/task/<tid>/cwd */ + LXPR_PID_TID_ENV, /* /proc/<pid>/task/<tid>/environ */ + LXPR_PID_TID_EXE, /* /proc/<pid>/task/<tid>/exe */ + LXPR_PID_TID_LIMITS, /* /proc/<pid>/task/<tid>/limits */ + LXPR_PID_TID_MAPS, /* /proc/<pid>/task/<tid>/maps */ + LXPR_PID_TID_MEM, /* /proc/<pid>/task/<tid>/mem */ + LXPR_PID_TID_MOUNTINFO, /* /proc/<pid>/task/<tid>/mountinfo */ + LXPR_PID_TID_ROOTDIR, /* /proc/<pid>/task/<tid>/root */ + LXPR_PID_TID_STAT, /* /proc/<pid>/task/<tid>/stat */ + LXPR_PID_TID_STATM, /* /proc/<pid>/task/<tid>/statm */ + LXPR_PID_TID_STATUS, /* /proc/<pid>/task/<tid>/status */ + LXPR_PID_TID_FDDIR, /* /proc/<pid>/task/<tid>/fd */ + LXPR_PID_TID_FD_FD, /* /proc/<pid>/task/<tid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DISKSTATS, /* /proc/diskstats */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MODULES, /* /proc/modules */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IF_INET6, /* /proc/net/if_inet6 */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_IPV6_ROUTE, /* /proc/net/ipv6_route */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_TCP6, /* /proc/net/tcp6 */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UDP6, /* /proc/net/udp6 */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_SWAPS, /* /proc/swaps */ + LXPR_SYSDIR, /* /proc/sys/ */ + LXPR_SYS_FSDIR, /* /proc/sys/fs/ */ + LXPR_SYS_FS_INOTIFYDIR, /* /proc/sys/fs/inotify */ + LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, /* inotify/max_queued_events */ + LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, /* inotify/max_user_instances */ + LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, /* inotify/max_user_watches */ + LXPR_SYS_KERNELDIR, /* /proc/sys/kernel/ */ + LXPR_SYS_KERNEL_HOSTNAME, /* /proc/sys/kernel/hostname */ + LXPR_SYS_KERNEL_MSGMNI, /* /proc/sys/kernel/msgmni */ + LXPR_SYS_KERNEL_NGROUPS_MAX, /* /proc/sys/kernel/ngroups_max */ + LXPR_SYS_KERNEL_PID_MAX, /* /proc/sys/kernel/pid_max */ + LXPR_SYS_KERNEL_SHMMAX, /* /proc/sys/kernel/shmmax */ + LXPR_SYS_KERNEL_THREADS_MAX, /* /proc/sys/kernel/threads-max */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * Linux sector size for /proc/diskstats + */ +#define LXPR_SECTOR_SIZE 512 + +/* + * external dirent characteristics + */ +typedef struct { + lxpr_nodetype_t d_type; + char *d_name; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + uint_t lxpr_desc; /* addl. descriptor (fd or tid) */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern boolean_t lxpr_uiobuf_nonblock(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c new file mode 100644 index 0000000000..6dad5ce7d2 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prsubr.c @@ -0,0 +1,570 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +/* + * lxprsubr.c: Various functions for the /lxproc vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_proc.h" + +#define LXPRCACHE_NAME "lxbpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lx_pr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lx_pr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +boolean_t +lxpr_uiobuf_nonblock(struct lxpr_uiobuf *uiobuf) +{ + if ((uiobuf->uiop->uio_fmode & FNONBLOCK) != 0) + return (B_TRUE); + return (B_FALSE); +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + pid_t find_pid; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process; + * if 0 we want zsched. + */ + if (pid == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (pid == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = pid; + } + p = prfind(find_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int desc) +{ + if (pid == 1) { + pid = curproc->p_zone->zone_proc_initpid; + } else if (pid == 0) { + pid = curproc->p_zone->zone_zsched->p_pid; + } + + switch (type) { + case LXPR_PIDDIR: + return (maxpid + pid + 1); + case LXPR_PID_TASK_IDDIR: + return (maxpid + (desc * 10)); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + desc); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int desc) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + lxpnp->lxpr_desc = desc; + VN_HOLD(dp); + if (p != NULL) { + if (p->p_pid == curproc->p_zone->zone_proc_initpid) { + lxpnp->lxpr_pid = 1; + } else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) { + lxpnp->lxpr_pid = 0; + } else { + lxpnp->lxpr_pid = p->p_pid; + } + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, desc); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_TASKDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_TASK_IDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + vp->v_type = VLNK; + break; + + case LXPR_PID_FDDIR: + case LXPR_PID_TID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_SYSDIR: + case LXPR_SYS_FSDIR: + case LXPR_SYS_FS_INOTIFYDIR: + case LXPR_SYS_KERNELDIR: + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c new file mode 100644 index 0000000000..e9dda7864a --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvfsops.c @@ -0,0 +1,374 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * lxprvfsops.c: vfs operations for /lxprocfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_proc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_proc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialise cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/proc"); + (void) strcpy(&sp->f_fstr[6], "/proc"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c new file mode 100644 index 0000000000..9b814f7ab4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/procfs/lx_prvnops.c @@ -0,0 +1,5296 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +/* + * lx_proc -- a Linux-compatible /proc for the LX brand + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support native (but Linux-borne) programs that wish to view the native + * system through the Linux /proc model; the other -- this one -- is to + * support Linux binaries via the LX brand. These two implementations differ + * greatly in their aspirations (and their willingness to bend the truth + * of the system to accommodate those aspirations); they should not be unified. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <lx_signum.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/rctl.h> +#include <sys/kstat.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/tcp.h> +#include <inet/udp_impl.h> +#include <inet/ipclassifier.h> +#include <sys/socketvar.h> +#include <fs/sockfs/socktpi.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); + +#include "lx_proc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_readlink_pid_fd(lxpr_node_t *lxpnp, char *bp, size_t len); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sysdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fsdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_fs_inotifydir(vnode_t *, char *); +static vnode_t *lxpr_lookup_sys_kerneldir(vnode_t *, char *); +static vnode_t *lxpr_lookup_taskdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_task_tid_dir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sysdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fsdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_sys_kerneldir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_taskdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_task_tid_dir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_diskstats(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_swaps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_env(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_limits(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_mountinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_tid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_tid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_if_inet6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ipv6_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp6(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *, + lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_hostname(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_msgmni(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_pid_max(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_shmmax(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_sys_kernel_threads_max(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) + +extern rctl_hndl_t rc_zone_msgmni; +extern rctl_hndl_t rc_zone_shmmax; +#define FOURGB 4294967295 + +/* + * The maximum length of the concatenation of argument vector strings we + * will return to the user via the branded procfs. Likewise for the env vector. + */ +int lxpr_maxargvlen = 4096; +int lxpr_maxenvvlen = 4096; + +/* + * The lx /proc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + + +/* + * file contents of an lx /proc directory. + */ +static lxpr_dirent_t lx_procdir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DISKSTATS, "diskstats" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MODULES, "modules" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_SWAPS, "swaps" }, + { LXPR_SYSDIR, "sys" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lx_procdir) / sizeof (lx_procdir[0])) + +/* + * Contents of an lx /proc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_TASKDIR, "task" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * Contents of an lx /proc/<pid>/task/<tid> directory. + */ +static lxpr_dirent_t tiddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_LIMITS, "limits" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_MOUNTINFO, "mountinfo" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_TID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_TID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define TIDDIRFILES (sizeof (tiddir) / sizeof (tiddir[0])) + +#define LX_RLIM_INFINITY 0xFFFFFFFFFFFFFFFF + +#define RCTL_INFINITE(x) \ + ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x->rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct lxpr_rlimtab { + char *rlim_name; /* limit name */ + char *rlim_unit; /* limit unit */ + char *rlim_rctl; /* rctl source */ +} lxpr_rlimtab_t; + +static lxpr_rlimtab_t lxpr_rlimtab[] = { + { "Max cpu time", "seconds", "process.max-cpu-time" }, + { "Max file size", "bytes", "process.max-file-size" }, + { "Max data size", "bytes", "process.max-data-size" }, + { "Max stack size", "bytes", "process.max-stack-size" }, + { "Max core file size", "bytes", "process.max-core-size" }, + { "Max resident set", "bytes", "zone.max-physical-memory" }, + { "Max processes", "processes", "zone.max-lwps" }, + { "Max open files", "files", "process.max-file-descriptor" }, + { "Max locked memory", "bytes", "zone.max-locked-memory" }, + { "Max address space", "bytes", "process.max-address-space" }, + { "Max file locks", "locks", NULL }, + { "Max pending signals", "signals", + "process.max-sigqueue-size" }, + { "Max msgqueue size", "bytes", "process.max-msg-messages" }, + { NULL, NULL, NULL } +}; + + +/* + * contents of lx /proc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IF_INET6, "if_inet6" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_IPV6_ROUTE, "ipv6_route" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_TCP6, "tcp6" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UDP6, "udp6" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * contents of /proc/sys directory + */ +static lxpr_dirent_t sysdir[] = { + { LXPR_SYS_FSDIR, "fs" }, + { LXPR_SYS_KERNELDIR, "kernel" }, +}; + +#define SYSDIRFILES (sizeof (sysdir) / sizeof (sysdir[0])) + +/* + * contents of /proc/sys/fs directory + */ +static lxpr_dirent_t sys_fsdir[] = { + { LXPR_SYS_FS_INOTIFYDIR, "inotify" }, +}; + +#define SYS_FSDIRFILES (sizeof (sys_fsdir) / sizeof (sys_fsdir[0])) + +/* + * contents of /proc/sys/fs/inotify directory + */ +static lxpr_dirent_t sys_fs_inotifydir[] = { + { LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, +}; + +#define SYS_FS_INOTIFYDIRFILES \ + (sizeof (sys_fs_inotifydir) / sizeof (sys_fs_inotifydir[0])) + +/* + * contents of /proc/sys/kernel directory + */ +static lxpr_dirent_t sys_kerneldir[] = { + { LXPR_SYS_KERNEL_HOSTNAME, "hostname" }, + { LXPR_SYS_KERNEL_MSGMNI, "msgmni" }, + { LXPR_SYS_KERNEL_NGROUPS_MAX, "ngroups_max" }, + { LXPR_SYS_KERNEL_PID_MAX, "pid_max" }, + { LXPR_SYS_KERNEL_SHMMAX, "shmmax" }, + { LXPR_SYS_KERNEL_THREADS_MAX, "threads-max" }, +}; + +#define SYS_KERNELDIRFILES (sizeof (sys_kerneldir) / sizeof (sys_kerneldir[0])) + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files, + * fifos or sockets; reject the open for anything else. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG && + rvp->v_type != VFIFO && rvp->v_type != VSOCK) { + error = EACCES; + } else { + if (type == LXPR_PID_FD_FD && rvp->v_type == VFIFO) { + /* + * This flag lets the fifo open know that + * we're using proc/fd to open a fd which we + * already have open. Otherwise, the fifo might + * reject an open if the other end has closed. + */ + flag |= FKLYR; + } + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_limits, /* /proc/<pid>/limits */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/mountinfo */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task */ + lxpr_read_isdir, /* /proc/<pid>/task/nn */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_pid_cmdline, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_read_pid_env, /* /proc/<pid>/task/<tid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/exe */ + lxpr_read_pid_limits, /* /proc/<pid>/task/<tid>/limits */ + lxpr_read_pid_maps, /* /proc/<pid>/task/<tid>/maps */ + lxpr_read_empty, /* /proc/<pid>/task/<tid>/mem */ + lxpr_read_pid_mountinfo, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_read_invalid, /* /proc/<pid>/task/<tid>/root */ + lxpr_read_pid_tid_stat, /* /proc/<pid>/task/<tid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/task/<tid>/statm */ + lxpr_read_pid_tid_status, /* /proc/<pid>/task/<tid>/status */ + lxpr_read_isdir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_read_fd, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_diskstats, /* /proc/diskstats */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_empty, /* /proc/modules */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_if_inet6, /* /proc/net/if_inet6 */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_ipv6_route, /* /proc/net/ipv6_route */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_tcp6, /* /proc/net/tcp6 */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_udp6, /* /proc/net/udp6 */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_swaps, /* /proc/swaps */ + lxpr_read_invalid, /* /proc/sys */ + lxpr_read_invalid, /* /proc/sys/fs */ + lxpr_read_invalid, /* /proc/sys/fs/inotify */ + lxpr_read_sys_fs_inotify_max_queued_events, /* max_queued_events */ + lxpr_read_sys_fs_inotify_max_user_instances, /* max_user_instances */ + lxpr_read_sys_fs_inotify_max_user_watches, /* max_user_watches */ + lxpr_read_invalid, /* /proc/sys/kernel */ + lxpr_read_sys_kernel_hostname, /* /proc/sys/kernel/hostname */ + lxpr_read_sys_kernel_msgmni, /* /proc/sys/kernel/msgmni */ + lxpr_read_sys_kernel_ngroups_max, /* /proc/sys/kernel/ngroups_max */ + lxpr_read_sys_kernel_pid_max, /* /proc/sys/kernel/pid_max */ + lxpr_read_sys_kernel_shmmax, /* /proc/sys/kernel/shmmax */ + lxpr_read_sys_kernel_threads_max, /* /proc/sys/kernel/threads-max */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by lx /proc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_taskdir, /* /proc/<pid>/task */ + lxpr_lookup_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/diskstats */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/modules */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp6 */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/swaps */ + lxpr_lookup_sysdir, /* /proc/sys */ + lxpr_lookup_sys_fsdir, /* /proc/sys/fs */ + lxpr_lookup_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_lookup_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_lookup_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_lookup_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_lookup_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_taskdir, /* /proc/<pid>/task */ + lxpr_readdir_task_tid_dir, /* /proc/<pid>/task/nn */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/limits */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/mountinfo */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/task/<tid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/task/<tid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/diskstats */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/modules */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/if_inet6 */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/ipv6_route */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp6 */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/swaps */ + lxpr_readdir_sysdir, /* /proc/sys */ + lxpr_readdir_sys_fsdir, /* /proc/sys/fs */ + lxpr_readdir_sys_fs_inotifydir, /* /proc/sys/fs/inotify */ + lxpr_readdir_not_a_dir, /* .../inotify/max_queued_events */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_instances */ + lxpr_readdir_not_a_dir, /* .../inotify/max_user_watches */ + lxpr_readdir_sys_kerneldir, /* /proc/sys/kernel */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/hostname */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/msgmni */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/ngroups_max */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/pid_max */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/shmmax */ + lxpr_readdir_not_a_dir, /* /proc/sys/kernel/threads-max */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in the lx procfs is human + * readable and not binary structures there do not have to be different + * read variants depending on whether the reading process model is 32 or 64 bits + * (at least in general, and certainly the difference is unlikely to be enough + * to justify have different routines for 32 and 64 bit reads + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): read argument vector from process + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxargvlen, sz; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE || + lxpnp->lxpr_type == LXPR_PID_TID_CMDLINE); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + kmem_free(buf, asz); + return; + } + + if (prreadargv(p, buf, asz, &sz) != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + + lxpr_unlock(p); + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_env(): read env vector from process + */ +static void +lxpr_read_pid_env(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + size_t asz = lxpr_maxenvvlen, sz; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_ENV); + + buf = kmem_alloc(asz, KM_SLEEP); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + kmem_free(buf, asz); + return; + } + + if (prreadenvv(p, buf, asz, &sz) != 0) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + } else { + lxpr_uiobuf_write(uiobuf, buf, sz); + } + + lxpr_unlock(p); + kmem_free(buf, asz); +} + +/* + * lxpr_read_pid_limits(): ulimit file + */ +static void +lxpr_read_pid_limits(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + rctl_qty_t cur, max; + rctl_val_t *oval, *nval; + rctl_hndl_t hndl; + char *kname; + int i; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_LIMITS || + lxpnp->lxpr_type == LXPR_PID_TID_LIMITS); + + nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + kmem_free(nval, sizeof (rctl_val_t)); + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + lxpr_uiobuf_printf(uiobuf, "%-25s %-20s %-20s %-10s\n", + "Limit", "Soft Limit", "Hard Limit", "Units"); + for (i = 0; lxpr_rlimtab[i].rlim_name != NULL; i++) { + kname = lxpr_rlimtab[i].rlim_rctl; + /* default to unlimited for resources without an analog */ + cur = RLIM_INFINITY; + max = RLIM_INFINITY; + if (kname != NULL) { + hndl = rctl_hndl_lookup(kname); + oval = NULL; + while ((hndl != -1) && + rctl_local_get(hndl, oval, nval, p) == 0) { + oval = nval; + switch (nval->rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + cur = nval->rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + max = nval->rcv_value; + break; + } + } + } + + lxpr_uiobuf_printf(uiobuf, "%-25s", lxpr_rlimtab[i].rlim_name); + if (cur == RLIM_INFINITY || cur == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", cur); + } + if (max == RLIM_INFINITY || max == LX_RLIM_INFINITY) { + lxpr_uiobuf_printf(uiobuf, " %-20s", "unlimited"); + } else { + lxpr_uiobuf_printf(uiobuf, " %-20lu", max); + } + lxpr_uiobuf_printf(uiobuf, " %-10s\n", + lxpr_rlimtab[i].rlim_unit); + } + + lxpr_unlock(p); + kmem_free(nval, sizeof (rctl_val_t)); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + uintptr_t saddr; + uintptr_t eaddr; + int type; + char prot[5]; + uintptr_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS || + lxpnp->lxpr_type == LXPR_PID_TID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = (uintptr_t)seg->s_base; + pbuf->eaddr = pbuf->saddr + seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = SEGOP_GETOFFSET(seg, (caddr_t)pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + ino_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (p->p_model == DATAMODEL_LP64) { + lxpr_uiobuf_printf(uiobuf, + "%08llx-%08llx %s %08llx %02x:%02x %llu%s%s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, *buf != '\0' ? " " : "", buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02x:%02x %llu%s%s\n", + (uint32_t)pbuf->saddr, (uint32_t)pbuf->eaddr, + pbuf->prot, (uint32_t)pbuf->offset, maj, min, + inode, *buf != '\0' ? " " : "", buf); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_mountinfo(): information about process mount points. e.g.: + * 14 19 0:13 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw + * mntid parid devnums root mntpnt mntopts - fstype mntsrc superopts + * + * We have to make up several of these fields. + */ +static void +lxpr_read_pid_mountinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + dev_t vfs_dev; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + int root_id = 15; /* use a made-up value */ + int mnt_id; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MOUNTINFO || + lxpnp->lxpr_type == LXPR_PID_TID_MOUNTINFO); + + vfs_list_read_lock(); + + /* root is the top-level, it does not appear in this output */ + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "%d 1 %d:%d / / %s - %s / %s\n", + root_id, + major(tvfsp->vfs_dev), minor(vfsp->vfs_dev), + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->vfs_dev = vfsp->vfs_dev; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + mnt_id = root_id + 1; + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : mntpt; + } + } else { + resource = "none"; + } + + /* + * XXX parent ID is not tracked correctly here. Currently we + * always assume the parent ID is the root ID. + */ + lxpr_uiobuf_printf(uiobuf, + "%d %d %d:%d / %s %s - %s %s %s\n", + mnt_id, root_id, + major(printp->vfs_dev), minor(printp->vfs_dev), + mntpt, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw", + vfssw[printp->vfs_fstype].vsw_name, + resource, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + mnt_id++; + } +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM || + lxpnp->lxpr_type == LXPR_PID_TID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * Look for either the main thread (lookup_id is 0) or the specified thread. + * If we're looking for the main thread but the proc does not have one, we + * fallback to using prchoose to get any thread available. + */ +static kthread_t * +lxpr_get_thread(proc_t *p, uint_t lookup_id) +{ + kthread_t *t; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + pid_t pid = p->p_pid; + pid_t init_pid = curproc->p_zone->zone_proc_initpid; + + /* get specified thread */ + if ((t = p->p_tlist) == NULL) + return (NULL); + + do { + if (lookup_id == 0 && t->t_tid == 1) { + thread_lock(t); + return (t); + } + + lwpd = ttolxlwp(t); + if (lwpd == NULL) { + emul_tid = t->t_tid; + } else { + if (pid == init_pid && lookup_id == 1) { + emul_tid = t->t_tid; + } else { + emul_tid = lwpd->br_pid; + } + } + if (emul_tid == lookup_id) { + thread_lock(t); + return (t); + } + } while ((t = t->t_forw) != p->p_tlist); + + if (lookup_id == 0) + return (prchoose(p)); + return (NULL); +} + +/* + * Lookup the real pid for procs 0 or 1. + */ +static pid_t +get_real_pid(pid_t p) +{ + pid_t find_pid; + + if (p == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (p == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = p; + } + + return (find_pid); +} + +/* + * pid/tid common code to read status file + */ +static void +lxpr_read_status_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + k_sigset_t current, ignore, handle; + int i, lx_sig; + pid_t real_pid; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process or if we're the zone's zsched the pid is 0. + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else if (pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; /* zsched is pid 0 */ + ppid = 0; /* parent pid for zsched is itself */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = lxpr_get_thread(p, lookup_id); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + size_t vsize, nlocked, rss; + + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + nlocked = p->p_locked_mem; + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + btok(nlocked), + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + lxpr_uiobuf_printf(uiobuf, "\nThreads:\t%u", p->p_lwpcnt); + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = stol_signo[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + lxpr_read_status_common(lxpnp, uiobuf, 0); +} + +/* + * lxpr_read_pid_tid_status(): status file + */ +static void +lxpr_read_pid_tid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STATUS); + lxpr_read_status_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* + * pid/tid common code to read stat file + */ +static void +lxpr_read_stat_common(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf, + uint_t lookup_id) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + pid_t real_pid; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else if (pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; /* PID for zsched */ + ppid = 0; /* parent PID for zsched is 0 */ + pgpid = 0; /* process group for zsched is 0 */ + psgid = (gid_t)-1; /* credential GID for zsched is -1 */ + spid = 0; /* session id for zsched is 0 */ + psdev = 0; /* session device for zsched is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = lxpr_get_thread(p, lookup_id); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + if (lookup_id != 0) { + /* we can't find this specific thread */ + lxpr_uiobuf_seterr(uiobuf, EINVAL); + lxpr_unlock(p); + return; + } + + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + (lookup_id == 0) ? pid : lxpnp->lxpr_desc, + PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + lxpr_read_stat_common(lxpnp, uiobuf, 0); +} + +/* + * lxpr_read_pid_tid_stat(): pid stat file + */ +static void +lxpr_read_pid_tid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_TID_STAT); + lxpr_read_stat_common(lxpnp, uiobuf, lxpnp->lxpr_desc); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +struct lxpr_ifstat { + uint64_t rx_bytes; + uint64_t rx_packets; + uint64_t rx_errors; + uint64_t rx_drop; + uint64_t tx_bytes; + uint64_t tx_packets; + uint64_t tx_errors; + uint64_t tx_drop; + uint64_t collisions; + uint64_t rx_multicast; +}; + +static void * +lxpr_kstat_read(kstat_t *kn, boolean_t byname, size_t *size, int *num) +{ + kstat_t *kp; + int i, nrec = 0; + size_t bufsize; + void *buf = NULL; + + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, getzoneid()); + } else { + kp = kstat_hold_bykid(kn->ks_kid, getzoneid()); + } + if (kp == NULL) { + return (NULL); + } + if (kp->ks_flags & KSTAT_FLAG_INVALID) { + kstat_rele(kp); + return (NULL); + } + + bufsize = kp->ks_data_size + 1; + kstat_rele(kp); + + /* + * The kstat in question is released so that kmem_alloc(KM_SLEEP) is + * performed without it held. After the alloc, the kstat is reacquired + * and its size is checked again. If the buffer is no longer large + * enough, the alloc and check are repeated up to three times. + */ + for (i = 0; i < 2; i++) { + buf = kmem_alloc(bufsize, KM_SLEEP); + + /* Check if bufsize still appropriate */ + if (byname == B_TRUE) { + kp = kstat_hold_byname(kn->ks_module, kn->ks_instance, + kn->ks_name, getzoneid()); + } else { + kp = kstat_hold_bykid(kn->ks_kid, getzoneid()); + } + if (kp == NULL || kp->ks_flags & KSTAT_FLAG_INVALID) { + if (kp != NULL) { + kstat_rele(kp); + } + kmem_free(buf, bufsize); + return (NULL); + } + KSTAT_ENTER(kp); + (void) KSTAT_UPDATE(kp, KSTAT_READ); + if (bufsize < kp->ks_data_size) { + kmem_free(buf, bufsize); + buf = NULL; + bufsize = kp->ks_data_size + 1; + KSTAT_EXIT(kp); + kstat_rele(kp); + continue; + } else { + if (KSTAT_SNAPSHOT(kp, buf, KSTAT_READ) != 0) { + kmem_free(buf, bufsize); + buf = NULL; + } + nrec = kp->ks_ndata; + KSTAT_EXIT(kp); + kstat_rele(kp); + break; + } + } + + if (buf != NULL) { + *size = bufsize; + *num = nrec; + } + return (buf); +} + +static int +lxpr_kstat_ifstat(kstat_t *kn, struct lxpr_ifstat *ifs) +{ + kstat_named_t *kp; + int i, num; + size_t size; + + /* + * Search by name instead of by kid since there's a small window to + * race against kstats being added/removed. + */ + bzero(ifs, sizeof (*ifs)); + kp = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num); + if (kp == NULL) + return (-1); + for (i = 0; i < num; i++) { + if (strncmp(kp[i].name, "rbytes64", KSTAT_STRLEN) == 0) + ifs->rx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ipackets64", KSTAT_STRLEN) == 0) + ifs->rx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "ierrors", KSTAT_STRLEN) == 0) + ifs->rx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "norcvbuf", KSTAT_STRLEN) == 0) + ifs->rx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "multircv", KSTAT_STRLEN) == 0) + ifs->rx_multicast = kp[i].value.ui32; + else if (strncmp(kp[i].name, "obytes64", KSTAT_STRLEN) == 0) + ifs->tx_bytes = kp[i].value.ui64; + else if (strncmp(kp[i].name, "opackets64", KSTAT_STRLEN) == 0) + ifs->tx_packets = kp[i].value.ui64; + else if (strncmp(kp[i].name, "oerrors", KSTAT_STRLEN) == 0) + ifs->tx_errors = kp[i].value.ui32; + else if (strncmp(kp[i].name, "noxmtbuf", KSTAT_STRLEN) == 0) + ifs->tx_drop = kp[i].value.ui32; + else if (strncmp(kp[i].name, "collisions", KSTAT_STRLEN) == 0) + ifs->collisions = kp[i].value.ui32; + } + kmem_free(kp, size); + return (0); +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + int i, nidx; + size_t sidx; + struct lxpr_ifstat ifs; + + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + if (ksr == NULL) + return; + + for (i = 1; i < nidx; i++) { + if (strncmp(ksr[i].ks_module, "link", KSTAT_STRLEN) == 0 || + strncmp(ksr[i].ks_module, "lo", KSTAT_STRLEN) == 0) { + if (lxpr_kstat_ifstat(&ksr[i], &ifs) != 0) + continue; + + /* Overwriting the name is ok in the local snapshot */ + lx_ifname_convert(ksr[i].ks_name, LX_IF_FROMNATIVE); + lxpr_uiobuf_printf(uiobuf, "%6s: %7llu %7llu %4lu " + "%4lu %4u %5u %10u %9lu %8llu %7llu %4lu %4lu %4u " + "%5lu %7u %10u\n", + ksr[i].ks_name, + ifs.rx_bytes, ifs.rx_packets, + ifs.rx_errors, ifs.rx_drop, + 0, 0, 0, ifs.rx_multicast, + ifs.tx_bytes, ifs.tx_packets, + ifs.tx_errors, ifs.tx_drop, + 0, ifs.collisions, 0, 0); + } + } + + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_inet6_out(const in6_addr_t *addr, char buf[33]) +{ + const uint8_t *ip = addr->s6_addr; + char digits[] = "0123456789abcdef"; + int i; + for (i = 0; i < 16; i++) { + buf[2 * i] = digits[ip[i] >> 4]; + buf[2 * i + 1] = digits[ip[i] & 0xf]; + } + buf[32] = '\0'; +} + +/* ARGSUSED */ +static void +lxpr_read_net_if_inet6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + ipif_t *ipif; + ill_walk_context_t ctx; + char ifname[LIFNAMSIZ], ip6out[33]; + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = ILL_START_WALK_V6(&ctx, ipst); + + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + uint_t index = ill->ill_phyint->phyint_ifindex; + int plen = ip_mask_to_plen_v6(&ipif->ipif_v6net_mask); + in6addr_scope_t scope = ip_addr_scope_v6( + &ipif->ipif_v6lcl_addr); + /* Always report PERMANENT flag */ + int flag = 0x80; + + ipif_get_name(ipif, ifname, sizeof (ifname)); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + lxpr_inet6_out(&ipif->ipif_v6lcl_addr, ip6out); + /* Scope output is shifted on Linux */ + scope = scope << 4; + + lxpr_uiobuf_printf(uiobuf, "%32s %02x %02x %02x %02x" + " %8s\n", ip6out, index, plen, scope, flag, ifname); + } + } + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static void +lxpr_format_route_ipv6(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + char ipv6addr[33]; + + lxpr_inet6_out(&ire->ire_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s %02x ", ipv6addr, + ip_mask_to_plen_v6(&ire->ire_mask_v6)); + + /* punt on this for now */ + lxpr_uiobuf_printf(uiobuf, "%s %02x ", + "00000000000000000000000000000000", 0); + + lxpr_inet6_out(&ire->ire_gateway_addr_v6, ipv6addr); + lxpr_uiobuf_printf(uiobuf, "%s", ipv6addr); + + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + /* Linux's RTF_LOCAL equivalent */ + if (ire->ire_metrics.iulp_local) + flags |= 0x80000000; + + if (ire->ire_ill != NULL) { + ill_get_name(ire->ire_ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '\0'; + } + + lxpr_uiobuf_printf(uiobuf, " %08x %08x %08x %08x %8s\n", + 0, /* metric */ + ire->ire_refcnt, + 0, + flags, + name); +} + +/* ARGSUSED */ +static void +lxpr_read_net_ipv6_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v6(&lxpr_format_route_ipv6, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +#define LXPR_SKIP_ROUTE(type) \ + (((IRE_IF_CLONE | IRE_BROADCAST | IRE_MULTICAST | \ + IRE_NOROUTE | IRE_LOOPBACK | IRE_LOCAL) & type) != 0) + +static void +lxpr_format_route_ipv4(ire_t *ire, lxpr_uiobuf_t *uiobuf) +{ + uint32_t flags; + char name[IFNAMSIZ]; + ill_t *ill; + ire_t *nire; + ipif_t *ipif; + ipaddr_t gateway; + + if (LXPR_SKIP_ROUTE(ire->ire_type) || ire->ire_testhidden != 0) + return; + + /* These route flags have direct Linux equivalents */ + flags = ire->ire_flags & + (RTF_UP|RTF_GATEWAY|RTF_HOST|RTF_DYNAMIC|RTF_MODIFIED); + + /* + * Search for a suitable IRE for naming purposes. + * On Linux, the default route is typically associated with the + * interface used to access gateway. The default IRE on Illumos + * typically lacks an ill reference but its parent might have one. + */ + nire = ire; + do { + ill = nire->ire_ill; + nire = nire->ire_dep_parent; + } while (ill == NULL && nire != NULL); + if (ill != NULL) { + ill_get_name(ill, name, sizeof (name)); + lx_ifname_convert(name, LX_IF_FROMNATIVE); + } else { + name[0] = '*'; + name[1] = '\0'; + } + + /* + * Linux suppresses the gateway address for directly connected + * interface networks. To emulate this behavior, we walk all addresses + * of a given route interface. If one matches the gateway, it is + * displayed as NULL. + */ + gateway = ire->ire_gateway_addr; + if ((ill = ire->ire_ill) != NULL) { + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (ipif->ipif_lcl_addr == gateway) { + gateway = 0; + break; + } + } + } + + lxpr_uiobuf_printf(uiobuf, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" + "%d\t%08X\t%d\t%u\t%u\n", + name, + ire->ire_addr, + gateway, + flags, 0, 0, + 0, /* priority */ + ire->ire_mask, + 0, 0, /* mss, window */ + ire->ire_metrics.iulp_rtt); +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + netstack_t *ns; + ip_stack_t *ipst; + + lxpr_uiobuf_printf(uiobuf, "Iface\tDestination\tGateway \tFlags\t" + "RefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT\n"); + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + /* + * LX branded zones are expected to have exclusive IP stack, hence + * using ALL_ZONES as the zoneid filter. + */ + ire_walk_v4(&lxpr_format_route_ipv4, uiobuf, ALL_ZONES, ipst); + + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +typedef struct lxpr_snmp_table { + const char *lst_proto; + const char *lst_fields[]; +} lxpr_snmp_table_t; + +static lxpr_snmp_table_t lxpr_snmp_ip = { "ip", + { + "forwarding", "defaultTTL", "inReceives", "inHdrErrors", + "inAddrErrors", "forwDatagrams", "inUnknownProtos", "inDiscards", + "inDelivers", "outRequests", "outDiscards", "outNoRoutes", + "reasmTimeout", "reasmReqds", "reasmOKs", "reasmFails", "fragOKs", + "fragFails", "fragCreates", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_icmp = { "icmp", + { + "inMsgs", "inErrors", "inCsumErrors", "inDestUnreachs", "inTimeExcds", + "inParmProbs", "inSrcQuenchs", "inRedirects", "inEchos", "inEchoReps", + "inTimestamps", "inTimestampReps", "inAddrMasks", "inAddrMaskReps", + "outMsgs", "outErrors", "outDestUnreachs", "outTimeExcds", + "outParmProbs", "outSrcQuenchs", "outRedirects", "outEchos", + "outEchoReps", "outTimestamps", "outTimestampReps", "outAddrMasks", + "outAddrMaskReps", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_tcp = { "tcp", + { + "rtoAlgorithm", "rtoMin", "rtoMax", "maxConn", "activeOpens", + "passiveOpens", "attemptFails", "estabResets", "currEstab", "inSegs", + "outSegs", "retransSegs", "inErrs", "outRsts", "inCsumErrors", + NULL + } +}; +static lxpr_snmp_table_t lxpr_snmp_udp = { "udp", + { + "inDatagrams", "noPorts", "inErrors", "outDatagrams", "rcvbufErrors", + "sndbufErrors", "inCsumErrors", + NULL + } +}; + +static lxpr_snmp_table_t *lxpr_net_snmptab[] = { + &lxpr_snmp_ip, + &lxpr_snmp_icmp, + &lxpr_snmp_tcp, + &lxpr_snmp_udp, + NULL +}; + +static void +lxpr_kstat_print_tab(lxpr_uiobuf_t *uiobuf, lxpr_snmp_table_t *table, + kstat_t *kn) +{ + kstat_named_t *klist; + char upname[KSTAT_STRLEN], upfield[KSTAT_STRLEN]; + int i, j, num; + size_t size; + + klist = (kstat_named_t *)lxpr_kstat_read(kn, B_TRUE, &size, &num); + if (klist == NULL) + return; + + /* Print the header line, fields capitalized */ + (void) strncpy(upname, table->lst_proto, KSTAT_STRLEN); + upname[0] = toupper(upname[0]); + lxpr_uiobuf_printf(uiobuf, "%s:", upname); + for (i = 0; table->lst_fields[i] != NULL; i++) { + (void) strncpy(upfield, table->lst_fields[i], KSTAT_STRLEN); + upfield[0] = toupper(upfield[0]); + lxpr_uiobuf_printf(uiobuf, " %s", upfield); + } + lxpr_uiobuf_printf(uiobuf, "\n%s:", upname); + + /* Then loop back through to print the value line. */ + for (i = 0; table->lst_fields[i] != NULL; i++) { + kstat_named_t *kpoint = NULL; + for (j = 0; j < num; j++) { + if (strncmp(klist[j].name, table->lst_fields[i], + KSTAT_STRLEN) == 0) { + kpoint = &klist[j]; + break; + } + } + if (kpoint == NULL) { + /* Output 0 for unknown fields */ + lxpr_uiobuf_printf(uiobuf, " 0"); + } else { + switch (kpoint->data_type) { + case KSTAT_DATA_INT32: + lxpr_uiobuf_printf(uiobuf, " %d", + kpoint->value.i32); + break; + case KSTAT_DATA_UINT32: + lxpr_uiobuf_printf(uiobuf, " %u", + kpoint->value.ui32); + break; + case KSTAT_DATA_INT64: + lxpr_uiobuf_printf(uiobuf, " %ld", + kpoint->value.l); + break; + case KSTAT_DATA_UINT64: + lxpr_uiobuf_printf(uiobuf, " %lu", + kpoint->value.ul); + break; + } + } + } + lxpr_uiobuf_printf(uiobuf, "\n"); + kmem_free(klist, size); +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + lxpr_snmp_table_t **table = lxpr_net_snmptab; + int i, t, nidx; + size_t sidx; + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + if (ksr == NULL) + return; + + for (t = 0; table[t] != NULL; t++) { + for (i = 0; i < nidx; i++) { + if (strncmp(ksr[i].ks_class, "mib2", KSTAT_STRLEN) != 0) + continue; + if (strncmp(ksr[i].ks_name, table[t]->lst_proto, + KSTAT_STRLEN) == 0) { + lxpr_kstat_print_tab(uiobuf, table[t], &ksr[i]); + break; + } + } + } + kmem_free(ksr, sidx); +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +static int +lxpr_convert_tcp_state(int st) +{ + /* + * Derived from the enum located in the Linux kernel sources: + * include/net/tcp_states.h + */ + switch (st) { + case TCPS_ESTABLISHED: + return (1); + case TCPS_SYN_SENT: + return (2); + case TCPS_SYN_RCVD: + return (3); + case TCPS_FIN_WAIT_1: + return (4); + case TCPS_FIN_WAIT_2: + return (5); + case TCPS_TIME_WAIT: + return (6); + case TCPS_CLOSED: + return (7); + case TCPS_CLOSE_WAIT: + return (8); + case TCPS_LAST_ACK: + return (9); + case TCPS_LISTEN: + return (10); + case TCPS_CLOSING: + return (11); + default: + /* No translation for TCPS_IDLE, TCPS_BOUND or anything else */ + return (0); + } +} + +static void +lxpr_format_tcp(lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address " + "st tx_queue rx_queue tr tm->when retrnsmt uid timeout " + "inode\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode\n"); + } + /* + * Due to differences between the Linux and illumos TCP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st + * - tx_queue + * - rx_queue + * - uid + * - inode + * + * Omitted/invalid fields + * - tr + * - tm->when + * - retrnsmt + * - timeout + */ + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp_t *tcp; + vattr_t attr; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + vnode_t *vp = (so != NULL) ? so->so_vnode : NULL; + if (connp->conn_ipversion != ipver) + continue; + tcp = connp->conn_tcp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + /* fetch the simulated inode for the socket */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %u %u %u %u %d\n", + lxpr_convert_tcp_state(tcp->tcp_state), + tcp->tcp_rcv_cnt, tcp->tcp_unsent, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* per-connection rexmits aren't tracked today */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode + more */ + (ino_t)attr.va_nodeid, 0, NULL, 0, 0, 0, 0, 0); + } + } + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(uiobuf, IPV4_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_tcp(uiobuf, IPV6_VERSION); +} + +static void +lxpr_format_udp(lxpr_uiobuf_t *uiobuf, ushort_t ipver) +{ + int i, sl = 0; + connf_t *connfp; + conn_t *connp; + netstack_t *ns; + ip_stack_t *ipst; + + ASSERT(ipver == IPV4_VERSION || ipver == IPV6_VERSION); + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, " sl local_address rem_address" + " st tx_queue rx_queue tr tm->when retrnsmt uid" + " timeout inode ref pointer drops\n"); + } else { + lxpr_uiobuf_printf(uiobuf, " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt " + "uid timeout inode ref pointer drops\n"); + } + /* + * Due to differences between the Linux and illumos UDP + * implementations, some data will be omitted from the output here. + * + * Valid fields: + * - local_address + * - remote_address + * - st: limited + * - uid + * + * Omitted/invalid fields + * - tx_queue + * - rx_queue + * - tr + * - tm->when + * - retrnsmt + * - timeout + * - inode + */ + + ns = netstack_get_current(); + if (ns == NULL) + return; + ipst = ns->netstack_ip; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_UDPCONN)) != NULL) { + udp_t *udp; + int state = 0; + vattr_t attr; + sonode_t *so = (sonode_t *)connp->conn_upper_handle; + vnode_t *vp = (so != NULL) ? so->so_vnode : NULL; + if (connp->conn_ipversion != ipver) + continue; + udp = connp->conn_udp; + if (ipver == IPV4_VERSION) { + lxpr_uiobuf_printf(uiobuf, + "%4d: %08X:%04X %08X:%04X ", + ++sl, + connp->conn_laddr_v4, + ntohs(connp->conn_lport), + connp->conn_faddr_v4, + ntohs(connp->conn_fport)); + } else { + lxpr_uiobuf_printf(uiobuf, "%4d: " + "%08X%08X%08X%08X:%04X " + "%08X%08X%08X%08X:%04X ", + ++sl, + connp->conn_laddr_v6.s6_addr32[0], + connp->conn_laddr_v6.s6_addr32[1], + connp->conn_laddr_v6.s6_addr32[2], + connp->conn_laddr_v6.s6_addr32[3], + ntohs(connp->conn_lport), + connp->conn_faddr_v6.s6_addr32[0], + connp->conn_faddr_v6.s6_addr32[1], + connp->conn_faddr_v6.s6_addr32[2], + connp->conn_faddr_v6.s6_addr32[3], + ntohs(connp->conn_fport)); + } + + switch (udp->udp_state) { + case TS_UNBND: + case TS_IDLE: + state = 7; + break; + case TS_DATA_XFER: + state = 1; + break; + } + + /* fetch the simulated inode for the socket */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + lxpr_uiobuf_printf(uiobuf, + "%02X %08X:%08X %02X:%08X %08X " + "%5u %8d %lu %d %p %d\n", + state, + 0, 0, /* rx/tx queue */ + 0, 0, /* tr, when */ + 0, /* retrans */ + connp->conn_cred->cr_uid, + 0, /* timeout */ + /* inode, ref, pointer, drops */ + (ino_t)attr.va_nodeid, 0, NULL, 0); + } + } + netstack_rele(ns); +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(uiobuf, IPV4_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp6(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_format_udp(uiobuf, IPV6_VERSION); +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + sonode_t *so; + zoneid_t zoneid = getzoneid(); + + lxpr_uiobuf_printf(uiobuf, "Num RefCount Protocol Flags Type " + "St Inode Path\n"); + + mutex_enter(&socklist.sl_lock); + for (so = socklist.sl_list; so != NULL; + so = _SOTOTPI(so)->sti_next_so) { + vnode_t *vp = so->so_vnode; + vattr_t attr; + sotpi_info_t *sti; + const char *name = NULL; + int status = 0; + int type = 0; + int flags = 0; + + /* Only process active sonodes in this zone */ + if (so->so_count == 0 || so->so_zoneid != zoneid) + continue; + + /* + * Grab the inode, if possible. + * This must be done before entering so_lock. + */ + if (vp == NULL || + VOP_GETATTR(vp, &attr, 0, CRED(), NULL) != 0) + attr.va_nodeid = 0; + + mutex_enter(&so->so_lock); + sti = _SOTOTPI(so); + + if (sti->sti_laddr_sa != NULL && + sti->sti_laddr_len > 0) { + name = sti->sti_laddr_sa->sa_data; + } else if (sti->sti_faddr_sa != NULL && + sti->sti_faddr_len > 0) { + name = sti->sti_faddr_sa->sa_data; + } + + /* + * Derived from enum values in Linux kernel source: + * include/uapi/linux/net.h + */ + if ((so->so_state & SS_ISDISCONNECTING) != 0) { + status = 4; + } else if ((so->so_state & SS_ISCONNECTING) != 0) { + status = 2; + } else if ((so->so_state & SS_ISCONNECTED) != 0) { + status = 3; + } else { + status = 1; + /* Add ACC flag for stream-type server sockets */ + if (so->so_type != SOCK_DGRAM && + sti->sti_laddr_sa != NULL) + flags |= 0x10000; + } + + /* Convert to Linux type */ + switch (so->so_type) { + case SOCK_DGRAM: + type = 2; + break; + case SOCK_SEQPACKET: + type = 5; + break; + default: + type = 1; + } + + lxpr_uiobuf_printf(uiobuf, "%p: %08X %08X %08X %04X %02X %5llu", + so, + so->so_count, + 0, /* proto, always 0 */ + flags, + type, + status, + (ino_t)attr.va_nodeid); + + /* + * Due to shortcomings in the abstract socket emulation, they + * cannot be properly represented here (as @<path>). + * + * This will be the case until they are better implemented. + */ + if (name != NULL) + lxpr_uiobuf_printf(uiobuf, " %s\n", name); + else + lxpr_uiobuf_printf(uiobuf, "\n"); + mutex_exit(&so->so_lock); + } + mutex_exit(&socklist.sl_lock); +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced, unless we're open non-blocking, in which case we return after + * 1ms. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + timestruc_t to; + timestruc_t *tp = NULL; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (lxpr_uiobuf_nonblock(uiobuf)) { + to.tv_sec = 0; + to.tv_nsec = 1000000; /* 1msec */ + tp = &to; + } + + if (ldi_getmsg(lh, &mp, tp) == 0) { + /* + * lx procfs doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * Over the years, /proc/partitions has been made considerably smaller -- to + * the point that it really is only major number, minor number, number of + * blocks (which we report as 0), and partition name. We support this only + * because some things want to see it to make sense of /proc/diskstats. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + + kstat_t *ksr; + kstat_t ks0; + int nidx, num, i; + size_t sidx, size; + + ASSERT(lxpnp->lxpr_type == LXPR_PARTITIONS); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + + if (ksr == NULL) + return; + + lxpr_uiobuf_printf(uiobuf, "major minor #blocks name\n\n"); + + for (i = 1; i < nidx; i++) { + kstat_t *ksp = &ksr[i]; + kstat_io_t *kip; + + if (ksp->ks_type != KSTAT_TYPE_IO || + strcmp(ksp->ks_class, "disk") != 0) + continue; + + if ((kip = (kstat_io_t *)lxpr_kstat_read(ksp, B_TRUE, + &size, &num)) == NULL) + continue; + + if (size < sizeof (kstat_io_t)) { + kmem_free(kip, size); + continue; + } + + lxpr_uiobuf_printf(uiobuf, "%4d %7d %10d %s\n", + mod_name_to_major(ksp->ks_module), + ksp->ks_instance, 0, ksp->ks_name); + + kmem_free(kip, size); + } + + kmem_free(ksr, sidx); +} + +/* + * lxpr_read_diskstats(): + * + * See the block comment above the per-device output-generating line for the + * details of the format. + */ +/* ARGSUSED */ +static void +lxpr_read_diskstats(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + kstat_t *ksr; + kstat_t ks0; + int nidx, num, i; + size_t sidx, size; + + ASSERT(lxpnp->lxpr_type == LXPR_DISKSTATS); + + ks0.ks_kid = 0; + ksr = (kstat_t *)lxpr_kstat_read(&ks0, B_FALSE, &sidx, &nidx); + + if (ksr == NULL) + return; + + for (i = 1; i < nidx; i++) { + kstat_t *ksp = &ksr[i]; + kstat_io_t *kip; + + if (ksp->ks_type != KSTAT_TYPE_IO || + strcmp(ksp->ks_class, "disk") != 0) + continue; + + if ((kip = (kstat_io_t *)lxpr_kstat_read(ksp, B_TRUE, + &size, &num)) == NULL) + continue; + + if (size < sizeof (kstat_io_t)) { + kmem_free(kip, size); + continue; + } + + /* + * /proc/diskstats is defined to have one line of output for + * each block device, with each line containing the following + * 14 fields: + * + * 1 - major number + * 2 - minor mumber + * 3 - device name + * 4 - reads completed successfully + * 5 - reads merged + * 6 - sectors read + * 7 - time spent reading (ms) + * 8 - writes completed + * 9 - writes merged + * 10 - sectors written + * 11 - time spent writing (ms) + * 12 - I/Os currently in progress + * 13 - time spent doing I/Os (ms) + * 14 - weighted time spent doing I/Os (ms) + * + * One small hiccup: we don't actually keep track of time + * spent reading vs. time spent writing -- we keep track of + * time waiting vs. time actually performing I/O. While we + * could divide the total time by the I/O mix (making the + * obviously wrong assumption that I/O operations all take the + * same amount of time), this has the undesirable side-effect + * of moving backwards. Instead, we report the total time + * (read + write) for all three stats (read, write, total). + * This is also a lie of sorts, but it should be more + * immediately clear to the user that reads and writes are + * each being double-counted as the other. + */ + lxpr_uiobuf_printf(uiobuf, "%4d %7d %s " + "%llu %llu %llu %llu " + "%llu %llu %llu %llu " + "%llu %llu %llu\n", + mod_name_to_major(ksp->ks_module), + ksp->ks_instance, ksp->ks_name, + (uint64_t)kip->reads, 0LL, + kip->nread / (uint64_t)LXPR_SECTOR_SIZE, + (kip->rtime + kip->wtime) / (uint64_t)(NANOSEC / MILLISEC), + (uint64_t)kip->writes, 0LL, + kip->nwritten / (uint64_t)LXPR_SECTOR_SIZE, + (kip->rtime + kip->wtime) / (uint64_t)(NANOSEC / MILLISEC), + (uint64_t)(kip->rcnt + kip->wcnt), + (kip->rtime + kip->wtime) / (uint64_t)(NANOSEC / MILLISEC), + (kip->rlentime + kip->wlentime) / + (uint64_t)(NANOSEC / MILLISEC)); + + kmem_free(kip, size); + } + + kmem_free(ksr, sidx); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + char *vers; + + vers = lx_get_zone_kern_version(LXPTOZ(lxpnp)); + + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + LX_UNAME_SYSNAME, vers, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + LX_UNAME_VERSION, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + char *lx_kern_version = lx_get_zone_kern_version(LXPTOZ(lxpnp)); + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + if (strncmp(lx_kern_version, "2.4", 3) != 0) { + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + if (strncmp(lx_kern_version, "2.4", 3) != 0) { + lxpr_uiobuf_printf(uiobuf, + "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + } else { + lxpr_uiobuf_printf(uiobuf, + "cpu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum); + } + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + if (strncmp(lx_kern_version, "2.4", 3) != 0) { + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + } else { + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu\n", + cp->cpu_id, + user_ticks, 0L, sys_ticks, idle_ticks); + } + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + if (strncmp(lx_kern_version, "2.4", 3) != 0) { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); + } else { + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum); + } +} + +/* + * lxpr_read_swaps(): + * + * We don't support swap files or partitions, so just provide a dummy file with + * the necessary header. + */ +/* ARGSUSED */ +static void +lxpr_read_swaps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "Filename " + "Type Size Used Priority\n"); +} + +/* + * inotify tunables exported via /proc. + */ +extern int inotify_maxevents; +extern int inotify_maxinstances; +extern int inotify_maxwatches; + +static void +lxpr_read_sys_fs_inotify_max_queued_events(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_QUEUED_EVENTS); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxevents); +} + +static void +lxpr_read_sys_fs_inotify_max_user_instances(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_INSTANCES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxinstances); +} + +static void +lxpr_read_sys_fs_inotify_max_user_watches(lxpr_node_t *lxpnp, + lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFY_MAX_USER_WATCHES); + lxpr_uiobuf_printf(uiobuf, "%d\n", inotify_maxwatches); +} + +static void +lxpr_read_sys_kernel_hostname(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_HOSTNAME); + lxpr_uiobuf_printf(uiobuf, "%s\n", uts_nodename()); +} + +static void +lxpr_read_sys_kernel_msgmni(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_MSGMNI); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_msgmni, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_ngroups_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_NGROUPS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", ngroups_max); +} + +static void +lxpr_read_sys_kernel_pid_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_PID_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", maxpid); +} + +static void +lxpr_read_sys_kernel_shmmax(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + rctl_qty_t val; + + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_SHMMAX); + + mutex_enter(&curproc->p_lock); + val = rctl_enforced_value(rc_zone_shmmax, + curproc->p_zone->zone_rctls, curproc); + mutex_exit(&curproc->p_lock); + + if (val > FOURGB) + val = FOURGB; + + lxpr_uiobuf_printf(uiobuf, "%u\n", (uint_t)val); +} + +static void +lxpr_read_sys_kernel_threads_max(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNEL_THREADS_MAX); + lxpr_uiobuf_printf(uiobuf, "%d\n", curproc->p_zone->zone_nlwps_ctl); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink, fifo or socket + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = lxpnp->lxpr_realvp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_PID_TASK_IDDIR: + vap->va_nlink = TIDDIRFILES; + vap->va_size = TIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FD_FD: + /* + * Restore VLNK type for lstat-type activity. + * See lxpr_readlink for more details. + */ + if ((flags & FOLLOW) == 0) + vap->va_type = VLNK; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_LIMITS: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + case LXPR_PID_TID_FDDIR: + case LXPR_PID_TID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_taskdir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + pid_t real_pid; + uint_t tid; + int c; + kthread_t *t; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASKDIR); + + /* + * convert the string rendition of the filename to a thread ID + */ + tid = 0; + while ((c = *comp++) != '\0') { + int otid; + if (c < '0' || c > '9') + return (NULL); + + otid = tid; + tid = 10 * tid + c - '0'; + /* integer overflow */ + if (tid / 10 != otid) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + real_pid = get_real_pid(dlxpnp->lxpr_pid); + p = lxpr_lock(real_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any threads. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + t = lxpr_get_thread(p, tid); + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + thread_unlock(t); + + /* + * Allocate and fill in a new lx /proc taskid node. + * Instead of the last arg being a fd, it is a tid. + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_TASK_IDDIR, p, tid); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); +} + +/* + * Lookup one of the process's task ID's. + */ +static vnode_t * +lxpr_lookup_task_tid_dir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + proc_t *p; + pid_t real_pid; + kthread_t *t; + int i; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + /* + * get the proc to work with and lock it + */ + real_pid = get_real_pid(dlxpnp->lxpr_pid); + p = lxpr_lock(real_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any threads. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* need to confirm tid is still there */ + t = lxpr_get_thread(p, dlxpnp->lxpr_desc); + if (t == NULL) { + lxpr_unlock(p); + return (NULL); + } + thread_unlock(t); + + /* + * allocate and fill in the new lx /proc taskid dir node + */ + for (i = 0; i < TIDDIRFILES; i++) { + if (strcmp(tiddir[i].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, tiddir[i].d_type, p, + dlxpnp->lxpr_desc); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + lxpr_unlock(p); + return (dp); + } + } + + lxpr_unlock(p); + return (NULL); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR || + dlxpnp->lxpr_type == LXPR_PID_TID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + /* + * For certain entries (sockets, pipes, etc), Linux expects a + * bogus-named symlink. If that's the case, report the type as + * VNON to bypass link-following elsewhere in the vfs system. + * + * See lxpr_readlink for more details. + */ + if (lxpr_readlink_pid_fd(lxpnp, NULL, 0) == 0) + LXPTOV(lxpnp)->v_type = VNON; + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lx /proc node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lx_procdir, PROCDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sysdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sysdir, SYSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_kerneldir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fsdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fsdir, + SYS_FSDIRFILES)); +} + +static vnode_t * +lxpr_lookup_sys_fs_inotifydir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_lookup_common(dp, comp, NULL, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lx_procdir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, or 0 if zsched, otherwise use the value from + * the proc structure + */ + if (p->p_pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + } else if (p->p_pid == curproc->p_zone->zone_zsched->p_pid) { + pid = 0; + } else { + pid = p->p_pid; + } + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + pid_t find_pid; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + if (lxpnp->lxpr_pid == 1) { + find_pid = curproc->p_zone->zone_proc_initpid; + } else if (lxpnp->lxpr_pid == 0) { + find_pid = curproc->p_zone->zone_zsched->p_pid; + } else { + find_pid = lxpnp->lxpr_pid; + } + p = prfind(find_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_taskdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int tiddirsize = -1; + int tasknum; + pid_t real_pid; + kthread_t *t; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASKDIR); + + oresid = uiop->uio_resid; + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = lxpr_lock(real_pid); + + /* can't read its contents if it died */ + if (p == NULL) { + return (ENOENT); + } + if (p->p_stat == SIDL) { + lxpr_unlock(p); + return (ENOENT); + } + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + tiddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its threads. + */ + mutex_exit(&p->p_lock); + + if (tiddirsize == -1) + tiddirsize = p->p_lwpcnt; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + if ((t = p->p_tlist) == NULL) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until all thread's have + * been returned. + */ + for (tasknum = 0; (uresid = uiop->uio_resid) > 0; tasknum++) { + int i; + int reclen; + int len; + uint_t emul_tid; + lx_lwp_data_t *lwpd; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the thread list + */ + i = (uoffset / LXPR_SDSIZE) - 2; + if (i < 0 || i >= tiddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (i != tasknum) + goto next; + + lwpd = ttolxlwp(t); + if (lwpd == NULL) { + emul_tid = t->t_tid; + } else { + emul_tid = lwpd->br_pid; + /* + * Convert pid to Linux default of 1 if we're the + * zone's init. + */ + if (emul_tid == curproc->p_zone->zone_proc_initpid) + emul_tid = 1; + } + + dirent->d_ino = lxpr_inode(LXPR_PID_TASK_IDDIR, lxpnp->lxpr_pid, + emul_tid); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", emul_tid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + + if ((t = t->t_forw) == p->p_tlist) { + if (eofp != NULL) + *eofp = 1; + goto out; + } + } + + if (eofp != NULL) + *eofp = 0; + +out: + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_task_tid_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + pid_t real_pid; + kthread_t *t; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_TASK_IDDIR); + + mutex_enter(&pidlock); + + real_pid = get_real_pid(lxpnp->lxpr_pid); + p = prfind(real_pid); + + /* can't read its contents if it died */ + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + + mutex_exit(&pidlock); + + /* need to confirm tid is still there */ + t = lxpr_get_thread(p, lxpnp->lxpr_desc); + if (t == NULL) { + /* we can't find this specific thread */ + return (NULL); + } + thread_unlock(t); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, tiddir, TIDDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR || + lxpnp->lxpr_type == LXPR_PID_TID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + +static int +lxpr_readdir_sysdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sysdir, SYSDIRFILES)); +} + +static int +lxpr_readdir_sys_fsdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FSDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fsdir, + SYS_FSDIRFILES)); +} + +static int +lxpr_readdir_sys_fs_inotifydir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_FS_INOTIFYDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_fs_inotifydir, + SYS_FS_INOTIFYDIRFILES)); +} + +static int +lxpr_readdir_sys_kerneldir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_SYS_KERNELDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, sys_kerneldir, + SYS_KERNELDIRFILES)); +} + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* + * Linux does something very "clever" for /proc/<pid>/fd/<num> entries. + * Open FDs are represented as symlinks, the link contents + * corresponding to the open resource. For plain files or devices, + * this isn't absurd since one can dereference the symlink to query + * the underlying resource. For sockets or pipes, it becomes ugly in a + * hurry. To maintain this human-readable output, those FD symlinks + * point to bogus targets such as "socket:[<inodenum>]". This requires + * circumventing vfs since the stat/lstat behavior on those FD entries + * will be unusual. (A stat must retrieve information about the open + * socket or pipe. It cannot fail because the link contents point to + * an absent file.) + * + * To accomplish this, lxpr_getnode returns an vnode typed VNON for FD + * entries. This bypasses code paths which would normally + * short-circuit on symlinks and allows us to emulate the vfs behavior + * expected by /proc consumers. + */ + if (vp->v_type != VLNK && lxpnp->lxpr_type != LXPR_PID_FD_FD) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) { + /* + * Special handling possible for /proc/<pid>/fd/<num> + * Generate <type>:[<inode>] links, if allowed. + */ + if (lxpnp->lxpr_type != LXPR_PID_FD_FD || + lxpr_readlink_pid_fd(lxpnp, bp, buflen) != 0) { + return (error); + } + } + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process or 0 if zsched. + */ + if (curproc->p_pid == + curproc->p_zone->zone_proc_initpid) { + pid = 1; + } else if (curproc->p_pid == + curproc->p_zone->zone_zsched->p_pid) { + pid = 0; + } else { + pid = curproc->p_pid; + } + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * Attempt to create Linux-proc-style fake symlinks contents for supported + * /proc/<pid>/fd/<#> entries. + */ +static int +lxpr_readlink_pid_fd(lxpr_node_t *lxpnp, char *bp, size_t len) +{ + const char *format; + vnode_t *rvp = lxpnp->lxpr_realvp; + vattr_t attr; + + switch (rvp->v_type) { + case VSOCK: + format = "socket:[%lu]"; + break; + case VFIFO: + format = "pipe:[%lu]"; + break; + default: + return (-1); + } + + /* Fetch the inode of the underlying vnode */ + if (VOP_GETATTR(rvp, &attr, 0, CRED(), NULL) != 0) + return (-1); + + if (bp != NULL) + (void) snprintf(bp, len, format, (ino_t)attr.va_nodeid); + return (0); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sys/lx_audio.h b/usr/src/uts/common/brand/lx/sys/lx_audio.h new file mode 100644 index 0000000000..cbb3431c4b --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_audio.h @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_AUDIO_H +#define _LX_AUDIO_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/zone.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * name for this driver + */ +#define LX_AUDIO_DRV "lx_audio" + +/* + * names for the minor nodes this driver exports + */ +#define LXA_MINORNAME_DEVCTL "lx_devctl" +#define LXA_MINORNAME_DSP "lx_dsp" +#define LXA_MINORNAME_MIXER "lx_mixer" + +/* + * minor numbers for the minor nodes this driver exporrts + */ +#define LXA_MINORNUM_DEVCTL 0 +#define LXA_MINORNUM_DSP 1 +#define LXA_MINORNUM_MIXER 2 +#define LXA_MINORNUM_COUNT 3 + +/* + * driver ioctls + * + * note that we're layering on top of solaris audio devices so we want + * to make sure that our ioctls namespace doesn't conflict with theirs. + * looking in sys/audioio.h and sys/mixer.h we see that they seem to + * use an _IO key of 'A' and 'M', so we'll choose an _IO key of 'a.' + */ + +/* + * administrative ioctls. + * these ioctls are only supported on the DEVCTL minor node + */ +#define LXA_IOC_ZONE_REG (_IOR('a', 0, lxa_zone_reg_t)) +#define LXA_IOC_ZONE_UNREG (_IOR('a', 1, lxa_zone_reg_t)) + + +/* + * audio and mixer device ioctls + * these ioctls are supported on DSP and MIXER minor nodes. + */ +#define LXA_IOC_GETMINORNUM (_IOR('a', 20, int)) + +/* + * audio device ioctls. + * these ioctls are supports on DSP minor nodes. + */ +#define LXA_IOC_MMAP_OUTPUT (_IOR('a', 41, int)) +#define LXA_IOC_MMAP_PTR (_IOR('a', 42, int)) +#define LXA_IOC_GET_FRAG_INFO (_IOR('a', 43, lxa_frag_info_t)) +#define LXA_IOC_SET_FRAG_INFO (_IOR('a', 44, lxa_frag_info_t)) + +/* + * mixer device ioctls. + * these ioctls are supports on MIXER minor nodes. + */ +#define LXA_IOC_MIXER_GET_VOL (_IOR('a', 60, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_VOL (_IOR('a', 61, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_GET_MIC (_IOR('a', 62, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_MIC (_IOR('a', 63, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_GET_PCM (_IOR('a', 64, lxa_mixer_levels_t)) +#define LXA_IOC_MIXER_SET_PCM (_IOR('a', 65, lxa_mixer_levels_t)) + +/* command structure for LXA_IOC_ZONE_REG */ +#define LXA_INTSTRLEN 32 +typedef struct lxa_zone_reg { + char lxa_zr_zone_name[ZONENAME_MAX]; + char lxa_zr_inputdev[LXA_INTSTRLEN]; + char lxa_zr_outputdev[LXA_INTSTRLEN]; +} lxa_zone_reg_t; + +/* command structure for LXA_IOC_GET_FRAG_INFO and LXA_IOC_SET_FRAG_INFO */ +typedef struct lxa_frag_info { + int lxa_fi_size; + int lxa_fi_cnt; +} lxa_frag_info_t; + +/* command structure for LXA_IOC_MIXER_GET_* and LXA_IOC_MIXER_SET_* */ +typedef struct lxa_mixer_levels { + int lxa_ml_gain; + int lxa_ml_balance; +} lxa_mixer_levels_t; + +/* verify that a solaris mixer level structure has valid values */ +#define LXA_MIXER_LEVELS_OK(x) (((x)->lxa_ml_gain >= AUDIO_MIN_GAIN) && \ + ((x)->lxa_ml_gain <= AUDIO_MAX_GAIN) && \ + ((x)->lxa_ml_balance >= AUDIO_LEFT_BALANCE) && \ + ((x)->lxa_ml_balance <= AUDIO_RIGHT_BALANCE)) + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUDIO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs.h b/usr/src/uts/common/brand/lx/sys/lx_autofs.h new file mode 100644 index 0000000000..a6ad86f594 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs.h @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_AUTOFS_H +#define _LX_AUTOFS_H + +/* + * The lx_autofs filesystem exists to emulate the Linux autofs filesystem + * and provide support for the Linux "automount" automounter. + * + * + * + * +++ Linux automounter background. + * + * Linux has two automounters: "amd" and "automount" + * + * 1) "amd" is a userland NFS server. It basically mounts an NFS filesystem + * at an automount point, and it acts as the NFS server for the mount. When + * an access is done to that NFS filesystem, the access is redirected by the + * kernel to the "amd" process via rpc. "amd" then looks up any information + * required to resolve the requests, mounts real NFS filesystems if + * necessary, and returns. "amd" has it's own strange configuration + * mechanism that doesn't seem to be very compatabile with Solaris's network + * based automounter map support. + * + * 2) "automount" is the other Linux automounter. It utilizes a kernel + * filesystem (autofs) to provide it's functionality. Basically, it mounts + * the autofs filesystem at any automounter controlled mount point. This + * filesystem then intercepts and redirects lookup operations (and only + * lookup ops) to the userland automounter process via a pipe. (The + * pipe to the automounter is establised via mount options when the autofs + * filesystem is mounted.) When the automounter recieves a request via this + * pipe, it does lookups to whatever backing store it's configured to use, + * does mkdir operations on the autofs filesystem, mounts remote NFS + * filesystems on any leaf directories it just created, and signals the + * autofs filesystem via an ioctl to let it know that the lookup can + * continue. + * + * + * + * +++ Linux autofs (and automount daemon) notes + * + * Since we're mimicking the behavior of the Linux autofs filesystem it's + * important to document some of it's observed behavior here since there's + * no doubt that in the future this behavior will change. These comments + * apply to the behavior of the automounter as observed on a system + * running Linux v2.4.21 (autofs is bundled with the Linux kernel). + * + * A) Autofs allows root owned, non-automounter processes to create + * directories in the autofs filesystem. The autofs filesystem treats the + * automounter's process group as special, but it doesn't prevent root + * processes outside of the automounter's process group from creating new + * directories in the autofs filesystem. + * + * B) Autofs doesn't allow creation of any non-directory entries in the + * autofs filesystem. No entity can create files (e.g. /bin/touch or + * VOP_CREATE/VOP_SYMLINK/etc.) The only entries that can exist within + * the autofs filesystem are directories. + * + * C) Autofs only intercepts vop lookup operations. Notably, it does _not_ + * intercept and re-direct vop readdir operations. This means that the + * observed behavior of the Linux automounter can be considerably different + * from that of the Solaris automounter. Specifically, on Solaris if autofs + * mount point is mounted _without_ the -nobrowse option then if a user does + * an ls operation (which translates into a vop readdir operation) then the + * automounter will intercept that operation and list all the possible + * directories and mount points without actually mounting any filesystems. + * Essentially, all automounter managed mount points on Linux will behave + * like "-nobrowse" mount points on Solaris. Here's an example to + * illustrate this. If /ws was mounted on Solaris without the -nobrowse + * option and an auto_ws yp map was setup as the backing store for this + * mount point, then an "ls /ws" would list all the keys in the map as + * valid directories, but an "ls /ws" on Linux would list an emptry + * directory. + * + * D) NFS mounts are performed by the automount process. When the automount + * process gets a redirected lookup request, it determines _all_ the + * possible remote mount points for that request, creates directory paths + * via mkdir, and mounts the remote filesystems on the newly created paths. + * So for example, if a machine called mcescher exported /var/crash and + * /var/core, an "ls /net/mcescher" would result in the following actions + * being done by the automounter: + * mkdir /net/mcescher + * mkdir /net/mcescher/var + * mkdir /net/mcescher/var/crash + * mkdir /net/mcescher/var/core + * mount mcescher:/var/crash /var/crash + * mount mcescher:/var/crash /var/core + * once the automounter compleated the work above it would signal the autofs + * filesystem (via an ioctl) that the lookup could continue. + * + * E.1) Autofs only redirects vop lookup operations for path entries that + * don't already exist in the autofs filesystem. So for the example above, + * an initial (after the start of the automounter) "ls /net/mcescher" would + * result in a request to the automounter. A subsequest "ls /net/mcescher" + * would not result in a request to the automounter. Even if + * /net/mcescher/var/crash and /net/mcescher/var/core were manually unmounted + * after the initial "ls /net/mcescher", a subsequest "ls /net/mcescher" + * would not result in a new request to the automounter. + * + * E.2) Autofs lookup requests that are sent to the automounter only include + * the root directory path component. So for example, after starting up + * the automounter if a user were to do a "ls /net/mcescher/var/crash", the + * lookup request actually sent to the automounter would just be for + * "mcescher". (The same request as if the user had done "ls /net/mcescher".) + * + * E.3) The two statements above aren't entirely entirely true. The Linux + * autofs filesystem will also redirect lookup operations for leaf + * directories that don't have a filesystem mounted on them. Using the + * example above, if a user did a "ls /net/mcescher", then manually + * unmounted /net/mcescher/var/crash, and then did an "ls + * /net/mcescher/var/crash", this would result in a request for + * "mcescher/var/crash" being sent to the automounter. The strange thing + * (a Linux bug perhaps) is that the automounter won't do anything with this + * request and the lookup will fail. + * + * F) The autofs filesystem communication protocol (what ioctls it supports + * and what data it passes to the automount process) are versioned. The + * source for the userland automount daemon (i looked at version v3.1.7) + * seemed to support two versions of the Linux kernel autofs implementation. + * Both versions supported communiciation with a pipe and the format of the + * structure passed via this pipe was the same. The difference between the + * two versions was in the functionality supported. (The v3 version has + * additional ioctls to support automount timeouts.) + * + * + * + * +++ lx_autofs notes + * + * 1) In general, the lx_autofs filesystem tries to mimic the behavior of the + * Linux autofs filesystem with the following exceptions: + * + * 1.1) We don't bother to implement the E.3 functionality listed above + * since it doesn't appear to be of any use. + * + * 1.2) We only implement v2 of the automounter protocol since + * implementing v3 would take a _lot_ more work. If this proves to be a + * problem we can re-visit this decision later. (More details about v3 + * support are included in comments below.) + * + * 2) In general, the approach taken for lx_autofs is to keep it as simple + * as possible and to minimize it's memory usage. To do this all information + * about the contents of the lx_autofs filesystem are mirrored in the + * underlying filesystem that lx_autofs is mounted on and most vop operations + * are simply passed onto this underlying filesystem. This means we don't + * have to implement most the complex operations that a full filesystem + * normally has to implement. It also means that most of our filesystem state + * (wrt the contents of the filesystem) doesn't actually have to be stored + * in memory, we can simply go to the underlying filesystem to get it when + * it's requested. For the purposes of discussion, we'll call the underlying + * filesystem the "backing store." + * + * The backing store is actually directory called ".lx_afs" which is created in + * the directory where the lx_autofs filesystem is mounted. When the lx_autofs + * filesystem is unmounted this backing store directory is deleted. If this + * directory exists at mount time (perhaps the system crashed while a previous + * lx_autofs instance was mounted at the same location) it will be deleted. + * There are a few implications of using a backing store worth mentioning. + * + * 2.1) lx_autofs can't be mounted on a read only filesystem. If this + * proves to be a problem we can probably move the location of the + * backing store. + * + * 2.2) If the backing store filesystem runs out of space then the + * automounter process won't be able to create more directories and mount + * new filesystems. Of course, strange failures usually happen when + * filesystems run out of space. + * + * 3) Why aren't we using gfs? gfs has two different usage models. + * + * 3.1) I'm my own filesystem but i'm using gfs to help with managing + * readdir operations. + * + * 3.2) I'm a gfs filesystem and gfs is managing all my vnodes + * + * We're not using the 3.1 interfaces because we don't implement readdir + * ourselves. We pass all readdir operations onto the backing store + * filesystem and utilize its readdir implementation. + * + * We're not using the 3.2 interfaces because they are really designed for + * in memory filesystems where all of the filesystem state is stored in + * memory. They don't lend themselves to filesystems where part of the + * state is in memory and part of the state is on disk. + * + * For more information on gfs take a look at the block comments in the + * top of gfs.c + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that the name of the actual Solaris filesystem is lx_afs and not + * lx_autofs. This is becase filesystem names are stupidly limited to 8 + * characters. + */ +#define LX_AUTOFS_NAME "lx_afs" + +/* + * Mount options supported. + */ +#define LX_MNTOPT_FD "fd" +#define LX_MNTOPT_PGRP "pgrp" +#define LX_MNTOPT_MINPROTO "minproto" +#define LX_MNTOPT_MAXPROTO "maxproto" +#define LX_MNTOPT_MODE "mode" +#define LX_MNTOPT_SIZE "size" + +/* Version of the Linux kernel automount protocol we support. */ +#define LX_AUTOFS_PROTO_VERSION 2 + +/* + * Command structure sent to automount process from lx_autofs via a pipe. + * This structure is the same for v2 and v3 of the automount protocol + * (the communication pipe is established at mount time). + */ +typedef struct lx_autofs_pkt { + int lap_protover; /* protocol version number */ + int lap_constant; /* always set to 0 */ + int lap_id; /* every pkt must have a unique id */ + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component to lookup */ +} lx_autofs_pkt_t; + +/* + * Ioctls supprted (v2 protocol). + */ +#define LX_AUTOFS_IOC_READY 0x00009360 /* arg: int */ +#define LX_AUTOFS_IOC_FAIL 0x00009361 /* arg: int */ +#define LX_AUTOFS_IOC_CATATONIC 0x00009362 /* arg: <none> */ + +/* + * Ioctls not supported (v3 protocol). + * + * Initially we're only going to support v2 of the Linux kernel automount + * protocol. This means that we don't support the following ioctls. + * + * 1) The protocol version ioctl (by not supporting it the automounter + * will assume version 2). + * + * 2) Automounter timeout ioctls. For v3 and later the automounter can + * be started with a timeout option. It will notify the filesystem of + * this timeout and, if any automounter filesystem root directory entry + * is not in use, it will notify the automounter via the LX_AUTOFS_IOC_EXPIRE + * ioctl. For example, if the timeout is 60 seconds, the Linux + * automounter will use the LX_AUTOFS_IOC_EXPIRE ioctl to query for + * timeouts more often than that. (v3.1.7 of the automount daemon would + * perform this ioctl every <timeout>/4 seconds.) Then, if the autofs + * filesystem will + * report top level directories that aren't in use to the automounter + * via this ioctl. If /net was managed by the automounter and + * there were the following mount points: + * /net/jurassic/var/crash + * /net/mcescher/var/crash + * and no one was looking at any crash dumps on mcescher but someone + * was analyzing a crash dump on jurassic, then after <timeout> seconds + * had passed the autofs filesystem would let the automounter know that + * "mcescher" could be unmounted. (Note the granularity of notification + * is directories in the root of the autofs filesystem.) Here's two + * ideas for how this functionality could be implemented on Solaris: + * + * 2.1) The easy incomplete way. Don't do any in-use detection. Simply + * tell the automounter it can try to unmount the filesystem every time + * the specified timeout passes. If the filesystem is in use then the + * unmount will fail. This would break down for remote hosts with multiple + * mounts. For example, if the automounter had mounted the following + * filesystems: + * /net/jurassic/var/crash + * /net/jurassic/var/core + * and the user was looking at a core file, and the timeout expired, the + * automounter would recieve notification to unmount "jurassic". Then + * it would unmount crash (which would succeed) and then to try unmount + * core (which would fail). After that (since the automounter only + * performs mounts for failed lookups in the root autofs directory) + * future access to /net/jurassic/var/crash would result to access + * to an empty autofs directory. We might be able to work around + * this by caching which root autofs directories we've timed out, + * then any access to paths that contain those directories could be + * stalled and we could resend another request to the automounter. + * This could work if the automounter ignores mount failures. + * + * 2.2) The hard correct way. The real difficulty here is detecting + * files in use on other filesystems (say NFS) that have been mounted + * on top of autofs. (Detecting in use autofs vnodes should be easy.) + * to do this we would probably have to create a new brand op to intercept + * mount/umount filesystem operations. Then using this entry point we + * could detect mounts of other filesystems on top of lx_autofs. When + * a successful mount finishes we would use the FEM (file event + * monitoring) framework to push a module onto that filesystem and + * intercept VOP operations that allocate/free vnodes in that filesystem. + * (We would also then have to track mount operations on top of that + * filesystem, etc.) this would allow us to properly detect any + * usage of subdirectories of an autofs directory. + */ +#define LX_AUTOFS_IOC_PROTOVER 0x80049363 /* arg: int */ +#define LX_AUTOFS_IOC_EXPIRE 0x81109365 /* arg: lx_autofs_expire * */ +#define LX_AUTOFS_IOC_SETTIMEOUT 0xc0049364 /* arg: ulong_t */ + +typedef struct lx_autofs_expire { + int lap_protover; /* protol version number */ + int lap_constant; /* always set to 1 */ + int lap_name_len; /* don't include newline or NULL */ + char lap_name[256]; /* path component that has timed out */ +} lx_autofs_expire_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h new file mode 100644 index 0000000000..9c5517b8d5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_autofs_impl.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LX_AUTOFS_IMPL_H +#define _LX_AUTOFS_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/file.h> +#include <sys/id_space.h> +#include <sys/modhash.h> +#include <sys/vnode.h> + +#include <sys/lx_autofs.h> + +/* + * Space key. + * Used to persist data across lx_autofs filesystem module unloads. + */ +#define LX_AUTOFS_SPACE_KEY_UDEV LX_AUTOFS_NAME "_udev" + +/* + * Name of the backing store directory. + */ +#define LX_AUTOFS_BS_DIR "." LX_AUTOFS_NAME + +#define LX_AUTOFS_VFS_ID_HASH_SIZE 15 +#define LX_AUTOFS_VFS_PATH_HASH_SIZE 15 +#define LX_AUTOFS_VFS_VN_HASH_SIZE 15 + +/* + * VFS data object. + */ +typedef struct lx_autofs_vfs { + /* Info about the underlying filesystem and backing store. */ + vnode_t *lav_mvp; + char *lav_bs_name; + vnode_t *lav_bs_vp; + + /* Info about the automounter process managing this filesystem. */ + int lav_fd; + pid_t lav_pgrp; + file_t *lav_fifo_wr; + file_t *lav_fifo_rd; + + /* Each automount requests needs a unique id. */ + id_space_t *lav_ids; + + /* All remaining structure members are protected by lav_lock. */ + kmutex_t lav_lock; + + /* Hashes to keep track of outstanding automounter requests. */ + mod_hash_t *lav_path_hash; + mod_hash_t *lav_id_hash; + + /* We need to keep track of all our vnodes. */ + vnode_t *lav_root; + mod_hash_t *lav_vn_hash; +} lx_autofs_vfs_t; + +/* + * Structure to keep track of requests sent to the automounter. + */ +typedef struct lx_autofs_lookup_req { + /* Packet that gets sent to the automounter. */ + lx_autofs_pkt_t lalr_pkt; + + /* Reference count. Always updated atomically. */ + uint_t lalr_ref; + + /* + * Fields to keep track and sync threads waiting on a lookup. + * Fields are protected by lalr_lock. + */ + kmutex_t lalr_lock; + kcondvar_t lalr_cv; + int lalr_complete; +} lx_autofs_lookup_req_t; + +/* + * Generic stack structure. + */ +typedef struct stack_elem { + list_node_t se_list; + caddr_t se_ptr1; + caddr_t se_ptr2; + caddr_t se_ptr3; +} stack_elem_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_AUTOFS_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_brand.h b/usr/src/uts/common/brand/lx/sys/lx_brand.h new file mode 100644 index 0000000000..882b25743a --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_brand.h @@ -0,0 +1,698 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_BRAND_H +#define _LX_BRAND_H + +#ifndef _ASM +#include <sys/types.h> +#include <sys/cpuvar.h> +#include <sys/zone.h> +#include <sys/ksocket.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_BRANDNAME "lx" + +/* + * Brand uname info + */ +#define LX_UNAME_SYSNAME "Linux" +#define LX_UNAME_RELEASE_2_6 "2.6.18" +#define LX_UNAME_RELEASE_2_4 "2.4.21" +#define LX_UNAME_VERSION "BrandZ virtual linux" +#define LX_UNAME_MACHINE32 "i686" +#define LX_UNAME_MACHINE64 "x86_64" + +#define LX_LIB_PATH32 "/native/usr/lib/lx_brand.so.1" +#define LX_LIB_PATH64 "/native/usr/lib/amd64/lx_brand.so.1" + +#if defined(_LP64) +#define LX_LIB_PATH LX_LIB_PATH64 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE64 +#else +#define LX_LIB_PATH LX_LIB_PATH32 +#define LX_UNAME_MACHINE LX_UNAME_MACHINE32 +#endif + +/* + * This must be large enough for both the 32-bit table and 64-bit table. + */ +#define LX_NSYSCALLS 358 + +/* + * brand(2) subcommands + * + * Everything >= 128 is a brand-specific subcommand. + * > 192 is reserved for in-kernel emulated system calls. + */ +#define B_LPID_TO_SPAIR 128 +#define B_GET_CURRENT_CONTEXT 129 +#define B_EMULATION_DONE 130 +#define B_PTRACE_KERNEL 131 +#define B_SET_AFFINITY_MASK 132 +#define B_GET_AFFINITY_MASK 133 +#define B_PTRACE_CLONE_BEGIN 134 +#define B_PTRACE_STOP_FOR_OPT 135 +#define B_UNSUPPORTED 136 +#define B_STORE_ARGS 137 +#define B_GETPID 138 +#define B_JUMP_TO_LINUX 139 +#define B_SET_THUNK_PID 140 +#define B_EXIT_AS_SIG 141 +#define B_HELPER_WAITID 142 +#define B_HELPER_CLONE 143 +#define B_HELPER_SETGROUPS 144 +#define B_HELPER_SIGQUEUE 145 +#define B_HELPER_TGSIGQUEUE 146 +#define B_SET_NATIVE_STACK 147 + +#ifndef _ASM +/* + * Support for Linux PTRACE_SETOPTIONS handling. + */ +typedef enum lx_ptrace_options { + LX_PTRACE_O_TRACESYSGOOD = 0x0001, + LX_PTRACE_O_TRACEFORK = 0x0002, + LX_PTRACE_O_TRACEVFORK = 0x0004, + LX_PTRACE_O_TRACECLONE = 0x0008, + LX_PTRACE_O_TRACEEXEC = 0x0010, + LX_PTRACE_O_TRACEVFORKDONE = 0x0020, + LX_PTRACE_O_TRACEEXIT = 0x0040, + LX_PTRACE_O_TRACESECCOMP = 0x0080 +} lx_ptrace_options_t; + +#define LX_PTRACE_O_ALL \ + (LX_PTRACE_O_TRACESYSGOOD | LX_PTRACE_O_TRACEFORK | \ + LX_PTRACE_O_TRACEVFORK | LX_PTRACE_O_TRACECLONE | \ + LX_PTRACE_O_TRACEEXEC | LX_PTRACE_O_TRACEVFORKDONE | \ + LX_PTRACE_O_TRACEEXIT | LX_PTRACE_O_TRACESECCOMP) +#endif /* !_ASM */ + +/* siginfo si_status for traced events */ +#define LX_PTRACE_EVENT_FORK 0x100 +#define LX_PTRACE_EVENT_VFORK 0x200 +#define LX_PTRACE_EVENT_CLONE 0x300 +#define LX_PTRACE_EVENT_EXEC 0x400 +#define LX_PTRACE_EVENT_VFORK_DONE 0x500 +#define LX_PTRACE_EVENT_EXIT 0x600 +#define LX_PTRACE_EVENT_SECCOMP 0x700 + +/* + * Brand-private values for the "pr_what" member of lwpstatus, for use with the + * PR_BRAND stop reason. These reasons are validated in lx_stop_notify(); + * update it if you add new reasons here. + */ +#define LX_PR_SYSENTRY 1 +#define LX_PR_SYSEXIT 2 +#define LX_PR_SIGNALLED 3 +#define LX_PR_EVENT 4 + + +#define LX_VERSION_1 1 +#define LX_VERSION LX_VERSION_1 + +#define LX_KERN_VERSION_NUM ZONE_ATTR_BRAND_ATTRS + +/* + * Aux vector containing phdr of Linux executable and ehdr of interpreter + * (if any), both of which are used by lx_librtld_db to ascertain r_debug. + * We repurpose the 3rd brand-specific aux vector slot for the Linux + * AT_SYSINFO_EHDR entry (we modify the a_type in the brand library). + */ +#define AT_SUN_BRAND_LX_PHDR AT_SUN_BRAND_AUX1 +#define AT_SUN_BRAND_LX_INTERP AT_SUN_BRAND_AUX2 +#define AT_SUN_BRAND_LX_SYSINFO_EHDR AT_SUN_BRAND_AUX3 + +/* Aux vector containing hz value */ +#define AT_CLKTCK 17 +/* Aux vector containing vDSO addr */ +#define AT_SYSINFO_EHDR 33 + +/* + * Usermode emulation routines are run on an alternate stack allocated by + * the brand library. Every LWP in a process will incur this overhead beyond + * the regular thread stack: + */ +#define LX_NATIVE_STACK_PAGE_COUNT 64 + +/* + * When returning in a new child process created with vfork(2) (or CLONE_VFORK) + * we discard some of the native stack to prevent corruption of the parent + * emulation state. + */ +#define LX_NATIVE_STACK_VFORK_GAP 0x3000 + +#ifndef _ASM + +extern struct brand lx_brand; + +typedef struct lx_brand_registration { + uint_t lxbr_version; /* version number */ + void *lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration_t; + +typedef struct lx_brand_registration32 { + uint_t lxbr_version; /* version number */ + uint32_t lxbr_handler; /* base address of handler */ + uint32_t lxbr_flags; /* LX_PROC_* registration flags */ +} lx_brand_registration32_t; + +#ifdef __amd64 + +typedef struct lx_regs { + long lxr_fs; + long lxr_rdi; + long lxr_rsi; + long lxr_rbp; + long lxr_rsp; + long lxr_rbx; + long lxr_rdx; + long lxr_rcx; + long lxr_rax; + long lxr_r8; + long lxr_r9; + long lxr_r10; + long lxr_r11; + long lxr_r12; + long lxr_r13; + long lxr_r14; + long lxr_r15; + long lxr_rip; + + long lxr_orig_rax; +} lx_regs_t; + +typedef struct lx_regs32 { + uint32_t lxr_gs; + uint32_t lxr_edi; + uint32_t lxr_esi; + uint32_t lxr_ebp; + uint32_t lxr_esp; + uint32_t lxr_ebx; + uint32_t lxr_edx; + uint32_t lxr_ecx; + uint32_t lxr_eax; + uint32_t lxr_eip; + + uint32_t lxr_orig_eax; +} lx_regs32_t; + +#else /* ! __amd64 */ + +typedef struct lx_regs { + long lxr_gs; + long lxr_edi; + long lxr_esi; + long lxr_ebp; + long lxr_esp; + long lxr_ebx; + long lxr_edx; + long lxr_ecx; + long lxr_eax; + long lxr_eip; + + long lxr_orig_eax; +} lx_regs_t; + +#endif /* __amd64 */ + +#ifdef __amd64 +/* + * The 64-bit native "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs { + long lxur_r15; + long lxur_r14; + long lxur_r13; + long lxur_r12; + long lxur_rbp; + long lxur_rbx; + long lxur_r11; + long lxur_r10; + long lxur_r9; + long lxur_r8; + long lxur_rax; + long lxur_rcx; + long lxur_rdx; + long lxur_rsi; + long lxur_rdi; + long lxur_orig_rax; + long lxur_rip; + long lxur_xcs; + long lxur_rflags; + long lxur_rsp; + long lxur_xss; + long lxur_xfs_base; + long lxur_xgs_base; + long lxur_xds; + long lxur_xes; + long lxur_xfs; + long lxur_xgs; +} lx_user_regs_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs32 { + int32_t lxur_ebx; + int32_t lxur_ecx; + int32_t lxur_edx; + int32_t lxur_esi; + int32_t lxur_edi; + int32_t lxur_ebp; + int32_t lxur_eax; + int32_t lxur_xds; + int32_t lxur_xes; + int32_t lxur_xfs; + int32_t lxur_xgs; + int32_t lxur_orig_eax; + int32_t lxur_eip; + int32_t lxur_xcs; + int32_t lxur_eflags; + int32_t lxur_esp; + int32_t lxur_xss; +} lx_user_regs32_t; +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#else /* !__amd64 */ +/* + * The 32-bit native "user_regs_struct" Linux structure. + */ +typedef struct lx_user_regs { + long lxur_ebx; + long lxur_ecx; + long lxur_edx; + long lxur_esi; + long lxur_edi; + long lxur_ebp; + long lxur_eax; + long lxur_xds; + long lxur_xes; + long lxur_xfs; + long lxur_xgs; + long lxur_orig_eax; + long lxur_eip; + long lxur_xcs; + long lxur_eflags; + long lxur_esp; + long lxur_xss; +} lx_user_regs_t; +#endif /* __amd64 */ + +#endif /* _ASM */ + +/* + * GDT usage + */ +#define GDT_TLSMIN (GDT_BRANDMIN) +#define GDT_TLSMAX (GDT_TLSMIN + 2) +#define LX_TLSNUM (GDT_TLSMAX - GDT_TLSMIN) + +#ifndef _ASM + +/* + * Stores information needed by the lx linker to launch the main + * lx executable. + */ +typedef struct lx_elf_data64 { + uintptr_t ed_phdr; + uintptr_t ed_phent; + uintptr_t ed_phnum; + uintptr_t ed_entry; + uintptr_t ed_base; + uintptr_t ed_ldentry; +} lx_elf_data64_t; + +typedef struct lx_elf_data32 { + uint32_t ed_phdr; + uint32_t ed_phent; + uint32_t ed_phnum; + uint32_t ed_entry; + uint32_t ed_base; + uint32_t ed_ldentry; +} lx_elf_data32_t; + +#if defined(_LP64) +typedef lx_elf_data64_t lx_elf_data_t; +#else +typedef lx_elf_data32_t lx_elf_data_t; +#endif + +typedef enum lx_proc_flags { + LX_PROC_INSTALL_MODE = 0x01, + LX_PROC_STRICT_MODE = 0x02 +} lx_proc_flags_t; + +#define LX_PROC_ALL (LX_PROC_INSTALL_MODE | LX_PROC_STRICT_MODE) + +#ifdef _KERNEL + +#define LX_RLFAKE_LOCKS 0 +#define LX_RLFAKE_NICE 1 +#define LX_RLFAKE_RTPRIO 2 +#define LX_RLFAKE_RTTIME 3 + +#define LX_RLFAKE_NLIMITS 4 + +#define LX_RLIM64_INFINITY (~0ULL) + +typedef struct { + uint64_t rlim_cur; + uint64_t rlim_max; +} lx_rlimit64_t; + +typedef struct lx_proc_data { + uintptr_t l_handler; /* address of user-space handler */ + pid_t l_ppid; /* pid of originating parent proc */ + uint64_t l_ptrace; /* process being observed with ptrace */ + lx_elf_data_t l_elf_data; /* ELF data for linux executable */ + int l_signal; /* signal to deliver to parent when this */ + /* thread group dies */ + lx_proc_flags_t l_flags; + + lx_rlimit64_t l_fake_limits[LX_RLFAKE_NLIMITS]; +} lx_proc_data_t; + +#endif /* _KERNEL */ + +/* + * A data type big enough to bitmap all Linux possible cpus. + * The bitmap size is defined as 1024 cpus in the Linux 2.4 and 2.6 man pages + * for sched_getaffinity() and sched_getaffinity(). + */ +#define LX_NCPU (1024) +#define LX_AFF_ULONGS (LX_NCPU / (8 * sizeof (ulong_t))) +typedef ulong_t lx_affmask_t[LX_AFF_ULONGS]; + +/* Max. length of kernel version string */ +#define LX_VERS_MAX 16 + +/* + * Flag values for uc_brand_data[0] in the ucontext_t: + */ +#define LX_UC_STACK_NATIVE 0x00001 +#define LX_UC_STACK_BRAND 0x00002 +#define LX_UC_RESTORE_NATIVE_SP 0x00010 +#define LX_UC_FRAME_IS_SYSCALL 0x00100 +#define LX_UC_RESTART_SYSCALL 0x01000 +#define LX_UC_IGNORE_LINK 0x10000 + +#ifdef _KERNEL + +typedef struct lx_lwp_data lx_lwp_data_t; + +/* + * Flag values for "lxpa_flags" on a ptrace(2) accord. + */ +typedef enum lx_accord_flags { + LX_ACC_TOMBSTONE = 0x01 +} lx_accord_flags_t; + +/* + * Flags values for "br_ptrace_flags" in the LWP-specific data. + */ +typedef enum lx_ptrace_state { + LX_PTRACE_SYSCALL = 0x01, + LX_PTRACE_EXITING = 0x02, + LX_PTRACE_STOPPING = 0x04, + LX_PTRACE_INHERIT = 0x08, + LX_PTRACE_STOPPED = 0x10, + LX_PTRACE_PARENT_WAIT = 0x20, + LX_PTRACE_CLDPEND = 0x40, + LX_PTRACE_CLONING = 0x80, + LX_PTRACE_WAITPEND = 0x100 +} lx_ptrace_state_t; + +/* + * A ptrace(2) accord represents the relationship between a tracer LWP and the + * set of LWPs that it is tracing: the tracees. This data structure belongs + * primarily to the tracer, but is reference counted so that it may be freed by + * whoever references it last. + */ +typedef struct lx_ptrace_accord { + kmutex_t lxpa_lock; + uint_t lxpa_refcnt; + lx_accord_flags_t lxpa_flags; + + /* + * The tracer must hold "pidlock" while clearing these fields for + * exclusion of waitid(), etc. + */ + lx_lwp_data_t *lxpa_tracer; + kcondvar_t *lxpa_cvp; + + /* + * The "lxpa_tracees_lock" mutex protects the tracee list. + */ + kmutex_t lxpa_tracees_lock; + list_t lxpa_tracees; +} lx_ptrace_accord_t; + +/* + * These values are stored in the per-LWP data for a tracee when it is attached + * to a tracer. They record the method that was used to attach. + */ +typedef enum lx_ptrace_attach { + LX_PTA_NONE = 0x00, /* not attached */ + LX_PTA_ATTACH = 0x01, /* due to tracer using PTRACE_ATTACH */ + LX_PTA_TRACEME = 0x02, /* due to child using PTRACE_TRACEME */ + LX_PTA_INHERIT_CLONE = 0x04, /* due to PTRACE_CLONE clone(2) flag */ + LX_PTA_INHERIT_OPTIONS = 0x08 /* due to PTRACE_SETOPTIONS options */ +} lx_ptrace_attach_t; + +typedef enum lx_stack_mode { + LX_STACK_MODE_PREINIT = 0, + LX_STACK_MODE_INIT, + LX_STACK_MODE_NATIVE, + LX_STACK_MODE_BRAND +} lx_stack_mode_t; + +struct lx_pid { + pid_t s_pid; /* the SunOS pid and ... */ + id_t s_tid; /* ... tid pair */ + pid_t l_pid; /* the corresponding linux pid */ + time_t l_start; /* birthday of this pid */ + struct pid *l_pidp; + struct lx_pid *stol_next; /* link in stol hash table */ + struct lx_pid *ltos_next; /* link in ltos hash table */ +}; + +/* + * lx-specific data in the klwp_t + */ +struct lx_lwp_data { + uint_t br_lwp_flags; /* misc. flags */ + klwp_t *br_lwp; /* back pointer to container lwp */ + int br_signal; /* signal to send to parent when */ + /* clone()'ed child terminates */ + int br_exitwhy; /* reason for thread (process) exit */ + int br_exitwhat; /* exit code / killing signal */ + lx_affmask_t br_affinitymask; /* bitmask of CPU sched affinities */ + struct user_desc br_tls[LX_TLSNUM]; + /* descriptors used by libc for TLS */ + ulong_t br_lx_fsbase; /* lx fsbase for 64-bit thread ptr */ + ulong_t br_ntv_fsbase; /* native fsbase 64-bit thread ptr */ + ulong_t br_lx_gsbase; /* lx user-land gsbase */ + ulong_t br_ntv_gsbase; /* native user-land gsbase */ + pid_t br_pid; /* converted pid for this thread */ + pid_t br_tgid; /* thread group ID for this thread */ + pid_t br_ppid; /* parent pid for this thread */ + id_t br_ptid; /* parent tid for this thread */ + void *br_clear_ctidp; /* clone thread id ptr */ + void *br_set_ctidp; /* clone thread id ptr */ + void *br_robust_list; /* robust lock list, if any */ + + /* + * The following struct is used by some system calls to pass extra + * flags into the kernel without impinging on the namespace for + * illumos. + */ + void *br_scall_args; + int br_args_size; /* size in bytes of br_scall_args */ + + boolean_t br_waitid_emulate; + int br_waitid_flags; + + lx_ptrace_state_t br_ptrace_flags; /* ptrace state for this LWP */ + lx_ptrace_options_t br_ptrace_options; /* PTRACE_SETOPTIONS options */ + lx_ptrace_options_t br_ptrace_clone_option; /* current clone(2) type */ + + lx_ptrace_attach_t br_ptrace_attach; /* how did we get attached */ + lx_ptrace_accord_t *br_ptrace_accord; /* accord for this tracer LWP */ + lx_ptrace_accord_t *br_ptrace_tracer; /* accord tracing this LWP */ + list_node_t br_ptrace_linkage; /* linkage for lxpa_tracees list */ + + ushort_t br_ptrace_whystop; /* stop reason, 0 for no stop */ + ushort_t br_ptrace_whatstop; /* stop sub-reason */ + + int32_t br_ptrace_stopsig; /* stop signal, 0 for no signal */ + uintptr_t br_ptrace_stopucp; /* usermode ucontext_t pointer */ + + uint_t br_ptrace_event; + ulong_t br_ptrace_eventmsg; + + int br_syscall_num; /* current system call number */ + boolean_t br_syscall_restart; /* should restart on EINTR */ + + /* + * Store the LX_STACK_MODE for this LWP, and the current extent of the + * native (emulation) stack. This is similar, in principle, to the + * sigaltstack mechanism for signal handling. We also use this mode + * flag to determine how to process system calls from this LWP. + */ + lx_stack_mode_t br_stack_mode; + uintptr_t br_ntv_stack; + uintptr_t br_ntv_stack_current; + + /* + * If this pid is set, we return it with getpid(). This allows the + * thunking server to interpose on the pid returned to the Linux + * syslog software. + */ + pid_t br_lx_thunk_pid; + + /* + * If strict mode is enabled (via LX_STRICT in the environment), any + * call to lx_unsupported() will set this boolean to B_TRUE. This will + * cause us to drop SIGSYS on the LWP as it attempts to return to + * usermode. + */ + boolean_t br_strict_failure; + + /* + * Hold a pre-allocated lx_pid structure to be used during lx_initlwp. + */ + struct lx_pid *br_lpid; +}; + +/* + * Upper limit on br_args_size, low because this value can persist until + * overridden with another value, and the size is given from userland. + */ +#define LX_BR_ARGS_SIZE_MAX (1024) + +/* brand specific data */ +typedef struct lx_zone_data { + char lxzd_kernel_version[LX_VERS_MAX]; + ksocket_t lxzd_ioctl_sock; +} lx_zone_data_t; + +#define BR_CPU_BOUND 0x0001 + +#define ttolxlwp(t) ((struct lx_lwp_data *)ttolwpbrand(t)) +#define lwptolxlwp(l) ((struct lx_lwp_data *)lwptolwpbrand(l)) +#define ttolxproc(t) \ + (((t)->t_procp->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(t)->t_procp->p_brand_data : NULL) +#define ptolxproc(p) \ + (((p)->p_brand == &lx_brand) ? \ + (struct lx_proc_data *)(p)->p_brand_data : NULL) +#define ztolxzd(z) \ + (((z)->zone_brand == &lx_brand) ? \ + (lx_zone_data_t *)(z)->zone_brand_data : NULL) + +/* Macro for converting to system call arguments. */ +#define LX_ARGS(scall) ((struct lx_##scall##_args *)\ + (ttolxlwp(curthread)->br_scall_args)) + +/* + * Determine the upper bound on the system call number: + */ +#if defined(_LP64) +#define LX_MAX_SYSCALL(lwp) \ + ((lwp_getdatamodel(lwp) == DATAMODEL_NATIVE) ? \ + lx_nsysent64 : lx_nsysent32) +#else +#define LX_MAX_SYSCALL(lwp) lx_nsysent32 +#endif + +extern char *lx_get_zone_kern_version(zone_t *); + +extern void lx_lwp_set_native_stack_current(lx_lwp_data_t *, uintptr_t); +extern void lx_divert(klwp_t *, uintptr_t); +extern int lx_runexe(klwp_t *, void *); +extern void lx_switch_to_native(klwp_t *); +extern int lx_regs_to_userregs(lx_lwp_data_t *, void *); +extern int lx_uc_to_userregs(lx_lwp_data_t *, void *, void *); +extern int lx_userregs_to_regs(lx_lwp_data_t *lwpd, void *); +extern int lx_userregs_to_uc(lx_lwp_data_t *lwpd, void *, void *); + +extern int lx_syscall_enter(void); +extern int lx_syscall_return(klwp_t *, int, long); + +extern void lx_trace_sysenter(int, uintptr_t *); +extern void lx_trace_sysreturn(int, long); + +extern void lx_emulate_user(klwp_t *, int, uintptr_t *); +#if defined(_SYSCALL32_IMPL) +extern void lx_emulate_user32(klwp_t *, int, uintptr_t *); +#endif + +extern int lx_debug; +#define lx_print if (lx_debug) printf + +extern void lx_pid_assign(kthread_t *, struct lx_pid *); +extern void lx_pid_reassign(kthread_t *); +extern void lx_pid_rele(pid_t, id_t); +extern pid_t lx_lpid_to_spair(pid_t, pid_t *, id_t *); +extern pid_t lx_lwp_ppid(klwp_t *, pid_t *, id_t *); +extern void lx_pid_init(void); +extern void lx_pid_fini(void); + +/* + * In-Kernel Linux System Call Description. + */ +typedef struct lx_sysent { + char *sy_name; + long (*sy_callc)(); + char sy_flags; + char sy_narg; +} lx_sysent_t; + +#if defined(_LP64) +extern lx_sysent_t lx_sysent64[LX_NSYSCALLS + 1]; +extern int lx_nsysent64; +#endif +extern lx_sysent_t lx_sysent32[LX_NSYSCALLS + 1]; +extern int lx_nsysent32; + +#endif /* _KERNEL */ +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_BRAND_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_fcntl.h b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h new file mode 100644 index 0000000000..759831ea38 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_fcntl.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_LX_FCNTL_H +#define _SYS_LX_FCNTL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Lx open/fcntl flags + */ +#define LX_O_RDONLY 00 +#define LX_O_WRONLY 01 +#define LX_O_RDWR 02 +#define LX_O_CREAT 0100 +#define LX_O_EXCL 0200 +#define LX_O_NOCTTY 0400 +#define LX_O_TRUNC 01000 +#define LX_O_APPEND 02000 +#define LX_O_NONBLOCK 04000 +#define LX_O_NDELAY LX_O_NONBLOCK +#define LX_O_SYNC 010000 +#define LX_O_FSYNC LX_O_SYNC +#define LX_O_ASYNC 020000 +#define LX_O_DIRECT 040000 +#define LX_O_LARGEFILE 0100000 +#define LX_O_DIRECTORY 0200000 +#define LX_O_NOFOLLOW 0400000 +/* lx flag for pipe2 */ +#define LX_O_CLOEXEC 02000000 + +#define LX_F_DUPFD 0 +#define LX_F_GETFD 1 +#define LX_F_SETFD 2 +#define LX_F_GETFL 3 +#define LX_F_SETFL 4 +#define LX_F_GETLK 5 +#define LX_F_SETLK 6 +#define LX_F_SETLKW 7 +#define LX_F_SETOWN 8 +#define LX_F_GETOWN 9 +#define LX_F_SETSIG 10 +#define LX_F_GETSIG 11 + +#define LX_F_GETLK64 12 +#define LX_F_SETLK64 13 +#define LX_F_SETLKW64 14 + +#define LX_F_SETLEASE 1024 +#define LX_F_GETLEASE 1025 +#define LX_F_NOTIFY 1026 +#define LX_F_CANCELLK 1029 +#define LX_F_DUPFD_CLOEXEC 1030 +#define LX_F_SETPIPE_SZ 1031 +#define LX_F_GETPIPE_SZ 1032 + +#define LX_F_RDLCK 0 +#define LX_F_WRLCK 1 +#define LX_F_UNLCK 2 + +/* + * Lx flock codes. + */ +#define LX_NAME_MAX 255 +#define LX_LOCK_SH 1 /* shared */ +#define LX_LOCK_EX 2 /* exclusive */ +#define LX_LOCK_NB 4 /* non-blocking */ +#define LX_LOCK_UN 8 /* unlock */ + +/* + * On Linux the constants AT_REMOVEDIR and AT_EACCESS have the same value. + * AT_REMOVEDIR is used only by unlinkat and AT_EACCESS is used only by + * faccessat. + */ +#define LX_AT_FDCWD (-100) +#define LX_AT_SYMLINK_NOFOLLOW 0x100 +#define LX_AT_REMOVEDIR 0x200 +#define LX_AT_EACCESS 0x200 +#define LX_AT_SYMLINK_FOLLOW 0x400 +#define LX_AT_NO_AUTOMOUNT 0x800 +#define LX_AT_EMPTY_PATH 0x1000 + +typedef struct lx_flock { + short l_type; + short l_whence; + long l_start; + long l_len; + int l_pid; +} lx_flock_t; + +typedef struct lx_flock64 { + short l_type; + short l_whence; + long long l_start; + long long l_len; + int l_pid; +} lx_flock64_t; + +#if defined(_KERNEL) + +/* + * 64-bit kernel view of 32-bit usermode structs. + */ +#pragma pack(4) +typedef struct lx_flock32 { + int16_t l_type; + int16_t l_whence; + int32_t l_start; + int32_t l_len; + int32_t l_pid; +} lx_flock32_t; + +typedef struct lx_flock64_32 { + int16_t l_type; + int16_t l_whence; + int64_t l_start; + int64_t l_len; + int32_t l_pid; +} lx_flock64_32_t; +#pragma pack() + +#endif /* _KERNEL && _SYSCALL32_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FCNTL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_futex.h b/usr/src/uts/common/brand/lx/sys/lx_futex.h new file mode 100644 index 0000000000..a400b3bd83 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_futex.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_FUTEX_H +#define _SYS_LX_FUTEX_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define FUTEX_WAIT 0 +#define FUTEX_WAKE 1 +#define FUTEX_FD 2 +#define FUTEX_REQUEUE 3 +#define FUTEX_CMP_REQUEUE 4 +#define FUTEX_WAKE_OP 5 +#define FUTEX_LOCK_PI 6 +#define FUTEX_UNLOCK_PI 7 +#define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 +#define FUTEX_MAX_CMD FUTEX_CMP_REQUEUE_PI + +/* + * Flags that can be OR'd into a futex operation. + */ +#define FUTEX_CMD_MASK 0x007f +#define FUTEX_PRIVATE_FLAG 0x0080 +#define FUTEX_CLOCK_REALTIME 0x0100 + +#define FUTEX_BITSET_MATCH_ANY 0xffffffff +/* + * FUTEX_WAKE_OP operations + */ +#define FUTEX_OP_SET 0 /* *(int *)UADDR2 = OPARG; */ +#define FUTEX_OP_ADD 1 /* *(int *)UADDR2 += OPARG; */ +#define FUTEX_OP_OR 2 /* *(int *)UADDR2 |= OPARG; */ +#define FUTEX_OP_ANDN 3 /* *(int *)UADDR2 &= ~OPARG; */ +#define FUTEX_OP_XOR 4 /* *(int *)UADDR2 ^= OPARG; */ + +/* + * FUTEX_WAKE_OP comparison operations + */ +#define FUTEX_OP_CMP_EQ 0 /* if (oldval == CMPARG) wake */ +#define FUTEX_OP_CMP_NE 1 /* if (oldval != CMPARG) wake */ +#define FUTEX_OP_CMP_LT 2 /* if (oldval < CMPARG) wake */ +#define FUTEX_OP_CMP_LE 3 /* if (oldval <= CMPARG) wake */ +#define FUTEX_OP_CMP_GT 4 /* if (oldval > CMPARG) wake */ +#define FUTEX_OP_CMP_GE 5 /* if (oldval >= CMPARG) wake */ + +/* + * The encoding of the FUTEX_WAKE_OP operation in 32 bits: + * + * +--+-- - --+-- - --+-- - --+-- - --+ + * |S |OP |CMP |OPARG |CMPARG | + * +--+-- - --+-- - --+-- - --+-- - --+ + * |31|30 - 28|27 - 24|23 - 12|11 - 0| + * + * The S bit denotes that the OPARG should be (1 << OPARG) instead of OPARG. + * (Yes, this whole thing is entirely absurd -- see the block comment in + * lx_futex.c for an explanation of this nonsense.) Macros to extract the + * various components from the operation, given the above encoding: + */ +#define FUTEX_OP_OP(x) (((x) >> 28) & 7) +#define FUTEX_OP_CMP(x) (((x) >> 24) & 15) +#define FUTEX_OP_OPARG(x) (((x) >> 31) ? (1 << (((x) << 8) >> 20)) : \ + ((((x) << 8) >> 20))) +#define FUTEX_OP_CMPARG(x) (((x) << 20) >> 20) + +#ifdef _KERNEL + +#define FUTEX_WAITERS 0x80000000 +#define FUTEX_OWNER_DIED 0x40000000 +#define FUTEX_TID_MASK 0x3fffffff + +#define FUTEX_ROBUST_LOCK_PI 1 +#define FUTEX_ROBUST_LIST_LIMIT 2048 + +extern long lx_futex(uintptr_t addr, int cmd, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val2); +extern void lx_futex_init(void); +extern int lx_futex_fini(void); +extern long lx_set_robust_list(void *listp, size_t len); +extern long lx_get_robust_list(pid_t pid, void **listp, size_t *lenp); +extern void lx_futex_robust_exit(uintptr_t addr, uint32_t tid); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_FUTEX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_impl.h b/usr/src/uts/common/brand/lx/sys/lx_impl.h new file mode 100644 index 0000000000..03b9d43038 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_impl.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _LX_IMPL_H +#define _LX_IMPL_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (lx_systrace_f)(ulong_t, ulong_t, ulong_t, ulong_t, ulong_t, + ulong_t, ulong_t); + + +extern lx_systrace_f *lx_systrace_entry_ptr; +extern lx_systrace_f *lx_systrace_return_ptr; + +extern void lx_brand_systrace_enable(void); +extern void lx_brand_systrace_disable(void); + +extern void lx_unsupported(char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_IMPL_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ldt.h b/usr/src/uts/common/brand/lx/sys/lx_ldt.h new file mode 100644 index 0000000000..825933e86c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ldt.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LINUX_LDT_H +#define _SYS_LINUX_LDT_H + +#include <sys/segments.h> + +#ifdef __cplusplus +extern "C" { +#endif + +struct ldt_info { + uint_t entry_number; + uint_t base_addr; + uint_t limit; + uint_t seg_32bit:1, + contents:2, + read_exec_only:1, + limit_in_pages:1, + seg_not_present:1, + useable:1; +}; + +#define LDT_INFO_EMPTY(info) \ + ((info)->base_addr == 0 && (info)->limit == 0 && \ + (info)->contents == 0 && (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && (info)->useable == 0) + +#if defined(__amd64) +#define SETMODE(desc) (desc)->usd_long = SDP_SHORT; +#else +#define SETMODE(desc) +#endif + +#define LDT_INFO_TO_DESC(info, desc) { \ + USEGD_SETBASE(desc, (info)->base_addr); \ + USEGD_SETLIMIT(desc, (info)->limit); \ + (desc)->usd_type = ((info)->contents << 2) | \ + ((info)->read_exec_only ^ 1) << 1 | 0x10; \ + (desc)->usd_dpl = SEL_UPL; \ + (desc)->usd_p = (info)->seg_not_present ^ 1; \ + (desc)->usd_def32 = (info)->seg_32bit; \ + (desc)->usd_gran = (info)->limit_in_pages; \ + (desc)->usd_avl = (info)->useable; \ + SETMODE(desc); \ +} + +#define DESC_TO_LDT_INFO(desc, info) { \ + bzero((info), sizeof (*(info))); \ + (info)->base_addr = USEGD_GETBASE(desc); \ + (info)->limit = USEGD_GETLIMIT(desc); \ + (info)->seg_not_present = (desc)->usd_p ^ 1; \ + (info)->contents = ((desc)->usd_type >> 2) & 3; \ + (info)->read_exec_only = (((desc)->usd_type >> 1) & 1) ^ 1; \ + (info)->seg_32bit = (desc)->usd_def32; \ + (info)->limit_in_pages = (desc)->usd_gran; \ + (info)->useable = (desc)->usd_avl; \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_LDT_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_misc.h b/usr/src/uts/common/brand/lx/sys/lx_misc.h new file mode 100644 index 0000000000..a3df150eef --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_misc.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS__LX_MISC_H +#define _SYS__LX_MISC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/lx_brand.h> + +#ifdef _KERNEL + +extern void lx_setrval(klwp_t *, int, int); +extern void lx_exec(); +extern void lx_exitlwp(klwp_t *); +extern void lx_freelwp(klwp_t *); +extern void *lx_lwpdata_alloc(proc_t *); +extern void lx_lwpdata_free(void *); +extern void lx_initlwp(klwp_t *, void *); +extern void lx_forklwp(klwp_t *, klwp_t *); + +extern void lx_set_gdt(int, user_desc_t *); +extern void lx_clear_gdt(int); + +extern longlong_t lx_nosys(); + +extern greg_t lx_fixsegreg(greg_t, model_t); +extern uintptr_t lx_fsbase(klwp_t *, uintptr_t); +extern void lx_exit_with_sig(proc_t *, sigqueue_t *); +extern boolean_t lx_wait_filter(proc_t *, proc_t *); + +typedef enum lx_if_action { + LX_IF_FROMNATIVE, + LX_IF_TONATIVE +} lx_if_action_t; + +/* Linux ARP protocol hardware identifiers */ +#define LX_ARPHRD_ETHER 1 /* Ethernet */ +#define LX_ARPHRD_LOOPBACK 772 /* Loopback */ +#define LX_ARPHRD_VOID 0xffff /* Unknown */ + +extern void lx_ifname_convert(char *, lx_if_action_t); +extern void lx_ifflags_convert(uint64_t *, lx_if_action_t); +extern void lx_stol_hwaddr(const struct sockaddr_dl *, struct sockaddr *, + int *); + +extern boolean_t lx_ptrace_stop(ushort_t); +extern void lx_stop_notify(proc_t *, klwp_t *, ushort_t, ushort_t); +extern void lx_ptrace_init(void); +extern void lx_ptrace_fini(void); +extern int lx_ptrace_kernel(int, pid_t, uintptr_t, uintptr_t); +extern int lx_waitid_helper(idtype_t, id_t, k_siginfo_t *, int, boolean_t *, + int *); +extern void lx_ptrace_exit(proc_t *, klwp_t *); +extern void lx_ptrace_inherit_tracer(lx_lwp_data_t *, lx_lwp_data_t *); +extern int lx_ptrace_stop_for_option(int, boolean_t, ulong_t, uintptr_t); +extern int lx_ptrace_set_clone_inherit(int, boolean_t); +extern int lx_sigcld_repost(proc_t *, sigqueue_t *); +extern int lx_ptrace_issig_stop(proc_t *, klwp_t *); +extern boolean_t lx_ptrace_sig_ignorable(proc_t *, int); + +extern int lx_helper_clone(int64_t *, int, void *, void *, void *); +extern int lx_helper_setgroups(int, gid_t *); +extern int lx_helper_rt_sigqueueinfo(pid_t, int, siginfo_t *); +extern int lx_helper_rt_tgsigqueueinfo(pid_t, pid_t, int, siginfo_t *); + +extern boolean_t lx_vsyscall_iscall(klwp_t *, uintptr_t, int *); +extern void lx_vsyscall_enter(proc_t *, klwp_t *, int); + +extern void lx_check_strict_failure(lx_lwp_data_t *); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS__LX_MISC_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_ptm.h b/usr/src/uts/common/brand/lx/sys/lx_ptm.h new file mode 100644 index 0000000000..74bbc939a3 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_ptm.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PTM_LINUX_H +#define _SYS_PTM_LINUX_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LX_PTM_DRV "lx_ptm" +#define LX_PTM_MINOR_NODE "lx_ptmajor" + +#define LX_PTM_DEV_TO_PTS(dev) (getminor(dev) - 1) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PTM_LINUX_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_sched.h b/usr/src/uts/common/brand/lx/sys/lx_sched.h new file mode 100644 index 0000000000..b0ae748f3c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_sched.h @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LINUX_SCHED_H +#define _SYS_LINUX_SCHED_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/procset.h> +#include <sys/priocntl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Linux scheduler policies. + */ +#define LX_SCHED_OTHER 0 +#define LX_SCHED_FIFO 1 +#define LX_SCHED_RR 2 + +#define LX_PRI_MAX 99 + +typedef int l_pid_t; + +struct lx_sched_param { + int lx_sched_prio; +}; + +extern int sched_setprocset(procset_t *, l_pid_t); +extern long do_priocntlsys(int, procset_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SCHED_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_siginfo.h b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h new file mode 100644 index 0000000000..9f606b614f --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_siginfo.h @@ -0,0 +1,190 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LX_SIGINFO_H +#define _LX_SIGINFO_H + +#include <sys/lx_types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_siginfo_t lsi_code values + * + * LX_SI_ASYNCNL: Sent by asynch name lookup completion + * LX_SI_DETHREAD: Sent by execve() killing subsidiary threads + * LX_SI_SIGIO: Sent by queued SIGIO + * LX_SI_ASYNCIO: Sent by asynchronous I/O completion + * LX_SI_MESGQ: Sent by real time message queue state change + * LX_SI_TIMER: Sent by timer expiration + * LX_SI_QUEUE: Sent by sigqueue + * LX_SI_USER: Sent by kill, sigsend, raise, etc. + * LX_SI_KERNEL: Sent by kernel + * LX_SI_CODE_NOT_EXIST: Error code. When translating from Linux to + * illumos errors, if there is no translation available, this value + * should be used. This value should have no meaning as an si_code in + * illumos or Linux. + * + * At present, LX_SI_ASYNCNL, LX_SI_DETHREAD, and LX_SI_SIGIO are unused by + * BrandZ. + */ +#define LX_SI_CODE_NOT_EXIST (-61) +#define LX_SI_ASYNCNL (-60) +#define LX_SI_DETHREAD (-7) +#define LX_SI_TKILL (-6) +#define LX_SI_SIGIO (-5) +#define LX_SI_ASYNCIO (-4) +#define LX_SI_MESGQ (-3) +#define LX_SI_TIMER (-2) +#define LX_SI_QUEUE (-1) +#define LX_SI_USER (0) +#define LX_SI_KERNEL (0x80) + +#define LX_SI_MAX_SIZE 128 +#define LX_SI_PAD_SIZE_32 ((LX_SI_MAX_SIZE / sizeof (int)) - 3) +#define LX_SI_PAD_SIZE_64 ((LX_SI_MAX_SIZE / sizeof (int)) - 4) + +#if defined(_LP64) +/* + * Because of the odd number (3) of ints before the union, we need to account + * for the smaller padding needed on x64 due to the union being offset to an 8 + * byte boundary. + */ +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_64 +#else +#define LX_SI_PAD_SIZE LX_SI_PAD_SIZE_32 +#endif + +typedef struct lx_siginfo { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE]; + + struct { + pid_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid_t _pid; + lx_uid16_t _uid; + union sigval _sigval; + } _rt; + + struct { + pid_t _pid; + lx_uid16_t _uid; + int _status; + clock_t _utime; + clock_t _stime; + } _sigchld; + + struct { + void *_addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo_t; + +#if defined(_KERNEL) && defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit "lx_siginfo_t" object. + */ +#pragma pack(4) +typedef struct lx_siginfo32 { + int lsi_signo; + int lsi_errno; + int lsi_code; + union { + int _pad[LX_SI_PAD_SIZE_32]; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + } _kill; + + struct { + uint_t _timer1; + uint_t _timer2; + } _timer; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + union sigval32 _sigval; + } _rt; + + struct { + pid32_t _pid; + lx_uid16_t _uid; + int _status; + clock32_t _utime; + clock32_t _stime; + } _sigchld; + + struct { + caddr32_t _addr; + } _sigfault; + + struct { + int _band; + int _fd; + } _sigpoll; + } _sifields; +} lx_siginfo32_t; +#pragma pack() +#endif /* defined(_KERNEL) && defined(_SYSCALL32_IMPL) */ + +#define lsi_pid _sifields._kill._pid +#define lsi_uid _sifields._kill._uid +#define lsi_status _sifields._sigchld._status +#define lsi_utime _sifields._sigchld._utime +#define lsi_stime _sifields._sigchld._stime +#define lsi_value _sifields._rt._sigval +#define lsi_int _sifields._rt._sigval.sivalx_int +#define lsi_ptr _sifields._rt._sigval.sivalx_ptr +#define lsi_addr _sifields._sigfault._addr +#define lsi_band _sifields._sigpoll._band +#define lsi_fd _sifields._sigpoll._fd + +#ifdef __cplusplus +} +#endif + +#endif /* _LX_SIGINFO_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_syscalls.h b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h new file mode 100644 index 0000000000..0f2977c47c --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_syscalls.h @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_LINUX_SYSCALLS_H +#define _SYS_LINUX_SYSCALLS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +extern long lx_arch_prctl(); +extern long lx_brk(); +extern long lx_chmod(); +extern long lx_chown(); +extern long lx_chown16(); +extern long lx_clock_getres(); +extern long lx_clock_gettime(); +extern long lx_clock_settime(); +extern long lx_fchmod(); +extern long lx_fchmodat(); +extern long lx_fchown(); +extern long lx_fchown16(); +extern long lx_fchownat(); +extern long lx_fcntl(); +extern long lx_fcntl64(); +extern long lx_futex(); +extern long lx_get_robust_list(); +extern long lx_get_thread_area(); +extern long lx_getcpu(); +extern long lx_getdents_32(); +extern long lx_getdents_64(); +extern long lx_getdents64(); +extern long lx_getpid(); +extern long lx_getppid(); +extern long lx_getrandom(); +extern long lx_getrlimit(); +extern long lx_gettid(); +extern long lx_gettimeofday(); +extern long lx_ioctl(); +extern long lx_kill(); +extern long lx_lchown(); +extern long lx_lchown16(); +extern long lx_mkdir(); +extern long lx_mkdirat(); +extern long lx_modify_ldt(); +extern long lx_oldgetrlimit(); +extern long lx_open(); +extern long lx_openat(); +extern long lx_pipe(); +extern long lx_pipe2(); +extern long lx_prlimit64(); +extern long lx_read(); +extern long lx_sched_getparam(); +extern long lx_sched_getscheduler(); +extern long lx_sched_rr_get_interval(); +extern long lx_sched_setparam(); +extern long lx_sched_setscheduler(); +extern long lx_sched_yield(); +extern long lx_set_robust_list(); +extern long lx_set_thread_area(); +extern long lx_set_tid_address(); +extern long lx_setresgid(); +extern long lx_setresgid16(); +extern long lx_setresuid(); +extern long lx_setresuid16(); +extern long lx_setrlimit(); +extern long lx_sysinfo32(); +extern long lx_sysinfo64(); +extern long lx_tgkill(); +extern long lx_time(); +extern long lx_tkill(); +extern long lx_wait4(); +extern long lx_waitid(); +extern long lx_waitpid(); +extern long lx_write(); +extern long lx_xattr(); + +#if defined(_LP64) +/* + * Linux vsyscall addresses: + */ +#define LX_VSYS_gettimeofday (uintptr_t)0xffffffffff600000 +#define LX_VSYS_time (uintptr_t)0xffffffffff600400 +#define LX_VSYS_getcpu (uintptr_t)0xffffffffff600800 + +#define LX_VSYSCALL_ADDR (uintptr_t)0xffffffffff600000 +#define LX_VSYSCALL_SIZE (uintptr_t)0x1000 +/* + * System call numbers for vsyscall revectoring: + */ +#define LX_SYS_gettimeofday 96 +#define LX_SYS_time 201 +#define LX_SYS_getcpu 309 +#endif + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LINUX_SYSCALLS_H */ diff --git a/usr/src/uts/common/brand/lx/sys/lx_types.h b/usr/src/uts/common/brand/lx/sys/lx_types.h new file mode 100644 index 0000000000..922c412020 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sys/lx_types.h @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LX_TYPES_H +#define _SYS_LX_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#define SHRT_MIN (-32768) /* min value of a "short int" */ +#define SHRT_MAX 32767 /* max value of a "short int" */ +#define USHRT_MAX 65535 /* max of "unsigned short int" */ +#define INT_MIN (-2147483647-1) /* min value of an "int" */ +#define INT_MAX 2147483647 /* max value of an "int" */ +#define UINT_MAX 4294967295U /* max value of an "unsigned int" */ + +#if defined(_LP64) +#define LONG_MAX 9223372036854775807L +#define ULONG_MAX 18446744073709551615UL +#else +#define LONG_MAX 2147483647L /* max value of a 32-bit "long int" */ +#define ULONG_MAX 4294967295UL /* max value of a 32-bit "ulong int" */ +#endif + +#endif /* !_KERNEL */ + +#define LX_SYS_UTS_LN 65 + +struct lx_utsname { + char sysname[LX_SYS_UTS_LN]; + char nodename[LX_SYS_UTS_LN]; + char release[LX_SYS_UTS_LN]; + char version[LX_SYS_UTS_LN]; + char machine[LX_SYS_UTS_LN]; + char domainname[LX_SYS_UTS_LN]; +}; + +typedef uint64_t lx_dev_t; +typedef uint16_t lx_dev16_t; +typedef uint32_t lx_ino_t; +typedef uint64_t lx_ino64_t; +typedef uint32_t lx_uid_t; +typedef uint16_t lx_uid16_t; +typedef uint32_t lx_gid_t; +typedef uint16_t lx_gid16_t; +typedef uint32_t lx_off_t; +typedef uint64_t lx_off64_t; +typedef uint32_t lx_blksize_t; +typedef uint32_t lx_blkcnt_t; +typedef uint64_t lx_blkcnt64_t; +typedef uint32_t lx_mode_t; +typedef uint16_t lx_mode16_t; + +#define LX_UID16_TO_UID32(uid16) \ + (((uid16) == (lx_uid16_t)-1) ? ((lx_uid_t)-1) : (lx_uid_t)(uid16)) + +#define LX_GID16_TO_GID32(gid16) \ + (((gid16) == (lx_gid16_t)-1) ? ((lx_gid_t)-1) : (lx_gid_t)(gid16)) + +/* Overflow values default to NFS nobody. */ + +#define UID16_OVERFLOW ((lx_uid16_t)65534) +#define GID16_OVERFLOW ((lx_gid16_t)65534) + +/* + * All IDs with high word non-zero are converted to default overflow values to + * avoid inadvertent truncation to zero (root) (!). + */ +#define LX_UID32_TO_UID16(uid32) \ + ((((uid32) & 0xffff0000) == 0) ? ((lx_uid16_t)(uid32)) : \ + (((uid32) == ((lx_uid_t)-1)) ? ((lx_uid16_t)-1) : UID16_OVERFLOW)) + +#define LX_GID32_TO_GID16(gid32) \ + ((((gid32) & 0xffff0000) == 0) ? ((lx_gid16_t)(gid32)) : \ + (((gid32) == ((lx_gid_t)-1)) ? ((lx_gid16_t)-1) : GID16_OVERFLOW)) + +struct lx_timespec { + time_t ts_sec; + long ts_nsec; +}; + +#define LX_32TO64(lo, hi) \ + ((uint64_t)((uint64_t)(lo) | ((uint64_t)(hi) << 32))) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LX_TYPES_H */ diff --git a/usr/src/uts/common/brand/lx/syscall/lx_brk.c b/usr/src/uts/common/brand/lx/syscall/lx_brk.c new file mode 100644 index 0000000000..19a7577ac0 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_brk.c @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> + +/* + * The brk() system call needs to be in-kernel because Linux expects a call to + * brk(0) to return the current breakpoint. In Solaris, the process breakpoint + * is setup and managed by libc. Due to the way we link our libraries and the + * need for Linux to manage its own breakpoint, this has to remain in the + * kernel. + */ +extern int brk(caddr_t); + +long +lx_brk(caddr_t nva) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + + if (nva != 0) { + (void) brk(nva); + + /* + * Despite claims to the contrary in the manpage, when Linux + * brk() fails, errno is left unchanged. + */ + lwp->lwp_errno = 0; + } + return ((long)(p->p_brkbase + p->p_brksize)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chmod.c b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c new file mode 100644 index 0000000000..66a4e35a63 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chmod.c @@ -0,0 +1,70 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/thread.h> +#include <sys/klwp.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> + +/* + * From "uts/common/syscall/chmod.c": + */ +extern int fchmodat(int, char *, int, int); + +static long +lx_fchmodat_wrapper(int fd, char *path, int mode, int flag) +{ + long rval; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + if ((rval = fchmodat(fd, path, mode, flag)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + klwp_t *lwp = ttolwp(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if ((pd->l_flags & LX_PROC_INSTALL_MODE) && + lwp->lwp_errno == ENOENT) { + lwp->lwp_errno = 0; + return (0); + } + } + + return (rval); +} + +long +lx_fchmodat(int fd, char *path, int mode) +{ + return (lx_fchmodat_wrapper(fd, path, mode, 0)); +} + +long +lx_fchmod(int fd, int mode) +{ + return (lx_fchmodat_wrapper(fd, NULL, mode, 0)); +} + +long +lx_chmod(char *path, int mode) +{ + return (lx_fchmodat_wrapper(AT_FDCWD, path, mode, 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_chown.c b/usr/src/uts/common/brand/lx/syscall/lx_chown.c new file mode 100644 index 0000000000..c0326ee619 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_chown.c @@ -0,0 +1,137 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_types.h> + +/* + * From "uts/common/syscall/chown.c": + */ +extern int fchownat(int, char *, uid_t, gid_t, int); + +long +lx_fchownat_wrapper(int fd, char *path, uid_t uid, gid_t gid, int native_flag) +{ + long rval; + + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + if ((rval = fchownat(fd, path, uid, gid, native_flag)) != 0) { + lx_proc_data_t *pd = ttolxproc(curthread); + klwp_t *lwp = ttolwp(curthread); + + /* + * If the process is in "install mode", return success + * if the operation failed due to an absent file. + */ + if ((pd->l_flags & LX_PROC_INSTALL_MODE) && + lwp->lwp_errno == ENOENT) { + lwp->lwp_errno = 0; + return (0); + } + } + + return (rval); +} + +long +lx_fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag) +{ + char c; + int native_flag = 0; + + if (copyin(path, &c, sizeof (c)) != 0) { + return (set_errno(EFAULT)); + } + + if (flag & LX_AT_EMPTY_PATH) { + /* + * According to fchownat(2), when AT_EMPTY_PATH is set: "if + * path is an empty string, operate on the file referred to by + * fd". We pass NULL in place of the empty string, which + * causes fchownat() to operate on the fd we passed without an + * additional lookup. + */ + if (c == '\0') { + path = NULL; + } + + flag &= ~LX_AT_EMPTY_PATH; + } else { + /* + * Otherwise, a file with no filename obviously cannot be + * present in the directory. + */ + if (c == '\0') { + return (set_errno(ENOENT)); + } + } + + if (flag & LX_AT_SYMLINK_NOFOLLOW) { + flag &= ~LX_AT_SYMLINK_NOFOLLOW; + native_flag |= AT_SYMLINK_NOFOLLOW; + } + + if (flag != 0) { + return (set_errno(EINVAL)); + } + + return (lx_fchownat_wrapper(fd, path, uid, gid, native_flag)); +} + +long +lx_fchown(int fd, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(fd, NULL, uid, gid, 0)); +} + +long +lx_lchown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, + AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, uid, gid, 0)); +} + +long +lx_fchown16(int fd, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchownat_wrapper(fd, NULL, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), 0)); +} + +long +lx_lchown16(char *path, uid_t uid, gid_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), AT_SYMLINK_NOFOLLOW)); +} + +long +lx_chown16(char *path, lx_uid16_t uid, lx_gid16_t gid) +{ + return (lx_fchownat_wrapper(AT_FDCWD, path, LX_UID16_TO_UID32(uid), + LX_GID16_TO_GID32(gid), 0)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_clone.c b/usr/src/uts/common/brand/lx/syscall/lx_clone.c new file mode 100644 index 0000000000..50cdeaeab9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_clone.c @@ -0,0 +1,143 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <lx_signum.h> +#include <lx_syscall.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> + +/* + * Our lwp has already been created at this point, so this routine is + * responsible for setting up all the state needed to track this as a + * linux cloned thread. + */ +/* ARGSUSED */ +int +lx_helper_clone(int64_t *rval, int flags, void *ptidp, void *tls, void *ctidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + struct lx_proc_data *lproc = ttolxproc(curthread); + struct ldt_info info; + struct user_desc descr; + int tls_index; + int entry = -1; + int signo; + + signo = flags & LX_CSIGNAL; + if (signo < 0 || signo > LX_NSIG) + return (set_errno(EINVAL)); + + if (!(flags & LX_CLONE_THREAD)) { + lproc->l_signal = signo; + } else { + if (flags & LX_CLONE_SETTLS) { + if (get_udatamodel() == DATAMODEL_ILP32) { + if (copyin((caddr_t)tls, &info, sizeof (info))) + return (set_errno(EFAULT)); + + if (LDT_INFO_EMPTY(&info)) + return (set_errno(EINVAL)); + + entry = info.entry_number; + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + tls_index = entry - GDT_TLSMIN; + + /* + * Convert the user-space structure into a real + * x86 descriptor and copy it into this LWP's + * TLS array. We also load it into the GDT. + */ + LDT_INFO_TO_DESC(&info, &descr); + bcopy(&descr, &lwpd->br_tls[tls_index], + sizeof (descr)); + lx_set_gdt(entry, &lwpd->br_tls[tls_index]); + } else { + /* + * Set the Linux %fsbase for this LWP. We will + * restore it the next time we return to Linux + * via setcontext()/lx_restorecontext(). + */ + lwpd->br_lx_fsbase = (uintptr_t)tls; + } + } + + lwpd->br_clear_ctidp = + (flags & LX_CLONE_CHILD_CLEARTID) ? ctidp : NULL; + + if (signo && ! (flags & LX_CLONE_DETACH)) + lwpd->br_signal = signo; + else + lwpd->br_signal = 0; + + if (flags & LX_CLONE_THREAD) + lwpd->br_tgid = curthread->t_procp->p_pid; + + if (flags & LX_CLONE_PARENT) + lwpd->br_ppid = 0; + + if ((flags & LX_CLONE_CHILD_SETTID) && (ctidp != NULL) && + (suword32(ctidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + if ((flags & LX_CLONE_PARENT_SETTID) && (ptidp != NULL) && + (suword32(ptidp, lwpd->br_pid) != 0)) { + if (entry >= 0) + lx_clear_gdt(entry); + return (set_errno(EFAULT)); + } + } + + *rval = lwpd->br_pid; + return (0); +} + +long +lx_set_tid_address(int *tidp) +{ + struct lx_lwp_data *lwpd = ttolxlwp(curthread); + long rv; + + lwpd->br_clear_ctidp = tidp; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + rv = lwpd->br_pid; + } + + return (rv); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_cpu.c b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c new file mode 100644 index 0000000000..ec8b7576d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_cpu.c @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * We support neither the second argument (NUMA node), nor the third (obsolete + * pre-2.6.24 caching functionality which was ultimately broken). + */ +long +lx_getcpu(unsigned int *cpu, uintptr_t p2, uintptr_t p3) +{ + unsigned int curcpu = curthread->t_cpu->cpu_id; + + if (copyout(&curcpu, cpu, sizeof (curcpu)) != 0) + return (set_errno(EFAULT)); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c new file mode 100644 index 0000000000..d3615c704f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_fcntl.c @@ -0,0 +1,520 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/filio.h> +#include <sys/fcntl.h> +#include <sys/stat.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +extern int fcntl(int, int, intptr_t); +extern int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); + +#define LTOS_FLOCK(l, s) \ +{ \ + s->l_type = ltos_type(l->l_type); \ + s->l_whence = l->l_whence; \ + s->l_start = l->l_start; \ + s->l_len = l->l_len; \ + s->l_sysid = 0; /* not defined in linux */ \ + s->l_pid = (pid_t)l->l_pid; \ +} + +#define STOL_FLOCK(s, l) \ +{ \ + l->l_type = stol_type(s->l_type); \ + l->l_whence = s->l_whence; \ + l->l_start = s->l_start; \ + l->l_len = s->l_len; \ + l->l_pid = (int)s->l_pid; \ +} + +static short +ltos_type(short l_type) +{ + switch (l_type) { + case LX_F_RDLCK: + return (F_RDLCK); + case LX_F_WRLCK: + return (F_WRLCK); + case LX_F_UNLCK: + return (F_UNLCK); + default: + return (-1); + } +} + +static short +stol_type(short l_type) +{ + switch (l_type) { + case F_RDLCK: + return (LX_F_RDLCK); + case F_WRLCK: + return (LX_F_WRLCK); + case F_UNLCK: + return (LX_F_UNLCK); + default: + /* can't ever happen */ + return (0); + } +} + +static void +ltos_flock(struct lx_flock *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock(struct flock64 *s, struct lx_flock *l) +{ + STOL_FLOCK(s, l) +} + +static void +ltos_flock64(struct lx_flock64_32 *l, struct flock64 *s) +{ + LTOS_FLOCK(l, s) +} + +static void +stol_flock64(struct flock64 *s, struct lx_flock64_32 *l) +{ + STOL_FLOCK(s, l) +} + +static int +lx_fcntl_getfl(int fd) +{ + int retval; + int rc; + + retval = fcntl(fd, F_GETFL, 0); + if (ttolwp(curthread)->lwp_errno != 0) + return (ttolwp(curthread)->lwp_errno); + + if ((retval & O_ACCMODE) == O_RDONLY) + rc = LX_O_RDONLY; + else if ((retval & O_ACCMODE) == O_WRONLY) + rc = LX_O_WRONLY; + else + rc = LX_O_RDWR; + /* O_NDELAY != O_NONBLOCK, so we need to check for both */ + if (retval & O_NDELAY) + rc |= LX_O_NDELAY; + if (retval & O_NONBLOCK) + rc |= LX_O_NONBLOCK; + if (retval & O_APPEND) + rc |= LX_O_APPEND; + if (retval & O_SYNC) + rc |= LX_O_SYNC; + if (retval & O_LARGEFILE) + rc |= LX_O_LARGEFILE; + if (retval & FASYNC) + rc |= LX_O_ASYNC; + + return (rc); +} + +static int +lx_fcntl_setfl(int fd, ulong_t arg) +{ + int new_arg; + + new_arg = 0; + /* LX_O_NDELAY == LX_O_NONBLOCK, so we only check for one */ + if (arg & LX_O_NDELAY) + new_arg |= O_NONBLOCK; + if (arg & LX_O_APPEND) + new_arg |= O_APPEND; + if (arg & LX_O_SYNC) + new_arg |= O_SYNC; + if (arg & LX_O_LARGEFILE) + new_arg |= O_LARGEFILE; + if (arg & LX_O_ASYNC) + new_arg |= FASYNC; + + return (fcntl(fd, F_SETFL, new_arg)); +} + +static int +lx_fcntl_common(int fd, int cmd, ulong_t arg) +{ + int rc = 0; + pid_t pid; + int error; + int rv; + int32_t flag; + file_t *fp; + + /* + * We depend on the call to fcntl to set the errno if necessary. + */ + ttolwp(curthread)->lwp_errno = 0; + + switch (cmd) { + case LX_F_SETSIG: + case LX_F_GETSIG: + case LX_F_SETLEASE: + case LX_F_GETLEASE: + case LX_F_NOTIFY: + case LX_F_CANCELLK: + case LX_F_SETPIPE_SZ: + case LX_F_GETPIPE_SZ: + { + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "unsupported fcntl command: %d", cmd); + lx_unsupported(buf); + } + return (set_errno(ENOTSUP)); + + case LX_F_DUPFD: + rc = fcntl(fd, F_DUPFD, arg); + break; + + case LX_F_DUPFD_CLOEXEC: + rc = fcntl(fd, F_DUPFD_CLOEXEC, arg); + break; + + case LX_F_GETFD: + rc = fcntl(fd, F_GETFD, 0); + break; + + case LX_F_SETFD: + rc = fcntl(fd, F_SETFD, arg); + break; + + case LX_F_GETFL: + rc = lx_fcntl_getfl(fd); + break; + + case LX_F_SETFL: + rc = lx_fcntl_setfl(fd, arg); + break; + + case LX_F_SETOWN: + pid = (pid_t)arg; + if (pid == 1) { + /* Setown for the init process uses the real pid. */ + pid = curzone->zone_proc_initpid; + } + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOSETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) + return (set_errno(error)); + + rc = 0; + break; + + case LX_F_GETOWN: + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + rv = 0; + + flag = fp->f_flag | get_udatamodel() | FKIOCTL; + error = VOP_IOCTL(fp->f_vnode, FIOGETOWN, (intptr_t)&pid, + flag, CRED(), &rv, NULL); + releasef(fd); + if (error != 0) + return (set_errno(error)); + + if (pid == curzone->zone_proc_initpid) { + /* Getown for the init process returns 1. */ + pid = 1; + } + + rc = pid; + break; + + default: + return (set_errno(EINVAL)); + } + + return (rc); +} + +static int +lx_fcntl_lock_cmd_to_s(int lx_cmd) +{ + switch (lx_cmd) { + case LX_F_GETLK: + return (F_GETLK); + case LX_F_SETLK: + return (F_SETLK); + case LX_F_SETLKW: + return (F_SETLKW); + case LX_F_GETLK64: + return (F_GETLK64); + case LX_F_SETLK64: + return (F_SETLK64); + case LX_F_SETLKW64: + return (F_SETLKW64); + default: + VERIFY(0); + /*NOTREACHED*/ + return (0); + } +} + +/* + * This is a pain but we can't re-use the fcntl code for locking since it does + * its own copyin/copyout for the flock struct. Since we have to convert the + * struct we have to do our own copyin/out. Thus we replicate the fcntl code for + * these 3 cmds. Luckily it's not much. + */ +static int +lx_fcntl_lock(int fd, int lx_cmd, void *arg) +{ + int cmd; + int error = 0; + file_t *fp; + vnode_t *vp; + int flag; + offset_t maxoffset; + u_offset_t offset; + model_t datamodel; + lx_flock_t lxflk; + lx_flock64_32_t lxflk64; + struct flock64 bf; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + + maxoffset = MAXOFF_T; + datamodel = DATAMODEL_NATIVE; +#if defined(_SYSCALL32_IMPL) + if ((datamodel = get_udatamodel()) == DATAMODEL_ILP32) + maxoffset = MAXOFF32_T; +#endif + vp = fp->f_vnode; + flag = fp->f_flag; + offset = fp->f_offset; + + cmd = lx_fcntl_lock_cmd_to_s(lx_cmd); + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + if (datamodel == DATAMODEL_NATIVE) { + if (copyin(arg, &lxflk, sizeof (lx_flock_t)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (copyin(arg, &lxflk32, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + + lxflk.l_type = lxflk32.l_type; + lxflk.l_whence = lxflk32.l_whence; + lxflk.l_start = (off64_t)lxflk32.l_start; + lxflk.l_len = (off64_t)lxflk32.l_len; + lxflk.l_pid = lxflk32.l_pid; + } +#endif /* _SYSCALL32_IMPL */ + + ltos_flock(&lxflk, &bf); + + if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock(&bf, &lxflk); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk.l_type, &((lx_flock_t *)arg)->l_type, + sizeof (lxflk.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (datamodel == DATAMODEL_NATIVE) { + if (copyout(&lxflk, arg, sizeof (lxflk)) != 0) { + error = EFAULT; + break; + } + } +#if defined(_SYSCALL32_IMPL) + else { + lx_flock32_t lxflk32; + + if (bf.l_start > MAXOFF32_T || bf.l_len > MAXOFF32_T) { + error = EOVERFLOW; + break; + } + + lxflk32.l_type = lxflk.l_type; + lxflk32.l_whence = lxflk.l_whence; + lxflk32.l_start = lxflk.l_start; + lxflk32.l_len = lxflk.l_len; + lxflk32.l_pid = lxflk.l_pid; + + if (copyout(&lxflk32, arg, sizeof (lxflk32)) != 0) { + error = EFAULT; + break; + } + } +#endif /* _SYSCALL32_IMPL */ + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + /* + * Large File support is only used for ILP32 apps. + */ + if (datamodel != DATAMODEL_ILP32) { + error = EINVAL; + break; + } + + if (cmd == F_GETLK64) + cmd = F_GETLK; + else if (cmd == F_SETLK64) + cmd = F_SETLK; + else if (cmd == F_SETLKW64) + cmd = F_SETLKW; + + if (copyin(arg, &lxflk64, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + + ltos_flock64(&lxflk64, &bf); + + if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) + break; + + if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, + fp->f_cred, NULL)) != 0) + break; + + if (cmd != F_GETLK) + break; + + /* + * The command is GETLK, return result. + */ + stol_flock64(&bf, &lxflk64); + + /* + * If no lock is found, only the type field is changed. + */ + if (lxflk64.l_type == LX_F_UNLCK) { + /* l_type always first entry, always a short */ + if (copyout(&lxflk64.l_type, + &((lx_flock64_t *)arg)->l_type, + sizeof (lxflk64.l_type))) + error = EFAULT; + break; + } + + if (bf.l_start > maxoffset || bf.l_len > maxoffset) { + error = EOVERFLOW; + break; + } + + if (copyout(&lxflk64, arg, sizeof (lxflk64)) != 0) { + error = EFAULT; + break; + } + break; + } + + releasef(fd); + if (error) + return (set_errno(error)); + + return (0); +} + +long +lx_fcntl(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK64: + case LX_F_SETLK64: + case LX_F_SETLKW64: + /* The 64-bit fcntl commands must go through fcntl64(). */ + return (set_errno(EINVAL)); + + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, arg)); + } +} + +long +lx_fcntl64(int fd, int cmd, intptr_t arg) +{ + switch (cmd) { + case LX_F_GETLK: + case LX_F_SETLK: + case LX_F_SETLKW: + case LX_F_GETLK64: + case LX_F_SETLKW64: + case LX_F_SETLK64: + return (lx_fcntl_lock(fd, cmd, (void *)arg)); + + default: + return (lx_fcntl_common(fd, cmd, (ulong_t)arg)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_futex.c b/usr/src/uts/common/brand/lx/syscall/lx_futex.c new file mode 100644 index 0000000000..f6a8f33f46 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_futex.c @@ -0,0 +1,1077 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <vm/as.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <vm/page.h> +#include <sys/priv.h> +#include <sys/mman.h> +#include <sys/timer.h> +#include <sys/condvar.h> +#include <sys/inttypes.h> +#include <sys/cmn_err.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_futex.h> +#include <sys/lx_impl.h> + +/* + * Futexes are a Linux-specific implementation of inter-process mutexes. + * They are designed to use shared memory for simple, uncontested + * operations, and rely on the kernel to resolve any contention issues. + * + * Most of the information in this section comes from the paper "Futexes + * Are Tricky", by Ulrich Drepper. This paper is currently available at: + * http://people.redhat.com/~drepper/futex.pdf. + * + * A futex itself a 4-byte integer, which must be 4-byte aligned. The + * value of this integer is expected to be modified using user-level atomic + * operations. The futex(4) design itself does not impose any semantic + * constraints on the value stored in the futex; it is up to the + * application to define its own protocol. + * + * When the application decides that kernel intervention is required, it + * will use the futex(2) system call. There are 5 different operations + * that can be performed on a futex, using this system call. Since this + * interface has evolved over time, there are several different prototypes + * available to the user. Fortunately, there is only a single kernel-level + * interface: + * + * long sys_futex(void *futex1, int cmd, int val1, + * struct timespec *timeout, void *futex2, int val2) + * + * The kernel-level operations that may be performed on a futex are: + * + * FUTEX_WAIT + * + * Atomically verify that futex1 contains the value val1. If it + * doesn't, return EWOULDBLOCK. If it does contain the expected + * value, the thread will sleep until somebody performs a FUTEX_WAKE + * on the futex. The caller may also specify a timeout, indicating + * the maximum time the thread should sleep. If the timer expires, + * the call returns ETIMEDOUT. If the thread is awoken with a signal, + * the call returns EINTR. Otherwise, the call returns 0. + * + * FUTEX_WAKE + * + * Wake up val1 processes that are waiting on futex1. The call + * returns the number of blocked threads that were woken up. + * + * FUTEX_WAIT_BITSET/FUTEX_WAKE_BITSET + * + * Similar to FUTEX_WAIT/FUTEX_WAKE, but each takes an additional argument + * denoting a bit vector, with wakers will only waking waiters that match + * in one or more bits. These semantics are dubious enough, but the + * interface has an inconsistency that is glaring even by the + * embarrassingly low standards that Linux sets for itself: the timeout + * argument to FUTEX_WAIT_BITSET is absolute, not relative as it is for + * FUTEX_WAIT. And as if that weren't enough unnecessary complexity, + * the caller may specify this absolute timeout to be against either + * CLOCK_MONOTONIC or CLOCK_REALTIME -- but only for FUTEX_WAIT_BITSET, + * of course! + * + * FUTEX_WAKE_OP + * + * The implementation of a conditional variable in terms of futexes + * actually uses two futexes: one to assure sequential access and one to + * represent the condition variable. This implementation gives rise to a + * particular performance problem whereby a thread is awoken on the futex + * that represents the condition variable only to have to (potentially) + * immediately wait on the futex that protects the condition variable. + * (Do not confuse the futex that serves to protect the condition variable + * with the pthread_mutex_t associated with pthread_cond_t -- which + * represents a third futex.) To (over)solve this problem, FUTEX_WAKE_OP + * was invented, which performs an atomic compare-and-exchange on a + * second address in a specified fashion (that is, with a specified + * operation). Here are the possible operations (OPARG is defined + * to be 12 bit value embedded in the operation): + * + * - FUTEX_OP_SET: Sets the value at the second address to OPARG + * - FUTEX_OP_ADD: Adds the value to OPARG + * - FUTEX_OP_OR: OR's the value with OPARG + * - FUTEX_OP_ANDN: Performs a negated AND of the value with OPARG + * - FUTEX_OP_XOR: XOR's the value with OPARG + * + * After this compare-and-exchange on the second address, a FUTEX_WAKE is + * performed on the first address and -- if the compare-and-exchange + * matches a specified result based on a specified comparison operation -- + * a FUTEX_WAKE is performed on the second address. Here are the possible + * comparison operations: + * + * - FUTEX_OP_CMP_EQ: If old value is CMPARG, wake + * - FUTEX_OP_CMP_NE: If old value is not equal to CMPARG, wake + * - FUTEX_OP_CMP_LT: If old value is less than CMPARG, wake + * - FUTEX_OP_CMP_LE: If old value is less than or equal to CMPARG, wake + * - FUTEX_OP_CMP_GT: If old value is greater than CMPARG, wake + * - FUTEX_OP_CMP_GE: If old value is greater than or equal to CMPARG, wake + * + * As a practical matter, the only way that this is used (or, some might + * argue, is usable) is by the implementation of pthread_cond_signal(), + * which uses FUTEX_WAKE_OP to -- in a single system call -- unlock the + * futex that protects the condition variable and wake the futex that + * represents the condition variable. The second wake-up is conditional + * because the futex that protects the condition variable (rather than the + * one that represents it) may or may not have waiters. Given that this + * is the use case, FUTEX_WAKE_OP is falsely generic: despite allowing for + * five different kinds of operations and six different kinds of + * comparision operations, in practice only one is used. (Namely, setting + * to 0 and waking if the old value is greater than 1 -- which denotes + * that waiters are present and the wakeup should be performed.) Moreover, + * because FUTEX_WAKE_OP does not (and cannot) optimize anything in the + * case that the pthread_mutex_t associated with the pthread_cond_t is + * held at the time of a pthread_cond_signal(), this entire mechanism is + * essentially for naught in this case. As one can imagine (and can + * verify on just about any source base that uses pthread_cond_signal()), + * it is overwhelmingly the common case that the lock associated with the + * pthread_cond_t is held at the time of pthread_cond_signal(), assuring + * that the problem that all of this complexity was designed to solve + * isn't, in fact, solved because the signalled thread simply wakes up + * only to block again on the held mutex. Cue a slow clap! + * + * FUTEX_CMP_REQUEUE + * + * If the value stored in futex1 matches that passed in in val2, wake + * up val1 processes that are waiting on futex1. Otherwise, return + * EAGAIN. + * + * If there are more than val1 threads waiting on the futex, remove + * the remaining threads from this futex, and requeue them on futex2. + * The caller can limit the number of threads being requeued by + * encoding an integral numerical value in the position usually used + * for the timeout pointer. + * + * The call returns the number of blocked threads that were woken up + * or requeued. + * + * FUTEX_REQUEUE + * + * Identical to FUTEX_CMP_REQUEUE except that it does not use val2. + * This command has been declared broken and obsolete, but we still + * need to support it. + * + * FUTEX_FD + * + * Return a file descriptor, which can be used to refer to the futex. + * This operation was broken by design, and was blessedly removed in + * Linux 2.6.26 ("because it was inherently racy"); it should go without + * saying that we don't support this operation. + */ + +/* + * This structure is used to track all the threads currently waiting on a + * futex. There is one fwaiter_t for each blocked thread. We store all + * fwaiter_t's in a hash structure, indexed by the memid_t of the integer + * containing the futex's value. + * + * At the moment, all fwaiter_t's for a single futex are simply dumped into + * the hash bucket. If futex contention ever becomes a hot path, we can + * chain a single futex's waiters together. + */ +typedef struct fwaiter { + memid_t fw_memid; /* memid of the user-space futex */ + kcondvar_t fw_cv; /* cond var */ + struct fwaiter *fw_next; /* hash queue */ + struct fwaiter *fw_prev; /* hash queue */ + uint32_t fw_bits; /* bits waiting on */ + volatile int fw_woken; +} fwaiter_t; + +/* + * The structure of the robust_list, as set with the set_robust_list() system + * call. See lx_futex_robust_exit(), below, for details. + */ +typedef struct futex_robust_list { + uintptr_t frl_head; /* list of robust locks held */ + uint64_t frl_offset; /* offset of lock word within a lock */ + uintptr_t frl_pending; /* pending operation */ +} futex_robust_list_t; + +#if defined(_SYSCALL32_IMPL) + +#pragma pack(4) +typedef struct futex_robust_list32 { + uint32_t frl_head; /* list of robust locks held */ + uint32_t frl_offset; /* offset of lock word within a lock */ + uint32_t frl_pending; /* pending operation */ +} futex_robust_list32_t; +#pragma pack() + +#endif + +#define MEMID_COPY(s, d) \ + { (d)->val[0] = (s)->val[0]; (d)->val[1] = (s)->val[1]; } +#define MEMID_EQUAL(s, d) \ + ((d)->val[0] == (s)->val[0] && (d)->val[1] == (s)->val[1]) + +/* Borrowed from the page freelist hash code. */ +#define HASH_SHIFT_SZ 7 +#define HASH_SIZE (1 << HASH_SHIFT_SZ) +#define HASH_FUNC(id) \ + ((((uintptr_t)((id)->val[1]) >> PAGESHIFT) + \ + ((uintptr_t)((id)->val[1]) >> (PAGESHIFT + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> 3) + \ + ((uintptr_t)((id)->val[0]) >> (3 + HASH_SHIFT_SZ)) + \ + ((uintptr_t)((id)->val[0]) >> (3 + 2 * HASH_SHIFT_SZ))) & \ + (HASH_SIZE - 1)) + +static fwaiter_t *futex_hash[HASH_SIZE]; +static kmutex_t futex_hash_lock[HASH_SIZE]; + +static void +futex_hashin(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash_lock[index])); + + fwp->fw_prev = NULL; + fwp->fw_next = futex_hash[index]; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp; + futex_hash[index] = fwp; +} + +static void +futex_hashout(fwaiter_t *fwp) +{ + int index; + + index = HASH_FUNC(&fwp->fw_memid); + ASSERT(MUTEX_HELD(&futex_hash_lock[index])); + + if (fwp->fw_prev) + fwp->fw_prev->fw_next = fwp->fw_next; + if (fwp->fw_next) + fwp->fw_next->fw_prev = fwp->fw_prev; + if (futex_hash[index] == fwp) + futex_hash[index] = fwp->fw_next; + + fwp->fw_prev = NULL; + fwp->fw_next = NULL; +} + +/* + * Go to sleep until somebody does a WAKE operation on this futex, we get a + * signal, or the timeout expires. + */ +static int +futex_wait(memid_t *memid, caddr_t addr, + int val, timespec_t *timeout, uint32_t bits) +{ + int err, ret; + int32_t curval; + fwaiter_t fw; + int index; + + fw.fw_woken = 0; + fw.fw_bits = bits; + + MEMID_COPY(memid, &fw.fw_memid); + cv_init(&fw.fw_cv, NULL, CV_DEFAULT, NULL); + + index = HASH_FUNC(&fw.fw_memid); + mutex_enter(&futex_hash_lock[index]); + + if (fuword32(addr, (uint32_t *)&curval)) { + err = set_errno(EFAULT); + goto out; + } + if (curval != val) { + err = set_errno(EWOULDBLOCK); + goto out; + } + + futex_hashin(&fw); + + err = 0; + while ((fw.fw_woken == 0) && (err == 0)) { + ret = cv_waituntil_sig(&fw.fw_cv, &futex_hash_lock[index], + timeout, timechanged); + if (ret < 0) { + err = set_errno(ETIMEDOUT); + } else if (ret == 0) { + /* + * According to signal(7), a futex(2) call with the + * FUTEX_WAIT operation is restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + err = set_errno(EINTR); + } + } + + /* + * The futex is normally hashed out in wakeup. If we timed out or + * got a signal, we need to hash it out here instead. + */ + if (fw.fw_woken == 0) + futex_hashout(&fw); + +out: + mutex_exit(&futex_hash_lock[index]); + + return (err); +} + +/* + * Wake up to wake_threads threads that are blocked on the futex at memid. + */ +static int +futex_wake(memid_t *memid, int wake_threads, uint32_t mask) +{ + fwaiter_t *fwp, *next; + int index; + int ret = 0; + + index = HASH_FUNC(memid); + + mutex_enter(&futex_hash_lock[index]); + + for (fwp = futex_hash[index]; fwp && ret < wake_threads; fwp = next) { + next = fwp->fw_next; + if (MEMID_EQUAL(&fwp->fw_memid, memid) && + (fwp->fw_bits & mask)) { + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + ret++; + } + } + + mutex_exit(&futex_hash_lock[index]); + + return (ret); +} + +static int +futex_wake_op_execute(int32_t *addr, int32_t val3) +{ + int32_t op = FUTEX_OP_OP(val3); + int32_t cmp = FUTEX_OP_CMP(val3); + int32_t cmparg = FUTEX_OP_CMPARG(val3); + int32_t oparg, oldval, newval; + label_t ljb; + int rval; + + if ((uintptr_t)addr >= KERNELBASE) + return (set_errno(EFAULT)); + + if (on_fault(&ljb)) + return (set_errno(EFAULT)); + + oparg = FUTEX_OP_OPARG(val3); + + do { + oldval = *addr; + newval = oparg; + + switch (op) { + case FUTEX_OP_SET: + break; + + case FUTEX_OP_ADD: + newval += oparg; + break; + + case FUTEX_OP_OR: + newval |= oparg; + break; + + case FUTEX_OP_ANDN: + newval &= ~oparg; + break; + + case FUTEX_OP_XOR: + newval ^= oparg; + break; + + default: + no_fault(); + return (set_errno(EINVAL)); + } + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + no_fault(); + + switch (cmp) { + case FUTEX_OP_CMP_EQ: + rval = (oldval == cmparg); + break; + + case FUTEX_OP_CMP_NE: + rval = (oldval != cmparg); + break; + + case FUTEX_OP_CMP_LT: + rval = (oldval < cmparg); + break; + + case FUTEX_OP_CMP_LE: + rval = (oldval <= cmparg); + break; + + case FUTEX_OP_CMP_GT: + rval = (oldval > cmparg); + break; + + case FUTEX_OP_CMP_GE: + rval = (oldval >= cmparg); + break; + + default: + return (set_errno(EINVAL)); + } + + return (rval); +} + +static int +futex_wake_op(memid_t *memid, caddr_t addr2, memid_t *memid2, + int wake_threads, int wake_threads2, int val3) +{ + kmutex_t *l1, *l2; + int ret = 0, ret2 = 0, wake; + fwaiter_t *fwp, *next; + int index1, index2; + + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(memid2); + + if (index1 == index2) { + l1 = &futex_hash_lock[index1]; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash_lock[index1]; + l2 = &futex_hash_lock[index2]; + } else { + l1 = &futex_hash_lock[index2]; + l2 = &futex_hash_lock[index1]; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + /* LINTED: alignment */ + if ((wake = futex_wake_op_execute((int32_t *)addr2, val3)) < 0) + goto out; + + for (fwp = futex_hash[index1]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret >= wake_threads) { + break; + } + } + + if (!wake) + goto out; + + for (fwp = futex_hash[index2]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid2)) + continue; + + futex_hashout(fwp); + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + if (++ret2 >= wake_threads2) { + break; + } + } + + ret += ret2; +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + return (ret); +} + +/* + * Wake up to wake_threads waiting on the futex at memid. If there are + * more than that many threads waiting, requeue the remaining threads on + * the futex at requeue_memid. + */ +static int +futex_requeue(memid_t *memid, memid_t *requeue_memid, int wake_threads, + ulong_t requeue_threads, caddr_t addr, int *cmpval) +{ + fwaiter_t *fwp, *next; + int index1, index2; + int ret = 0; + int32_t curval; + kmutex_t *l1, *l2; + + /* + * To ensure that we don't miss a wakeup if the value of cmpval + * changes, we need to grab locks on both the original and new hash + * buckets. To avoid deadlock, we always grab the lower-indexed + * lock first. + */ + index1 = HASH_FUNC(memid); + index2 = HASH_FUNC(requeue_memid); + + if (index1 == index2) { + l1 = &futex_hash_lock[index1]; + l2 = NULL; + } else if (index1 < index2) { + l1 = &futex_hash_lock[index1]; + l2 = &futex_hash_lock[index2]; + } else { + l1 = &futex_hash_lock[index2]; + l2 = &futex_hash_lock[index1]; + } + + mutex_enter(l1); + if (l2 != NULL) + mutex_enter(l2); + + if (cmpval != NULL) { + if (fuword32(addr, (uint32_t *)&curval)) { + ret = -EFAULT; + goto out; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out; + } + } + + for (fwp = futex_hash[index1]; fwp; fwp = next) { + next = fwp->fw_next; + if (!MEMID_EQUAL(&fwp->fw_memid, memid)) + continue; + + futex_hashout(fwp); + if (ret++ < wake_threads) { + fwp->fw_woken = 1; + cv_signal(&fwp->fw_cv); + } else { + MEMID_COPY(requeue_memid, &fwp->fw_memid); + futex_hashin(fwp); + + if ((ret - wake_threads) >= requeue_threads) + break; + } + } + +out: + if (l2 != NULL) + mutex_exit(l2); + mutex_exit(l1); + + if (ret < 0) + return (set_errno(-ret)); + return (ret); +} + +/* + * Copy in the relative timeout provided by the application and convert it + * to an absolute timeout. Sadly, this is complicated by the different + * timeout of semantics of FUTEX_WAIT vs. FUTEX_WAIT_BITSET. (Yes, you read + * that correctly; FUTEX_WAIT and FUTEX_WAIT_BITSET have different timeout + * semantics; see the block comment at the top of the file for commentary + * on this inanity.) + */ +static int +get_timeout(void *lx_timeout, timestruc_t *timeout, int cmd, int clock) +{ + timestruc_t now; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(lx_timeout, timeout, sizeof (timestruc_t))) + return (EFAULT); + } +#ifdef _SYSCALL32_IMPL + else { + timestruc32_t timeout32; + if (copyin(lx_timeout, &timeout32, sizeof (timestruc32_t))) + return (EFAULT); + timeout->tv_sec = (time_t)timeout32.tv_sec; + timeout->tv_nsec = timeout32.tv_nsec; + } +#endif + if (itimerspecfix(timeout)) + return (EINVAL); + + if (cmd == FUTEX_WAIT) { + /* + * We've been given a relative time; add it to the current + * time to derive an absolute time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } else { + /* + * This is a FUTEX_WAIT_BITSET operation, which (1) specifies + * the timeout as an absolute rather than a relative timeout + * and (2) allows for different clock types to be specified. + * If the clock is CLOCK_REALTIME, we actually have nothing + * to do -- but if this is CLOCK_MONOTONIC, we need to convert + * our absolute time back into a relative time and then add + * it to our current hrestime to get an absolute CLOCK_REALTIME + * timeout. + */ + if (clock == CLOCK_MONOTONIC) { + /* + * Get our current time, and subtract it from our + * timeout to get the relative value. + */ + hrt2ts(gethrtime(), &now); + timespecsub(timeout, &now); + + /* + * If our timeout is in the past, set it to be 0. + */ + if (timeout->tv_sec < 0) { + timeout->tv_sec = 0; + timeout->tv_nsec = 0; + } + + /* + * Add the relative time back into the current time. + */ + gethrestime(&now); + timespecadd(timeout, &now); + } + } + + return (0); +} + +long +lx_futex(uintptr_t addr, int op, int val, uintptr_t lx_timeout, + uintptr_t addr2, int val3) +{ + struct as *as = curproc->p_as; + memid_t memid, memid2; + timestruc_t timeout; + timestruc_t *tptr = NULL; + int val2 = NULL; + int rval = 0; + int cmd = op & FUTEX_CMD_MASK; + int private = op & FUTEX_PRIVATE_FLAG; + char dmsg[32]; + + /* must be aligned on int boundary */ + if (addr & 0x3) + return (set_errno(EINVAL)); + + /* Sanity check the futex command */ + if (cmd < 0 || cmd > FUTEX_MAX_CMD) + return (set_errno(EINVAL)); + + if (cmd == FUTEX_FD) { + /* + * FUTEX_FD was sentenced to death for grievous crimes of + * semantics against humanity; it has been ripped out of Linux + * and will never be supported by us. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + /* + * These are operations that we don't currently support, but + * may well need to in the future. For now, callers need to + * deal with these being missing -- but if and as that changes, + * they may well need to be implemented. + */ + (void) snprintf(dmsg, sizeof (dmsg), "futex 0x%x", cmd); + lx_unsupported(dmsg); + return (set_errno(ENOSYS)); + } + + if ((op & FUTEX_CLOCK_REALTIME) && cmd != FUTEX_WAIT_BITSET) { + /* + * Linux only allows FUTEX_CLOCK_REALTIME to be set on the + * FUTEX_WAIT_BITSET and FUTEX_WAIT_REQUEUE_PI commands. + */ + return (set_errno(ENOSYS)); + } + + /* Copy in the timeout structure from userspace. */ + if ((cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_BITSET) && + lx_timeout != NULL) { + rval = get_timeout((timespec_t *)lx_timeout, &timeout, cmd, + op & FUTEX_CLOCK_REALTIME ? CLOCK_REALTIME : + CLOCK_MONOTONIC); + + if (rval != 0) + return (set_errno(rval)); + tptr = &timeout; + } + + switch (cmd) { + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: + /* + * lx_timeout is nominally a pointer to a userspace address. + * For several commands, however, it actually contains + * an additional integer parameter. This is horrible, and + * the people who did this to us should be sorry. + */ + val2 = (int)lx_timeout; + } + + /* + * Translate the process-specific, user-space futex virtual + * address(es) to a universal memid. If the private bit is set, we + * can just use our as plus the virtual address, saving quite a bit + * of effort. + */ + if (private) { + memid.val[0] = (uintptr_t)as; + memid.val[1] = (uintptr_t)addr; + } else { + rval = as_getmemid(as, (void *)addr, &memid); + if (rval != 0) + return (set_errno(rval)); + } + + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) { + if (addr2 & 0x3) + return (set_errno(EINVAL)); + + if (private) { + memid2.val[0] = (uintptr_t)as; + memid2.val[1] = (uintptr_t)addr2; + } else { + rval = as_getmemid(as, (void *)addr2, &memid2); + if (rval) + return (set_errno(rval)); + } + } + + switch (cmd) { + case FUTEX_WAIT: + rval = futex_wait(&memid, (void *)addr, val, + tptr, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAIT_BITSET: + rval = futex_wait(&memid, (void *)addr, val, tptr, val3); + break; + + case FUTEX_WAKE: + rval = futex_wake(&memid, val, FUTEX_BITSET_MATCH_ANY); + break; + + case FUTEX_WAKE_BITSET: + rval = futex_wake(&memid, val, val3); + break; + + case FUTEX_WAKE_OP: + rval = futex_wake_op(&memid, (void *)addr2, &memid2, + val, val2, val3); + break; + + case FUTEX_CMP_REQUEUE: + case FUTEX_REQUEUE: + rval = futex_requeue(&memid, &memid2, val, + val2, (void *)addr2, &val3); + + break; + } + + return (rval); +} + +/* + * Does the dirty work of actually dropping a held robust lock in the event + * of the untimely death of the owner; see lx_futex_robust_exit(), below. + */ +static void +lx_futex_robust_drop(uintptr_t addr, uint32_t tid) +{ + memid_t memid; + uint32_t oldval, newval; + + VERIFY(addr + sizeof (uint32_t) < KERNELBASE); + + do { + fuword32_noerr((void *)addr, &oldval); + + if ((oldval & FUTEX_TID_MASK) != tid) + return; + + newval = (oldval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + } while (atomic_cas_32((uint32_t *)addr, oldval, newval) != oldval); + + /* + * We have now denoted that this lock's owner is dead; we need to + * wake any waiters. + */ + if (as_getmemid(curproc->p_as, (void *)addr, &memid) != 0) + return; + + (void) futex_wake(&memid, 1, FUTEX_BITSET_MATCH_ANY); +} + +/* + * Called when a thread is exiting. The role of the kernel is very clearly + * spelled out in the Linux design document entitled robust-futex-ABI.txt: + * we must (carefully!) iterate over the list of held locks pointed to by + * the robust list head; for each lock, we'll check to see if the calling + * (exiting) thread is the owner, and if so, denote that the lock is dead + * and wake any waiters. (The "pending" field of the head points to a lock + * that is in transition; it should be dropped if held.) If there are any + * errors through here at all (including memory operations), we abort the + * entire operation. + */ +void +lx_futex_robust_exit(uintptr_t addr, uint32_t tid) +{ + futex_robust_list_t list; + uintptr_t entry, next; + model_t model = get_udatamodel(); + int length = 0; + label_t ljb; + + if (on_fault(&ljb)) + return; + + if (addr + sizeof (futex_robust_list_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + copyin_noerr((void *)addr, &list, sizeof (list)); + } +#if defined(_SYSCALL32_IMPL) + else { + futex_robust_list32_t list32; + + copyin_noerr((void *)addr, &list32, sizeof (list32)); + list.frl_head = list32.frl_head; + list.frl_offset = list32.frl_offset; + list.frl_pending = list32.frl_pending; + } +#endif + + /* + * Strip off the PI bit, if any. + */ + entry = list.frl_head & ~FUTEX_ROBUST_LOCK_PI; + + while (entry != addr && length++ < FUTEX_ROBUST_LIST_LIMIT) { + if (entry + list.frl_offset + sizeof (uint32_t) >= KERNELBASE) + goto out; + + if (model == DATAMODEL_NATIVE) { + fulword_noerr((void *)entry, &next); + } +#if defined(_SYSCALL32_IMPL) + else { + uint32_t next32; + fuword32_noerr((void *)entry, &next32); + next = next32; + } +#endif + + /* + * Drop the robust mutex -- but only if our pending lock didn't + * somehow sneak on there. + */ + if (entry != list.frl_pending) + lx_futex_robust_drop(entry + list.frl_offset, tid); + + entry = next & ~FUTEX_LOCK_PI; + } + + /* + * Finally, drop the pending lock if there is one. + */ + if (list.frl_pending != NULL && list.frl_pending + + list.frl_offset + sizeof (uint32_t) < KERNELBASE) + lx_futex_robust_drop(list.frl_pending + list.frl_offset, tid); + +out: + no_fault(); +} + +long +lx_set_robust_list(void *listp, size_t len) +{ + proc_t *p = curproc; + klwp_t *lwp = ttolwp(curthread); + struct lx_lwp_data *lwpd = lwptolxlwp(lwp); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (len != sizeof (futex_robust_list_t)) + return (set_errno(EINVAL)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (len != sizeof (futex_robust_list32_t)) + return (set_errno(EINVAL)); + } +#endif + + /* + * To assure that we are serialized with respect to any racing call + * to lx_get_robust_list(), we lock ourselves to set the value. (Note + * that sprunlock() drops p_lock.) + */ + mutex_enter(&p->p_lock); + sprlock_proc(p); + lwpd->br_robust_list = listp; + sprunlock(p); + + return (0); +} + +long +lx_get_robust_list(pid_t pid, void **listp, size_t *lenp) +{ + model_t model = get_udatamodel(); + pid_t rpid; + id_t rtid; + proc_t *rproc; + klwp_t *rlwp; + lx_lwp_data_t *rlwpd; + kthread_t *rthr; + void *list; + int err = 0; + + if (pid == 0) { + /* + * A pid of 0 denotes the current thread; we lock the current + * process even though it isn't strictly necessary (we can't + * race with set_robust_list() because a thread may only set + * its robust list on itself). + */ + rproc = curproc; + rlwpd = lwptolxlwp(ttolwp(curthread)); + mutex_enter(&curproc->p_lock); + sprlock_proc(rproc); + } else { + if (lx_lpid_to_spair(pid, &rpid, &rtid) != 0 || + (rproc = sprlock(rpid)) == NULL) { + /* + * We couldn't find the specified process. + */ + return (set_errno(ESRCH)); + } + + if (rproc->p_model != model || + (rthr = idtot(rproc, rtid)) == NULL || + (rlwp = ttolwp(rthr)) == NULL || + (rlwpd = lwptolxlwp(rlwp)) == NULL) { + /* + * The target process does not match our data model, or + * we couldn't find the LWP, or the target process is + * not branded. + */ + err = ESRCH; + goto out; + } + } + + if (curproc != rproc && + priv_proc_cred_perm(curproc->p_cred, rproc, NULL, VREAD) != 0) { + /* + * We don't have the permission to examine the target. + */ + err = EPERM; + goto out; + } + + list = rlwpd->br_robust_list; + +out: + sprunlock(rproc); + + if (err != 0) + return (set_errno(err)); + + if (model == DATAMODEL_NATIVE) { + if (sulword(listp, (uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (sulword(lenp, sizeof (futex_robust_list_t)) != 0) + return (set_errno(EFAULT)); + } +#if defined(_SYSCALL32_IMPL) + else { + if (suword32(listp, (uint32_t)(uintptr_t)list) != 0) + return (set_errno(EFAULT)); + + if (suword32(lenp, sizeof (futex_robust_list32_t)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +void +lx_futex_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) + mutex_init(&futex_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); + bzero(futex_hash, sizeof (futex_hash)); +} + +int +lx_futex_fini(void) +{ + int i, err; + + err = 0; + for (i = 0; (err == 0) && (i < HASH_SIZE); i++) { + mutex_enter(&futex_hash_lock[i]); + if (futex_hash[i] != NULL) + err = EBUSY; + mutex_exit(&futex_hash_lock[i]); + } + return (err); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getdents.c b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c new file mode 100644 index 0000000000..a41ab19d65 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getdents.c @@ -0,0 +1,349 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> +#include <sys/vnode.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> + +#include <sys/lx_types.h> +#include <sys/lx_misc.h> + +#define LX_NAMEMAX 256 + +#define LX_GETDENTS_MAX_BUFSZ 65536 + +/* + * Because the Linux dirent has an extra field (d_type), it's possible that + * each entry will be 8 bytes larger due to padding. To prevent overrun during + * translation, the illumos-native buffer is sized pessimistically. + */ +#define LTOS_GETDENTS_BUFSZ(bufsz, datasz) \ + (((bufsz) / (datasz + 8)) * sizeof (struct dirent)) + +/* + * Record must be long enough to house d_name string, null terminator and + * d_type field. It's then padded to nearest 8-byte boundary + */ +#define LX_RECLEN(l, t) \ + ((offsetof(t, d_name) + 2 + (l) + 7) & ~7) + +/* + * Bytes after d_name string until d_reclen should be zeroed. + * Includes zero-terminating d_name + */ +#define LX_ZEROLEN(l, t) \ + (LX_RECLEN(l, t) - \ + ((offsetof(t, d_name) + (l)))) + +/* The output format of getdents differs if the caller is 32 or 64 bit. */ +struct lx_dirent_32 { + uint32_t d_ino; + int32_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +struct lx_dirent_64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + char d_name[1]; + uchar_t d_type; +}; + +static long +lx_getdents_common(int fd, caddr_t uptr, size_t count, + unsigned int lx_size, int (*outcb)(caddr_t, caddr_t, int)) +{ + vnode_t *vp; + file_t *fp; + struct uio auio; + struct iovec aiov; + int error; + int sbufsz, lbufsz, bufsz; + void *lbuf, *sbuf; + size_t outb = 0; + + if (count < lx_size) { + return (set_errno(EINVAL)); + } + if ((fp = getf(fd)) == NULL) { + return (set_errno(EBADF)); + } + vp = fp->f_vnode; + if (vp->v_type != VDIR) { + releasef(fd); + return (set_errno(ENOTDIR)); + } + if (!(fp->f_flag & FREAD)) { + releasef(fd); + return (set_errno(EBADF)); + } + + if (count > LX_GETDENTS_MAX_BUFSZ) { + /* + * If the target buffer passed to us is huge, keep the + * translation buffers moderate in size. Iteration will be + * used to fill the request. + */ + lbufsz = LX_GETDENTS_MAX_BUFSZ; + sbufsz = LTOS_GETDENTS_BUFSZ(LX_GETDENTS_MAX_BUFSZ, lx_size); + } else if (count < (lx_size + MAXPATHLEN)) { + /* + * If the target buffer is tiny, allocate a Linux-format buffer + * big enough to hold at least one max-length row while keeping + * the illumos-format buffer pesimistic in size. + * + * Assuming the buffer is truely tiny, it's likely that the + * result will not fit and an EINVAL will be tossed. + */ + lbufsz = (lx_size + MAXPATHLEN); + sbufsz = MAX((LTOS_GETDENTS_BUFSZ(count, lx_size)), + sizeof (struct dirent)); + } else { + lbufsz = count; + sbufsz = LTOS_GETDENTS_BUFSZ(count, lx_size); + } + bufsz = sbufsz; + lbuf = kmem_alloc(lbufsz, KM_SLEEP); + sbuf = kmem_alloc(sbufsz, KM_SLEEP); + + aiov.iov_base = sbuf; + aiov.iov_len = sbufsz; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_loffset = fp->f_offset; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = sbufsz; + auio.uio_fmode = 0; + auio.uio_extflg = UIO_COPY_CACHED; + + /* + * Since we use a conservative buffer allocation for the differing + * struct sizing and Linux places fewer limits on getdents buffers in + * general, there's a chance we'll undershoot on the record count. + * When this happens, we can simply repeat the READDIR operation until + * the available records are exhausted or we've filled the user buffer. + */ + while (1) { + int at_eof, res; + (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL); + error = VOP_READDIR(vp, &auio, fp->f_cred, &at_eof, NULL, 0); + VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL); + if (error != 0 || auio.uio_resid == sbufsz) { + break; + } + res = outcb(sbuf, lbuf, bufsz - auio.uio_resid); + VERIFY(res <= lbufsz); + if (res == 0) { + /* no records to copyout from this batch */ + break; + } else if (res > count) { + /* + * For very small buffer sizes, it's possible that a + * single record is too large due to a long filename. + */ + error = EINVAL; + break; + } + + VERIFY(outb + res <= count); + if (copyout(lbuf, (void *)(uptr + outb), res) != 0) { + error = EFAULT; + break; + } + outb += res; + + if (at_eof != 0 || (count - outb) < (lx_size + MAXPATHLEN)) { + /* + * If there are no records left or the remaining buffer + * space is not large enough to hold a max-length + * filename, do not continue iteration. + */ + break; + } + + /* + * We undershot the request buffer. + * Reset for another READDIR, taking care not to overshoot. + */ + bufsz = MIN(sbufsz, LTOS_GETDENTS_BUFSZ(count - outb, lx_size)); + auio.uio_resid = bufsz; + aiov.iov_len = bufsz; + aiov.iov_base = sbuf; + } + + kmem_free(lbuf, lbufsz); + kmem_free(sbuf, sbufsz); + + if (error) { + releasef(fd); + return (set_errno(error)); + } + + fp->f_offset = auio.uio_loffset; + releasef(fd); + return (outb); +} + + +static int +lx_getdents_format32(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent_32 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent_32 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_32); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_32)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +static int +lx_getdents_format64(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent_64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent_64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN(namelen, + struct lx_dirent_64); + /* Zero out any alignment padding and d_type */ + bzero(ld->d_name + namelen, + LX_ZEROLEN(namelen, struct lx_dirent_64)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + +long +lx_getdents_32(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_32), lx_getdents_format32)); +} + +long +lx_getdents_64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent_64), lx_getdents_format64)); +} + +struct lx_dirent64 { + uint64_t d_ino; + int64_t d_off; + ushort_t d_reclen; + uchar_t d_type; + char d_name[1]; +}; + +#define LX_RECLEN64(namelen) \ + ((offsetof(struct lx_dirent64, d_name) + 1 + (namelen) + 7) & ~7) + +#define LX_ZEROLEN64(namelen) \ + (LX_RECLEN64(namelen) - \ + ((offsetof(struct lx_dirent64, d_name) + (namelen)))) + +static int +lx_getdents64_format(caddr_t sbuf, caddr_t lbuf, int len) +{ + struct dirent *sd; + struct lx_dirent64 *ld; + int namelen; + int size = 0; + + while (len > 0) { + sd = (struct dirent *)sbuf; + ld = (struct lx_dirent64 *)lbuf; + namelen = MIN(strlen(sd->d_name), LX_NAMEMAX - 1); + + ld->d_ino = sd->d_ino; + ld->d_off = sd->d_off; + ld->d_type = 0; + (void) strncpy(ld->d_name, sd->d_name, namelen); + ld->d_name[namelen] = 0; + ld->d_reclen = (ushort_t)LX_RECLEN64(namelen); + /* Zero out any alignment padding */ + bzero(ld->d_name + namelen, LX_ZEROLEN64(namelen)); + + len -= sd->d_reclen; + size += ld->d_reclen; + sbuf += sd->d_reclen; + lbuf += ld->d_reclen; + } + return (size); +} + + +long +lx_getdents64(int fd, caddr_t buf, size_t count) +{ + return (lx_getdents_common(fd, buf, count, + sizeof (struct lx_dirent64), lx_getdents64_format)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getpid.c b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c new file mode 100644 index 0000000000..c2506f52c5 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getpid.c @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * return the pid + */ +long +lx_getpid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + long rv; + + if (curproc->p_pid == curproc->p_zone->zone_proc_initpid) { + rv = 1; + } else { + VERIFY(lwpd != NULL); + + if (lwpd->br_lx_thunk_pid != 0) { + rv = lwpd->br_lx_thunk_pid; + } else { + rv = lwpd->br_tgid; + } + } + + return (rv); +} + +/* + * return the parent pid + */ +long +lx_getppid(void) +{ + return (lx_lwp_ppid(ttolwp(curthread), NULL, NULL)); +} + +/* + * return the thread id + */ +long +lx_gettid(void) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + + return (lwpd->br_pid); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c new file mode 100644 index 0000000000..acc4073483 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_getrandom.c @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* + * From "uts/common/syscall/getrandom.c": + */ +extern int getrandom(void *, size_t, int); + +long +lx_getrandom(void *bufp, size_t buflen, int flags) +{ + /* + * According to signal(7), calls to getrandom(2) are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (getrandom(bufp, buflen, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_id.c b/usr/src/uts/common/brand/lx/syscall/lx_id.c new file mode 100644 index 0000000000..baa41f52fa --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_id.c @@ -0,0 +1,296 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/zone.h> +#include <sys/cred_impl.h> +#include <sys/policy.h> + +typedef ushort_t l_uid16_t; +typedef ushort_t l_gid16_t; +typedef uint_t l_uid_t; +typedef uint_t l_gid_t; + +#define LINUX_UID16_TO_UID32(uid16) \ + (((uid16) == (l_uid16_t)-1) ? ((l_uid_t)-1) : (l_uid_t)(uid16)) + +#define LINUX_GID16_TO_GID32(gid16) \ + (((gid16) == (l_gid16_t)-1) ? ((l_gid_t)-1) : (l_gid_t)(gid16)) + +#define LX_NGROUPS_MAX 32 +extern int setgroups(int, gid_t *); + +/* + * This function is based on setreuid in common/syscall/uid.c and exists + * because illumos does not have a way to explicitly set the saved uid (suid) + * from any other system call. + */ +long +lx_setresuid(l_uid_t ruid, l_uid_t euid, l_uid_t suid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + int uidchge = 0; + uid_t oldruid = ruid; + cred_t *cr, *newcr; + zoneid_t zoneid = getzoneid(); + + if ((ruid != -1 && (ruid > MAXUID)) || + (euid != -1 && (euid > MAXUID)) || + (suid != -1 && (suid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + +retry: + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (ruid != -1 && + ruid != cr->cr_ruid && ruid != cr->cr_uid && + ruid != cr->cr_suid && secpolicy_allow_setid(cr, ruid, B_FALSE)) { + error = EPERM; + } else if (euid != -1 && + euid != cr->cr_ruid && euid != cr->cr_uid && + euid != cr->cr_suid && secpolicy_allow_setid(cr, euid, B_FALSE)) { + error = EPERM; + } else if (suid != -1 && + suid != cr->cr_ruid && suid != cr->cr_uid && + suid != cr->cr_suid && secpolicy_allow_setid(cr, suid, B_FALSE)) { + error = EPERM; + } else { + if (!uidchge && ruid != -1 && cr->cr_ruid != ruid) { + /* + * The ruid of the process is going to change. In order + * to avoid a race condition involving the + * process count associated with the newly given ruid, + * we increment the count before assigning the + * credential to the process. + * To do that, we'll have to take pidlock, so we first + * release p_crlock. + */ + mutex_exit(&p->p_crlock); + uidchge = 1; + mutex_enter(&pidlock); + upcount_inc(ruid, zoneid); + mutex_exit(&pidlock); + /* + * As we released p_crlock we can't rely on the cr + * we read. So retry the whole thing. + */ + goto retry; + } + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (euid != -1) + newcr->cr_uid = euid; + if (suid != -1) + newcr->cr_suid = suid; + if (ruid != -1) { + oldruid = newcr->cr_ruid; + newcr->cr_ruid = ruid; + ASSERT(ruid != oldruid ? uidchge : 1); + } + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_uid != newcr->cr_uid || + cr->cr_ruid != newcr->cr_ruid || + cr->cr_suid != newcr->cr_suid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + /* + * We decrement the number of processes associated with the oldruid + * to match the increment above, even if the ruid of the process + * did not change or an error occurred (oldruid == uid). + */ + if (uidchge) { + ASSERT(oldruid != -1 && ruid != -1); + mutex_enter(&pidlock); + upcount_dec(oldruid, zoneid); + mutex_exit(&pidlock); + } + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresuid16(l_uid16_t ruid16, l_uid16_t euid16, l_uid16_t suid16) +{ + long rval; + + rval = lx_setresuid( + LINUX_UID16_TO_UID32(ruid16), + LINUX_UID16_TO_UID32(euid16), + LINUX_UID16_TO_UID32(suid16)); + + return (rval); +} + +/* + * This function is based on setregid in common/syscall/gid.c + */ +long +lx_setresgid(l_gid_t rgid, l_gid_t egid, l_gid_t sgid) +{ + proc_t *p; + int error = 0; + int do_nocd = 0; + cred_t *cr, *newcr; + + if ((rgid != -1 && (rgid > MAXUID)) || + (egid != -1 && (egid > MAXUID)) || + (sgid != -1 && (sgid > MAXUID))) { + error = EINVAL; + goto done; + } + + /* + * Need to pre-allocate the new cred structure before grabbing + * the p_crlock mutex. + */ + newcr = cralloc(); + + p = ttoproc(curthread); + mutex_enter(&p->p_crlock); + cr = p->p_cred; + + if (rgid != -1 && + rgid != cr->cr_rgid && rgid != cr->cr_gid && + rgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (egid != -1 && + egid != cr->cr_rgid && egid != cr->cr_gid && + egid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else if (sgid != -1 && + sgid != cr->cr_rgid && sgid != cr->cr_gid && + sgid != cr->cr_sgid && secpolicy_allow_setid(cr, -1, B_FALSE)) { + error = EPERM; + } else { + crhold(cr); + crcopy_to(cr, newcr); + p->p_cred = newcr; + + if (egid != -1) + newcr->cr_gid = egid; + if (sgid != -1) + newcr->cr_sgid = sgid; + if (rgid != -1) + newcr->cr_rgid = rgid; + + /* + * A process that gives up its privilege + * must be marked to produce no core dump. + */ + if ((cr->cr_gid != newcr->cr_gid || + cr->cr_rgid != newcr->cr_rgid || + cr->cr_sgid != newcr->cr_sgid)) + do_nocd = 1; + + crfree(cr); + } + mutex_exit(&p->p_crlock); + + if (error == 0) { + if (do_nocd) { + mutex_enter(&p->p_lock); + p->p_flag |= SNOCD; + mutex_exit(&p->p_lock); + } + crset(p, newcr); /* broadcast to process threads */ + goto done; + } + crfree(newcr); +done: + if (error) + return (set_errno(error)); + else + return (0); +} + +long +lx_setresgid16(l_gid16_t rgid16, l_gid16_t egid16, l_gid16_t sgid16) +{ + long rval; + + rval = lx_setresgid( + LINUX_GID16_TO_GID32(rgid16), + LINUX_GID16_TO_GID32(egid16), + LINUX_GID16_TO_GID32(sgid16)); + + return (rval); +} + +/* + * Linux defines NGROUPS_MAX to be 32, but on illumos it is only 16. We employ + * the terrible hack below so that tests may proceed, if only on DEBUG kernels. + */ +long +lx_helper_setgroups(int ngroups, gid_t *grouplist) +{ +#ifdef DEBUG + if (ngroups > ngroups_max && ngroups <= LX_NGROUPS_MAX) + ngroups = ngroups_max; +#endif /* DEBUG */ + + return (setgroups(ngroups, grouplist)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c new file mode 100644 index 0000000000..b9c24198f9 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_ioctl.c @@ -0,0 +1,1220 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/filio.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/termio.h> +#include <sys/termios.h> +#include <sys/ptyvar.h> +#include <net/if.h> +#include <net/if_dl.h> +#include <sys/sockio.h> +#include <sys/stropts.h> +#include <sys/ptms.h> +#include <sys/cred.h> +#include <sys/cred_impl.h> +#include <sys/sysmacros.h> +#include <sys/lx_misc.h> +#include <sys/lx_ptm.h> +#include <sys/sunddi.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/session.h> +#include <sys/kmem.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <net/if_arp.h> +#include <sys/ioccom.h> +#include <sys/dtrace.h> +#include <sys/ethernet.h> +#include <sys/dlpi.h> + +/* + * Supported ioctls + */ +#define LX_TCGETS 0x5401 +#define LX_TCSETS 0x5402 +#define LX_TCSETSW 0x5403 +#define LX_TCSETSF 0x5404 +#define LX_TCGETA 0x5405 +#define LX_TCSETA 0x5406 +#define LX_TCSETAW 0x5407 +#define LX_TCSETAF 0x5408 +#define LX_TCSBRK 0x5409 +#define LX_TCXONC 0x540a +#define LX_TCFLSH 0x540b +#define LX_TIOCEXCL 0x540c +#define LX_TIOCNXCL 0x540d +#define LX_TIOCSCTTY 0x540e +#define LX_TIOCGPGRP 0x540f +#define LX_TIOCSPGRP 0x5410 +#define LX_TIOCOUTQ 0x5411 +#define LX_TIOCSTI 0x5412 +#define LX_TIOCGWINSZ 0x5413 +#define LX_TIOCSWINSZ 0x5414 +#define LX_TIOCMGET 0x5415 +#define LX_TIOCMBIS 0x5416 +#define LX_TIOCMBIC 0x5417 +#define LX_TIOCMSET 0x5418 +#define LX_TIOCGSOFTCAR 0x5419 +#define LX_TIOCSSOFTCAR 0x541a +#define LX_FIONREAD 0x541b +#define LX_TIOCPKT 0x5420 +#define LX_FIONBIO 0x5421 +#define LX_TIOCNOTTY 0x5422 +#define LX_TIOCSETD 0x5423 +#define LX_TIOCGETD 0x5424 +#define LX_TCSBRKP 0x5425 +#define LX_TIOCGSID 0x5429 +#define LX_TIOCGPTN 0x80045430 +#define LX_TIOCSPTLCK 0x40045431 +#define LX_FIONCLEX 0x5450 +#define LX_FIOCLEX 0x5451 +#define LX_FIOASYNC 0x5452 +#define LX_FIOSETOWN 0x8901 +#define LX_SIOCSPGRP 0x8902 +#define LX_FIOGETOWN 0x8903 +#define LX_SIOCGPGRP 0x8904 +#define LX_SIOCATMARK 0x8905 +#define LX_SIOCGSTAMP 0x8906 +#define LX_SIOCGIFCONF 0x8912 +#define LX_SIOCGIFFLAGS 0x8913 +#define LX_SIOCSIFFLAGS 0x8914 +#define LX_SIOCGIFADDR 0x8915 +#define LX_SIOCSIFADDR 0x8916 +#define LX_SIOCGIFDSTADDR 0x8917 +#define LX_SIOCSIFDSTADDR 0x8918 +#define LX_SIOCGIFBRDADDR 0x8919 +#define LX_SIOCSIFBRDADDR 0x891a +#define LX_SIOCGIFNETMASK 0x891b +#define LX_SIOCSIFNETMASK 0x891c +#define LX_SIOCGIFMETRIC 0x891d +#define LX_SIOCSIFMETRIC 0x891e +#define LX_SIOCGIFMEM 0x891f +#define LX_SIOCSIFMEM 0x8920 +#define LX_SIOCGIFMTU 0x8921 +#define LX_SIOCSIFMTU 0x8922 +#define LX_SIOCSIFHWADDR 0x8924 +#define LX_SIOCGIFHWADDR 0x8927 +#define LX_SIOCGIFINDEX 0x8933 +#define LX_SIOCGIFTXQLEN 0x8942 + +#define FLUSER(fp) fp->f_flag | get_udatamodel() +#define FLFAKE(fp) fp->f_flag | FKIOCTL + +/* + * LX_NCC must be different from LX_NCCS since while the termio and termios + * structures may look similar they are fundamentally different sizes and + * have different members. + */ +#define LX_NCC 8 +#define LX_NCCS 19 + +struct lx_termio { + unsigned short c_iflag; /* input mode flags */ + unsigned short c_oflag; /* output mode flags */ + unsigned short c_cflag; /* control mode flags */ + unsigned short c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCC]; /* control characters */ +}; + +struct lx_termios { + uint32_t c_iflag; /* input mode flags */ + uint32_t c_oflag; /* output mode flags */ + uint32_t c_cflag; /* control mode flags */ + uint32_t c_lflag; /* local mode flags */ + unsigned char c_line; /* line discipline */ + unsigned char c_cc[LX_NCCS]; /* control characters */ +}; + +/* + * c_cc characters which are valid for lx_termio and lx_termios + */ +#define LX_VINTR 0 +#define LX_VQUIT 1 +#define LX_VERASE 2 +#define LX_VKILL 3 +#define LX_VEOF 4 +#define LX_VTIME 5 +#define LX_VMIN 6 +#define LX_VSWTC 7 + +/* + * c_cc characters which are valid for lx_termios + */ +#define LX_VSTART 8 +#define LX_VSTOP 9 +#define LX_VSUSP 10 +#define LX_VEOL 11 +#define LX_VREPRINT 12 +#define LX_VDISCARD 13 +#define LX_VWERASE 14 +#define LX_VLNEXT 15 +#define LX_VEOL2 16 + +/* VSD key for lx_cc information */ +static uint_t lx_ioctl_vsd = 0; + +extern int lx_lpid_to_spair(pid_t l_pid, pid_t *s_pid, id_t *s_tid); + +/* Terminal helpers */ + +static void +l2s_termios(struct lx_termios *l_tios, struct termios *s_tios) +{ + ASSERT((l_tios != NULL) && (s_tios != NULL)); + + bzero(s_tios, sizeof (*s_tios)); + + s_tios->c_iflag = l_tios->c_iflag; + s_tios->c_oflag = l_tios->c_oflag; + s_tios->c_cflag = l_tios->c_cflag; + s_tios->c_lflag = l_tios->c_lflag; + + if (s_tios->c_lflag & ICANON) { + s_tios->c_cc[VEOF] = l_tios->c_cc[LX_VEOF]; + s_tios->c_cc[VEOL] = l_tios->c_cc[LX_VEOL]; + } else { + s_tios->c_cc[VMIN] = l_tios->c_cc[LX_VMIN]; + s_tios->c_cc[VTIME] = l_tios->c_cc[LX_VTIME]; + } + + s_tios->c_cc[VEOL2] = l_tios->c_cc[LX_VEOL2]; + s_tios->c_cc[VERASE] = l_tios->c_cc[LX_VERASE]; + s_tios->c_cc[VKILL] = l_tios->c_cc[LX_VKILL]; + s_tios->c_cc[VREPRINT] = l_tios->c_cc[LX_VREPRINT]; + s_tios->c_cc[VLNEXT] = l_tios->c_cc[LX_VLNEXT]; + s_tios->c_cc[VWERASE] = l_tios->c_cc[LX_VWERASE]; + s_tios->c_cc[VINTR] = l_tios->c_cc[LX_VINTR]; + s_tios->c_cc[VQUIT] = l_tios->c_cc[LX_VQUIT]; + s_tios->c_cc[VSWTCH] = l_tios->c_cc[LX_VSWTC]; + s_tios->c_cc[VSTART] = l_tios->c_cc[LX_VSTART]; + s_tios->c_cc[VSTOP] = l_tios->c_cc[LX_VSTOP]; + s_tios->c_cc[VSUSP] = l_tios->c_cc[LX_VSUSP]; + s_tios->c_cc[VDISCARD] = l_tios->c_cc[LX_VDISCARD]; +} + +static void +l2s_termio(struct lx_termio *l_tio, struct termio *s_tio) +{ + ASSERT((l_tio != NULL) && (s_tio != NULL)); + + bzero(s_tio, sizeof (*s_tio)); + + s_tio->c_iflag = l_tio->c_iflag; + s_tio->c_oflag = l_tio->c_oflag; + s_tio->c_cflag = l_tio->c_cflag; + s_tio->c_lflag = l_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + s_tio->c_cc[VEOF] = l_tio->c_cc[LX_VEOF]; + } else { + s_tio->c_cc[VMIN] = l_tio->c_cc[LX_VMIN]; + s_tio->c_cc[VTIME] = l_tio->c_cc[LX_VTIME]; + } + + s_tio->c_cc[VINTR] = l_tio->c_cc[LX_VINTR]; + s_tio->c_cc[VQUIT] = l_tio->c_cc[LX_VQUIT]; + s_tio->c_cc[VERASE] = l_tio->c_cc[LX_VERASE]; + s_tio->c_cc[VKILL] = l_tio->c_cc[LX_VKILL]; + s_tio->c_cc[VSWTCH] = l_tio->c_cc[LX_VSWTC]; +} + +static void +termios2lx_cc(struct lx_termios *l_tios, struct lx_cc *lio) +{ + ASSERT((l_tios != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tios->c_cc[LX_VEOF]; + lio->veol = l_tios->c_cc[LX_VEOL]; + lio->vmin = l_tios->c_cc[LX_VMIN]; + lio->vtime = l_tios->c_cc[LX_VTIME]; +} + +static void +termio2lx_cc(struct lx_termio *l_tio, struct lx_cc *lio) +{ + ASSERT((l_tio != NULL) && (lio != NULL)); + + bzero(lio, sizeof (*lio)); + + lio->veof = l_tio->c_cc[LX_VEOF]; + lio->veol = 0; + lio->vmin = l_tio->c_cc[LX_VMIN]; + lio->vtime = l_tio->c_cc[LX_VTIME]; +} + +static void +s2l_termios(struct termios *s_tios, struct lx_termios *l_tios) +{ + ASSERT((s_tios != NULL) && (l_tios != NULL)); + + bzero(l_tios, sizeof (*l_tios)); + + l_tios->c_iflag = s_tios->c_iflag; + l_tios->c_oflag = s_tios->c_oflag; + l_tios->c_cflag = s_tios->c_cflag; + l_tios->c_lflag = s_tios->c_lflag; + + if (s_tios->c_lflag & ICANON) { + l_tios->c_cc[LX_VEOF] = s_tios->c_cc[VEOF]; + l_tios->c_cc[LX_VEOL] = s_tios->c_cc[VEOL]; + } else { + l_tios->c_cc[LX_VMIN] = s_tios->c_cc[VMIN]; + l_tios->c_cc[LX_VTIME] = s_tios->c_cc[VTIME]; + } + + l_tios->c_cc[LX_VEOL2] = s_tios->c_cc[VEOL2]; + l_tios->c_cc[LX_VERASE] = s_tios->c_cc[VERASE]; + l_tios->c_cc[LX_VKILL] = s_tios->c_cc[VKILL]; + l_tios->c_cc[LX_VREPRINT] = s_tios->c_cc[VREPRINT]; + l_tios->c_cc[LX_VLNEXT] = s_tios->c_cc[VLNEXT]; + l_tios->c_cc[LX_VWERASE] = s_tios->c_cc[VWERASE]; + l_tios->c_cc[LX_VINTR] = s_tios->c_cc[VINTR]; + l_tios->c_cc[LX_VQUIT] = s_tios->c_cc[VQUIT]; + l_tios->c_cc[LX_VSWTC] = s_tios->c_cc[VSWTCH]; + l_tios->c_cc[LX_VSTART] = s_tios->c_cc[VSTART]; + l_tios->c_cc[LX_VSTOP] = s_tios->c_cc[VSTOP]; + l_tios->c_cc[LX_VSUSP] = s_tios->c_cc[VSUSP]; + l_tios->c_cc[LX_VDISCARD] = s_tios->c_cc[VDISCARD]; +} + +static void +s2l_termio(struct termio *s_tio, struct lx_termio *l_tio) +{ + ASSERT((s_tio != NULL) && (l_tio != NULL)); + + bzero(l_tio, sizeof (*l_tio)); + + l_tio->c_iflag = s_tio->c_iflag; + l_tio->c_oflag = s_tio->c_oflag; + l_tio->c_cflag = s_tio->c_cflag; + l_tio->c_lflag = s_tio->c_lflag; + + if (s_tio->c_lflag & ICANON) { + l_tio->c_cc[LX_VEOF] = s_tio->c_cc[VEOF]; + } else { + l_tio->c_cc[LX_VMIN] = s_tio->c_cc[VMIN]; + l_tio->c_cc[LX_VTIME] = s_tio->c_cc[VTIME]; + } + + l_tio->c_cc[LX_VINTR] = s_tio->c_cc[VINTR]; + l_tio->c_cc[LX_VQUIT] = s_tio->c_cc[VQUIT]; + l_tio->c_cc[LX_VERASE] = s_tio->c_cc[VERASE]; + l_tio->c_cc[LX_VKILL] = s_tio->c_cc[VKILL]; + l_tio->c_cc[LX_VSWTC] = s_tio->c_cc[VSWTCH]; +} + +static void +set_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + /* + * Linux expects that the termio/termios control characters are + * preserved more strictly than illumos supports. In order to preserve + * the illusion that the characters are maintained, they are stored as + * vnode-specific data. + */ + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur == NULL) { + cur = kmem_alloc(sizeof (struct lx_cc), KM_SLEEP); + bcopy(lio, cur, sizeof (struct lx_cc)); + (void) vsd_set(vp, lx_ioctl_vsd, cur); + } else { + bcopy(lio, cur, sizeof (struct lx_cc)); + } + mutex_exit(&vp->v_vsd_lock); +} + +static int +get_lx_cc(vnode_t *vp, struct lx_cc *lio) +{ + struct lx_cc *cur; + int rv = 1; + mutex_enter(&vp->v_vsd_lock); + cur = (struct lx_cc *)vsd_get(vp, lx_ioctl_vsd); + if (cur != NULL) { + bcopy(cur, lio, sizeof (*lio)); + rv = 0; + } + mutex_exit(&vp->v_vsd_lock); + return (rv); +} + +/* Socket helpers */ + +typedef struct lx_ifreq32 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + }; +} lx_ifreq32_t; + +typedef struct lx_ifreq64 { + char ifr_name[IFNAMSIZ]; + union { + struct sockaddr ifru_addr; + /* pad this out to the Linux size */ + uint64_t ifmap[3]; + }; +} lx_ifreq64_t; + +typedef struct lx_ifconf32 { + int32_t if_len; + caddr32_t if_buf; +} lx_ifconf32_t; + +typedef struct lx_ifconf64 { + int32_t if_len; + caddr_t if_buf; +} lx_ifconf64_t; + + +/* Generic translators */ + +static int +ict_pass(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error = 0; + int rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_fionbio(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + int32_t iflag, flags; + int error; + + if (copyin((caddr_t)arg, &iflag, sizeof (iflag))) + return (set_errno(EFAULT)); + + mutex_enter(&fp->f_tlock); + vp = fp->f_vnode; + flags = fp->f_flag; + /* Linux sets NONBLOCK instead of FIONBIO */ + if (iflag) + flags |= FNONBLOCK; + else + flags &= ~FNONBLOCK; + /* push the flag down */ + error = VOP_SETFL(vp, fp->f_flag, flags, fp->f_cred, NULL); + fp->f_flag = flags; + mutex_exit(&fp->f_tlock); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_fionread(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp; + struct vattr vattr; + int error = 0; + int rv; + /* + * offset is int32_t because that is what FIONREAD is defined in terms + * of. We cap at INT_MAX as in other cases for this ioctl. + */ + int32_t offset; + + vp = fp->f_vnode; + + if (vp->v_type == VREG || vp->v_type == VDIR) { + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vp, &vattr, 0, fp->f_cred, NULL); + if (error != 0) + return (set_errno(error)); + offset = MIN(vattr.va_size - fp->f_offset, INT_MAX); + if (copyout(&offset, (caddr_t)arg, sizeof (offset))) + return (set_errno(EFAULT)); + } else { + error = VOP_IOCTL(vp, FIONREAD, arg, FLUSER(fp), fp->f_cred, + &rv, NULL); + if (error) + return (set_errno(error)); + } + return (0); +} + +/* Terminal-related translators */ + +static int +ict_tcsets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETS || cmd == TCSETSW || cmd == TCSETSF); + + if (copyin((struct lx_termios *)arg, &l_tios, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + termios2lx_cc(&l_tios, &lio); + l2s_termios(&l_tios, &s_tios); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +static int +ict_tcseta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + ASSERT(cmd == TCSETA || cmd == TCSETAW || cmd == TCSETAF); + + if (copyin((struct lx_termio *)arg, &l_tio, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + l2s_termio(&l_tio, &s_tio); + termio2lx_cc(&l_tio, &lio); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + /* preserve lx_cc */ + set_lx_cc(fp->f_vnode, &lio); + + return (0); +} + +static int +ict_tcgets_ptm(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios, *s_tiosd; + uint_t s_tiosl; + + /* get termios defaults */ + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_NOTPROM, "ttymodes", (uchar_t **)&s_tiosd, + &s_tiosl) != DDI_SUCCESS) + return (EIO); + ASSERT(s_tiosl == sizeof (*s_tiosd)); + bcopy(s_tiosd, &s_tios, sizeof (s_tios)); + ddi_prop_free(s_tiosd); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tcgets_native(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termios l_tios; + struct termios s_tios; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tios, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + /* Now munge the data to how Linux wants it. */ + s2l_termios(&s_tios, &l_tios); + + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tios.c_cc[LX_VEOF] = lio.veof; + l_tios.c_cc[LX_VEOL] = lio.veol; + l_tios.c_cc[LX_VMIN] = lio.vmin; + l_tios.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tios, (struct lx_termios *)arg, sizeof (l_tios)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tcgets(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (getmajor(fp->f_vnode->v_rdev) == ddi_name_to_major(LX_PTM_DRV)) + return (ict_tcgets_ptm(fp, cmd, arg, lxcmd)); + else + return (ict_tcgets_native(fp, cmd, arg, lxcmd)); +} + +static int +ict_tcgeta(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct lx_termio l_tio; + struct termio s_tio; + struct lx_cc lio; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&s_tio, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error) + return (set_errno(error)); + + s2l_termio(&s_tio, &l_tio); + /* return preserved lx_cc */ + if (get_lx_cc(fp->f_vnode, &lio) == 0) { + l_tio.c_cc[LX_VEOF] = lio.veof; + l_tio.c_cc[LX_VMIN] = lio.vmin; + l_tio.c_cc[LX_VTIME] = lio.vtime; + } + + if (copyout(&l_tio, (struct lx_termios *)arg, sizeof (l_tio)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tiocspgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t lpid, spid, tid; + int error, rv; + + /* Converting to the illumos pid is necessary */ + if (copyin((pid_t *)arg, &lpid, sizeof (lpid)) < 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(lpid, &spid, &tid) < 0) + return (set_errno(EPERM)); + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spid, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_tcsbrkp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int rv, error; + /* use null duration to emulate TCSBRKP */ + int dur = 0; + error = VOP_IOCTL(fp->f_vnode, TCSBRK, (intptr_t)&dur, + FLFAKE(fp), fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_tiocgpgrp(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t spgrp; + int error, rv; + + error = VOP_IOCTL(fp->f_vnode, cmd, (intptr_t)&spgrp, FLFAKE(fp), + fp->f_cred, &rv, NULL); + if (error == 0) { + if (spgrp == curproc->p_zone->zone_proc_initpid) { + spgrp = 1; + } + if (copyout(&spgrp, (caddr_t)arg, sizeof (spgrp))) { + return (set_errno(EFAULT)); + } + } + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_sptlock(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + int error, rv; + + istr.ic_cmd = UNLKPT; + istr.ic_len = 0; + istr.ic_timout = 0; + istr.ic_dp = NULL; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + fp->f_flag |FKIOCTL, fp->f_cred, &rv, NULL); + /* + * The success/fail return values are different between Linux + * and illumos. Linux expects 0 or -1. Illumos can return + * positive number on success. + */ + return ((error != 0) ? set_errno(error) : 0); +} + +static int +ict_gptn(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct strioctl istr; + cred_t *cr; + pt_own_t pto; + int error, rv; + int ptyno; + + /* This operation is only valid for the lx_ptm device. */ + if (getmajor(fp->f_vnode->v_rdev) != ddi_name_to_major(LX_PTM_DRV)) + return (set_errno(ENOTTY)); + + cr = CRED(); + pto.pto_ruid = cr->cr_uid; + pto.pto_rgid = cr->cr_gid; + + istr.ic_cmd = OWNERPT; + istr.ic_len = sizeof (pto); + istr.ic_timout = 0; + istr.ic_dp = (char *)&pto; + error = VOP_IOCTL(fp->f_vnode, I_STR, (intptr_t)&istr, + FLFAKE(fp), fp->f_cred, &rv, NULL); + + if (error) + return (set_errno((error == ENOTTY) ? error: EACCES)); + + ptyno = getminor(fp->f_vnode->v_rdev) - 1; + if (copyout(&ptyno, (caddr_t)arg, sizeof (ptyno))) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_tiocgwinsz(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + int error, rv; + struct winsize winsize; + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + + if (error == EINVAL) { + bzero(&winsize, sizeof (winsize)); + if (copyout(&winsize, (caddr_t)arg, sizeof (winsize))) + return (set_errno(EFAULT)); + } else if (error != 0) { + return (set_errno(error)); + } + + return (0); +} + +static int +ict_tiocsctty(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + pid_t ttysid, mysid; + int error, rv; + proc_t *p = curproc; + + /* getsid */ + mutex_enter(&p->p_splock); + mysid = p->p_sessp->s_sid; + mutex_exit(&p->p_splock); + + /* + * Report success if we already control the tty. + * If no one controls it, TIOCSCTTY will change that later. + */ + error = VOP_IOCTL(fp->f_vnode, TIOCGSID, (intptr_t)&ttysid, + FLFAKE(fp), fp->f_cred, &rv, NULL); + if (error == 0 && ttysid == mysid) + return (0); + + /* + * Need to make sure we're a session leader, otherwise the + * TIOCSCTTY ioctl will fail. + */ + mutex_enter(&pidlock); + if (p->p_sessp->s_sidp != p->p_pidp && !pgmembers(p->p_pid)) { + mutex_exit(&pidlock); + sess_create(); + } else { + mutex_exit(&pidlock); + } + + error = VOP_IOCTL(fp->f_vnode, cmd, 0, FLUSER(fp), + fp->f_cred, &rv, NULL); + return ((error != 0) ? set_errno(error) : 0); +} + +/* Socket-related translators */ + +static int +ict_siocatmark(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + vnode_t *vp = fp->f_vnode; + int error, rv; + /* + * Linux expects a SIOCATMARK of a UDP socket to return ENOTTY, while + * Illumos allows it. Linux prior to 2.6.39 returned EINVAL for this. + */ + if (vp->v_type != VSOCK || VTOSO(vp)->so_type != SOCK_STREAM) + return (set_errno(ENOTTY)); + + error = VOP_IOCTL(fp->f_vnode, cmd, arg, FLUSER(fp), fp->f_cred, &rv, + NULL); + if (error) + return (set_errno(error)); + + return (0); +} + +static int +ict_if_ioctl(vnode_t *vn, int cmd, intptr_t arg, int flags, cred_t *cred) +{ + int error, rv; + lx_zone_data_t *lxzd = ztolxzd(curproc->p_zone); + ksocket_t ks; + + ASSERT(lxzd != NULL); + ks = lxzd->lxzd_ioctl_sock; + + /* + * For ioctls of this type, Illumos is strict about address family + * whereas Linux is lenient. This strictness can be avoided by using + * an internal AF_INET ksocket. + */ + if (ks != NULL) { + error = ksocket_ioctl(ks, cmd, arg, &rv, cred); + } else { + error = VOP_IOCTL(vn, cmd, arg, flags, cred, &rv, NULL); + } + + return (error); +} + +static int +ict_sioghwaddr(file_t *fp, struct lifreq *lreq) +{ + struct sockaddr_dl *sdl = (struct sockaddr_dl *)&lreq->lifr_addr; + struct sockaddr hwaddr; + int error, size; + + error = ict_if_ioctl(fp->f_vnode, SIOCGLIFHWADDR, (intptr_t)lreq, + FLFAKE(fp), fp->f_cred); + + if (error == EADDRNOTAVAIL && + strncmp(lreq->lifr_name, "lo", 2) == 0) { + /* Emulate success on suspected loopbacks */ + sdl->sdl_type = DL_LOOP; + sdl->sdl_alen = ETHERADDRL; + bzero(LLADDR(sdl), sdl->sdl_alen); + error = 0; + } + + if (error == 0) { + bzero(&hwaddr, sizeof (hwaddr)); + lx_stol_hwaddr(sdl, &hwaddr, &size); + bcopy(&hwaddr, &lreq->lifr_addr, + size + sizeof (sdl->sdl_family)); + } + + return (error); +} + +static int +ict_siolifreq(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + struct ifreq req; + struct lifreq lreq; + int error, len; + + /* Convert from Linux ifreq to illumos lifreq */ + if (curproc->p_model == DATAMODEL_LP64) + len = sizeof (lx_ifreq64_t); + else + len = sizeof (lx_ifreq32_t); + if (copyin((struct ifreq *)arg, &req, len) != 0) + return (set_errno(EFAULT)); + bzero(&lreq, sizeof (lreq)); + strncpy(lreq.lifr_name, req.ifr_name, IFNAMSIZ); + bcopy(&req.ifr_ifru, &lreq.lifr_lifru, len - IFNAMSIZ); + lx_ifname_convert(lreq.lifr_name, LX_IF_TONATIVE); + + switch (cmd) { + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + case SIOCGIFMTU: + case SIOCSIFMTU: + /* + * Convert cmd from SIO*IF* to SIO*LIF*. + * This is needed since Linux allows ifreq operations on ipv6 + * sockets where illumos does not. + */ + cmd = ((cmd & IOC_INOUT) | + _IOW('i', ((cmd & 0xff) + 100), struct lifreq)); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFINDEX: + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFFLAGS: + cmd = SIOCGLIFFLAGS; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_FROMNATIVE); + break; + case SIOCSIFFLAGS: + cmd = SIOCSLIFFLAGS; + lx_ifflags_convert(&lreq.lifr_flags, LX_IF_TONATIVE); + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + break; + case SIOCGIFHWADDR: + error = ict_sioghwaddr(fp, &lreq); + break; + case LX_SIOCGIFTXQLEN: + /* + * Illumos lacks the notion of txqlen. Confirm the provided + * interface is valid with SIOCGLIFINDEX and return a fake + * txqlen of 1. Loopback devices will report txqlen of 0. + */ + if (strncmp(lreq.lifr_name, "lo", 2) == 0) { + lreq.lifr_index = 0; + error = 0; + break; + } + cmd = SIOCGLIFINDEX; + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&lreq, + FLFAKE(fp), fp->f_cred); + if (error == 0) { + /* lifr_index aliases to the qlen field */ + lreq.lifr_index = 1; + } + break; + case LX_SIOCSIFHWADDR: + /* + * We're not going to support SIOCSIFHWADDR, but we need to be + * able to check the result of the copyin first to see if the + * command should have returned EFAULT. + */ + default: + error = EINVAL; + } + + if (error != 0) + return (set_errno(error)); + + /* Convert back to a Linux ifreq */ + lx_ifname_convert(lreq.lifr_name, LX_IF_FROMNATIVE); + bzero(&req, sizeof (req)); + strncpy(req.ifr_name, lreq.lifr_name, IFNAMSIZ); + bcopy(&lreq.lifr_lifru, &req.ifr_ifru, len - IFNAMSIZ); + + if (copyout(&req, (struct lifreq *)arg, len) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +ict_siocgifconf32(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf32_t conf; + lx_ifreq32_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf32_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq32_t); + + if (copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq32_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq32_t); + oreq = (lx_ifreq32_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq32_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf32_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +static int +ict_siocgifconf64(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + lx_ifconf64_t conf; + lx_ifreq64_t *oreq; + struct ifconf sconf; + int ifcount, error, i, buf_len; + + if (copyin((lx_ifconf64_t *)arg, &conf, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + + /* They want to know how many interfaces there are. */ + if (conf.if_len <= 0 || conf.if_buf == NULL) { + error = ict_if_ioctl(fp->f_vnode, SIOCGIFNUM, + (intptr_t)&ifcount, FLFAKE(fp), fp->f_cred); + if (error != 0) + return (set_errno(error)); + + conf.if_len = ifcount * sizeof (lx_ifreq64_t); + + if (copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + return (set_errno(EFAULT)); + return (0); + } else { + ifcount = conf.if_len / sizeof (lx_ifreq64_t); + } + + /* Get interface configuration list. */ + sconf.ifc_len = ifcount * sizeof (struct ifreq); + sconf.ifc_req = (struct ifreq *)kmem_alloc(sconf.ifc_len, KM_SLEEP); + + error = ict_if_ioctl(fp->f_vnode, cmd, (intptr_t)&sconf, FLFAKE(fp), + fp->f_cred); + if (error != 0) { + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + return (set_errno(error)); + } + + /* Convert data to Linux format & rename interfaces */ + buf_len = ifcount * sizeof (lx_ifreq64_t); + oreq = (lx_ifreq64_t *)kmem_alloc(buf_len, KM_SLEEP); + for (i = 0; i < sconf.ifc_len / sizeof (struct ifreq); i++) { + bcopy(&sconf.ifc_req[i], oreq + i, sizeof (lx_ifreq64_t)); + lx_ifname_convert(oreq[i].ifr_name, LX_IF_FROMNATIVE); + } + conf.if_len = i * sizeof (*oreq); + kmem_free(sconf.ifc_req, ifcount * sizeof (struct ifreq)); + + error = 0; + if (copyout(oreq, (caddr_t)(uintptr_t)conf.if_buf, conf.if_len) != 0 || + copyout(&conf, (lx_ifconf64_t *)arg, sizeof (conf)) != 0) + error = set_errno(EFAULT); + + kmem_free(oreq, buf_len); + return (error); +} + +static int +ict_siocgifconf(file_t *fp, int cmd, intptr_t arg, int lxcmd) +{ + if (curproc->p_model == DATAMODEL_LP64) + return (ict_siocgifconf64(fp, cmd, arg, lxcmd)); + else + return (ict_siocgifconf32(fp, cmd, arg, lxcmd)); +} + +/* Structure used to define an ioctl translator. */ +typedef struct ioc_cmd_translator { + int ict_lxcmd; + int ict_cmd; + int (*ict_func)(file_t *fp, int cmd, intptr_t arg, int lxcmd); +} ioc_cmd_translator_t; + +#define IOC_CMD_TRANSLATOR_PASS(ioc_cmd_sym) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define IOC_CMD_TRANSLATOR_FILTER(ioc_cmd_sym, ioct_handler) \ + { (int)LX_##ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define IOC_CMD_TRANSLATOR_CUSTOM(ioc_cmd_sym, ioct_handler) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ioct_handler }, + +#define IOC_CMD_TRANSLATOR_PTHRU(ioc_cmd_sym) \ + { (int)ioc_cmd_sym, (int)ioc_cmd_sym, ict_pass }, + +#define IOC_CMD_TRANSLATOR_END \ + {0, 0, NULL} + +static ioc_cmd_translator_t ioc_translators[] = { + IOC_CMD_TRANSLATOR_FILTER(FIONBIO, ict_fionbio) + IOC_CMD_TRANSLATOR_FILTER(FIONREAD, ict_fionread) + IOC_CMD_TRANSLATOR_PASS(FIOGETOWN) + IOC_CMD_TRANSLATOR_PASS(FIOASYNC) + + /* streams related */ + IOC_CMD_TRANSLATOR_PASS(TCXONC) + IOC_CMD_TRANSLATOR_PASS(TCFLSH) + IOC_CMD_TRANSLATOR_PASS(TIOCEXCL) + IOC_CMD_TRANSLATOR_PASS(TIOCNXCL) + IOC_CMD_TRANSLATOR_PASS(TIOCSTI) + IOC_CMD_TRANSLATOR_PASS(TIOCSWINSZ) + IOC_CMD_TRANSLATOR_PASS(TIOCMBIS) + IOC_CMD_TRANSLATOR_PASS(TIOCMBIC) + IOC_CMD_TRANSLATOR_PASS(TIOCMSET) + IOC_CMD_TRANSLATOR_PASS(TIOCSETD) + IOC_CMD_TRANSLATOR_PASS(TCSBRK) + + /* terminal related */ + IOC_CMD_TRANSLATOR_PASS(TIOCGETD) + IOC_CMD_TRANSLATOR_PASS(TIOCGSID) + IOC_CMD_TRANSLATOR_PASS(TIOCNOTTY) + IOC_CMD_TRANSLATOR_PASS(TIOCPKT) + + IOC_CMD_TRANSLATOR_FILTER(TCSETS, ict_tcsets) + IOC_CMD_TRANSLATOR_FILTER(TCSETSW, ict_tcsets) + IOC_CMD_TRANSLATOR_FILTER(TCSETSF, ict_tcsets) + IOC_CMD_TRANSLATOR_FILTER(TCSETA, ict_tcseta) + IOC_CMD_TRANSLATOR_FILTER(TCSETAW, ict_tcseta) + IOC_CMD_TRANSLATOR_FILTER(TCSETAF, ict_tcseta) + IOC_CMD_TRANSLATOR_FILTER(TCGETS, ict_tcgets) + IOC_CMD_TRANSLATOR_FILTER(TCGETA, ict_tcgeta) + IOC_CMD_TRANSLATOR_FILTER(TIOCGWINSZ, ict_tiocgwinsz) + IOC_CMD_TRANSLATOR_CUSTOM(LX_TCSBRKP, ict_tcsbrkp) + IOC_CMD_TRANSLATOR_FILTER(TIOCSPGRP, ict_tiocspgrp) + IOC_CMD_TRANSLATOR_FILTER(TIOCGPGRP, ict_tiocgpgrp) + IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCSPTLCK, ict_sptlock) + IOC_CMD_TRANSLATOR_CUSTOM(LX_TIOCGPTN, ict_gptn) + IOC_CMD_TRANSLATOR_FILTER(TIOCSCTTY, ict_tiocsctty) + + /* socket related */ + IOC_CMD_TRANSLATOR_PASS(SIOCSPGRP) + IOC_CMD_TRANSLATOR_PASS(SIOCGPGRP) + IOC_CMD_TRANSLATOR_PASS(SIOCGSTAMP) + IOC_CMD_TRANSLATOR_FILTER(SIOCATMARK, ict_siocatmark) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFFLAGS, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFFLAGS, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFDSTADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFDSTADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFBRDADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFBRDADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFNETMASK, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFNETMASK, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMETRIC, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMETRIC, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFMTU, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCSIFMTU, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFHWADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCSIFHWADDR, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFINDEX, ict_siolifreq) + IOC_CMD_TRANSLATOR_CUSTOM(LX_SIOCGIFTXQLEN, ict_siolifreq) + IOC_CMD_TRANSLATOR_FILTER(SIOCGIFCONF, ict_siocgifconf) + + /* dtrace related */ + IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADD) + IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_REMOVE) + IOC_CMD_TRANSLATOR_PTHRU(DTRACEHIOC_ADDDOF) + + IOC_CMD_TRANSLATOR_END +}; + +static void +lx_ioctl_vsd_free(void *data) +{ + kmem_free(data, sizeof (struct lx_cc)); +} + +void +lx_ioctl_init() +{ + vsd_create(&lx_ioctl_vsd, lx_ioctl_vsd_free); +} + +void +lx_ioctl_fini() +{ + vsd_destroy(&lx_ioctl_vsd); +} + +long +lx_ioctl(int fdes, int cmd, intptr_t arg) +{ + file_t *fp; + int res = 0; + ioc_cmd_translator_t *ict; + + if (cmd == LX_FIOCLEX || cmd == LX_FIONCLEX) { + res = f_setfd_error(fdes, (cmd == LX_FIOCLEX) ? FD_CLOEXEC : 0); + return ((res != 0) ? set_errno(res) : 0); + } + + if ((fp = getf(fdes)) == NULL) + return (set_errno(EBADF)); + + /* + * Today, none of the ioctls supported by the emulation possess + * overlapping cmd values. Because of that, no type interrogation of + * the fd is done before executing specific ioctl emulation. It's + * assumed that the vnode-specific logic called by the emulation + * function will reject ioctl commands not supported by the fd. + */ + ict = ioc_translators; + while (ict->ict_func != NULL) { + if (ict->ict_lxcmd == cmd) + break; + ict++; + } + + if (ict->ict_func != NULL) { + res = ict->ict_func(fp, ict->ict_cmd, arg, ict->ict_lxcmd); + } else { + res = set_errno(EINVAL); + } + + releasef(fdes); + return (res); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_kill.c b/usr/src/uts/common/brand/lx/syscall/lx_kill.c new file mode 100644 index 0000000000..908d83b6be --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_kill.c @@ -0,0 +1,399 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + + + + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/zone.h> +#include <sys/thread.h> +#include <sys/signal.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <lx_signum.h> +#include <sys/contract/process_impl.h> + +extern int kill(pid_t, int); + +/* + * Check if it is legal to send this signal to the init process. Linux + * kill(2) semantics dictate that no _unhandled_ signal may be sent to pid + * 1. + */ +static int +init_sig_check(int sig, pid_t pid) +{ + proc_t *p; + int rv = 0; + + mutex_enter(&pidlock); + + if (((p = prfind(pid)) == NULL) || (p->p_stat == SIDL)) + rv = ESRCH; + else if (sig && (sigismember(&cantmask, sig) || + (PTOU(p)->u_signal[sig-1] == SIG_DFL) || + (PTOU(p)->u_signal[sig-1] == SIG_IGN))) + rv = EPERM; + + mutex_exit(&pidlock); + + return (rv); +} + +static long +lx_thrkill(pid_t tgid, pid_t pid, int lx_sig, boolean_t tgkill) +{ + kthread_t *t; + proc_t *pp, *cp = curproc; + pid_t initpid; + sigqueue_t *sqp; + int tid = 1; /* default tid */ + int sig, rv; + + /* + * Unlike kill(2), Linux tkill(2) doesn't allow signals to + * be sent to process IDs <= 0 as it doesn't overlay any special + * semantics on the pid. + */ + if ((pid <= 0) || ((lx_sig < 0) || (lx_sig > LX_NSIG)) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * If the Linux pid is 1, translate the pid to the actual init + * pid for the zone. Note that Linux dictates that no unhandled + * signals may be sent to init, so check for that, too. + * + * Otherwise, extract the tid and real pid from the Linux pid. + */ + initpid = cp->p_zone->zone_proc_initpid; + if (pid == 1) + pid = initpid; + if ((pid == initpid) && ((rv = init_sig_check(sig, pid)) != 0)) + return (set_errno(rv)); + else if (lx_lpid_to_spair(pid, &pid, &tid) < 0) + return (set_errno(ESRCH)); + + if (tgkill && tgid != pid) + return (set_errno(ESRCH)); + + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + + /* + * Find the process for the passed pid... + */ + mutex_enter(&pidlock); + if (((pp = prfind(pid)) == NULL) || (pp->p_stat == SIDL)) { + mutex_exit(&pidlock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + mutex_enter(&pp->p_lock); + mutex_exit(&pidlock); + + /* + * Deny permission to send the signal if either of the following + * is true: + * + * + The signal is SIGCONT and the target pid is not in the same + * session as the sender + * + * + prochasprocperm() shows the user lacks sufficient permission + * to send the signal to the target pid + */ + if (((sig == SIGCONT) && (pp->p_sessp != cp->p_sessp)) || + (!prochasprocperm(pp, cp, CRED()))) { + mutex_exit(&pp->p_lock); + rv = set_errno(EPERM); + goto free_and_exit; + } + + /* check for the tid */ + if ((t = idtot(pp, tid)) == NULL) { + mutex_exit(&pp->p_lock); + rv = set_errno(ESRCH); + goto free_and_exit; + } + + /* a signal of 0 means just check for the existence of the thread */ + if (lx_sig == 0) { + mutex_exit(&pp->p_lock); + rv = 0; + goto free_and_exit; + } + + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = SI_LWP; + sqp->sq_info.si_pid = cp->p_pid; + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(pp, t, sqp); + + mutex_exit(&pp->p_lock); + + return (0); + +free_and_exit: + kmem_free(sqp, sizeof (sigqueue_t)); + return (rv); +} + +long +lx_tgkill(pid_t tgid, pid_t pid, int lx_sig) +{ + return (lx_thrkill(tgid, pid, lx_sig, B_TRUE)); +} + +long +lx_tkill(pid_t pid, int lx_sig) +{ + return (lx_thrkill(0, pid, lx_sig, B_FALSE)); +} + +long +lx_kill(pid_t lx_pid, int lx_sig) +{ + pid_t s_pid, initpid; + sigsend_t v; + zone_t *zone = curproc->p_zone; + struct proc *p; + int err, sig, nfound; + + if ((lx_sig < 0) || (lx_sig > LX_NSIG) || + ((sig = ltos_signo[lx_sig]) < 0)) + return (set_errno(EINVAL)); + + /* + * Since some linux apps rely on init(1M) having PID 1, we + * transparently translate 1 to the real init(1M)'s pid. We then + * check to be sure that it is legal for this process to send this + * signal to init(1M). + */ + initpid = zone->zone_proc_initpid; + if (lx_pid == 1 || lx_pid == -1) { + s_pid = initpid; + } else if (lx_pid == 0) { + s_pid = 0; + } else if (lx_pid > 0) { + if (lx_lpid_to_spair(lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } else { + ASSERT(lx_pid < 0); + if (lx_lpid_to_spair(-lx_pid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid it means that the + * process group leader doesn't exist in this zone. + * In this case assuming that the Linux pid is + * the same as the Solaris pid will get us the + * correct behavior. + */ + s_pid = -lx_pid; + } + } + + if ((s_pid == initpid) && ((err = init_sig_check(sig, s_pid)) != 0)) + return (set_errno(err)); + + /* + * For individual processes, kill() semantics are the same between + * Solaris and Linux. + */ + if (lx_pid >= 0) + return (kill(s_pid, sig)); + + /* + * In Solaris, sending a signal to -pid means "send a signal to + * everyone in process group pid." In Linux it means "send a + * signal to everyone in the group other than init." Sending a + * signal to -1 means "send a signal to every process except init + * and myself." + */ + + bzero(&v, sizeof (v)); + v.sig = sig; + v.checkperm = 1; + v.sicode = SI_USER; + err = 0; + + mutex_enter(&pidlock); + + p = (lx_pid == -1) ? practive : pgfind(s_pid); + nfound = 0; + while (err == 0 && p != NULL) { + if ((p->p_zone == zone) && (p->p_stat != SIDL) && + (p->p_pid != initpid) && (lx_pid < -1 || p != curproc)) { + nfound++; + err = sigsendproc(p, &v); + } + + p = (lx_pid == -1) ? p->p_next : p->p_pglink; + } + mutex_exit(&pidlock); + if (nfound == 0) + err = ESRCH; + else if (err == 0 && v.perm == 0) + err = EPERM; + return (err ? set_errno(err) : 0); +} + +/* + * This handles the unusual case where the user sends a non-queueable signal + * through rt_sigqueueinfo. Signals sent with codes that indicate they are + * queuable are sent through the sigqueue syscall via the user level function + * lx_rt_sigqueueinfo(). + */ +int +lx_helper_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo) +{ + proc_t *target_proc; + pid_t s_pid; + zone_t *zone = curproc->p_zone; + sigsend_t send; + int err; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + /* Unlike in lx_kill, this process id must be exact, no negatives. */ + if (tgid == 0) + return (set_errno(ESRCH)); + if (tgid < 0) + return (set_errno(EINVAL)); + /* + * Translate init directly, otherwise use the convenient utility + * function to translate. Since we're sending to the whole group, we + * only need the solaris pid, and not the lwp id. + */ + if (tgid == 1) { + s_pid = zone->zone_proc_initpid; + } else { + if (lx_lpid_to_spair(tgid, &s_pid, NULL) != 0) { + /* + * If we didn't find this pid that means it doesn't + * exist in this zone. + */ + return (set_errno(ESRCH)); + } + } + /* + * We shouldn't have queuable signals here, those are sent elsewhere by + * the usermode handler for this emulated call. + */ + if (!SI_CANQUEUE(kinfo.si_code)) { + return (set_errno(EINVAL)); + } + /* Since our signal shouldn't queue, we just call sigsendproc(). */ + bzero(&send, sizeof (send)); + send.sig = sig; + send.checkperm = 1; + send.sicode = kinfo.si_code; + send.value = kinfo.si_value; + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + err = 0; + if (target_proc != NULL) { + err = sigsendproc(target_proc, &send); + if (err == 0 && send.perm == 0) + err = EPERM; + } else { + err = ESRCH; + } + mutex_exit(&pidlock); + + return (err ? set_errno(err) : 0); +} + +/* + * Unlike the above function, this handles all system calls to rt_tgsigqueue + * regardless of si_code. + */ +int +lx_helper_rt_tgsigqueueinfo(pid_t tgid, pid_t tid, int sig, siginfo_t *uinfo) +{ + id_t s_tid; + pid_t s_pid; + proc_t *target_proc; + sigqueue_t *sqp; + kthread_t *t; + siginfo_t kinfo; + + if (copyin(uinfo, &kinfo, sizeof (siginfo_t)) != 0) + return (set_errno(EFAULT)); + if (lx_lpid_to_spair(tid, &s_pid, &s_tid) != 0) + return (set_errno(ESRCH)); + /* + * For group leaders, solaris pid == linux pid, so the solaris leader + * pid should be the same as the tgid but since the tgid comes in via + * the syscall we need to check for an invalid value. + */ + if (s_pid != tgid) + return (set_errno(EINVAL)); + + mutex_enter(&pidlock); + target_proc = prfind(s_pid); + if (target_proc != NULL) + mutex_enter(&target_proc->p_lock); + mutex_exit(&pidlock); + + if (target_proc == NULL) { + return (set_errno(ESRCH)); + } + if (sig < 0 || sig >= NSIG) + return (set_errno(EINVAL)); + + /* + * Some code adapted from lwp_kill, duplicated here because we do some + * customization to the sq_info field of sqp. + */ + if ((t = idtot(target_proc, s_tid)) == NULL) { + mutex_exit(&target_proc->p_lock); + return (set_errno(ESRCH)); + } + /* Just checking for existence of the process, not sending a signal. */ + if (sig == 0) { + mutex_exit(&target_proc->p_lock); + return (0); + } + sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); + sqp->sq_info.si_signo = sig; + sqp->sq_info.si_code = kinfo.si_code; + sqp->sq_info.si_pid = target_proc->p_pid; + sqp->sq_info.si_ctid = PRCTID(target_proc); + sqp->sq_info.si_zoneid = getzoneid(); + sqp->sq_info.si_uid = crgetruid(CRED()); + sigaddqa(target_proc, t, sqp); + mutex_exit(&target_proc->p_lock); + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c new file mode 100644 index 0000000000..2f29f56d5f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_mkdir.c @@ -0,0 +1,38 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/fcntl.h> +#include <sys/lx_fcntl.h> + +/* + * From "uts/common/syscall/mkdir.c": + */ +extern int mkdirat(int, char *, int); + +long +lx_mkdirat(int fd, char *dname, int dmode) +{ + if (fd == LX_AT_FDCWD) { + fd = AT_FDCWD; + } + + return (mkdirat(fd, dname, dmode)); +} + +long +lx_mkdir(char *dname, int dmode) +{ + return (mkdirat(AT_FDCWD, dname, dmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c new file mode 100644 index 0000000000..aa6e12a7d8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_modify_ldt.c @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/segments.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/sysi86.h> +#include <sys/cmn_err.h> +#include <sys/lx_ldt.h> + +/* + * Read the ldt_info structure in from the Linux app, convert it to an ssd + * structure, and then call setdscr() to do all the heavy lifting. + */ +static int +write_ldt(void *data, ulong_t count) +{ + user_desc_t usd; + struct ssd ssd; + struct ldt_info ldt_inf; + proc_t *pp = curthread->t_procp; + int err; + + if (count != sizeof (ldt_inf)) + return (set_errno(EINVAL)); + + if (copyin(data, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + if (ldt_inf.entry_number >= MAXNLDT) + return (set_errno(EINVAL)); + + LDT_INFO_TO_DESC(&ldt_inf, &usd); + usd_to_ssd(&usd, &ssd, SEL_LDT(ldt_inf.entry_number)); + + /* + * Get everyone into a safe state before changing the LDT. + */ + if (!holdlwps(SHOLDFORK1)) + return (set_errno(EINTR)); + + err = setdscr(&ssd); + + /* + * Release the hounds! + */ + mutex_enter(&pp->p_lock); + continuelwps(pp); + mutex_exit(&pp->p_lock); + + return (err ? set_errno(err) : 0); +} + +static int +read_ldt(void *uptr, ulong_t count) +{ + proc_t *pp = curproc; + int bytes; + + if (pp->p_ldt == NULL) + return (0); + + bytes = (pp->p_ldtlimit + 1) * sizeof (user_desc_t); + if (bytes > count) + bytes = count; + + if (copyout(pp->p_ldt, uptr, bytes)) + return (set_errno(EFAULT)); + + return (bytes); +} + +long +lx_modify_ldt(int op, void *data, ulong_t count) +{ + int rval; + + switch (op) { + case 0: + rval = read_ldt(data, count); + break; + + case 1: + rval = write_ldt(data, count); + break; + + default: + rval = set_errno(ENOSYS); + break; + } + + return (rval); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_open.c b/usr/src/uts/common/brand/lx/syscall/lx_open.c new file mode 100644 index 0000000000..3e13cef54f --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_open.c @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/filio.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/inttypes.h> + +#include <sys/lx_types.h> +#include <sys/lx_fcntl.h> +#include <sys/lx_misc.h> + +extern int fcntl(int, int, intptr_t); +extern int openat(int, char *, int, int); +extern int open(char *, int, int); +extern int close(int); +extern int ioctl(int, int, intptr_t); +extern int lookupnameat(char *, enum uio_seg, int, vnode_t **, vnode_t **, + vnode_t *); + +static int +ltos_open_flags(uintptr_t p2) +{ + int flags; + + if ((p2 & O_ACCMODE) == LX_O_RDONLY) + flags = O_RDONLY; + else if ((p2 & O_ACCMODE) == LX_O_WRONLY) + flags = O_WRONLY; + else + flags = O_RDWR; + + if (p2 & LX_O_CREAT) { + flags |= O_CREAT; + } + + if (p2 & LX_O_EXCL) + flags |= O_EXCL; + if (p2 & LX_O_NOCTTY) + flags |= O_NOCTTY; + if (p2 & LX_O_TRUNC) + flags |= O_TRUNC; + if (p2 & LX_O_APPEND) + flags |= O_APPEND; + if (p2 & LX_O_NONBLOCK) + flags |= O_NONBLOCK; + if (p2 & LX_O_SYNC) + flags |= O_SYNC; + if (p2 & LX_O_LARGEFILE) + flags |= O_LARGEFILE; + if (p2 & LX_O_NOFOLLOW) + flags |= O_NOFOLLOW; + if (p2 & LX_O_CLOEXEC) + flags |= O_CLOEXEC; + + /* + * Linux uses the LX_O_DIRECT flag to do raw, synchronous I/O to the + * device backing the fd in question. Illumos doesn't have similar + * functionality, but we can attempt to simulate it using the flags + * (O_RSYNC|O_SYNC) and directio(3C). + * + * The LX_O_DIRECT flag also requires that the transfer size and + * alignment of I/O buffers be a multiple of the logical block size for + * the underlying file system, but frankly there isn't an easy way to + * support that functionality without doing something like adding an + * fcntl(2) flag to denote LX_O_DIRECT mode. + * + * Since LX_O_DIRECT is merely a performance advisory, we'll just + * emulate what we can and trust that the only applications expecting + * an error when performing I/O from a misaligned buffer or when + * passing a transfer size is not a multiple of the underlying file + * system block size will be test suites. + */ + if (p2 & LX_O_DIRECT) + flags |= (O_RSYNC|O_SYNC); + + return (flags); +} + +static int +lx_open_postprocess(int fd, int fmode) +{ + if (fmode & LX_O_DIRECT) { + (void) ioctl(fd, _FIODIRECTIO, DIRECTIO_ON); + } + + /* + * Set the ASYNC flag if passsed. + */ + if (fmode & LX_O_ASYNC) { + int res; + + if ((res = fcntl(fd, F_SETFL, FASYNC)) != 0) { + (void) close(fd); + return (res); + } + } + + return (fd); +} + +long +lx_openat(int atfd, char *path, int fmode, int cmode) +{ + int flags, fd; + mode_t mode = 0; + + if (atfd == LX_AT_FDCWD) + atfd = AT_FDCWD; + + flags = ltos_open_flags(fmode); + + /* + * We use the FSEARCH flag to make sure this is a directory. We have to + * explicitly add 1 to emulate the FREAD/FWRITE mapping of the OPENMODE + * macro since it won't get set via OPENMODE when FSEARCH is used. + */ + if (fmode & LX_O_DIRECTORY) { + flags |= FSEARCH; + flags++; + } + + if (flags & O_CREAT) + mode = (mode_t)cmode; + + ttolwp(curthread)->lwp_errno = 0; + fd = openat(atfd, path, flags, mode); + if (ttolwp(curthread)->lwp_errno != 0) { + if ((fmode & LX_O_DIRECTORY) && + ttolwp(curthread)->lwp_errno != ENOTDIR) { + /* + * We got an error trying to open a file as a directory. + * We need to determine if we should return the original + * error or ENOTDIR. + */ + vnode_t *startvp; + vnode_t *vp; + int oerror, error = 0; + + oerror = ttolwp(curthread)->lwp_errno; + + if (atfd == AT_FDCWD) { + /* regular open */ + startvp = NULL; + } else { + char startchar; + + if (copyin(path, &startchar, sizeof (char))) + return (set_errno(oerror)); + + /* if startchar is / then startfd is ignored */ + if (startchar == '/') { + startvp = NULL; + } else { + file_t *startfp; + + if ((startfp = getf(atfd)) == NULL) + return (set_errno(oerror)); + startvp = startfp->f_vnode; + VN_HOLD(startvp); + releasef(atfd); + } + } + + if (lookupnameat(path, UIO_USERSPACE, + (fmode & LX_O_NOFOLLOW) ? NO_FOLLOW : FOLLOW, + NULLVPP, &vp, startvp) != 0) { + if (startvp != NULL) + VN_RELE(startvp); + return (set_errno(oerror)); + } + + if (startvp != NULL) + VN_RELE(startvp); + + if (vp->v_type != VDIR) + error = ENOTDIR; + + VN_RELE(vp); + if (error != 0) + return (set_errno(ENOTDIR)); + + set_errno(oerror); + } + return (ttolwp(curthread)->lwp_errno); + } + + return (lx_open_postprocess(fd, fmode)); +} + +long +lx_open(char *path, int fmode, int cmode) +{ + return (lx_openat(LX_AT_FDCWD, path, fmode, cmode)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_pipe.c b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c new file mode 100644 index 0000000000..fe354a8d54 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_pipe.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T. All Rights Reserved. + * + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/zone.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cpuvar.h> +#include <sys/cred.h> +#include <sys/user.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/errno.h> +#include <sys/debug.h> +#include <sys/fs/fifonode.h> +#include <sys/fcntl.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +#define LX_O_NONBLOCK 04000 +#define LX_O_CLOEXEC 02000000 + +/* + * Based on native pipe(2) system call, except that the pipe is half-duplex. + */ +static int +lx_hd_pipe(intptr_t arg, int flags) +{ + vnode_t *vp1, *vp2; + struct file *fp1, *fp2; + int error = 0; + int flag1, flag2, iflags; + int fd1, fd2; + + /* + * Validate allowed flags. + */ + if ((flags & ~(FCLOEXEC|FNONBLOCK)) != 0) { + return (set_errno(EINVAL)); + } + /* + * Allocate and initialize two vnodes. + */ + makepipe(&vp1, &vp2); + + /* + * Allocate and initialize two file table entries and two + * file pointers. The first file pointer is open for read and the + * second is open for write. + */ + if ((error = falloc(vp1, FREAD, &fp1, &fd1)) != 0) { + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); + } + + if ((error = falloc(vp2, FWRITE, &fp2, &fd2)) != 0) + goto out2; + + /* + * Create two stream heads and attach to each vnode. + */ + if ((error = fifo_stropen(&vp1, FREAD, fp1->f_cred, 0, 0)) != 0) + goto out; + + if ((error = fifo_stropen(&vp2, FWRITE, fp2->f_cred, 0, 0)) != 0) { + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, + fp1->f_cred, NULL); + goto out; + } + + strmate(vp1, vp2); + + VTOF(vp1)->fn_ino = VTOF(vp2)->fn_ino = fifogetid(); + + /* + * Set the O_NONBLOCK flag if requested. + */ + if (flags & FNONBLOCK) { + flag1 = fp1->f_flag; + flag2 = fp2->f_flag; + iflags = flags & FNONBLOCK; + + if ((error = VOP_SETFL(vp1, flag1, iflags, fp1->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp1->f_flag |= iflags; + + if ((error = VOP_SETFL(vp2, flag2, iflags, fp2->f_cred, + NULL)) != 0) { + goto out_vop_close; + } + fp2->f_flag |= iflags; + } + + /* + * Return the file descriptors to the user. They now + * point to two different vnodes which have different + * stream heads. + */ + if (copyout(&fd1, &((int *)arg)[0], sizeof (int)) || + copyout(&fd2, &((int *)arg)[1], sizeof (int))) { + error = EFAULT; + goto out_vop_close; + } + + /* + * Now fill in the entries that falloc reserved + */ + mutex_exit(&fp1->f_tlock); + mutex_exit(&fp2->f_tlock); + setf(fd1, fp1); + setf(fd2, fp2); + + /* + * Optionally set the FCLOEXEC flag + */ + if ((flags & FCLOEXEC) != 0) { + f_setfd(fd1, FD_CLOEXEC); + f_setfd(fd2, FD_CLOEXEC); + } + + return (0); +out_vop_close: + (void) VOP_CLOSE(vp1, FREAD, 1, (offset_t)0, fp1->f_cred, NULL); + (void) VOP_CLOSE(vp2, FWRITE, 1, (offset_t)0, fp2->f_cred, NULL); +out: + setf(fd2, NULL); + unfalloc(fp2); +out2: + setf(fd1, NULL); + unfalloc(fp1); + VN_RELE(vp1); + VN_RELE(vp2); + return (set_errno(error)); +} + +/* + * pipe(2) system call. + */ +long +lx_pipe(intptr_t arg) +{ + return (lx_hd_pipe(arg, 0)); +} + +/* + * pipe2(2) system call. + */ +long +lx_pipe2(intptr_t arg, int lxflags) +{ + int flags = 0; + + /* + * Validate allowed flags. + */ + if ((lxflags & ~(LX_O_NONBLOCK | LX_O_CLOEXEC)) != 0) { + return (set_errno(EINVAL)); + } + + /* + * Convert from Linux flags to illumos flags. + */ + if (lxflags & LX_O_NONBLOCK) { + flags |= FNONBLOCK; + } + if (lxflags & LX_O_CLOEXEC) { + flags |= FCLOEXEC; + } + + return (lx_hd_pipe(arg, flags)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c new file mode 100644 index 0000000000..6581ead25b --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rlimit.c @@ -0,0 +1,575 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/zone.h> +#include <sys/cpuvar.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> +#include <sys/lx_brand.h> + +#define LX_RLIMIT_CPU 0 +#define LX_RLIMIT_FSIZE 1 +#define LX_RLIMIT_DATA 2 +#define LX_RLIMIT_STACK 3 +#define LX_RLIMIT_CORE 4 +#define LX_RLIMIT_RSS 5 +#define LX_RLIMIT_NPROC 6 +#define LX_RLIMIT_NOFILE 7 +#define LX_RLIMIT_MEMLOCK 8 +#define LX_RLIMIT_AS 9 +#define LX_RLIMIT_LOCKS 10 /* NA limit on locks, early 2.4 only */ +#define LX_RLIMIT_SIGPENDING 11 +#define LX_RLIMIT_MSGQUEUE 12 +#define LX_RLIMIT_NICE 13 /* NA ceiling for nice */ +#define LX_RLIMIT_RTPRIO 14 /* NA ceiling on the RT priority */ +#define LX_RLIMIT_RTTIME 15 /* NA cpu limit for RT proc. */ + +#define LX_RLIMIT_NLIMITS 16 + +#define RCTL_INFINITE(x) \ + ((x->rcv_flagaction & RCTL_LOCAL_MAXIMAL) && \ + (x->rcv_flagaction & RCTL_GLOBAL_INFINITE)) + +typedef struct { + ulong_t rlim_cur; + ulong_t rlim_max; +} lx_rlimit_t; + +typedef struct { + uint32_t rlim_cur; + uint32_t rlim_max; +} lx_rlimit32_t; + +/* + * Linux supports many of the same resources that we do, but on Illumos these + * are rctls. Instead of using rlimit, we use rctls for all of the limits. + * This table is used to translate Linux rlimit keys into the Illumos legacy + * rlimit. We then primarily use the rctl/rlimit compatability code to + * manage these. + */ +static int l_to_r[LX_RLIMIT_NLIMITS] = { + RLIMIT_CPU, /* 0 CPU */ + RLIMIT_FSIZE, /* 1 FSIZE */ + RLIMIT_DATA, /* 2 DATA */ + RLIMIT_STACK, /* 3 STACK */ + RLIMIT_CORE, /* 4 CORE */ + -1, /* 5 RSS */ + -1, /* 6 NPROC */ + RLIMIT_NOFILE, /* 7 NOFILE */ + -1, /* 8 MEMLOCK */ + RLIMIT_AS, /* 9 AS */ + -1, /* 10 LOCKS */ + -1, /* 11 SIGPENDING */ + -1, /* 12 MSGQUEUE */ + -1, /* 13 NICE */ + -1, /* 14 RTPRIO */ + -1 /* 15 RTTIME */ +}; + +/* + * Magic value Linux uses to indicate infinity + */ +#define LX_RLIM_INFINITY_N ULONG_MAX + +static void +lx_get_rctl(char *nm, struct rlimit64 *rlp64) +{ + rctl_hndl_t hndl; + rctl_val_t *oval, *nval; + + rlp64->rlim_cur = RLIM_INFINITY; + rlp64->rlim_max = RLIM_INFINITY; + + nval = kmem_alloc(sizeof (rctl_val_t), KM_SLEEP); + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + oval = NULL; + while ((hndl != -1) && rctl_local_get(hndl, oval, nval, curproc) == 0) { + oval = nval; + switch (nval->rcv_privilege) { + case RCPRIV_BASIC: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_cur = nval->rcv_value; + break; + case RCPRIV_PRIVILEGED: + if (!RCTL_INFINITE(nval)) + rlp64->rlim_max = nval->rcv_value; + break; + } + } + + mutex_exit(&curproc->p_lock); + kmem_free(nval, sizeof (rctl_val_t)); + + if (rlp64->rlim_cur == RLIM_INFINITY && + rlp64->rlim_max != RLIM_INFINITY) + rlp64->rlim_cur = rlp64->rlim_max; +} + +static int +lx_getrlimit_common(int lx_resource, uint64_t *rlim_curp, uint64_t *rlim_maxp) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int resource; + int64_t cur = -1; + boolean_t cur_inf = B_FALSE; + int64_t max = -1; + boolean_t max_inf = B_FALSE; + struct rlimit64 rlim64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max; + break; + + case LX_RLIMIT_NICE: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max; + break; + + case LX_RLIMIT_RTTIME: + rlim64.rlim_cur = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur; + rlim64.rlim_max = pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max; + break; + + case LX_RLIMIT_RSS: + /* zone.max-physical-memory */ + rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_phys_mem_ctl; + break; + + case LX_RLIMIT_NPROC: + /* zone.max-lwps */ + rlim64.rlim_cur = rlim64.rlim_max = curzone->zone_nlwps_ctl; + break; + + case LX_RLIMIT_MEMLOCK: + /* zone.max-locked-memory */ + rlim64.rlim_cur = rlim64.rlim_max = + curzone->zone_locked_mem_ctl; + break; + + case LX_RLIMIT_SIGPENDING: + lx_get_rctl("process.max-sigqueue-size", &rlim64); + break; + + case LX_RLIMIT_MSGQUEUE: + lx_get_rctl("process.max-msg-messages", &rlim64); + break; + + default: + resource = l_to_r[lx_resource]; + + mutex_enter(&curproc->p_lock); + (void) rctl_rlimit_get(rctlproc_legacy[resource], curproc, + &rlim64); + mutex_exit(&curproc->p_lock); + break; + } + + + if (rlim64.rlim_cur == RLIM64_INFINITY) { + cur = LX_RLIM_INFINITY_N; + } else { + cur = rlim64.rlim_cur; + } + if (rlim64.rlim_max == RLIM64_INFINITY) { + max = LX_RLIM_INFINITY_N; + } else { + max = rlim64.rlim_max; + } + + if (lx_resource == LX_RLIMIT_STACK && cur > INT_MAX) { + /* + * Stunningly, Linux has somehow managed to confuse the concept + * of a "limit" with that of a "default" -- and the value of + * RLIMIT_STACK is used by NPTL as the _default_ stack size if + * it isn't specified. (!!) Even for a system that prides + * itself on slapdash castles of junk, this is an amazingly + * willful act of incompetence -- and one that is gleefully + * confessed in the pthread_create() man page: "if the + * RLIMIT_STACK soft resource limit at the time the program + * started has any value other than 'unlimited', then it + * determines the default stack size of new threads." A + * typical stack limit for us is 32TB; if it needs to be said, + * setting the default stack size to be 32TB doesn't work so + * well! Of course, glibc dropping a deuce in its pants + * becomes our problem -- so to prevent smelly accidents we + * tell Linux that any stack limit over the old (32-bit) values + * for infinity are just infinitely large. + */ + cur_inf = B_TRUE; + max_inf = B_TRUE; + } + + if (cur_inf) { + *rlim_curp = LX_RLIM64_INFINITY; + } else { + *rlim_curp = cur; + } + + if (max_inf) { + *rlim_maxp = LX_RLIM64_INFINITY; + } else { + *rlim_maxp = max; + } + + return (0); +} + +/* + * This is the 'new' getrlimit, variously called getrlimit or ugetrlimit + * in Linux headers and code. The only difference between this and the old + * getrlimit (variously called getrlimit or old_getrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. Modern code will + * use this version by default. + */ +long +lx_getrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (rlim_cur == LX_RLIM64_INFINITY) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else if (rlim_cur > LX_RLIM_INFINITY_N) + rl.rlim_cur = LX_RLIM_INFINITY_N; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max == LX_RLIM64_INFINITY) + rl.rlim_max = LX_RLIM_INFINITY_N; + else if (rlim_max > LX_RLIM_INFINITY_N) + rl.rlim_max = LX_RLIM_INFINITY_N; + else + rl.rlim_max = (ulong_t)rlim_max; + + if (copyout(&rl, rlp, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (rlim_cur > UINT_MAX) + rl.rlim_cur = UINT_MAX; + else + rl.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > UINT_MAX) + rl.rlim_max = UINT_MAX; + else + rl.rlim_max = (ulong_t)rlim_max; + + rl32.rlim_cur = rl.rlim_cur; + rl32.rlim_max = rl.rlim_max; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + } +#endif + + return (0); +} + +/* + * This is the 'old' getrlimit, variously called getrlimit or old_getrlimit + * in Linux headers and code. The only difference between this and the new + * getrlimit (variously called getrlimit or ugetrlimit) is the value of + * RLIM_INFINITY, which is smaller for the older version. + * + * This is only used for 32-bit code. + */ +long +lx_oldgetrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit32_t rl32; + uint64_t rlim_cur, rlim_max; + + rv = lx_getrlimit_common(resource, &rlim_cur, &rlim_max); + if (rv != 0) + return (set_errno(rv)); + + if (rlim_cur > INT_MAX) + rl32.rlim_cur = INT_MAX; + else + rl32.rlim_cur = (ulong_t)rlim_cur; + + if (rlim_max > INT_MAX) + rl32.rlim_max = INT_MAX; + else + rl32.rlim_max = (ulong_t)rlim_cur; + + if (copyout(&rl32, rlp, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +static int +lx_set_rctl(char *nm, struct rlimit64 *rlp64) +{ + int err; + rctl_hndl_t hndl; + rctl_alloc_gp_t *gp; + + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + + hndl = rctl_hndl_lookup(nm); + + /* + * We're not supposed to do this but since we want all our rctls to + * behave like rlimits, we take advantage of this function to set up + * this way. + */ + err = rctl_rlimit_set(hndl, curproc, rlp64, gp, RCTL_LOCAL_DENY, 0, + CRED()); + + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + + return (err); +} + +static int +lx_setrlimit_common(int lx_resource, uint64_t rlim_cur, uint64_t rlim_max) +{ + lx_proc_data_t *pd = ptolxproc(curproc); + int err; + int resource; + rctl_alloc_gp_t *gp; + struct rlimit64 rl64; + + if (lx_resource < 0 || lx_resource >= LX_RLIMIT_NLIMITS) + return (EINVAL); + + switch (lx_resource) { + case LX_RLIMIT_LOCKS: + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_LOCKS].rlim_max = rlim_max; + break; + + case LX_RLIMIT_NICE: + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_NICE].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTPRIO: + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTPRIO].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RTTIME: + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_cur = rlim_cur; + pd->l_fake_limits[LX_RLFAKE_RTTIME].rlim_max = rlim_max; + break; + + case LX_RLIMIT_RSS: + /* + * zone.max-physical-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_NPROC: + /* + * zone.max-lwps + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_MEMLOCK: + /* + * zone.max-locked-memory + * Since we're emulating the value via a zone rctl, we can't + * set that from within the zone. Lie and say we set the value. + */ + break; + + case LX_RLIMIT_SIGPENDING: + /* + * On Ubuntu at least, the login and sshd processes expect to + * set this limit to 16k and login will fail if this fails. On + * Illumos we have a system limit of 8k and normally the + * privileged limit is 512. We simply pretend this works to + * allow login to work. + */ + if (rlim_max > 8192) + return (0); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-sigqueue-size", &rl64)) + != 0) + return (set_errno(err)); + break; + + case LX_RLIMIT_MSGQUEUE: + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + if ((err = lx_set_rctl("process.max-msg-messages", &rl64)) != 0) + return (set_errno(err)); + break; + + default: + resource = l_to_r[lx_resource]; + + /* + * Linux limits the max number of open files to 1m and there is + * a test for this. + */ + if (lx_resource == LX_RLIMIT_NOFILE && rlim_max > (1024 * 1024)) + return (EPERM); + + rl64.rlim_cur = rlim_cur; + rl64.rlim_max = rlim_max; + gp = rctl_rlimit_set_prealloc(1); + + mutex_enter(&curproc->p_lock); + err = rctl_rlimit_set(rctlproc_legacy[resource], curproc, + &rl64, gp, rctlproc_flags[resource], + rctlproc_signals[resource], CRED()); + mutex_exit(&curproc->p_lock); + + rctl_prealloc_destroy(gp); + if (err != 0) + return (set_errno(err)); + break; + } + + return (0); +} + +long +lx_setrlimit(int resource, lx_rlimit_t *rlp) +{ + int rv; + lx_rlimit_t rl; + uint64_t rlim_cur, rlim_max; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin(rlp, &rl, sizeof (rl)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + lx_rlimit32_t rl32; + + if (copyin(rlp, &rl32, sizeof (rl32)) != 0) + return (set_errno(EFAULT)); + + rl.rlim_cur = rl32.rlim_cur; + rl.rlim_max = rl32.rlim_max; + } +#endif + + if ((rl.rlim_max != LX_RLIM_INFINITY_N && + rl.rlim_cur == LX_RLIM_INFINITY_N) || + rl.rlim_cur > rl.rlim_max) + return (set_errno(EINVAL)); + + if (rl.rlim_cur == LX_RLIM_INFINITY_N) + rlim_cur = LX_RLIM64_INFINITY; + else + rlim_cur = rl.rlim_cur; + + if (rl.rlim_max == LX_RLIM_INFINITY_N) + rlim_max = LX_RLIM64_INFINITY; + else + rlim_max = rl.rlim_max; + + rv = lx_setrlimit_common(resource, rlim_cur, rlim_max); + if (rv != 0) + return (set_errno(rv)); + return (0); +} + +/* + * From the man page: + * The Linux-specific prlimit() system call combines and extends the + * functionality of setrlimit() and getrlimit(). It can be used to both set + * and get the resource limits of an arbitrary process. + * + * If pid is 0, then the call applies to the calling process. + */ +long +lx_prlimit64(pid_t pid, int resource, lx_rlimit64_t *nrlp, lx_rlimit64_t *orlp) +{ + int rv; + lx_rlimit64_t nrl, orl; + + if (pid != 0) { + /* XXX TBD if needed */ + char buf[80]; + + (void) snprintf(buf, sizeof (buf), + "setting prlimit %d for another process\n", resource); + lx_unsupported(buf); + return (ENOTSUP); + } + + if (orlp != NULL) { + /* we first get the current limits */ + rv = lx_getrlimit_common(resource, &orl.rlim_cur, + &orl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (nrlp != NULL) { + if (copyin(nrlp, &nrl, sizeof (nrl)) != 0) + return (set_errno(EFAULT)); + + if ((nrl.rlim_max != LX_RLIM64_INFINITY && + nrl.rlim_cur == LX_RLIM64_INFINITY) || + nrl.rlim_cur > nrl.rlim_max) + return (set_errno(EINVAL)); + + rv = lx_setrlimit_common(resource, nrl.rlim_cur, nrl.rlim_max); + if (rv != 0) + return (set_errno(rv)); + } + + if (orlp != NULL) { + /* now return the original limits, if necessary */ + if (copyout(&orl, orlp, sizeof (orl)) != 0) + return (set_errno(EFAULT)); + } + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_rw.c b/usr/src/uts/common/brand/lx/syscall/lx_rw.c new file mode 100644 index 0000000000..b21a81da48 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_rw.c @@ -0,0 +1,60 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/vnode.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> + +/* uts/common/syscall/rw.c */ +extern ssize_t read(int fdes, void *cbuf, size_t count); +extern ssize_t write(int fdes, void *cbuf, size_t count); + +long +lx_read(int fd, void *buf, size_t nbyte) +{ + file_t *fp; + vtype_t t; + + if ((fp = getf(fd)) == NULL) + return (set_errno(EBADF)); + t = fp->f_vnode->v_type; + releasef(fd); + + if (t == VDIR) + return (set_errno(EISDIR)); + + /* + * If read(2) returns EINTR, we want to signal that restarting the + * system call is acceptable: + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (read(fd, buf, nbyte)); +} + +long +lx_write(int fd, void *buf, size_t nbyte) +{ + /* + * If write(2) returns EINTR, we want to signal that restarting the + * system call is acceptable: + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + + return (write(fd, buf, nbyte)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sched.c b/usr/src/uts/common/brand/lx/syscall/lx_sched.c new file mode 100644 index 0000000000..0def559e29 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sched.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/cpu.h> +#include <sys/rtpriocntl.h> +#include <sys/tspriocntl.h> +#include <sys/processor.h> +#include <sys/brand.h> +#include <sys/lx_sched.h> +#include <sys/lx_brand.h> + +extern int yield(); +extern long priocntl_common(int, procset_t *, int, caddr_t, caddr_t, uio_seg_t); + +long +lx_sched_yield(void) +{ + yield(); + + return (0); +} + +int +lx_sched_affinity(int cmd, uintptr_t pid, int len, uintptr_t maskp, + int64_t *rval) +{ + pid_t s_pid; + id_t s_tid; + kthread_t *t = curthread; + lx_lwp_data_t *lx_lwp; + + if (cmd != B_GET_AFFINITY_MASK && cmd != B_SET_AFFINITY_MASK) + return (set_errno(EINVAL)); + + /* + * The caller wants to know how large the mask should be. + */ + if (cmd == B_GET_AFFINITY_MASK && len == 0) { + *rval = sizeof (lx_affmask_t); + return (0); + } + + /* + * Otherwise, ensure they have a large enough mask. + */ + if (cmd == B_GET_AFFINITY_MASK && len < sizeof (lx_affmask_t)) { + *rval = -1; + return (set_errno(EINVAL)); + } + + if (pid == 0) { + s_pid = curproc->p_pid; + s_tid = curthread->t_tid; + } else if (lx_lpid_to_spair((pid_t)pid, &s_pid, &s_tid) == -1) { + return (set_errno(ESRCH)); + } + + /* + * For now, we only support manipulating threads in the + * same process. + */ + if (curproc->p_pid != s_pid) + return (set_errno(EPERM)); + + /* + * We must hold the process lock so that the thread list + * doesn't change while we're looking at it. We'll hold + * the lock until we no longer reference the + * corresponding lwp. + */ + + mutex_enter(&curproc->p_lock); + + do { + if (t->t_tid == s_tid) + break; + t = t->t_forw; + } while (t != curthread); + + /* + * If the given PID is in the current thread's process, + * then we _must_ find it in the process's thread list. + */ + ASSERT(t->t_tid == s_tid); + + lx_lwp = t->t_lwp->lwp_brand; + + if (cmd == B_SET_AFFINITY_MASK) { + if (copyin_nowatch((void *)maskp, &lx_lwp->br_affinitymask, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = 0; + } else { + if (copyout_nowatch(&lx_lwp->br_affinitymask, (void *)maskp, + sizeof (lx_affmask_t)) != 0) { + mutex_exit(&curproc->p_lock); + return (set_errno(EFAULT)); + } + + *rval = sizeof (lx_affmask_t); + } + + mutex_exit(&curproc->p_lock); + return (0); +} + +long +lx_sched_setscheduler(l_pid_t pid, int policy, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + if (policy < 0) { + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + policy = LX_SCHED_OTHER; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + } else { + return (set_errno(EINVAL)); + } + } + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getscheduler(l_pid_t pid) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int policy; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + policy = set_errno(EINVAL); + + return (policy); +} + +long +lx_sched_setparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + procset_t procset; + procset_t procset_cid; + pcparms_t pcparm; + pcinfo_t pcinfo; + struct lx_sched_param sched_param; + tsparms_t *tsp; + int policy; + int prio, maxupri; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + if (copyin(param, &sched_param, sizeof (sched_param))) + return (set_errno(EFAULT)); + + prio = sched_param.lx_sched_prio; + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the current policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (strcmp(pcinfo.pc_clname, "TS") == 0) + policy = LX_SCHED_OTHER; + else if (strcmp(pcinfo.pc_clname, "RT") == 0) + policy = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs == + RT_TQINF ? LX_SCHED_FIFO : LX_SCHED_RR; + else + return (set_errno(EINVAL)); + + bzero(&pcinfo, sizeof (pcinfo)); + bzero(&pcparm, sizeof (pcparm)); + setprocset(&procset_cid, POP_AND, P_PID, 0, P_ALL, 0); + switch (policy) { + case LX_SCHED_FIFO: + case LX_SCHED_RR: + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (prio < 0 || + prio > ((rtinfo_t *)pcinfo.pc_clinfo)->rt_maxpri) + return (set_errno(EINVAL)); + pcparm.pc_cid = pcinfo.pc_cid; + ((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio; + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = + policy == LX_SCHED_RR ? RT_TQDEF : RT_TQINF; + break; + + case LX_SCHED_OTHER: + (void) strcpy(pcinfo.pc_clname, "TS"); + (void) do_priocntlsys(PC_GETCID, &procset_cid, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + maxupri = ((tsinfo_t *)pcinfo.pc_clinfo)->ts_maxupri; + if (prio > maxupri || prio < -maxupri) + return (set_errno(EINVAL)); + + pcparm.pc_cid = pcinfo.pc_cid; + tsp = (tsparms_t *)pcparm.pc_clparms; + tsp->ts_upri = prio; + tsp->ts_uprilim = TS_NOCHANGE; + break; + + default: + return (set_errno(EINVAL)); + } + + /* + * finally set scheduling policy and parameters + */ + (void) do_priocntlsys(PC_SETPARMS, &procset, &pcparm); + + return (0); +} + +long +lx_sched_getparam(l_pid_t pid, struct lx_sched_param *param) +{ + klwp_t *lwp = ttolwp(curthread); + struct lx_sched_param local_param; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + tsinfo_t *tsi; + int prio, scale; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + bzero(&pcinfo, sizeof (pcinfo)); + pcinfo.pc_cid = pcparm.pc_cid; + (void) do_priocntlsys(PC_GETCLINFO, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + bzero(&local_param, sizeof (local_param)); + if (strcmp(pcinfo.pc_clname, "TS") == 0) { + /* + * I don't know if we need to do this, coz it can't be + * changed from zero anyway..... + */ + tsi = (tsinfo_t *)pcinfo.pc_clinfo; + prio = ((tsparms_t *)pcparm.pc_clparms)->ts_upri; + scale = tsi->ts_maxupri; + if (scale == 0) + local_param.lx_sched_prio = 0; + else + local_param.lx_sched_prio = -(prio * 20) / scale; + } else if (strcmp(pcinfo.pc_clname, "RT") == 0) { + local_param.lx_sched_prio = + ((rtparms_t *)pcparm.pc_clparms)->rt_pri; + } else { + rv = set_errno(EINVAL); + } + + if (rv == 0) + if (copyout(&local_param, param, sizeof (local_param))) + return (set_errno(EFAULT)); + + return (rv); +} + +long +lx_sched_rr_get_interval(l_pid_t pid, struct timespec *ival) +{ + klwp_t *lwp = ttolwp(curthread); + struct timespec interval; + procset_t procset; + pcparms_t pcparm; + pcinfo_t pcinfo; + int rv; + + if (pid < 0) + return (set_errno(ESRCH)); + + if ((rv = sched_setprocset(&procset, pid))) + return (rv); + + /* + * get the class id + */ + pcparm.pc_cid = PC_CLNULL; + (void) do_priocntlsys(PC_GETPARMS, &procset, &pcparm); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + /* + * get the class info and identify the equivalent linux policy + */ + setprocset(&procset, POP_AND, P_PID, 0, P_ALL, 0); + bzero(&pcinfo, sizeof (pcinfo)); + (void) strcpy(pcinfo.pc_clname, "RT"); + (void) do_priocntlsys(PC_GETCID, &procset, &pcinfo); + if (lwp->lwp_errno) + return (lwp->lwp_errno); + + if (pcparm.pc_cid == pcinfo.pc_cid && + ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs != RT_TQINF) { + interval.tv_sec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqsecs; + interval.tv_nsec = ((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs; + + if (copyout(&interval, ival, sizeof (interval))) + return (set_errno(EFAULT)); + + return (0); + } + + return (set_errno(EINVAL)); +} + +int +sched_setprocset(procset_t *procset, l_pid_t pid) +{ + id_t lid, rid; + idtype_t lidtype, ridtype; + + /* + * define the target lwp + */ + if (pid == 0) { + ridtype = P_ALL; + lidtype = P_PID; + rid = 0; + lid = P_MYID; + } else { + if (lx_lpid_to_spair(pid, &pid, &lid) < 0) + return (set_errno(ESRCH)); + if (pid != curproc->p_pid) + return (set_errno(ESRCH)); + rid = 0; + ridtype = P_ALL; + lidtype = P_LWPID; + } + setprocset(procset, POP_AND, lidtype, lid, ridtype, rid); + + return (0); +} + +long +do_priocntlsys(int cmd, procset_t *procset, void *arg) +{ + return (priocntl_common(PC_VERSION, procset, cmd, (caddr_t)arg, 0, + UIO_SYSSPACE)); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c new file mode 100644 index 0000000000..449d5882d4 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_sysinfo.c @@ -0,0 +1,218 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <vm/anon.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/zone.h> +#include <sys/time.h> + +typedef struct lx_sysinfo { + int64_t si_uptime; /* Seconds since boot */ + uint64_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint64_t si_totalram; /* Total memory size */ + uint64_t si_freeram; /* Available memory */ + uint64_t si_sharedram; /* Shared memory */ + uint64_t si_bufferram; /* Buffer memory */ + uint64_t si_totalswap; /* Total swap space */ + uint64_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint64_t si_totalhigh; /* High memory size */ + uint64_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ +} lx_sysinfo_t; + +#if defined(_SYSCALL32_IMPL) +/* + * 64-bit kernel view of the 32-bit usermode struct. + */ +#pragma pack(4) +typedef struct lx_sysinfo32 { + int32_t si_uptime; /* Seconds since boot */ + uint32_t si_loads[3]; /* 1, 5, and 15 minute avg runq length */ + uint32_t si_totalram; /* Total memory size */ + uint32_t si_freeram; /* Available memory */ + uint32_t si_sharedram; /* Shared memory */ + uint32_t si_bufferram; /* Buffer memory */ + uint32_t si_totalswap; /* Total swap space */ + uint32_t si_freeswap; /* Avail swap space */ + uint16_t si_procs; /* Process count */ + uint16_t si_pad; /* Padding */ + uint32_t si_totalhigh; /* High memory size */ + uint32_t si_freehigh; /* Avail high memory */ + uint32_t si_mem_unit; /* Unit size of memory fields */ + char __si_pad[8]; +} lx_sysinfo32_t; +#pragma pack() +#endif + +extern pgcnt_t swapfs_minfree; + +static void +lx_sysinfo_common(lx_sysinfo_t *si) +{ + zone_t *zone = curthread->t_procp->p_zone; + uint64_t zphysmem, zfreemem, ztotswap, zfreeswap; + + si->si_uptime = gethrestime_sec() - zone->zone_boot_time; + + si->si_loads[0] = zone->zone_hp_avenrun[0]; + si->si_loads[1] = zone->zone_hp_avenrun[1]; + si->si_loads[2] = zone->zone_hp_avenrun[2]; + + /* + * In linux each thread looks like a process, so we conflate the + * two in this stat as well. + */ + si->si_procs = (int32_t)zone->zone_nlwps; + + /* + * If memory or swap limits are set on the zone, use those, otherwise + * use the system values. physmem and freemem are in pages, but the + * zone values are in bytes. Likewise, ani_max and ani_free are in + * pages. + */ + if (zone->zone_phys_mem_ctl == UINT64_MAX) { + zphysmem = physmem; + zfreemem = freemem; + } else { + zphysmem = btop(zone->zone_phys_mem_ctl); + zfreemem = btop(zone->zone_phys_mem_ctl - zone->zone_phys_mem); + } + + if (zone->zone_max_swap_ctl == UINT64_MAX) { + ztotswap = k_anoninfo.ani_max; + zfreeswap = k_anoninfo.ani_free; + } else { + /* + * See the comment in swapctl for a description of how free is + * calculated within a zone. + */ + rctl_qty_t used; + spgcnt_t avail; + uint64_t max; + + avail = MAX((spgcnt_t)(availrmem - swapfs_minfree), 0); + max = k_anoninfo.ani_max + k_anoninfo.ani_mem_resv + avail; + + mutex_enter(&zone->zone_mem_lock); + ztotswap = btop(zone->zone_max_swap_ctl); + used = btop(zone->zone_max_swap); + mutex_exit(&zone->zone_mem_lock); + + zfreeswap = MIN(ztotswap, max) - used; + } + + /* + * If the maximum memory stat is less than 1^20 pages (i.e. 4GB), + * then we report the result in bytes. Otherwise we use pages. + * Once we start supporting >1TB systems/zones, we'll need a third + * option. + */ + if (MAX(zphysmem, ztotswap) < 1024 * 1024) { + si->si_totalram = ptob(zphysmem); + si->si_freeram = ptob(zfreemem); + si->si_totalswap = ptob(ztotswap); + si->si_freeswap = ptob(zfreeswap); + si->si_mem_unit = 1; + } else { + si->si_totalram = zphysmem; + si->si_freeram = zfreemem; + si->si_totalswap = ztotswap; + si->si_freeswap = zfreeswap; + si->si_mem_unit = PAGESIZE; + } + si->si_bufferram = 0; + si->si_sharedram = 0; + + /* + * These two stats refer to high physical memory. If an + * application running in a Linux zone cares about this, then + * either it or we are broken. + */ + si->si_totalhigh = 0; + si->si_freehigh = 0; +} + +long +lx_sysinfo64(caddr_t sip) +{ + lx_sysinfo_t si; + + bzero(&si, sizeof (si)); + lx_sysinfo_common(&si); + + if (copyout(&si, sip, sizeof (si)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +long +lx_sysinfo32(caddr_t sip) +{ + lx_sysinfo_t si; + lx_sysinfo32_t si32; + int i; + + lx_sysinfo_common(&si); + + /* + * Convert the lx_sysinfo_t into the legacy 32-bit view: + */ + bzero(&si32, sizeof (si32)); + si32.si_uptime = si.si_uptime; + + for (i = 0; i < 3; i++) { + if ((si.si_loads[i]) > 0x7fffffff) + si32.si_loads[i] = 0x7fffffff; + else + si32.si_loads[i] = si.si_loads[i]; + } + + si32.si_procs = si.si_procs; + si32.si_totalram = si.si_totalram; + si32.si_freeram = si.si_freeram; + si32.si_totalswap = si.si_totalswap; + si32.si_freeswap = si.si_freeswap; + si32.si_mem_unit = si.si_mem_unit; + + si32.si_bufferram = si.si_bufferram; + si32.si_sharedram = si.si_sharedram; + + si32.si_totalhigh = si.si_totalhigh; + si32.si_freehigh = si.si_freehigh; + + if (copyout(&si32, sip, sizeof (si32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif diff --git a/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c new file mode 100644 index 0000000000..48d91b09cc --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_thread_area.c @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/errno.h> +#include <sys/cpuvar.h> +#include <sys/archsystm.h> +#include <sys/proc.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_ldt.h> +#include <sys/lx_misc.h> +#include <sys/x86_archext.h> +#include <sys/controlregs.h> +#include <lx_syscall.h> + +long +lx_arch_prctl(int code, ulong_t addr) +{ +#if defined(__amd64) + klwp_t *lwp = ttolwp(curthread); + lx_lwp_data_t *llwp = lwptolxlwp(lwp); + pcb_t *pcb = &lwp->lwp_pcb; + + switch (code) { + case LX_ARCH_GET_FS: + if (copyout(&llwp->br_lx_fsbase, (void *)addr, + sizeof (llwp->br_lx_fsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_FS: + llwp->br_lx_fsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_fsbase != llwp->br_lx_fsbase) { + pcb->pcb_fsbase = llwp->br_lx_fsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); + break; + + case LX_ARCH_GET_GS: + if (copyout(&llwp->br_lx_gsbase, (void *)addr, + sizeof (llwp->br_lx_gsbase)) != 0) { + return (set_errno(EFAULT)); + } + break; + + case LX_ARCH_SET_GS: + llwp->br_lx_gsbase = addr; + + kpreempt_disable(); + if (pcb->pcb_gsbase != llwp->br_lx_gsbase) { + pcb->pcb_gsbase = llwp->br_lx_gsbase; + + /* + * Ensure we go out via update_sregs. + */ + pcb->pcb_rupdate = 1; + } + kpreempt_enable(); + break; + + default: + return (set_errno(EINVAL)); + } +#endif + + return (0); +} + +long +lx_get_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + + if (fuword32(&inf->entry_number, (uint32_t *)&entry)) + return (set_errno(EFAULT)); + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + /* + * convert the solaris ldt to the linux format expected by the + * caller + */ + DESC_TO_LDT_INFO(dscrp, &ldt_inf); + ldt_inf.entry_number = entry; + + if (copyout(&ldt_inf, inf, sizeof (struct ldt_info))) + return (set_errno(EFAULT)); + + return (0); +} + +long +lx_set_thread_area(struct ldt_info *inf) +{ + struct lx_lwp_data *jlwp = ttolxlwp(curthread); + struct ldt_info ldt_inf; + user_desc_t *dscrp; + int entry; + int i; + + /* Check that casts for accessing the words in user_desc are valid */ + ASSERT(sizeof (user_desc_t) == 8); + + if (copyin(inf, &ldt_inf, sizeof (ldt_inf))) + return (set_errno(EFAULT)); + + entry = ldt_inf.entry_number; + if (entry == -1) { + /* + * Find an empty entry in the tls for this thread. + * The casts assume each user_desc_t entry is 8 bytes. + */ + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) { + if (((uint_t *)dscrp)[0] == 0 && + ((uint_t *)dscrp)[1] == 0) + break; + } + + if (i < LX_TLSNUM) { + /* + * found one + */ + entry = i + GDT_TLSMIN; + if (suword32(&inf->entry_number, entry)) + return (set_errno(EFAULT)); + } else { + return (set_errno(ESRCH)); + } + } + + if (entry < GDT_TLSMIN || entry > GDT_TLSMAX) + return (set_errno(EINVAL)); + + /* + * convert the linux ldt info to standard intel descriptor + */ + dscrp = jlwp->br_tls + entry - GDT_TLSMIN; + + if (LDT_INFO_EMPTY(&ldt_inf)) { + ((uint_t *)dscrp)[0] = 0; + ((uint_t *)dscrp)[1] = 0; + } else { + LDT_INFO_TO_DESC(&ldt_inf, dscrp); + } + + /* + * update the gdt with the new descriptor + */ + kpreempt_disable(); + + for (i = 0, dscrp = jlwp->br_tls; i < LX_TLSNUM; i++, dscrp++) + lx_set_gdt(GDT_TLSMIN + i, dscrp); + + kpreempt_enable(); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_timer.c b/usr/src/uts/common/brand/lx/syscall/lx_timer.c new file mode 100644 index 0000000000..aa22228808 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_timer.c @@ -0,0 +1,367 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * The illumos kernel provides two clock backends: CLOCK_REALTIME, the + * adjustable system wall clock; and CLOCK_HIGHRES, the monotonically + * increasing time source that is not subject to drift or adjustment. By + * contrast, the Linux kernel is furnished with an overabundance of narrowly + * differentiated clock types. + * + * Fortunately, most of the commonly used Linux clock types are either similar + * enough to the native clock backends that they can be directly mapped, or + * represent queries to the per-process and per-LWP microstate counters. + */ + +#include <sys/time.h> +#include <sys/systm.h> +#include <sys/cmn_err.h> +#include <sys/lx_impl.h> + +/* + * From "uts/common/os/timer.c": + */ +extern int clock_settime(clockid_t, timespec_t *); +extern int clock_gettime(clockid_t, timespec_t *); +extern int clock_getres(clockid_t, timespec_t *); + + +static int lx_emul_clock_getres(clockid_t, timespec_t *); +static int lx_emul_clock_gettime(clockid_t, timespec_t *); +static int lx_emul_clock_settime(clockid_t, timespec_t *); + +typedef struct lx_clock_backend { + clockid_t lclk_ntv_id; + int (*lclk_clock_getres)(clockid_t, timespec_t *); + int (*lclk_clock_gettime)(clockid_t, timespec_t *); + int (*lclk_clock_settime)(clockid_t, timespec_t *); +} lx_clock_backend_t; + +/* + * NOTE: The Linux man pages state this structure is obsolete and is + * unsupported, so it is declared here for sizing purposes only. + */ +struct lx_timezone { + int tz_minuteswest; /* minutes W of Greenwich */ + int tz_dsttime; /* type of dst correction */ +}; + +/* + * Use the native clock_* system call implementation, but with a translated + * clock identifier: + */ +#define NATIVE(ntv_id) \ + { ntv_id, clock_getres, clock_gettime, clock_settime } + +/* + * This backend is not supported, so we provide an emulation handler: + */ +#define EMUL(ntv_id) \ + { ntv_id, lx_emul_clock_getres, lx_emul_clock_gettime, \ + lx_emul_clock_settime } + +static lx_clock_backend_t lx_clock_backends[] = { + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC */ + EMUL(CLOCK_PROCESS_CPUTIME_ID), /* LX_CLOCK_PROCESS_CPUTIME_ID */ + EMUL(CLOCK_THREAD_CPUTIME_ID), /* LX_CLOCK_THREAD_CPUTIME_ID */ + NATIVE(CLOCK_HIGHRES), /* LX_CLOCK_MONOTONIC_RAW */ + NATIVE(CLOCK_REALTIME), /* LX_CLOCK_REALTIME_COARSE */ + NATIVE(CLOCK_HIGHRES) /* LX_CLOCK_MONOTONIC_COARSE */ +}; + +#define LX_CLOCK_MAX \ + (sizeof (lx_clock_backends) / sizeof (lx_clock_backends[0])) +#define LX_CLOCK_BACKEND(clk) \ + ((clk) < LX_CLOCK_MAX && (clk) >= 0 ? &lx_clock_backends[(clk)] : NULL) + +static int +lx_emul_clock_settime(clockid_t clock, timespec_t *tp) +{ + return (set_errno(EINVAL)); +} + +static int +lx_emul_clock_gettime(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: { + proc_t *p = ttoproc(curthread); + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage() in "rusagesys.c": + */ + mutex_enter(&p->p_lock); + unsecs = mstate_aggr_state(p, LMS_USER); + snsecs = mstate_aggr_state(p, LMS_SYSTEM); + mutex_exit(&p->p_lock); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + case CLOCK_THREAD_CPUTIME_ID: { + klwp_t *lwp = ttolwp(curthread); + struct mstate *ms = &lwp->lwp_mstate; + hrtime_t snsecs, unsecs; + + /* + * Based on getrusage_lwp() in "rusagesys.c": + */ + unsecs = ms->ms_acct[LMS_USER]; + snsecs = ms->ms_acct[LMS_SYSTEM] + ms->ms_acct[LMS_TRAP]; + + scalehrtime(&unsecs); + scalehrtime(&snsecs); + + hrt2ts(unsecs + snsecs, &t); + break; + } + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static int +lx_emul_clock_getres(clockid_t clock, timespec_t *tp) +{ + timespec_t t; + + if (tp == NULL) { + return (0); + } + + switch (clock) { + case CLOCK_PROCESS_CPUTIME_ID: + case CLOCK_THREAD_CPUTIME_ID: + /* + * These clock backends return microstate accounting values for + * the LWP or the entire process. The Linux kernel claims they + * have nanosecond resolution; so will we. + */ + t.tv_sec = 0; + t.tv_nsec = 1; + break; + + default: + return (set_errno(EINVAL)); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + timespec32_t t32; + + if (TIMESPEC_OVERFLOW(&t)) { + return (set_errno(EOVERFLOW)); + } + TIMESPEC_TO_TIMESPEC32(&t32, &t); + + if (copyout(&t32, tp, sizeof (t32)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); + } +#endif + + if (copyout(&t, tp, sizeof (t)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +static void +lx_clock_unsupported(int clock) +{ + char buf[100]; + + (void) snprintf(buf, sizeof (buf), "unsupported clock: %d", clock); + lx_unsupported(buf); +} + +long +lx_clock_settime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_settime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_gettime(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_gettime(backend->lclk_ntv_id, tp)); +} + +long +lx_clock_getres(int clock, timespec_t *tp) +{ + lx_clock_backend_t *backend; + + if (tp == NULL) { + return (0); + } + + if ((backend = LX_CLOCK_BACKEND(clock)) == NULL) { + lx_clock_unsupported(clock); + return (set_errno(EINVAL)); + } + + return (backend->lclk_clock_getres(backend->lclk_ntv_id, tp)); +} + + +long +lx_gettimeofday(struct timeval *tvp, struct lx_timezone *tzp) +{ + struct lx_timezone tz; + + bzero(&tz, sizeof (tz)); + + /* + * We want to be similar to libc which just does a fasttrap to + * gethrestime and simply converts that result. We follow how uniqtime + * does the conversion but we can't use that code since it does some + * extra work which can cause the result to bounce around based on which + * CPU we run on. + */ + if (tvp != NULL) { + struct timeval tv; + timestruc_t ts; + int usec, nsec; + + gethrestime(&ts); + nsec = ts.tv_nsec; + usec = nsec + (nsec >> 2); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 4); + usec = nsec - (usec >> 3); + usec = nsec + (usec >> 2); + usec = nsec + (usec >> 3); + usec = nsec + (usec >> 4); + usec = nsec + (usec >> 1); + usec = nsec + (usec >> 6); + usec = usec >> 10; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = usec; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(&tv, tvp, sizeof (tv)) != 0) + return (set_errno(EFAULT)); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (copyout(&tv32, tvp, sizeof (tv32))) + return (set_errno(EFAULT)); + } +#endif + } + + /* + * The Linux man page states use of the second parameter is obsolete, + * but gettimeofday(2) should still return EFAULT if it is set + * to a bad non-NULL pointer (sigh...) + */ + if (tzp != NULL && copyout(&tz, tzp, sizeof (tz)) != 0) + return (set_errno(EFAULT)); + + return (0); +} + +/* + * On Linux a bad buffer will set errno to EFAULT, and on Illumos the failure + * mode is documented as "undefined." + */ +long +lx_time(time_t *tp) +{ + timestruc_t ts; + struct timeval tv; + + gethrestime(&ts); + tv.tv_sec = ts.tv_sec; + tv.tv_usec = 0; + + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (tp != NULL && + copyout(&tv.tv_sec, tp, sizeof (tv.tv_sec)) != 0) + return (set_errno(EFAULT)); + + return (tv.tv_sec); + } +#ifdef _SYSCALL32_IMPL + else { + struct timeval32 tv32; + + if (TIMEVAL_OVERFLOW(&tv)) + return (set_errno(EOVERFLOW)); + TIMEVAL_TO_TIMEVAL32(&tv32, &tv); + + if (tp != NULL && + copyout(&tv32.tv_sec, tp, sizeof (tv32.tv_sec))) + return (set_errno(EFAULT)); + + return (tv32.tv_sec); + } +#endif +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_wait.c b/usr/src/uts/common/brand/lx/syscall/lx_wait.c new file mode 100644 index 0000000000..e6c5607e43 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_wait.c @@ -0,0 +1,473 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. + */ + +/* + * wait() family of functions. + * + * The first minor difference between the Linux and Solaris family of wait() + * calls is that the values for WNOHANG and WUNTRACED are different. Thankfully, + * the exit status values are identical between the two implementations. + * + * Things get very different and very complicated when we introduce the Linux + * threading model. Under linux, both threads and child processes are + * represented as processes. However, the behavior of wait() with respect to + * each child varies according to the flags given to clone() + * + * SIGCHLD The SIGCHLD signal should be sent on termination + * CLONE_THREAD The child shares the same thread group as the parent + * CLONE_DETACHED The parent receives no notification when the child exits + * + * The following flags control the Linux behavior w.r.t. the above attributes: + * + * __WALL Wait on all children, regardless of type + * __WCLONE Wait only on non-SIGCHLD children + * __WNOTHREAD Don't wait on children of other threads in this group + * + * The following chart shows whether wait() returns when the child exits: + * + * default __WCLONE __WALL + * no SIGCHLD - X X + * SIGCHLD X - X + * + * The following chart shows whether wait() returns when the grandchild exits: + * + * default __WNOTHREAD + * no CLONE_THREAD - - + * CLONE_THREAD X - + * + * The CLONE_DETACHED flag is universal - when the child exits, no state is + * stored and wait() has no effect. + * + * XXX Support the above combination of options, or some reasonable subset that + * covers at least fork() and pthread_create(). + */ + +#include <sys/wait.h> +#include <sys/brand.h> +#include <sys/lx_brand.h> +#include <sys/lx_types.h> +#include <sys/lx_siginfo.h> +#include <lx_signum.h> +#include <lx_errno.h> +#include <lx_syscall.h> + +/* + * From "uts/common/os/exit.c" and "uts/common/syscall/rusagesys.c": + */ +extern int waitid(idtype_t, id_t, k_siginfo_t *, int); +extern int rusagesys(int, void *, void *, void *, void *); + +/* + * Convert between Linux options and Solaris options, returning -1 if any + * invalid flags are found. + */ +#define LX_WNOHANG 0x00000001 +#define LX_WUNTRACED 0x00000002 +#define LX_WSTOPPED LX_WUNTRACED +#define LX_WEXITED 0x00000004 +#define LX_WCONTINUED 0x00000008 +#define LX_WNOWAIT 0x01000000 + +#define LX_WNOTHREAD 0x20000000 +#define LX_WALL 0x40000000 +#define LX_WCLONE 0x80000000 + +#define LX_P_ALL 0x0 +#define LX_P_PID 0x1 +#define LX_P_GID 0x2 + +/* + * Split the passed waitpid/waitid options into two separate variables: + * those for the native illumos waitid(2), and the extra Linux-specific + * options we will handle in our brand-specific code. + */ +static int +ltos_options(uintptr_t options, int *native_options, int *extra_options) +{ + int newoptions = 0; + + if (((options) & ~(LX_WNOHANG | LX_WUNTRACED | LX_WEXITED | + LX_WCONTINUED | LX_WNOWAIT | LX_WNOTHREAD | LX_WALL | + LX_WCLONE)) != 0) { + return (-1); + } + + *extra_options = options & (LX_WNOTHREAD | LX_WALL | LX_WCLONE); + + if (options & LX_WNOHANG) + newoptions |= WNOHANG; + if (options & LX_WUNTRACED) + newoptions |= WUNTRACED; + if (options & LX_WEXITED) + newoptions |= WEXITED; + if (options & LX_WCONTINUED) + newoptions |= WCONTINUED; + if (options & LX_WNOWAIT) + newoptions |= WNOWAIT; + + /* + * The trapped option is implicit on Linux. + */ + newoptions |= WTRAPPED; + + *native_options = newoptions; + return (0); +} + +static int +lx_wstat(int code, int status) +{ + int stat = 0; + + switch (code) { + case CLD_EXITED: + stat = status << 8; + break; + case CLD_DUMPED: + stat = lx_stol_signo(status, SIGKILL) | WCOREFLG; + break; + case CLD_KILLED: + stat = lx_stol_signo(status, SIGKILL); + break; + case CLD_TRAPPED: + case CLD_STOPPED: + stat = (lx_stol_status(status, SIGKILL) << 8) | WSTOPFLG; + break; + case CLD_CONTINUED: + stat = WCONTFLG; + break; + } + + return (stat); +} + +static int +lx_call_waitid(idtype_t idtype, id_t id, k_siginfo_t *sip, int native_options, + int extra_options) +{ + lx_lwp_data_t *lwpd = ttolxlwp(curthread); + int error; + + /* + * Our brand-specific waitid helper only understands a subset of + * the possible idtypes. Ensure we keep to that subset here: + */ + if (idtype != P_ALL && idtype != P_PID && idtype != P_PGID) { + return (EINVAL); + } + + /* + * Enable the return of emulated ptrace(2) stop conditions + * through lx_waitid_helper, and stash the Linux-specific + * extra waitid() flags. + */ + lwpd->br_waitid_emulate = B_TRUE; + lwpd->br_waitid_flags = extra_options; + + if ((error = waitid(idtype, id, sip, native_options)) == EINTR) { + /* + * According to signal(7), the wait4(2), waitid(2), and + * waitpid(2) system calls are restartable. + */ + ttolxlwp(curthread)->br_syscall_restart = B_TRUE; + } + + lwpd->br_waitid_emulate = B_FALSE; + lwpd->br_waitid_flags = 0; + + return (error); +} + +long +lx_wait4(uintptr_t p1, uintptr_t p2, uintptr_t p3, uintptr_t p4) +{ + k_siginfo_t info = { 0 }; + idtype_t idtype; + id_t id; + int status = 0; + pid_t pid = (pid_t)p1; + int error; + int native_options, extra_options; + int *statusp = (int *)p2; + void *rup = (void *)p4; + + if (ltos_options(p3, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (pid > maxpid) { + return (set_errno(ECHILD)); + } + + /* + * While not listed as a valid return code, Linux's wait4(2) does, + * in fact, get an EFAULT if either the status pointer or rusage + * pointer is invalid. Since a failed waitpid should leave child + * process in a state where a future wait4(2) will succeed, we + * check them by copying out the values their buffers originally + * contained. (We need to do this as a failed system call should + * never affect the contents of a passed buffer.) + * + * This will fail if the buffers in question are write-only. + */ + if (statusp != NULL) { + if (copyin(statusp, &status, sizeof (status)) != 0 || + copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + /* + * Do the same check for the "struct rusage" pointer, which differs + * in size for 32- and 64-bit processes. + */ + if (rup != NULL) { + struct rusage ru; + void *krup = &ru; + size_t rusz = sizeof (ru); +#if defined(_SYSCALL32_IMPL) + struct rusage32 ru32; + + if (get_udatamodel() != DATAMODEL_NATIVE) { + krup = &ru32; + rusz = sizeof (ru32); + } +#endif + + if (copyin(rup, krup, rusz) != 0 || + copyout(krup, rup, rusz) != 0) { + return (set_errno(EFAULT)); + } + } + + if (pid < -1) { + idtype = P_PGID; + id = -pid; + } else if (pid == -1) { + idtype = P_ALL; + id = 0; + } else if (pid == 0) { + idtype = P_PGID; + mutex_enter(&pidlock); + id = curproc->p_pgrp; + mutex_exit(&pidlock); + } else { + idtype = P_PID; + id = pid; + } + + native_options |= (WEXITED | WTRAPPED); + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + + status = lx_wstat(info.si_code, info.si_status); + + /* + * Unfortunately if this attempt to copy out either the status or the + * rusage fails, the process will be in an inconsistent state as + * subsequent calls to wait for the same child will fail where they + * should succeed on a Linux system. This, however, is rather + * unlikely since we tested the validity of both above. + */ + if (statusp != NULL) { + if (copyout(&status, statusp, sizeof (status)) != 0) { + return (set_errno(EFAULT)); + } + } + + if (rup != NULL) { + if ((error = rusagesys(_RUSAGESYS_GETRUSAGE_CHLD, rup, NULL, + NULL, NULL)) != 0) { + return (set_errno(error)); + } + } + + return (info.si_pid); +} + +long +lx_waitpid(uintptr_t p1, uintptr_t p2, uintptr_t p3) +{ + return (lx_wait4(p1, p2, p3, NULL)); +} + +static int +stol_ksiginfo(k_siginfo_t *sip, uintptr_t lxsip) +{ + lx_siginfo_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} + +#if defined(_SYSCALL32_IMPL) +static int +stol_ksiginfo32(k_siginfo_t *sip, uintptr_t lxsip) +{ + lx_siginfo32_t lsi; + + bzero(&lsi, sizeof (lsi)); + lsi.lsi_signo = lx_stol_signo(sip->si_signo, SIGCLD); + lsi.lsi_code = lx_stol_sigcode(sip->si_code); + lsi.lsi_errno = lx_errno(sip->si_errno, EINVAL); + + switch (lsi.lsi_signo) { + case LX_SIGPOLL: + lsi.lsi_band = sip->si_band; + lsi.lsi_fd = sip->si_fd; + break; + + case LX_SIGCHLD: + lsi.lsi_pid = sip->si_pid; + if (sip->si_code <= 0 || sip->si_code == CLD_EXITED) { + lsi.lsi_status = sip->si_status; + } else { + lsi.lsi_status = lx_stol_status(sip->si_status, + SIGKILL); + } + lsi.lsi_utime = sip->si_utime; + lsi.lsi_stime = sip->si_stime; + break; + + case LX_SIGILL: + case LX_SIGBUS: + case LX_SIGFPE: + case LX_SIGSEGV: + lsi.lsi_addr = (caddr32_t)(uintptr_t)sip->si_addr; + break; + + default: + lsi.lsi_pid = sip->si_pid; + lsi.lsi_uid = LX_UID32_TO_UID16(sip->si_uid); + } + + if (copyout(&lsi, (void *)lxsip, sizeof (lsi)) != 0) { + return (set_errno(EFAULT)); + } + + return (0); +} +#endif + +long +lx_waitid(uintptr_t idtype, uintptr_t id, uintptr_t infop, uintptr_t opt) +{ + int error; + int native_options, extra_options; + k_siginfo_t info = { 0 }; + + if (ltos_options(opt, &native_options, &extra_options) == -1) { + return (set_errno(EINVAL)); + } + + if (((opt) & (LX_WEXITED | LX_WSTOPPED | LX_WCONTINUED)) == 0) { + return (set_errno(EINVAL)); + } + + switch (idtype) { + case LX_P_ALL: + idtype = P_ALL; + break; + case LX_P_PID: + idtype = P_PID; + break; + case LX_P_GID: + idtype = P_PGID; + break; + default: + return (set_errno(EINVAL)); + } + + if ((error = lx_call_waitid(idtype, id, &info, native_options, + extra_options)) != 0) { + return (set_errno(error)); + } + + /* + * If the WNOHANG flag was specified and no child was found return 0. + */ + if ((native_options & WNOHANG) && info.si_pid == 0) { + return (0); + } + +#if defined(_SYSCALL32_IMPL) + if (get_udatamodel() != DATAMODEL_NATIVE) { + return (stol_ksiginfo32(&info, infop)); + } else +#endif + { + return (stol_ksiginfo(&info, infop)); + } +} diff --git a/usr/src/uts/common/brand/lx/syscall/lx_xattr.c b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c new file mode 100644 index 0000000000..ea23c3e4b8 --- /dev/null +++ b/usr/src/uts/common/brand/lx/syscall/lx_xattr.c @@ -0,0 +1,31 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/file.h> + +/* + * *xattr() family of functions. + * + * These are currently unimplemented. We return EOPNOTSUPP for now, rather + * than using NOSYS_NO_EQUIV to avoid unwanted stderr output from ls(1). + */ + +long +lx_xattr(void) +{ + return (set_errno(EOPNOTSUPP)); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h new file mode 100644 index 0000000000..a30fdd9c06 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysfs.h @@ -0,0 +1,192 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _LXSYSFS_H +#define _LXSYSFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lx_sysfs.h: declarations, data structures and macros for lx_sysfs + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> +#include <sys/netstack.h> +#include <inet/ip.h> +#include <inet/ip_if.h> + +/* + * Convert a vnode into an lxsys_mnt_t + */ +#define VTOLXSM(vp) ((lxsys_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxsys_node + */ +#define VTOLXS(vp) ((lxsys_node_t *)(vp)->v_data) + +/* + * convert a lxsys_node into a vnode + */ +#define LXSTOV(lxsnp) ((lxsnp)->lxsys_vnode) + +/* + * convert a lxsys_node into zone for fs + */ +#define LXSTOZ(lxsnp) \ + (((lxsys_mnt_t *)(lxsnp)->lxsys_vnode->v_vfsp->vfs_data)->lxsysm_zone) + +#define LXSNSIZ 256 /* max size of lx /sys file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXSYS_SDSIZE 16 + +/* Root sysfs lxsys_instance */ +#define LXSYS_INST_ROOT 0 + +/* + * Node/file types for lx /sys files + * (directories and files contained therein). + */ +typedef enum lxsys_nodetype { + LXSYS_NONE, /* None-type to keep inodes non-zero */ + LXSYS_STATIC, /* Statically defined entries */ + LXSYS_CLASS_NET, /* /sys/class/net/<iface> */ + LXSYS_DEVICES_NET, /* /sys/devices/virtual/net/<iface> */ + LXSYS_MAXTYPE, /* type limit */ +} lxsys_nodetype_t; + +/* + * external dirent characteristics + */ +typedef struct { + unsigned int d_idnum; + char *d_name; +} lxsys_dirent_t; + +typedef struct { + unsigned int dl_instance; + lxsys_dirent_t *dl_list; + int dl_length; +} lxsys_dirlookup_t; + +/* + * This is the lx sysfs private data object + * which is attached to v_data in the vnode structure + */ +struct lxsys_node; +typedef struct lxsys_node lxsys_node_t; +struct lxsys_node { + lxsys_nodetype_t lxsys_type; /* type ID of node */ + unsigned int lxsys_instance; /* instance ID node */ + unsigned int lxsys_endpoint; /* endpoint ID node */ + vnode_t *lxsys_vnode; /* vnode for the node */ + vnode_t *lxsys_parent; /* parent directory */ + vnode_t *lxsys_realvp; /* real vnode */ + lxsys_node_t *lxsys_next; /* next list entry */ + timestruc_t lxsys_time; /* creation time */ + mode_t lxsys_mode; /* file mode bits */ + uid_t lxsys_uid; /* file owner */ + gid_t lxsys_gid; /* file group owner */ + ino_t lxsys_ino; /* node id */ +}; + +/* + * This is the lxsysfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxsys_mnt { + kmutex_t lxsysm_lock; /* protects fields */ + lxsys_node_t *lxsysm_node; /* node at root of sys mount */ + zone_t *lxsysm_zone; /* zone for this mount */ +} lxsys_mnt_t; + +extern vnodeops_t *lxsys_vnodeops; + +typedef struct mounta mounta_t; + +extern void lxsys_initnodecache(); +extern void lxsys_fininodecache(); +extern ino_t lxsys_inode(lxsys_nodetype_t, unsigned int, unsigned int); +extern ino_t lxsys_parentinode(lxsys_node_t *); +extern lxsys_node_t *lxsys_getnode(vnode_t *, lxsys_nodetype_t, unsigned int, + unsigned int); +extern lxsys_node_t *lxsys_getnode_static(vnode_t *, unsigned int); +extern void lxsys_freenode(lxsys_node_t *); + +extern netstack_t *lxsys_netstack(lxsys_node_t *); +extern ill_t *lxsys_find_ill(ip_stack_t *, uint_t); + +typedef struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t bufsize; + char *pos; + size_t beg; + int error; +} lxsys_uiobuf_t; + +extern lxsys_uiobuf_t *lxsys_uiobuf_new(uio_t *); +extern void lxsys_uiobuf_free(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_seterr(lxsys_uiobuf_t *, int); +extern int lxsys_uiobuf_flush(lxsys_uiobuf_t *); +extern void lxsys_uiobuf_write(lxsys_uiobuf_t *, const char *, size_t); +extern void lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...); + +#ifdef __cplusplus +} +#endif + +#ifndef islower +#define islower(x) (((unsigned)(x) >= 'a') && ((unsigned)(x) <= 'z')) +#endif +#ifndef toupper +#define toupper(x) (islower(x) ? (x) - 'a' + 'A' : (x)) +#endif + +#endif /* _LXSYSFS_H */ diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c new file mode 100644 index 0000000000..36da7fe546 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_syssubr.c @@ -0,0 +1,461 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * lx_syssubr.c: Various functions for the /sys vnodeops. + */ + +#include <sys/varargs.h> + +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lx_sysfs.h" + +#define LXSYSCACHE_NAME "lxsys_cache" + +static int lxsys_node_constructor(void *, void *, int); +static void lxsys_node_destructor(void *, void *); + +static kmem_cache_t *lxsys_node_cache; + +void +lxsys_initnodecache() +{ + lxsys_node_cache = kmem_cache_create(LXSYSCACHE_NAME, + sizeof (lxsys_node_t), 0, + lxsys_node_constructor, lxsys_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxsys_fininodecache() +{ + kmem_cache_destroy(lxsys_node_cache); +} + +/* ARGSUSED */ +static int +lxsys_node_constructor(void *buf, void *un, int kmflags) +{ + lxsys_node_t *lxsnp = buf; + vnode_t *vp; + + vp = lxsnp->lxsys_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxsys_vnodeops); + vp->v_data = lxsnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxsys_node_destructor(void *buf, void *un) +{ + lxsys_node_t *lxsnp = buf; + + vn_free(LXSTOV(lxsnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxsys node + */ +ino_t +lxsys_inode(lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + /* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + */ + ASSERT(instance <= 0xffff); + ASSERT(endpoint <= 0xff); + + return ((ino_t)(type << 24)|(instance << 8)|endpoint); +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxsys_parentinode(lxsys_node_t *lxsnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxsnp->lxsys_type == LXSYS_STATIC && + lxsnp->lxsys_instance == LXSYS_INST_ROOT) { + return (lxsnp->lxsys_ino); + } else { + return (VTOLXS(lxsnp->lxsys_parent)->lxsys_ino); + } +} + +/* + * Allocate a new lxsys node + * + * This also allocates the vnode associated with it + */ +lxsys_node_t * +lxsys_getnode(vnode_t *dp, lxsys_nodetype_t type, unsigned int instance, + unsigned int endpoint) +{ + lxsys_node_t *lxsnp; + vnode_t *vp; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxsnp = kmem_cache_alloc(lxsys_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxsnp->lxsys_type = type; + lxsnp->lxsys_instance = instance; + lxsnp->lxsys_endpoint = endpoint; + lxsnp->lxsys_next = NULL; + lxsnp->lxsys_realvp = NULL; + lxsnp->lxsys_parent = dp; + VN_HOLD(dp); + + lxsnp->lxsys_time = now; + lxsnp->lxsys_uid = lxsnp->lxsys_gid = 0; + lxsnp->lxsys_ino = lxsys_inode(type, instance, endpoint); + + /* initialize the vnode data */ + vp = lxsnp->lxsys_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Default to a directory with open permissions. + * Specific components will override this + */ + if (type == LXSYS_STATIC && instance == LXSYS_INST_ROOT) { + vp->v_flag |= VROOT; + } + vp->v_type = VDIR; + lxsnp->lxsys_mode = 0555; + + return (lxsnp); +} + +lxsys_node_t * +lxsys_getnode_static(vnode_t *dp, unsigned int instance) +{ + lxsys_mnt_t *lxsm = VTOLXSM(dp); + lxsys_node_t *lnp; + + mutex_enter(&lxsm->lxsysm_lock); + lnp = lxsm->lxsysm_node; + while (1) { + if (lnp->lxsys_instance == instance) { + VERIFY(lnp->lxsys_parent == dp); + + VN_HOLD(lnp->lxsys_vnode); + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); + } else if (lnp->lxsys_next == NULL) { + break; + } + lnp = lnp->lxsys_next; + } + + /* + * No persistent node found. + * Create one and add it to the end of the list. + */ + lnp->lxsys_next = lxsys_getnode(dp, LXSYS_STATIC, instance, 0); + lnp = lnp->lxsys_next; + /* Allow mounts on static entries */ + LXSTOV(lnp)->v_flag &= (~VNOMOUNT); + + mutex_exit(&lxsm->lxsysm_lock); + return (lnp); +} + +/* Clean up persistence for static lxsys_node */ +int +lxsys_freenode_static(lxsys_node_t *lnp) +{ + lxsys_node_t *plnp; + vnode_t *vp = LXSTOV(lnp); + lxsys_mnt_t *lxsm = VTOLXSM(vp); + + if (lnp->lxsys_instance == LXSYS_INST_ROOT) { + /* + * The root vnode does not need special cleanup since it + * anchors the list and is freed by lxsys_unmount. + */ + return (0); + } + + mutex_enter(&lxsm->lxsysm_lock); + + /* + * It is possible that a different process acquired a fresh reference + * to this vnode via lookup while we were waiting on the lxsysm_lock. + * To avoid freeing the vnode out from under them, we will double-check + * v_count and bail from the fop_inactive if it was grabbed. + */ + mutex_enter(&vp->v_lock); + if (vp->v_count != 1) { + VERIFY(vp->v_count > 0); + + mutex_exit(&vp->v_lock); + mutex_exit(&lxsm->lxsysm_lock); + return (-1); + } + mutex_exit(&vp->v_lock); + + /* search for the record pointing to lnp */ + plnp = lxsm->lxsysm_node; + while (plnp != NULL && plnp->lxsys_next != lnp) { + plnp = plnp->lxsys_next; + } + /* entry should always be found */ + VERIFY(plnp != NULL); + plnp->lxsys_next = lnp->lxsys_next; + + mutex_exit(&lxsm->lxsysm_lock); + return (0); +} + +/* + * Free the storage obtained from lxsys_getnode(). + */ +void +lxsys_freenode(lxsys_node_t *lxsnp) +{ + vnode_t *vp = LXSTOV(lxsnp); + + VERIFY(vp != NULL); + + if (lxsnp->lxsys_type == LXSYS_STATIC) { + if (lxsys_freenode_static(lxsnp) != 0) { + return; + } + } + + /* + * delete any association with realvp + */ + if (lxsnp->lxsys_realvp != NULL) + VN_RELE(lxsnp->lxsys_realvp); + + /* + * delete any association with parent vp + */ + if (lxsnp->lxsys_parent != NULL) + VN_RELE(lxsnp->lxsys_parent); + + /* + * Release the lxsysnode. + */ + kmem_cache_free(lxsys_node_cache, lxsnp); +} + +/* + * Get the netstack associated with this lxsys mount + */ +netstack_t * +lxsys_netstack(lxsys_node_t *lnp) +{ + zone_t *zone = VTOLXSM(LXSTOV(lnp))->lxsysm_zone; + netstack_t *ns = zone->zone_netstack; + + VERIFY(ns != NULL); + + if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) { + ns = NULL; + } else { + netstack_hold(ns); + } + + return (ns); +} + +ill_t * +lxsys_find_ill(ip_stack_t *ipst, uint_t ifindex) +{ + ill_t *ill; + phyint_t *phyi; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phyi = avl_find(&ipst->ips_phyint_g_list->phyint_list_avl_by_index, + (void *) &ifindex, NULL); + if (phyi != NULL) { + /* + * Since interface information presented via /sys is not + * specific to IPv4 or IPv6, an ill reference from either + * protocol will be adequate. Check both, starting with IPv4 + * for a valid reference to use. + */ + for (ill = phyi->phyint_illv4; ill != phyi->phyint_illv6; + ill = phyi->phyint_illv6) { + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ill); + } + mutex_exit(&ill->ill_lock); + } + } + } + rw_exit(&ipst->ips_ill_g_lock); + return (NULL); +} + + +#define LXSYSUIOBUFSZ 4096 + +lxsys_uiobuf_t * +lxsys_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxsys_uiobuf and output buffer */ + int bufsize = LXSYSUIOBUFSZ; + lxsys_uiobuf_t *uiobuf = + kmem_alloc(sizeof (lxsys_uiobuf_t) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->bufsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxsys_uiobuf_free(lxsys_uiobuf_t *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (lxsys_uiobuf_t) + uiobuf->bufsize); +} + +void +lxsys_uiobuf_seterr(lxsys_uiobuf_t *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxsys_uiobuf_flush(lxsys_uiobuf_t *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxsys_uiobuf_write(lxsys_uiobuf_t *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->bufsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxsys_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxsys_uiobuf_printf(lxsys_uiobuf_t *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxsys_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxsys_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c new file mode 100644 index 0000000000..c33e61345e --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvfsops.c @@ -0,0 +1,348 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * lxsysvfsops.c: vfs operations for lx sysfs. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <sys/lx_impl.h> + +#include "lx_sysfs.h" + +/* Module level parameters */ +static int lxsysfstype; +static dev_t lxsysdev; +static kmutex_t lxsys_mount_lock; + +static int lxsys_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxsys_unmount(vfs_t *, int, cred_t *); +static int lxsys_root(vfs_t *, vnode_t **); +static int lxsys_statvfs(vfs_t *, statvfs64_t *); +static int lxsys_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lx_sysfs", + lxsys_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "lx brand sysfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxsys_node cache + */ + lxsys_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxsysfstype); + vn_freevnodeops(lxsys_vnodeops); + + mutex_destroy(&lxsys_mount_lock); +done: + return (retval); +} + +static int +lxsys_init(int fstype, char *name) +{ + static const fs_operation_def_t lxsys_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxsys_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxsys_unmount }, + VFSNAME_ROOT, { .vfs_root = lxsys_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxsys_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxsys_vnodeops_template[]; + int error; + major_t dev; + + lxsysfstype = fstype; + ASSERT(lxsysfstype != 0); + + mutex_init(&lxsys_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxsys_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxsys_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxsys_vnodeops_template, &lxsys_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxsys_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxsys_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxsysdev = makedevice(dev, 0); + + /* + * Initialise cache for lxsys_nodes + */ + lxsys_initnodecache(); + + return (0); +} + +static int +lxsys_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt; + zone_t *zone = curproc->p_zone; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxsys" doesn't make sense + */ + vfs_setresource(vfsp, "lxsys", 0); + + lxsys_mnt = kmem_alloc(sizeof (*lxsys_mnt), KM_SLEEP); + + mutex_enter(&lxsys_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxsys_mount_lock); + kmem_free(lxsys_mnt, sizeof ((*lxsys_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + + mutex_init(&lxsys_mnt->lxsysm_lock, NULL, MUTEX_DEFAULT, NULL); + zone_hold(lxsys_mnt->lxsysm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxsys_mnt->lxsysm_node = lxsys_getnode(mvp, LXSYS_STATIC, + LXSYS_INST_ROOT, 0); + lxsys_mnt->lxsysm_node->lxsys_next = NULL; + + /* Correctly set the fs for the root node */ + lxsys_mnt->lxsysm_node->lxsys_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxsysdev, lxsysfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxsysfstype; + vfsp->vfs_data = (caddr_t)lxsys_mnt; + vfsp->vfs_dev = lxsysdev; + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxsys_mnt_t *lxsys_mnt = (lxsys_mnt_t *)vfsp->vfs_data; + lxsys_node_t *lnp; + vnode_t *vp; + int count; + + VERIFY(lxsys_mnt != NULL); + + mutex_enter(&lxsys_mount_lock); + + /* must be root to unmount */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxsys_mount_lock); + return (EPERM); + } + + /* forced unmount is not supported by this fs */ + if (flag & MS_FORCE) { + mutex_exit(&lxsys_mount_lock); + return (ENOTSUP); + } + + /* Ensure that no vnodes are in use on this mount point. */ + lnp = lxsys_mnt->lxsysm_node; + vp = LXSTOV(lnp); + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxsys_mount_lock); + return (EBUSY); + } + + /* + * If there are no references to the root vnode the list of persistent + * static vnodes should be empty + */ + VERIFY(lnp->lxsys_next == NULL); + + (void) dnlc_purge_vfsp(vfsp, 0); + + lxsys_mnt->lxsysm_node = NULL; + lxsys_freenode(lnp); + zone_rele(lxsys_mnt->lxsysm_zone); + vfsp->vfs_data = NULL; + kmem_free(lxsys_mnt, sizeof (*lxsys_mnt)); + + mutex_exit(&lxsys_mount_lock); + + return (0); +} + +static int +lxsys_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxsys_mnt_t *lxsm = (lxsys_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + + VERIFY(lxsm != NULL); + VERIFY(lxsm->lxsysm_node != NULL); + + vp = LXSTOV(lxsm->lxsysm_node); + VN_HOLD(vp); + *vpp = vp; + + return (0); +} + +static int +lxsys_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + dev32_t d32; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)3; + sp->f_ffree = (fsfilcnt64_t)0; /* none */ + sp->f_favail = (fsfilcnt64_t)0; /* none */ + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxsysfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + bzero(sp->f_fstr, sizeof (sp->f_fstr)); + + /* We know f_fstr is 32 chars */ + (void) strcpy(sp->f_fstr, "/sys"); + (void) strcpy(&sp->f_fstr[6], "/sys"); + + return (0); +} diff --git a/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c new file mode 100644 index 0000000000..0383904f83 --- /dev/null +++ b/usr/src/uts/common/brand/lx/sysfs/lx_sysvnops.c @@ -0,0 +1,1182 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * lx_sysfs -- a Linux-compatible /sys for the LX brand + */ + +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/lx_brand.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> +#include <sys/param.h> +#include <sys/utsname.h> +#include <sys/lx_misc.h> +#include <sys/brand.h> +#include <sys/cred_impl.h> +#include <sys/tihdr.h> +#include <sys/sunddi.h> +#include <sys/vnode.h> +#include <sys/netstack.h> +#include <sys/ethernet.h> +#include <inet/ip_arp.h> + +#include "lx_sysfs.h" + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxsys_init() in lx_sysvfsops.c + */ +vnodeops_t *lxsys_vnodeops; + +static int lxsys_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxsys_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxsys_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxsys_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxsys_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxsys_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxsys_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxsys_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxsys_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxsys_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxsys_sync(void); +static void lxsys_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxsys_lookup_static(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_class_netdir(lxsys_node_t *, char *); +static vnode_t *lxsys_lookup_devices_virtual_netdir(lxsys_node_t *, char *); + +static int lxsys_read_devices_virtual_net(lxsys_node_t *, lxsys_uiobuf_t *); + +static int lxsys_readdir_static(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_class_netdir(lxsys_node_t *, uio_t *, int *); +static int lxsys_readdir_devices_virtual_netdir(lxsys_node_t *, uio_t *, int *); + +static int lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len); + +/* + * The lx /sys vnode operations vector + */ +const fs_operation_def_t lxsys_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxsys_open }, + VOPNAME_CLOSE, { .vop_close = lxsys_close }, + VOPNAME_READ, { .vop_read = lxsys_read }, + VOPNAME_GETATTR, { .vop_getattr = lxsys_getattr }, + VOPNAME_ACCESS, { .vop_access = lxsys_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxsys_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxsys_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxsys_readlink }, + VOPNAME_FSYNC, { .error = lxsys_sync }, + VOPNAME_SEEK, { .error = lxsys_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxsys_inactive }, + VOPNAME_CMP, { .vop_cmp = lxsys_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxsys_realvp }, + NULL, NULL +}; + +/* + * Sysfs Inode format: + * 0000AABBBBCC + * + * AA - TYPE + * BBBB - INSTANCE + * CC - ENDPOINT + * + * Where TYPE is one of: + * 1 - SYS_STATIC + * 2 - SYS_CLASS_NET + * 3 - SYS_DEVICES_NET + * + * Static entries will have assigned INSTANCE identifiers: + * - 0: /sys + * - 1: /sys/class + * - 2: /sys/devices + * - 3: /sys/fs + * - 4: /sys/class/net + * - 5: /sys/devices/virtual + * - 7: /sys/devices/system + * - 8: /sys/fs/cgroup + * - 9: /sys/devices/virtual/net + * + * Dynamic /sys/class/net/<interface> symlinks will use an INSTANCE derived + * from the corresonding ifindex. + * + * Dynamic /sys/devices/virtual/net/<interface>/<entries> directories will use + * an INSTANCE derived from the ifindex and statically assigned ENDPOINT IDs + * for the contained entries. + */ + +#define LXSYS_INST_CLASSDIR 1 +#define LXSYS_INST_DEVICESDIR 2 +#define LXSYS_INST_FSDIR 3 +#define LXSYS_INST_CLASS_NETDIR 4 +#define LXSYS_INST_DEVICES_VIRTUALDIR 5 +#define LXSYS_INST_DEVICES_SYSTEMDIR 6 +#define LXSYS_INST_FS_CGROUPDIR 7 +#define LXSYS_INST_DEVICES_VIRTUAL_NETDIR 8 + +/* + * file contents of an lx /sys directory. + */ +static lxsys_dirent_t dirlist_root[] = { + { LXSYS_INST_CLASSDIR, "class" }, + { LXSYS_INST_DEVICESDIR, "devices" }, + { LXSYS_INST_FSDIR, "fs" } +}; +static lxsys_dirent_t dirlist_empty[] = {}; +static lxsys_dirent_t dirlist_class[] = { + { LXSYS_INST_CLASS_NETDIR, "net" } +}; +static lxsys_dirent_t dirlist_fs[] = { + { LXSYS_INST_FS_CGROUPDIR, "cgroup" } +}; +static lxsys_dirent_t dirlist_devices[] = { + { LXSYS_INST_DEVICES_SYSTEMDIR, "system" }, + { LXSYS_INST_DEVICES_VIRTUALDIR, "virtual" } +}; +static lxsys_dirent_t dirlist_devices_virtual[] = { + { LXSYS_INST_DEVICES_VIRTUAL_NETDIR, "net" } +}; + + +#define LXSYS_ENDP_NET_ADDRESS 1 +#define LXSYS_ENDP_NET_ADDRLEN 2 +#define LXSYS_ENDP_NET_FLAGS 3 +#define LXSYS_ENDP_NET_IFINDEX 4 +#define LXSYS_ENDP_NET_MTU 5 +#define LXSYS_ENDP_NET_TXQLEN 6 +#define LXSYS_ENDP_NET_TYPE 7 + +static lxsys_dirent_t dirlist_devices_virtual_net[] = { + { LXSYS_ENDP_NET_ADDRESS, "address" }, + { LXSYS_ENDP_NET_ADDRLEN, "addr_len" }, + { LXSYS_ENDP_NET_FLAGS, "flags" }, + { LXSYS_ENDP_NET_IFINDEX, "ifindex" }, + { LXSYS_ENDP_NET_MTU, "mtu" }, + { LXSYS_ENDP_NET_TXQLEN, "tx_queue_len" }, + { LXSYS_ENDP_NET_TYPE, "type" } +}; + +#define SYSDIRLISTSZ(l) (sizeof (l) / sizeof ((l)[0])) + +#define SYSDLENT(i, l) { i, l, SYSDIRLISTSZ(l) } +static lxsys_dirlookup_t lxsys_dirlookup[] = { + SYSDLENT(LXSYS_INST_ROOT, dirlist_root), + SYSDLENT(LXSYS_INST_CLASSDIR, dirlist_class), + SYSDLENT(LXSYS_INST_FSDIR, dirlist_fs), + SYSDLENT(LXSYS_INST_FS_CGROUPDIR, dirlist_empty), + SYSDLENT(LXSYS_INST_DEVICESDIR, dirlist_devices), + SYSDLENT(LXSYS_INST_DEVICES_VIRTUALDIR, dirlist_devices_virtual) +}; + + +/* + * Array of lookup functions, indexed by lx /sys file type. + */ +static vnode_t *(*lxsys_lookup_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_lookup_static, /* LXSYS_STATIC */ + lxsys_lookup_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_lookup_devices_virtual_netdir, /* LXSYS_DEVICES_NET */ +}; + +/* + * Array of readdir functions, indexed by /sys file type. + */ +static int (*lxsys_readdir_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + lxsys_readdir_static, /* LXSYS_STATIC */ + lxsys_readdir_class_netdir, /* LXSYS_CLASS_NET */ + lxsys_readdir_devices_virtual_netdir, /* LXSYS_DEVICES_NET */ +}; + +/* + * Array of read functions, indexed by /sys file type. + */ +static int (*lxsys_read_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + NULL, /* LXSYS_STATIC */ + NULL, /* LXSYS_CLASS_NET */ + lxsys_read_devices_virtual_net, /* LXSYS_DEVICES_NET */ +}; + +/* + * Array of readlink functions, indexed by /sys file type. + */ +static int (*lxsys_readlink_function[LXSYS_MAXTYPE])() = { + NULL, /* LXSYS_NONE */ + NULL, /* LXSYS_STATIC */ + lxsys_readlink_class_net, /* LXSYS_CLASS_NET */ + NULL, /* LXSYS_DEVICES_NET */ +}; + + + +/* + * lxsys_open(): Vnode operation for VOP_OPEN() + */ +static int +lxsys_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxsys_node_t *lxsnp = VTOLXS(vp); + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file system + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files, + * reject the open for anything else. + * Just do it if we are opening the current or root directory. + */ + if (lxsnp->lxsys_realvp != NULL) { + rvp = lxsnp->lxsys_realvp; + + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + + return (error); +} + + +/* + * lxsys_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxsys_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + + +/* + * lxsys_read(): Vnode operation for VOP_READ() + * All we currently have in this fs are directories. + */ +/* ARGSUSED */ +static int +lxsys_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + lxsys_uiobuf_t *luio; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type == VDIR) { + return (EISDIR); + } + + rlfunc = lxsys_read_function[type]; + if (rlfunc != NULL) { + luio = lxsys_uiobuf_new(uiop); + if ((error = rlfunc(lnp, luio)) == 0) { + error = lxsys_uiobuf_flush(luio); + } + lxsys_uiobuf_free(luio); + } else { + error = EIO; + } + + return (error); +} + +/* + * lxsys_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxsys_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxsys_node_t *lxsnp = VTOLXS(vp); + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxsnp->lxsys_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxsnp->lxsys_realvp; + + /* + * limit attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxsnp->lxsys_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxsnp->lxsys_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxsnp->lxsys_uid; + vap->va_gid = lxsnp->lxsys_gid; + vap->va_nodeid = lxsnp->lxsys_ino; + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxsys_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxsys_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxsys_node_t *lxsnp = VTOLXS(vp); + int shift = 0; + + /* + * Although our lx sysfs is basically a read only file system, Linux + * expects it to be writable so we can't just error if (mode & VWRITE). + */ + + if (lxsnp->lxsys_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxsnp->lxsys_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxsnp->lxsys_uid) { + shift += 3; + if (!groupmember((uid_t)lxsnp->lxsys_gid, cr)) + shift += 3; + } + + mode &= ~(lxsnp->lxsys_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* + * lxsys_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxsys_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + int error; + + VERIFY(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxsys_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxsnp->lxsys_parent); + *vpp = lxsnp->lxsys_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxsys_lookup_function[type](lxsnp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +static vnode_t * +lxsys_lookup_static(lxsys_node_t *ldp, char *comp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (ldp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + if (dirent == NULL) { + return (NULL); + } + + for (i = 0; i < len; i++) { + if (strncmp(comp, dirent[i].d_name, MAXPATHLEN) == 0) { + lxsys_nodetype_t node_type = ldp->lxsys_type; + unsigned int node_instance = 0; + lxsys_node_t *lnp; + + switch (dirent[i].d_idnum) { + case LXSYS_INST_CLASS_NETDIR: + node_type = LXSYS_CLASS_NET; + break; + case LXSYS_INST_DEVICES_VIRTUAL_NETDIR: + node_type = LXSYS_DEVICES_NET; + break; + default: + /* Another static node */ + node_instance = dirent[i].d_idnum; + } + if (node_type == LXSYS_STATIC) { + lnp = lxsys_getnode_static(ldp->lxsys_vnode, + node_instance); + } else { + lnp = lxsys_getnode(ldp->lxsys_vnode, + node_type, node_instance, 0); + } + return (lnp->lxsys_vnode); + } + } + return (NULL); +} + +static vnode_t * +lxsys_lookup_class_netdir(lxsys_node_t *ldp, char *comp) +{ + vnode_t *result = NULL; + lxsys_node_t *lnp; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + if (ldp->lxsys_type != LXSYS_CLASS_NET || + ldp->lxsys_instance != 0) { + /* Lookups only allowed at directory level */ + return (NULL); + } + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + result->v_type = VLNK; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); +} + +static vnode_t * +lxsys_lookup_devices_virtual_netdir(lxsys_node_t *ldp, char *comp) +{ + lxsys_node_t *lnp; + + if (ldp->lxsys_instance == 0) { + /* top-level interface listing */ + vnode_t *result = NULL; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + char ifname[LIFNAMSIZ]; + + (void) strncpy(ifname, comp, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_TONATIVE); + + if ((ns = lxsys_netstack(ldp)) == NULL) { + return (NULL); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_name; + phyi = avl_find(phytree, ifname, NULL); + if (phyi != NULL) { + lnp = lxsys_getnode(ldp->lxsys_vnode, ldp->lxsys_type, + phyi->phyint_ifindex, 0); + result = lnp->lxsys_vnode; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + + return (result); + } else if (ldp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + int i, size; + lxsys_dirent_t *dirent; + + size = SYSDIRLISTSZ(dirlist_devices_virtual_net); + for (i = 0; i < size; i++) { + dirent = &dirlist_devices_virtual_net[i]; + if (strncmp(comp, dirent->d_name, LXSNSIZ) == 0) { + lnp = lxsys_getnode(ldp->lxsys_vnode, + ldp->lxsys_type, ldp->lxsys_instance, + dirent->d_idnum); + lnp->lxsys_vnode->v_type = VREG; + lnp->lxsys_mode = 0444; + return (lnp->lxsys_vnode); + } + } + } + + return (NULL); +} + + +static int +lxsys_read_devices_virtual_net(lxsys_node_t *lnp, lxsys_uiobuf_t *luio) +{ + netstack_t *ns; + ill_t *ill; + uint_t ifindex = lnp->lxsys_instance; + uint8_t *addr; + uint64_t flags; + int error = 0; + + if (ifindex == 0 || lnp->lxsys_endpoint == 0) { + return (EISDIR); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (EIO); + } + + ill = lxsys_find_ill(ns->netstack_ip, ifindex); + if (ill == NULL) { + netstack_rele(ns); + return (EIO); + } + + switch (lnp->lxsys_endpoint) { + case LXSYS_ENDP_NET_ADDRESS: + if (ill->ill_phys_addr_length != ETHERADDRL) { + lxsys_uiobuf_printf(luio, "00:00:00:00:00:00\n"); + break; + } + addr = ill->ill_phys_addr; + lxsys_uiobuf_printf(luio, + "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx\n", + addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); + break; + case LXSYS_ENDP_NET_ADDRLEN: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? ETHERADDRL : ill->ill_phys_addr_length); + break; + case LXSYS_ENDP_NET_FLAGS: + flags = (ill->ill_flags | ill->ill_ipif->ipif_flags | + ill->ill_phyint->phyint_flags) & 0xffff; + lx_ifflags_convert(&flags, LX_IF_FROMNATIVE); + lxsys_uiobuf_printf(luio, "0x%x\n", flags); + break; + case LXSYS_ENDP_NET_IFINDEX: + lxsys_uiobuf_printf(luio, "%u\n", ifindex); + break; + case LXSYS_ENDP_NET_MTU: + lxsys_uiobuf_printf(luio, "%u\n", ill->ill_mtu); + break; + case LXSYS_ENDP_NET_TXQLEN: + /* perpetuate the txqlen lie */ + if (IS_LOOPBACK(ill)) { + lxsys_uiobuf_printf(luio, "0\n"); + } else { + lxsys_uiobuf_printf(luio, "1\n"); + } + break; + case LXSYS_ENDP_NET_TYPE: + lxsys_uiobuf_printf(luio, "%u\n", + IS_LOOPBACK(ill) ? LX_ARPHRD_LOOPBACK : + arp_hw_type(ill->ill_mactype)); + break; + default: + error = EIO; + } + + ill_refrele(ill); + netstack_rele(ns); + return (error); +} + +/* + * lxsys_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxsys_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxsys_node_t *lxsnp = VTOLXS(dp); + lxsys_nodetype_t type = lxsnp->lxsys_type; + ssize_t uresid; + off_t uoffset; + int error, leof; + + ASSERT(dp->v_type == VDIR); + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxsys_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXSYS_SDSIZE) + return (ENOENT); + + /* Free lower functions from having to check eofp == NULL */ + if (eofp == NULL) { + eofp = &leof; + } + + return (lxsys_readdir_function[lxsnp->lxsys_type](lxsnp, uiop, eofp)); +} + +static int +lxsys_dirent_out(dirent64_t *d, ushort_t n, struct uio *uio) +{ + int error; + off_t offset = uio->uio_offset; + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset by the + * same amount. But we want uiop->uio_offset to change in increments + * of LXSYS_SDSIZE, which is different from the number of bytes being + * returned to the user. To accomplish this, we set uiop->uio_offset + * separately on success, overriding what uiomove() does. + */ + d->d_off = (off64_t)(offset + LXSYS_SDSIZE); + d->d_reclen = n; + if ((error = uiomove(d, n, UIO_READ, uio)) != 0) { + return (error); + } + uio->uio_offset = offset + LXSYS_SDSIZE; + return (0); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxsys_readdir_common(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(LXSYS_STATIC, + dirtab[dirindex].d_idnum, 0); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_subdir(lxsys_node_t *lxsnp, uio_t *uiop, int *eofp, + lxsys_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Satisfy user request */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXSYS_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxsnp->lxsys_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXSYS_SDSIZE) { + + dirent->d_ino = lxsys_parentinode(lxsnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxsys_inode(lxsnp->lxsys_type, + lxsnp->lxsys_instance, dirtab[dirindex].d_idnum); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + *eofp = 1; + return (0); + } + + /* + * If the size of the data to transfer is greater than the + * user-provided buffer, we cannot continue. + */ + if (reclen > uresid) { + /* Error if no entries have been returned yet. */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + return (error); + } + } + + /* Have run out of space, but could have just done last table entry */ + *eofp = (uiop->uio_offset >= ((dirtablen+2) * LXSYS_SDSIZE)) ? 1 : 0; + return (0); +} + +static int +lxsys_readdir_ifaces(lxsys_node_t *ldp, struct uio *uiop, int *eofp, + lxsys_nodetype_t type) +{ + longlong_t bp[DIRENT64_RECLEN(LXSNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid, uresid; + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + int error, i; + + + /* Emit "." and ".." entries */ + oresid = uiop->uio_resid; + error = lxsys_readdir_common(ldp, uiop, eofp, NULL, 0); + if (error != 0 || *eofp == 0) { + return (error); + } + + if ((ns = lxsys_netstack(ldp)) == NULL) { + *eofp = 1; + return (0); + } + ipst = ns->netstack_ip; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_first(phytree); + if (phyi == NULL) { + *eofp = 1; + } + bzero(bp, sizeof (bp)); + + /* + * Skip records we have already passed with the offset. + * This accounts for the two "." and ".." records already seen. + */ + for (i = (uiop->uio_offset/LXSYS_SDSIZE) - 2; i > 0; i--) { + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + while ((uresid = uiop->uio_resid) > 0 && phyi != NULL) { + uint_t ifindex; + int reclen; + + ifindex = phyi->phyint_ifindex; + (void) strncpy(dirent->d_name, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(dirent->d_name, LX_IF_FROMNATIVE); + dirent->d_ino = lxsys_inode(type, ifindex, 0); + reclen = DIRENT64_RECLEN(strlen(dirent->d_name)); + + if (reclen > uresid) { + if (uresid == oresid) { + /* Not enough space for one record */ + error = EINVAL; + } + break; + } + if ((error = lxsys_dirent_out(dirent, reclen, uiop)) != 0) { + break; + } + + if ((phyi = avl_walk(phytree, phyi, AVL_AFTER)) == NULL) { + *eofp = 1; + break; + } + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + + +static int +lxsys_readdir_static(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + lxsys_dirent_t *dirent = NULL; + int i, len = 0; + + for (i = 0; i < SYSDIRLISTSZ(lxsys_dirlookup); i++) { + if (lnp->lxsys_instance == lxsys_dirlookup[i].dl_instance) { + dirent = lxsys_dirlookup[i].dl_list; + len = lxsys_dirlookup[i].dl_length; + break; + } + } + + if (dirent == NULL) { + return (ENOTDIR); + } + + return (lxsys_readdir_common(lnp, uiop, eofp, dirent, len)); +} + +static int +lxsys_readdir_class_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + if (lnp->lxsys_type != LXSYS_CLASS_NET || + lnp->lxsys_instance != 0) { + /* + * Since /sys/class/net contains only symlinks, readdir + * operations should not be performed anywhere except the top + * level (instance == 0). + */ + return (ENOTDIR); + } + + return (lxsys_readdir_ifaces(lnp, uiop, eofp, LXSYS_CLASS_NET)); +} + +static int +lxsys_readdir_devices_virtual_netdir(lxsys_node_t *lnp, uio_t *uiop, int *eofp) +{ + int error; + + if (lnp->lxsys_instance == 0) { + /* top-level interface listing */ + error = lxsys_readdir_ifaces(lnp, uiop, eofp, + LXSYS_DEVICES_NET); + } else if (lnp->lxsys_endpoint == 0) { + /* interface-level sub-item listing */ + error = lxsys_readdir_subdir(lnp, uiop, eofp, + dirlist_devices_virtual_net, + SYSDIRLISTSZ(dirlist_devices_virtual_net)); + } else { + /* there shouldn't be subdirs below this */ + error = ENOTDIR; + } + + return (error); +} + +/* + * lxsys_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxsys_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char buf[MAXPATHLEN + 1]; + lxsys_node_t *lnp = VTOLXS(vp); + lxsys_nodetype_t type = lnp->lxsys_type; + int (*rlfunc)(); + int error; + + VERIFY(type > LXSYS_NONE && type < LXSYS_MAXTYPE); + + if (vp->v_type != VLNK) { + return (EINVAL); + } + + rlfunc = lxsys_readlink_function[lnp->lxsys_type]; + if (rlfunc != NULL) { + if ((error = rlfunc(lnp, buf, sizeof (buf))) == 0) { + error = uiomove(buf, strlen(buf), UIO_READ, uiop); + } + } else { + error = EINVAL; + } + + return (error); +} + + +static int +lxsys_readlink_class_net(lxsys_node_t *lnp, char *buf, size_t len) +{ + netstack_t *ns; + ip_stack_t *ipst; + avl_tree_t *phytree; + phyint_t *phyi; + uint_t ifindex; + char ifname[LIFNAMSIZ]; + int error = EINVAL; + + if ((ifindex = lnp->lxsys_instance) == 0) { + return (error); + } + + if ((ns = lxsys_netstack(lnp)) == NULL) { + return (error); + } + ipst = ns->netstack_ip; + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + phytree = &ipst->ips_phyint_g_list->phyint_list_avl_by_index; + phyi = avl_find(phytree, &ifindex, NULL); + if (phyi != NULL) { + (void) strncpy(ifname, phyi->phyint_name, LIFNAMSIZ); + lx_ifname_convert(ifname, LX_IF_FROMNATIVE); + (void) snprintf(buf, len, "/sys/devices/virtual/net/%s", + ifname); + error = 0; + } + + rw_exit(&ipst->ips_ill_g_lock); + netstack_rele(ns); + return (error); +} + +/* + * lxsys_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxsys_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxsys_freenode(VTOLXS(vp)); +} + +/* + * lxsys_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxsys_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxsys_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxsys_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxsys_vnodeops) && + (rvp = VTOLXS(vp1)->lxsys_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxsys_vnodeops) && + (rvp = VTOLXS(vp2)->lxsys_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxsys_vnodeops) || + vn_matchops(vp2, lxsys_vnodeops)) + return (vp1 == vp2); + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxsys_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxsys_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXS(vp)->lxsys_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.c b/usr/src/uts/common/brand/sn1/sn1_brand.c index d61928d578..7f99dc5f33 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.c +++ b/usr/src/uts/common/brand/sn1/sn1_brand.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ #include <sys/errno.h> @@ -48,37 +49,59 @@ void sn1_setbrand(proc_t *); int sn1_getattr(zone_t *, int, void *, size_t *); int sn1_setattr(zone_t *, int, void *, size_t); int sn1_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void sn1_copy_procdata(proc_t *, proc_t *); -void sn1_proc_exit(struct proc *, klwp_t *); +void sn1_proc_exit(struct proc *); void sn1_exec(); -int sn1_initlwp(klwp_t *); +void sn1_initlwp(klwp_t *, void *); void sn1_forklwp(klwp_t *, klwp_t *); void sn1_freelwp(klwp_t *); void sn1_lwpexit(klwp_t *); int sn1_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); /* sn1 brand */ struct brand_ops sn1_brops = { - sn1_init_brand_data, - sn1_free_brand_data, - sn1_brandsys, - sn1_setbrand, - sn1_getattr, - sn1_setattr, - sn1_copy_procdata, - sn1_proc_exit, - sn1_exec, - lwp_setrval, - sn1_initlwp, - sn1_forklwp, - sn1_freelwp, - sn1_lwpexit, - sn1_elfexec, - NULL, - NULL, - NSIG, + sn1_init_brand_data, /* b_init_brand_data */ + sn1_free_brand_data, /* b_free_brand_data */ + sn1_brandsys, /* b_brandsys */ + sn1_setbrand, /* b_setbrand */ + sn1_getattr, /* b_getattr */ + sn1_setattr, /* b_setattr */ + sn1_copy_procdata, /* b_copy_procdata */ + sn1_proc_exit, /* b_proc_exit */ + sn1_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + sn1_initlwp, /* b_initlwp */ + sn1_forklwp, /* b_forklwp */ + sn1_freelwp, /* b_freelwp */ + sn1_lwpexit, /* b_lwpexit */ + sn1_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL /* b_pagefault */ }; #ifdef sparc @@ -94,9 +117,12 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, + NULL, sn1_brand_int91_callback, sn1_brand_syscall_callback, - sn1_brand_syscall32_callback + sn1_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -104,7 +130,10 @@ struct brand_mach_ops sn1_mops = { struct brand_mach_ops sn1_mops = { sn1_brand_sysenter_callback, NULL, + NULL, sn1_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -115,7 +144,8 @@ struct brand sn1_brand = { BRAND_VER_1, "sn1", &sn1_brops, - &sn1_mops + &sn1_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -151,7 +181,7 @@ sn1_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) /*ARGSUSED*/ int sn1_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) { int res; @@ -171,9 +201,9 @@ sn1_copy_procdata(proc_t *child, proc_t *parent) } void -sn1_proc_exit(struct proc *p, klwp_t *l) +sn1_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &sn1_brand); + brand_solaris_proc_exit(p, &sn1_brand); } void @@ -182,10 +212,10 @@ sn1_exec() brand_solaris_exec(&sn1_brand); } -int -sn1_initlwp(klwp_t *l) +void +sn1_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &sn1_brand)); + brand_solaris_initlwp(l, &sn1_brand); } void @@ -221,11 +251,11 @@ sn1_init_brand_data(zone_t *zone) int sn1_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &sn1_brand, SN1_BRANDNAME, - SN1_LIB, SN1_LIB32, SN1_LINKER, SN1_LINKER32)); + SN1_LIB, SN1_LIB32)); } int diff --git a/usr/src/uts/common/brand/sn1/sn1_brand.h b/usr/src/uts/common/brand/sn1/sn1_brand.h index b487745e21..fef9dc128b 100644 --- a/usr/src/uts/common/brand/sn1/sn1_brand.h +++ b/usr/src/uts/common/brand/sn1/sn1_brand.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SN1_BRAND_H @@ -37,20 +38,14 @@ extern "C" { #define SN1_VERSION SN1_VERSION_1 #define SN1_LIB_NAME "sn1_brand.so.1" -#define SN1_LINKER_NAME "ld.so.1" #define SN1_LIB32 BRAND_NATIVE_DIR "usr/lib/" SN1_LIB_NAME -#define SN1_LINKER32 "/lib/" SN1_LINKER_NAME - #define SN1_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" SN1_LIB_NAME -#define SN1_LINKER64 "/lib/64/" SN1_LINKER_NAME #if defined(_LP64) #define SN1_LIB SN1_LIB64 -#define SN1_LINKER SN1_LINKER64 #else /* !_LP64 */ #define SN1_LIB SN1_LIB32 -#define SN1_LINKER SN1_LINKER32 #endif /* !_LP64 */ #if defined(_KERNEL) diff --git a/usr/src/uts/common/brand/sngl/sngl_brand.c b/usr/src/uts/common/brand/sngl/sngl_brand.c new file mode 100644 index 0000000000..d5e1808d89 --- /dev/null +++ b/usr/src/uts/common/brand/sngl/sngl_brand.c @@ -0,0 +1,276 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#include <sys/errno.h> +#include <sys/exec.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/model.h> +#include <sys/proc.h> +#include <sys/syscall.h> +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/cmn_err.h> +#include <sys/archsystm.h> +#include <sys/pathname.h> +#include <sys/sunddi.h> + +#include <sys/machbrand.h> +#include <sys/brand.h> +#include "sngl_brand.h" + +char *sngl_emulation_table = NULL; + +void sngl_init_brand_data(zone_t *); +void sngl_free_brand_data(zone_t *); +void sngl_setbrand(proc_t *); +int sngl_getattr(zone_t *, int, void *, size_t *); +int sngl_setattr(zone_t *, int, void *, size_t); +int sngl_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, + uintptr_t, uintptr_t); +void sngl_copy_procdata(proc_t *, proc_t *); +void sngl_proc_exit(struct proc *); +void sngl_exec(); +void sngl_initlwp(klwp_t *, void *); +void sngl_forklwp(klwp_t *, klwp_t *); +void sngl_freelwp(klwp_t *); +void sngl_lwpexit(klwp_t *); +int sngl_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, + long *, int, caddr_t, cred_t *, int *); + +/* SNGL brand */ +struct brand_ops sngl_brops = { + sngl_init_brand_data, /* b_init_brand_data */ + sngl_free_brand_data, /* b_free_brand_data */ + sngl_brandsys, /* b_brandsys */ + sngl_setbrand, /* b_setbrand */ + sngl_getattr, /* b_getattr */ + sngl_setattr, /* b_setattr */ + sngl_copy_procdata, /* b_copy_procdata */ + sngl_proc_exit, /* b_proc_exit */ + sngl_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + sngl_initlwp, /* b_initlwp */ + sngl_forklwp, /* b_forklwp */ + sngl_freelwp, /* b_freelwp */ + sngl_lwpexit, /* b_lwpexit */ + sngl_elfexec, /* b_elfexec */ + NULL, /* b_sigset_native_to_brand */ + NULL, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL /* b_pagefault */ +}; + +#ifdef __amd64 + +struct brand_mach_ops sngl_mops = { + sngl_brand_sysenter_callback, + sngl_brand_int91_callback, + sngl_brand_syscall_callback, + sngl_brand_syscall32_callback, + NULL +}; + +#else /* ! __amd64 */ + +struct brand_mach_ops sngl_mops = { + sngl_brand_sysenter_callback, + NULL, + sngl_brand_syscall_callback, + NULL, + NULL +}; +#endif /* __amd64 */ + +struct brand sngl_brand = { + BRAND_VER_1, + "sngl", + &sngl_brops, + &sngl_mops, + sizeof (brand_proc_data_t), +}; + +static struct modlbrand modlbrand = { + &mod_brandops, /* type of module */ + "SNGL Brand", /* description of module */ + &sngl_brand /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlbrand, NULL +}; + +void +sngl_setbrand(proc_t *p) +{ + brand_solaris_setbrand(p, &sngl_brand); +} + +/*ARGSUSED*/ +int +sngl_getattr(zone_t *zone, int attr, void *buf, size_t *bufsize) +{ + return (EINVAL); +} + +/*ARGSUSED*/ +int +sngl_setattr(zone_t *zone, int attr, void *buf, size_t bufsize) +{ + return (EINVAL); +} + +/*ARGSUSED*/ +int +sngl_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) +{ + int res; + + *rval = 0; + res = brand_solaris_cmd(cmd, arg1, arg2, arg3, &sngl_brand, + SNGL_VERSION); + if (res >= 0) + return (res); + + return (EINVAL); +} + +void +sngl_copy_procdata(proc_t *child, proc_t *parent) +{ + brand_solaris_copy_procdata(child, parent, &sngl_brand); +} + +void +sngl_proc_exit(struct proc *p) +{ + brand_solaris_proc_exit(p, &sngl_brand); +} + +void +sngl_exec() +{ + brand_solaris_exec(&sngl_brand); +} + +void +sngl_initlwp(klwp_t *l, void *bd) +{ + brand_solaris_initlwp(l, &sngl_brand); +} + +void +sngl_forklwp(klwp_t *p, klwp_t *c) +{ + brand_solaris_forklwp(p, c, &sngl_brand); +} + +void +sngl_freelwp(klwp_t *l) +{ + brand_solaris_freelwp(l, &sngl_brand); +} + +void +sngl_lwpexit(klwp_t *l) +{ + brand_solaris_lwpexit(l, &sngl_brand); +} + +void +sngl_free_brand_data(zone_t *zone) +{ +} + +void +sngl_init_brand_data(zone_t *zone) +{ +} + +int +sngl_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, + int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, + int *brand_action) +{ + return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, + setid, exec_file, cred, brand_action, &sngl_brand, SNGL_BRANDNAME, + SNGL_LIB, SNGL_LIB32)); +} + +int +_init(void) +{ + int err; + + /* + * Set up the table to interpose on open. + */ + sngl_emulation_table = kmem_zalloc(NSYSCALL, KM_SLEEP); + sngl_emulation_table[SYS_open] = 1; /* 5 */ + + err = mod_install(&modlinkage); + if (err) { + cmn_err(CE_WARN, "Couldn't install brand module"); + kmem_free(sngl_emulation_table, NSYSCALL); + } + + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (brand_solaris_fini(&sngl_emulation_table, &modlinkage, + &sngl_brand)); +} diff --git a/usr/src/uts/common/brand/sngl/sngl_brand.h b/usr/src/uts/common/brand/sngl/sngl_brand.h new file mode 100644 index 0000000000..7c6cbd5412 --- /dev/null +++ b/usr/src/uts/common/brand/sngl/sngl_brand.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SNGL_BRAND_H +#define _SNGL_BRAND_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/brand.h> + +#define SNGL_BRANDNAME "sngl" + +#define SNGL_VERSION 1 + +#define SNGL_LIB_NAME "sngl_brand.so.1" +#define SNGL_LIB32 "/system/usr/lib/" SNGL_LIB_NAME +#define SNGL_LIB64 "/system/usr/lib/64/" SNGL_LIB_NAME + +#if defined(_LP64) +#define SNGL_LIB SNGL_LIB64 +#else /* !_LP64 */ +#define SNGL_LIB SNGL_LIB32 +#endif /* !_LP64 */ + +#if defined(_KERNEL) + +void sngl_brand_syscall_callback(void); +void sngl_brand_sysenter_callback(void); + +#if defined(__amd64) +void sngl_brand_syscall32_callback(void); +void sngl_brand_int91_callback(void); +#endif /* __amd64 */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SNGL_BRAND_H */ diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.c b/usr/src/uts/common/brand/solaris10/s10_brand.c index f24b864eef..f70f54a788 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.c +++ b/usr/src/uts/common/brand/solaris10/s10_brand.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/errno.h> @@ -51,39 +52,61 @@ void s10_setbrand(proc_t *); int s10_getattr(zone_t *, int, void *, size_t *); int s10_setattr(zone_t *, int, void *, size_t); int s10_brandsys(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void s10_copy_procdata(proc_t *, proc_t *); -void s10_proc_exit(struct proc *, klwp_t *); +void s10_proc_exit(struct proc *); void s10_exec(); -int s10_initlwp(klwp_t *); +void s10_initlwp(klwp_t *, void *); void s10_forklwp(klwp_t *, klwp_t *); void s10_freelwp(klwp_t *); void s10_lwpexit(klwp_t *); int s10_elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); void s10_sigset_native_to_s10(sigset_t *); void s10_sigset_s10_to_native(sigset_t *); /* s10 brand */ struct brand_ops s10_brops = { - s10_init_brand_data, - s10_free_brand_data, - s10_brandsys, - s10_setbrand, - s10_getattr, - s10_setattr, - s10_copy_procdata, - s10_proc_exit, - s10_exec, - lwp_setrval, - s10_initlwp, - s10_forklwp, - s10_freelwp, - s10_lwpexit, - s10_elfexec, - s10_sigset_native_to_s10, - s10_sigset_s10_to_native, - S10_NSIG, + s10_init_brand_data, /* b_init_brand_data */ + s10_free_brand_data, /* b_free_brand_data */ + s10_brandsys, /* b_brandsys */ + s10_setbrand, /* b_setbrand */ + s10_getattr, /* b_getattr */ + s10_setattr, /* b_setattr */ + s10_copy_procdata, /* b_copy_procdata */ + s10_proc_exit, /* b_proc_exit */ + s10_exec, /* b_exec */ + lwp_setrval, /* b_lwp_setrval */ + NULL, /* b_lwpdata_alloc */ + NULL, /* b_lwpdata_free */ + s10_initlwp, /* b_initlwp */ + s10_forklwp, /* b_forklwp */ + s10_freelwp, /* b_freelwp */ + s10_lwpexit, /* b_lwpexit */ + s10_elfexec, /* b_elfexec */ + s10_sigset_native_to_s10, /* b_sigset_native_to_brand */ + s10_sigset_s10_to_native, /* b_sigset_brand_to_native */ + NULL, /* b_psig_to_proc */ + S10_NSIG, /* b_nsig */ + NULL, /* b_exit_with_sig */ + NULL, /* b_wait_filter */ + NULL, /* b_native_exec */ + NULL, /* b_ptrace_exectrap */ + NULL, /* b_map32limit */ + NULL, /* b_stop_notify */ + NULL, /* b_waitid_helper */ + NULL, /* b_sigcld_repost */ + NULL, /* b_issig_stop */ + NULL, /* b_sig_ignorable */ + NULL, /* b_savecontext */ +#if defined(_SYSCALL32_IMPL) + NULL, /* b_savecontext32 */ +#endif + NULL, /* b_restorecontext */ + NULL, /* b_sendsig_stack */ + NULL, /* b_sendsig */ + NULL, /* b_setid_clear */ + NULL /* b_pagefault */ }; #ifdef sparc @@ -99,9 +122,12 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, + NULL, s10_brand_int91_callback, s10_brand_syscall_callback, - s10_brand_syscall32_callback + s10_brand_syscall32_callback, + NULL, + NULL }; #else /* ! __amd64 */ @@ -109,7 +135,10 @@ struct brand_mach_ops s10_mops = { struct brand_mach_ops s10_mops = { s10_brand_sysenter_callback, NULL, + NULL, s10_brand_syscall_callback, + NULL, + NULL, NULL }; #endif /* __amd64 */ @@ -120,7 +149,8 @@ struct brand s10_brand = { BRAND_VER_1, "solaris10", &s10_brops, - &s10_mops + &s10_mops, + sizeof (brand_proc_data_t), }; static struct modlbrand modlbrand = { @@ -252,7 +282,7 @@ s10_native(void *cmd, void *args) /*ARGSUSED*/ int s10_brandsys(int cmd, int64_t *rval, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) { proc_t *p = curproc; int res; @@ -326,9 +356,9 @@ s10_copy_procdata(proc_t *child, proc_t *parent) } void -s10_proc_exit(struct proc *p, klwp_t *l) +s10_proc_exit(struct proc *p) { - brand_solaris_proc_exit(p, l, &s10_brand); + brand_solaris_proc_exit(p, &s10_brand); } void @@ -337,10 +367,10 @@ s10_exec() brand_solaris_exec(&s10_brand); } -int -s10_initlwp(klwp_t *l) +void +s10_initlwp(klwp_t *l, void *bd) { - return (brand_solaris_initlwp(l, &s10_brand)); + brand_solaris_initlwp(l, &s10_brand); } void @@ -390,11 +420,11 @@ s10_init_brand_data(zone_t *zone) int s10_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { return (brand_solaris_elfexec(vp, uap, args, idatap, level, execsz, setid, exec_file, cred, brand_action, &s10_brand, S10_BRANDNAME, - S10_LIB, S10_LIB32, S10_LINKER, S10_LINKER32)); + S10_LIB, S10_LIB32)); } void diff --git a/usr/src/uts/common/brand/solaris10/s10_brand.h b/usr/src/uts/common/brand/solaris10/s10_brand.h index 11f9853f48..ffef485e12 100644 --- a/usr/src/uts/common/brand/solaris10/s10_brand.h +++ b/usr/src/uts/common/brand/solaris10/s10_brand.h @@ -22,6 +22,7 @@ /* * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _S10_BRAND_H @@ -42,17 +43,12 @@ extern "C" { #define S10_LINKER_NAME "ld.so.1" #define S10_LIB32 BRAND_NATIVE_DIR "usr/lib/" S10_LIB_NAME -#define S10_LINKER32 "/lib/" S10_LINKER_NAME - #define S10_LIB64 BRAND_NATIVE_DIR "usr/lib/64/" S10_LIB_NAME -#define S10_LINKER64 "/lib/64/" S10_LINKER_NAME #if defined(_LP64) #define S10_LIB S10_LIB64 -#define S10_LINKER S10_LINKER64 #else /* !_LP64 */ #define S10_LIB S10_LIB32 -#define S10_LINKER S10_LINKER32 #endif /* !_LP64 */ /* diff --git a/usr/src/uts/common/conf/param.c b/usr/src/uts/common/conf/param.c index 6b3ba51f31..a71be771fd 100644 --- a/usr/src/uts/common/conf/param.c +++ b/usr/src/uts/common/conf/param.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. */ @@ -559,8 +560,8 @@ char *isa_list = architecture; static pgcnt_t original_physmem = 0; #define MIN_DEFAULT_MAXUSERS 8u -#define MAX_DEFAULT_MAXUSERS 2048u -#define MAX_MAXUSERS 4096u +#define MAX_DEFAULT_MAXUSERS 10000u +#define MAX_MAXUSERS 20000u void param_preset(void) @@ -572,7 +573,7 @@ void param_calc(int platform_max_nprocs) { /* - * Default to about one "user" per megabyte, taking into + * Default to about one "user" per 8MB, taking into * account both physical and virtual constraints. * Note: 2^20 is a meg; shifting right by (20 - PAGESHIFT) * converts pages to megs without integer overflow. @@ -586,8 +587,9 @@ param_calc(int platform_max_nprocs) if (maxusers == 0) { pgcnt_t physmegs = physmem >> (20 - PAGESHIFT); pgcnt_t virtmegs = vmem_size(heap_arena, VMEM_FREE) >> 20; - maxusers = MIN(MAX(MIN(physmegs, virtmegs), - MIN_DEFAULT_MAXUSERS), MAX_DEFAULT_MAXUSERS); + maxusers = MIN(physmegs, virtmegs) >> 3; /* divide by 8 */ + maxusers = MAX(maxusers, MIN_DEFAULT_MAXUSERS); + maxusers = MIN(maxusers, MAX_DEFAULT_MAXUSERS); } if (maxusers > MAX_MAXUSERS) { maxusers = MAX_MAXUSERS; diff --git a/usr/src/uts/common/crypto/api/kcf_random.c b/usr/src/uts/common/crypto/api/kcf_random.c index bc72fa984a..75072fb686 100644 --- a/usr/src/uts/common/crypto/api/kcf_random.c +++ b/usr/src/uts/common/crypto/api/kcf_random.c @@ -70,6 +70,7 @@ #include <sys/cpuvar.h> #include <sys/taskq.h> #include <rng/fips_random.h> +#include <sys/strlog.h> #define RNDPOOLSIZE 1024 /* Pool size in bytes */ #define MINEXTRACTBYTES 20 @@ -933,7 +934,8 @@ rnd_handler(void *arg) int len = 0; if (!rng_prov_found && rng_ok_to_log) { - cmn_err(CE_WARN, "No randomness provider enabled for " + (void) strlog(0, 0, 0, SL_NOTE, + "No randomness provider enabled for " "/dev/random. Use cryptoadm(1M) to enable a provider."); rng_ok_to_log = B_FALSE; } diff --git a/usr/src/uts/common/crypto/core/kcf_sched.c b/usr/src/uts/common/crypto/core/kcf_sched.c index f461fe048c..8b2760b237 100644 --- a/usr/src/uts/common/crypto/core/kcf_sched.c +++ b/usr/src/uts/common/crypto/core/kcf_sched.c @@ -1027,9 +1027,9 @@ kcfpool_svc(void *arg) case 0: case -1: /* - * Woke up with no work to do. Check - * if this thread should exit. We keep - * at least kcf_minthreads. + * Woke up with no work to do. Check if we + * should lwp_exit() (which won't return). We + * keep at least kcf_minthreads. */ if (kcfpool->kp_threads > kcf_minthreads) { KCF_ATOMIC_DECR(kcfpool->kp_threads); diff --git a/usr/src/uts/common/ctf/ctf_mod.c b/usr/src/uts/common/ctf/ctf_mod.c index b34cf400cd..421b922c96 100644 --- a/usr/src/uts/common/ctf/ctf_mod.c +++ b/usr/src/uts/common/ctf/ctf_mod.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sysmacros.h> #include <sys/modctl.h> #include <sys/debug.h> @@ -117,6 +115,15 @@ ctf_version(int version) /*ARGSUSED*/ ctf_file_t * +ctf_fdcreate_int(int fd, int *errp, ctf_sect_t *ctfp) +{ + if (errp != NULL) + *errp = ENOTSUP; + return (NULL); +} + +/*ARGSUSED*/ +ctf_file_t * ctf_modopen(struct module *mp, int *error) { ctf_sect_t ctfsect, symsect, strsect; diff --git a/usr/src/uts/common/disp/cmt.c b/usr/src/uts/common/disp/cmt.c index 1c5e1f79a9..3ecbf39393 100644 --- a/usr/src/uts/common/disp/cmt.c +++ b/usr/src/uts/common/disp/cmt.c @@ -201,13 +201,15 @@ pg_cmt_cpu_startup(cpu_t *cp) /* * Return non-zero if thread can migrate between "from" and "to" - * without a performance penalty + * without a performance penalty. This is true only if we share a core on + * virtually any CPU; sharing the last-level cache is insufficient to make + * migration possible without penalty. */ int pg_cmt_can_migrate(cpu_t *from, cpu_t *to) { - if (from->cpu_physid->cpu_cacheid == - to->cpu_physid->cpu_cacheid) + if (from->cpu_physid->cpu_coreid == + to->cpu_physid->cpu_coreid) return (1); return (0); } diff --git a/usr/src/uts/common/disp/cpucaps.c b/usr/src/uts/common/disp/cpucaps.c index 46f53faab6..2a4365ff73 100644 --- a/usr/src/uts/common/disp/cpucaps.c +++ b/usr/src/uts/common/disp/cpucaps.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ #include <sys/disp.h> @@ -74,6 +75,32 @@ * Putting threads on wait queues in random places while running in the * kernel might lead to all kinds of locking problems. * + * Bursting + * ======== + * + * CPU bursting occurs when the CPU usage is over the baseline but under the + * cap. The baseline CPU (zone.cpu-baseline) is set in a multi-tenant + * environment so that we know how much CPU is allocated for a tenant under + * normal utilization. We can then track how much time a zone is spending + * over the "normal" CPU utilization expected for that zone using the + * "above_base_sec" kstat. This kstat is cumulative. + * + * If the zone has a burst limit (zone.cpu-burst-time) then the zone can + * burst for that period of time (in seconds) before the effective cap is + * lowered to the baseline. Once the effective cap is lowered, the zone + * will run at the baseline for the burst limit before the effective cap is + * raised again to the full value. This will allow the zone to burst again. + * We can watch this behavior using the kstats. The "effective" kstat shows + * which cap is being used, the baseline value or the burst value. The + * "burst_limit_sec" shows the value of the zone.cpu-burst-time rctl and the + * "bursting_sec" kstat shows how many seconds the zone has currently been + * bursting. When the CPU load is continuously greater than the baseline, + * bursting_sec will increase, up to the burst_limit_sec value, then the + * effective kstat will drop to the baseline and the bursting_sec value will + * decrease until it hits 0, at which time the effective kstat will return to + * the full burst value and the bursting_sec value will begin to increase + * again. + * * Accounting * ========== * @@ -203,18 +230,28 @@ static void caps_update(); */ struct cap_kstat { kstat_named_t cap_value; + kstat_named_t cap_baseline; + kstat_named_t cap_effective; + kstat_named_t cap_burst_limit; + kstat_named_t cap_bursting; kstat_named_t cap_usage; kstat_named_t cap_nwait; kstat_named_t cap_below; kstat_named_t cap_above; + kstat_named_t cap_above_base; kstat_named_t cap_maxusage; kstat_named_t cap_zonename; } cap_kstat = { { "value", KSTAT_DATA_UINT64 }, + { "baseline", KSTAT_DATA_UINT64 }, + { "effective", KSTAT_DATA_UINT64 }, + { "burst_limit_sec", KSTAT_DATA_UINT64 }, + { "bursting_sec", KSTAT_DATA_UINT64 }, { "usage", KSTAT_DATA_UINT64 }, { "nwait", KSTAT_DATA_UINT64 }, { "below_sec", KSTAT_DATA_UINT64 }, { "above_sec", KSTAT_DATA_UINT64 }, + { "above_base_sec", KSTAT_DATA_UINT64 }, { "maxusage", KSTAT_DATA_UINT64 }, { "zonename", KSTAT_DATA_STRING }, }; @@ -311,7 +348,7 @@ cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) cap->cap_below = cap->cap_above = 0; cap->cap_maxusage = 0; cap->cap_usage = 0; - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; waitq_unblock(&cap->cap_waitq); if (CPUCAPS_OFF()) { cpucaps_enabled = B_TRUE; @@ -340,19 +377,21 @@ cap_disable(list_t *l, cpucap_t *cap) ASSERT(CAP_ENABLED(cap)); waitq_block(&cap->cap_waitq); + + /* do this first to avoid race with cap_kstat_update */ + if (cap->cap_kstat != NULL) { + kstat_delete(cap->cap_kstat); + cap->cap_kstat = NULL; + } + list_remove(l, cap); if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { cpucaps_enabled = B_FALSE; cpucaps_clock_callout = NULL; } - cap->cap_value = 0; + cap->cap_value = cap->cap_chk_value = 0; cap->cap_project = NULL; cap->cap_zone = NULL; - if (cap->cap_kstat != NULL) { - kstat_delete(cap->cap_kstat); - cap->cap_kstat = NULL; - } - } /* @@ -487,6 +526,8 @@ cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) * The waitq_isempty check is performed without the waitq lock. If a new thread * is placed on the waitq right after the check, it will be picked up during the * next invocation of cap_poke_waitq(). + * + * Called once per tick for zones. */ /* ARGSUSED */ static void @@ -494,15 +535,92 @@ cap_poke_waitq(cpucap_t *cap, int64_t gen) { ASSERT(MUTEX_HELD(&caps_lock)); - if (cap->cap_usage >= cap->cap_value) { + if (cap->cap_base != 0) { + /* + * Because of the way usage is calculated and decayed, its + * possible for the zone to be slightly over its cap, but we + * don't want to count that after we have reduced the effective + * cap to the baseline. That way the zone will be able to + * burst again after the burst_limit has expired. + */ + if (cap->cap_usage > cap->cap_base && + cap->cap_chk_value == cap->cap_value) { + cap->cap_above_base++; + + /* + * If bursting is limited and we've been bursting + * longer than we're supposed to, then set the + * effective cap to the baseline. + */ + if (cap->cap_burst_limit != 0) { + cap->cap_bursting++; + if (cap->cap_bursting >= cap->cap_burst_limit) + cap->cap_chk_value = cap->cap_base; + } + } else if (cap->cap_bursting > 0) { + /* + * We're not bursting now, but we were, decay the + * bursting timer. + */ + cap->cap_bursting--; + /* + * Reset the effective cap once we decay to 0 so we + * can burst again. + */ + if (cap->cap_bursting == 0 && + cap->cap_chk_value != cap->cap_value) + cap->cap_chk_value = cap->cap_value; + } + } + + if (cap->cap_usage >= cap->cap_chk_value) { cap->cap_above++; } else { waitq_t *wq = &cap->cap_waitq; cap->cap_below++; - if (!waitq_isempty(wq)) - waitq_runone(wq); + if (!waitq_isempty(wq)) { + int i, ndequeue, p; + + /* + * Since this function is only called once per tick, + * we can hit a situation where we have artificially + * limited the project/zone below its cap. This would + * happen if we have multiple threads queued up but + * only dequeued one thread/tick. To avoid this we + * dequeue multiple threads, calculated based on the + * usage percentage of the cap. It is possible that we + * could dequeue too many threads and some of them + * might be put back on the wait queue quickly, but + * since we know that threads are on the wait queue + * because we're capping, we know that there is unused + * CPU cycles anyway, so this extra work would not + * hurt. Also, the ndequeue number is only an upper + * bound and we might dequeue less, depending on how + * many threads are actually in the wait queue. The + * ndequeue values are empirically derived and could be + * adjusted or calculated in another way if necessary. + */ + p = (int)((100 * cap->cap_usage) / cap->cap_chk_value); + if (p >= 98) + ndequeue = 10; + else if (p >= 95) + ndequeue = 20; + else if (p >= 90) + ndequeue = 40; + else if (p >= 85) + ndequeue = 80; + else + ndequeue = 160; + + for (i = 0; i < ndequeue; i++) { + waitq_runone(wq); + if (waitq_isempty(wq)) + break; + } + DTRACE_PROBE2(cpucaps__pokeq, int, p, int, i); + } } } @@ -629,14 +747,14 @@ cap_project_zone_modify_walker(kproject_t *kpj, void *arg) * Remove all projects in this zone without caps * from the capped_projects list. */ - if (project_cap->cap_value == MAX_USAGE) { + if (project_cap->cap_chk_value == MAX_USAGE) { cap_project_disable(kpj); } } else if (CAP_DISABLED(project_cap)) { /* * Add the project to capped_projects list. */ - ASSERT(project_cap->cap_value == 0); + ASSERT(project_cap->cap_chk_value == 0); cap_project_enable(kpj, MAX_USAGE); } mutex_exit(&caps_lock); @@ -746,7 +864,7 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) /* * No state transitions, just change the value */ - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } ASSERT(MUTEX_HELD(&caps_lock)); @@ -757,6 +875,108 @@ cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) } /* + * Set zone's base cpu value to base_val + */ +int +cpucaps_zone_set_base(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= MAXCAP); + if (base_val > MAXCAP) + base_val = MAXCAP; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = base_val * cap_tick_cost; + if (value < 0 || value > cap->cap_value) + value = 0; + + cap->cap_base = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* + * Set zone's maximum burst time in seconds. A burst time of 0 means that + * the zone can run over its baseline indefinitely. + */ +int +cpucaps_zone_set_burst_time(zone_t *zone, rctl_qty_t base_val) +{ + cpucap_t *cap = NULL; + hrtime_t value; + + ASSERT(base_val <= INT_MAX); + /* Treat the default as 0 - no limit */ + if (base_val == INT_MAX) + base_val = 0; + if (base_val > INT_MAX) + base_val = INT_MAX; + + if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) + return (0); + + if (zone->zone_cpucap == NULL) + cap = cap_alloc(); + + mutex_enter(&caps_lock); + + if (cpucaps_busy) { + mutex_exit(&caps_lock); + return (EBUSY); + } + + /* + * Double-check whether zone->zone_cpucap is NULL, now with caps_lock + * held. If it is still NULL, assign a newly allocated cpucap to it. + */ + if (zone->zone_cpucap == NULL) { + zone->zone_cpucap = cap; + } else if (cap != NULL) { + cap_free(cap); + } + + cap = zone->zone_cpucap; + + value = SEC_TO_TICK(base_val); + if (value < 0) + value = 0; + + cap->cap_burst_limit = value; + + mutex_exit(&caps_lock); + + return (0); +} + +/* * The project is going away so disable its cap. */ void @@ -902,7 +1122,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) if (CAP_DISABLED(cap)) cap_project_enable(kpj, value); else - cap->cap_value = value; + cap->cap_value = cap->cap_chk_value = value; } else if (CAP_ENABLED(cap)) { /* * User requested to drop a cap on the project. If it is part of @@ -910,7 +1130,7 @@ cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) * otherwise disable the cap. */ if (ZONE_IS_CAPPED(kpj->kpj_zone)) { - cap->cap_value = MAX_USAGE; + cap->cap_value = cap->cap_chk_value = MAX_USAGE; } else { cap_project_disable(kpj); } @@ -948,6 +1168,26 @@ cpucaps_zone_get(zone_t *zone) } /* + * Get current zone baseline. + */ +rctl_qty_t +cpucaps_zone_get_base(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(zone->zone_cpucap->cap_base / cap_tick_cost) : 0); +} + +/* + * Get current zone maximum burst time. + */ +rctl_qty_t +cpucaps_zone_get_burst_time(zone_t *zone) +{ + return (zone->zone_cpucap != NULL ? + (rctl_qty_t)(TICK_TO_SEC(zone->zone_cpucap->cap_burst_limit)) : 0); +} + +/* * Charge project of thread t the time thread t spent on CPU since previously * adjusted. * @@ -1045,7 +1285,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) project_cap = kpj->kpj_cpucap; - if (project_cap->cap_usage >= project_cap->cap_value) { + if (project_cap->cap_usage >= project_cap->cap_chk_value) { t->t_schedflag |= TS_PROJWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_PROJWAITQ) { @@ -1059,7 +1299,7 @@ cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) } else { cpucap_t *zone_cap = zone->zone_cpucap; - if (zone_cap->cap_usage >= zone_cap->cap_value) { + if (zone_cap->cap_usage >= zone_cap->cap_chk_value) { t->t_schedflag |= TS_ZONEWAITQ; rc = B_TRUE; } else if (t->t_schedflag & TS_ZONEWAITQ) { @@ -1119,6 +1359,7 @@ cpucaps_enforce(kthread_t *t) /* * Convert internal cap statistics into values exported by cap kstat. + * Note that the kstat is held throughout this function but caps_lock is not. */ static int cap_kstat_update(kstat_t *ksp, int rw) @@ -1133,6 +1374,12 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_value.value.ui64 = ROUND_SCALE(cap->cap_value, cap_tick_cost); + capsp->cap_baseline.value.ui64 = + ROUND_SCALE(cap->cap_base, cap_tick_cost); + capsp->cap_effective.value.ui64 = + ROUND_SCALE(cap->cap_chk_value, cap_tick_cost); + capsp->cap_burst_limit.value.ui64 = + ROUND_SCALE(cap->cap_burst_limit, tick_sec); capsp->cap_usage.value.ui64 = ROUND_SCALE(cap->cap_usage, cap_tick_cost); capsp->cap_maxusage.value.ui64 = @@ -1140,6 +1387,10 @@ cap_kstat_update(kstat_t *ksp, int rw) capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); + capsp->cap_above_base.value.ui64 = + ROUND_SCALE(cap->cap_above_base, tick_sec); + capsp->cap_bursting.value.ui64 = + ROUND_SCALE(cap->cap_bursting, tick_sec); kstat_named_setstr(&capsp->cap_zonename, zonename); return (0); diff --git a/usr/src/uts/common/disp/disp.c b/usr/src/uts/common/disp/disp.c index 0c2c0b4993..5f9c2c68a2 100644 --- a/usr/src/uts/common/disp/disp.c +++ b/usr/src/uts/common/disp/disp.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ @@ -105,7 +109,7 @@ static void cpu_resched(cpu_t *cp, pri_t tpri); /* * If this is set, only interrupt threads will cause kernel preemptions. * This is done by changing the value of kpreemptpri. kpreemptpri - * will either be the max sysclass pri + 1 or the min interrupt pri. + * will either be the max sysclass pri or the min interrupt pri. */ int only_intr_kpreempt; @@ -252,7 +256,23 @@ dispinit(void) maxglobpri = cl_maxglobpri; } } - kpreemptpri = (pri_t)v.v_maxsyspri + 1; + + /* + * Historically, kpreemptpri was set to v_maxsyspri + 1 -- which is + * to say, maxclsyspri + 1. However, over time, the system has used + * more and more asynchronous kernel threads, with an increasing number + * of these doing work on direct behalf of higher-level software (e.g., + * network processing). This has led to potential priority inversions: + * threads doing low-priority lengthy kernel work can effectively + * delay kernel-level processing of higher-priority data. To minimize + * such inversions, we set kpreemptpri to be v_maxsyspri; anything in + * the kernel that runs at maxclsyspri will therefore induce kernel + * preemption, and this priority should be used if/when an asynchronous + * thread (or, as is often the case, task queue) is performing a task + * on behalf of higher-level software (or any task that is otherwise + * latency-sensitve). + */ + kpreemptpri = (pri_t)v.v_maxsyspri; if (kpqpri == KPQPRI) kpqpri = kpreemptpri; @@ -2258,7 +2278,7 @@ disp_getbest(disp_t *dp) * placed earlier. */ if (tcp == NULL || - pri >= minclsyspri || + (pri >= minclsyspri && tp->t_procp == &p0) || tp->t_cpu != tcp) break; diff --git a/usr/src/uts/common/disp/thread.c b/usr/src/uts/common/disp/thread.c index f2685af534..ae6c5eef16 100644 --- a/usr/src/uts/common/disp/thread.c +++ b/usr/src/uts/common/disp/thread.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -75,6 +75,10 @@ #include <sys/cpucaps.h> #include <sys/kiconv.h> +#ifndef STACK_GROWTH_DOWN +#error Stacks do not grow downward; 3b2 zombie attack detected! +#endif + struct kmem_cache *thread_cache; /* cache of free threads */ struct kmem_cache *lwp_cache; /* cache of free lwps */ struct kmem_cache *turnstile_cache; /* cache of free turnstiles */ @@ -372,7 +376,7 @@ thread_create( if (stksize <= sizeof (kthread_t) + PTR24_ALIGN) cmn_err(CE_PANIC, "thread_create: proposed stack size" " too small to hold thread."); -#ifdef STACK_GROWTH_DOWN + stksize -= SA(sizeof (kthread_t) + PTR24_ALIGN - 1); stksize &= -PTR24_ALIGN; /* make thread aligned */ t = (kthread_t *)(stk + stksize); @@ -381,13 +385,6 @@ thread_create( audit_thread_create(t); t->t_stk = stk + stksize; t->t_stkbase = stk; -#else /* stack grows to larger addresses */ - stksize -= SA(sizeof (kthread_t)); - t = (kthread_t *)(stk); - bzero(t, sizeof (kthread_t)); - t->t_stk = stk + sizeof (kthread_t); - t->t_stkbase = stk + stksize + sizeof (kthread_t); -#endif /* STACK_GROWTH_DOWN */ t->t_flag |= T_TALLOCSTK; t->t_swap = stk; } else { @@ -400,13 +397,8 @@ thread_create( * Initialize t_stk to the kernel stack pointer to use * upon entry to the kernel */ -#ifdef STACK_GROWTH_DOWN t->t_stk = stk + stksize; t->t_stkbase = stk; -#else - t->t_stk = stk; /* 3b2-like */ - t->t_stkbase = stk + stksize; -#endif /* STACK_GROWTH_DOWN */ } if (kmem_stackinfo != 0) { @@ -589,6 +581,9 @@ thread_exit(void) if ((t->t_proc_flag & TP_ZTHREAD) != 0) cmn_err(CE_PANIC, "thread_exit: zthread_exit() not called"); + if ((t->t_flag & T_SPLITSTK) != 0) + cmn_err(CE_PANIC, "thread_exit: called when stack is split"); + tsd_exit(); /* Clean up this thread's TSD */ kcpc_passivate(); /* clean up performance counter state */ @@ -1050,6 +1045,8 @@ installctx( ctx->free_op = free; ctx->arg = arg; ctx->next = t->t_ctx; + ctx->save_ts = 0; + ctx->restore_ts = 0; t->t_ctx = ctx; } @@ -1124,9 +1121,12 @@ savectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->save_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->save_op != NULL) { + ctx->save_ts = gethrtime_unscaled(); (ctx->save_op)(ctx->arg); + } + } } void @@ -1135,9 +1135,12 @@ restorectx(kthread_t *t) struct ctxop *ctx; ASSERT(t == curthread); - for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) - if (ctx->restore_op != NULL) + for (ctx = t->t_ctx; ctx != 0; ctx = ctx->next) { + if (ctx->restore_op != NULL) { + ctx->restore_ts = gethrtime_unscaled(); (ctx->restore_op)(ctx->arg); + } + } } void @@ -1883,6 +1886,103 @@ thread_change_pri(kthread_t *t, pri_t disp_pri, int front) return (on_rq); } + +/* + * There are occasions in the kernel when we need much more stack than we + * allocate by default, but we do not wish to have that work done + * asynchronously by another thread. To accommodate these scenarios, we allow + * for a split stack (also known as a "segmented stack") whereby a new stack + * is dynamically allocated and the current thread jumps onto it for purposes + * of executing the specified function. After the specified function returns, + * the stack is deallocated and control is returned to the caller. This + * functionality is implemented by thread_splitstack(), below; there are a few + * constraints on its use: + * + * - The caller must be in a context where it is safe to block for memory. + * - The caller cannot be in a t_onfault context + * - The called function must not call thread_exit() while on the split stack + * + * The code will explicitly panic if these constraints are violated. Notably, + * however, thread_splitstack() _can_ be called on a split stack -- there + * is no limit to the level that split stacks can nest. + * + * When the stack is split, it is constructed such that stack backtraces + * from kernel debuggers continue to function -- though note that DTrace's + * stack() action and stackdepth function will only show the stack up to and + * including thread_splitstack_run(); DTrace explicitly bounds itself to + * pointers that exist within the current declared stack as a safety + * mechanism. + */ +void +thread_splitstack(void (*func)(void *), void *arg, size_t stksize) +{ + kthread_t *t = curthread; + caddr_t ostk, ostkbase, stk; + ushort_t otflag; + + if (t->t_onfault != NULL) + panic("thread_splitstack: called with non-NULL t_onfault"); + + ostk = t->t_stk; + ostkbase = t->t_stkbase; + otflag = t->t_flag; + + stksize = roundup(stksize, PAGESIZE); + + if (stksize < default_stksize) + stksize = default_stksize; + + if (stksize == default_stksize) { + stk = (caddr_t)segkp_cache_get(segkp_thread); + } else { + stksize = roundup(stksize, PAGESIZE); + stk = (caddr_t)segkp_get(segkp, stksize, + (KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED)); + } + + /* + * We're going to lock ourselves before we set T_SPLITSTK to assure + * that we're not swapped out in the meantime. (Note that we don't + * bother to set t_swap, as we're not going to be swapped out.) + */ + thread_lock(t); + + if (!(otflag & T_SPLITSTK)) + t->t_flag |= T_SPLITSTK; + + t->t_stk = stk + stksize; + t->t_stkbase = stk; + + thread_unlock(t); + + /* + * Now actually run on the new (split) stack... + */ + thread_splitstack_run(t->t_stk, func, arg); + + /* + * We're back onto our own stack; lock ourselves and restore our + * pre-split state. + */ + thread_lock(t); + + t->t_stk = ostk; + t->t_stkbase = ostkbase; + + if (!(otflag & T_SPLITSTK)) + t->t_flag &= ~T_SPLITSTK; + + thread_unlock(t); + + /* + * Now that we are entirely back on our own stack, call back into + * the platform layer to perform any platform-specific cleanup. + */ + thread_splitstack_cleanup(); + + segkp_release(segkp, stk); +} + /* * Tunable kmem_stackinfo is set, fill the kernel thread stack with a * specific pattern. diff --git a/usr/src/uts/common/disp/thread_intr.c b/usr/src/uts/common/disp/thread_intr.c index 67ccc6922f..c840bdf31a 100644 --- a/usr/src/uts/common/disp/thread_intr.c +++ b/usr/src/uts/common/disp/thread_intr.c @@ -23,19 +23,10 @@ * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - /* - * FILE NOTICE BEGIN - * - * This file should not be modified. If you wish to modify it or have it - * modified, please contact Sun Microsystems at <LFI149367@-sun-.-com-> - * (without anti-spam dashes) - * - * FILE NOTICE END + * Copyright 2015, Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/cpuvar.h> #include <sys/stack.h> #include <vm/seg_kp.h> @@ -44,6 +35,17 @@ #include <sys/sysmacros.h> /* + * Use a slightly larger thread stack size for interrupt threads rather than the + * default. This is useful for cases where the networking stack may do an rx and + * a tx in the context of a single interrupt and when combined with various + * promisc hooks that need memory, can cause us to get dangerously close to the + * edge of the traditional stack sizes. This is only a few pages more than a + * traditional stack and given that we don't have that many interrupt threads, + * the memory costs end up being more than worthwhile. + */ +#define LL_INTR_STKSZ (32 * 1024) + +/* * Create and initialize an interrupt thread. */ static void @@ -51,7 +53,7 @@ thread_create_intr(cpu_t *cp) { kthread_t *tp; - tp = thread_create(NULL, 0, + tp = thread_create(NULL, LL_INTR_STKSZ, (void (*)())thread_create_intr, NULL, 0, &p0, TS_ONPROC, 0); /* @@ -97,9 +99,12 @@ thread_create_intr(cpu_t *cp) } /* - * Allocate a given number of interrupt threads for a given CPU. - * These threads will get freed by cpu_destroy_bound_threads() - * when CPU gets unconfigured. + * Allocate a given number of interrupt threads for a given CPU. These threads + * will get freed by cpu_destroy_bound_threads() when CPU gets unconfigured. + * + * Note, high level interrupts are always serviced using cpu_intr_stack and are + * not allowed to block. Low level interrupts or soft-interrupts use the + * kthread_t's that we create through the calls to thread_create_intr(). */ void cpu_intr_alloc(cpu_t *cp, int n) @@ -110,6 +115,6 @@ cpu_intr_alloc(cpu_t *cp, int n) thread_create_intr(cp); cp->cpu_intr_stack = (caddr_t)segkp_get(segkp, INTR_STACK_SIZE, - KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + - INTR_STACK_SIZE - SA(MINFRAME); + KPD_HASREDZONE | KPD_NO_ANON | KPD_LOCKED) + + INTR_STACK_SIZE - SA(MINFRAME); } diff --git a/usr/src/uts/common/dtrace/dtrace.c b/usr/src/uts/common/dtrace/dtrace.c index aa4b1d67e0..1a7876ca57 100644 --- a/usr/src/uts/common/dtrace/dtrace.c +++ b/usr/src/uts/common/dtrace/dtrace.c @@ -785,8 +785,8 @@ dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate, if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz)) return (1); - if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz, - vp->v_path, strlen(vp->v_path) + 1)) { + if (DTRACE_INRANGE(addr, sz, vp->v_path, + strlen(vp->v_path) + 1)) { return (1); } @@ -845,7 +845,7 @@ static int dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, dtrace_vstate_t *vstate) { - size_t sz; + size_t sz, strsize; ASSERT(type->dtdt_flags & DIF_TF_BYREF); /* @@ -855,11 +855,24 @@ dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate, if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) return (1); - if (type->dtdt_kind == DIF_TYPE_STRING) - sz = dtrace_strlen(src, - vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1; - else + if (type->dtdt_kind == DIF_TYPE_STRING) { + dtrace_state_t *state = vstate->dtvs_state; + + if (state != NULL) { + strsize = state->dts_options[DTRACEOPT_STRSIZE]; + } else { + /* + * In helper context, we have a NULL state; fall back + * to using the system-wide default for the string size + * in this case. + */ + strsize = dtrace_strsize_default; + } + + sz = dtrace_strlen(src, strsize) + 1; + } else { sz = type->dtdt_size; + } return (dtrace_canload((uintptr_t)src, sz, mstate, vstate)); } @@ -7506,7 +7519,7 @@ dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp) priv = DTRACE_PRIV_ALL; } else { *uidp = crgetuid(cr); - *zoneidp = crgetzoneid(cr); + *zoneidp = crgetzonedid(cr); priv = 0; if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) @@ -8002,7 +8015,7 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv, provider->dtpv_priv.dtpp_flags = priv; if (cr != NULL) { provider->dtpv_priv.dtpp_uid = crgetuid(cr); - provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr); + provider->dtpv_priv.dtpp_zoneid = crgetzonedid(cr); } provider->dtpv_pops = *pops; @@ -8613,6 +8626,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) uint32_t priv; uid_t uid; zoneid_t zoneid; + dtrace_state_t *state = enab->dten_vstate->dtvs_state; ASSERT(MUTEX_HELD(&dtrace_lock)); dtrace_ecb_create_cache = NULL; @@ -8627,8 +8641,22 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab) } dtrace_probekey(desc, &pkey); - dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred, - &priv, &uid, &zoneid); + dtrace_cred2priv(state->dts_cred.dcr_cred, &priv, &uid, &zoneid); + + if ((priv & DTRACE_PRIV_ZONEOWNER) && + state->dts_options[DTRACEOPT_ZONE] != DTRACEOPT_UNSET) { + /* + * If we have the privilege of instrumenting all zones but we + * have been told to instrument but one, we will spoof this up + * depriving ourselves of DTRACE_PRIV_ZONEOWNER for purposes + * of dtrace_match(). (Note that DTRACEOPT_ZONE is not for + * security but rather for performance: it allows the global + * zone to instrument USDT probes in a local zone without + * requiring all zones to be instrumented.) + */ + priv &= ~DTRACE_PRIV_ZONEOWNER; + zoneid = state->dts_options[DTRACEOPT_ZONE]; + } return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab)); diff --git a/usr/src/uts/common/dtrace/sdt_subr.c b/usr/src/uts/common/dtrace/sdt_subr.c index 157acc25fc..3d350ff278 100644 --- a/usr/src/uts/common/dtrace/sdt_subr.c +++ b/usr/src/uts/common/dtrace/sdt_subr.c @@ -97,6 +97,10 @@ static dtrace_pattr_t iscsi_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, }; +/* + * When adding a new provider you must add it before sdt as sdt is a catch all + * for remaining probes. + */ sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr }, { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER }, @@ -117,6 +121,7 @@ sdt_provider_t sdt_providers[] = { { "fc", "__fc_", &fc_attr }, { "srp", "__srp_", &fc_attr }, { "sysevent", "__sysevent_", &stab_attr }, + { "vnd", "__vnd_", &stab_attr }, { "sdt", NULL, &sdt_attr }, { NULL } }; @@ -1151,6 +1156,34 @@ sdt_argdesc_t sdt_args[] = { { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", "fc_port_info_t *" }, + { "vnd", "flow-blocked", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-blocked", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-blocked", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "flow-resumed", 0, 0, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "flow-resumed", 1, 1, "uint64_t", "uint64_t" }, + { "vnd", "flow-resumed", 2, 2, "uintptr_t", "uintptr_t" }, + { "vnd", "drop-in", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-in", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-in", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-in", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-out", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-out", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-out", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-out", 3, 3, "const char *", "const char *" }, + { "vnd", "drop-ctl", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "drop-ctl", 1, 1, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "drop-ctl", 2, 2, "mblk_t *", "etherinfo_t *" }, + { "vnd", "drop-ctl", 3, 3, "const char *", "const char *" }, + { "vnd", "send", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "send", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "send", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "send", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "send", 4, 4, "mblk_t *", "etherinfo_t *" }, + { "vnd", "recv", 0, 0, "mblk_t *", "pktinfo_t *" }, + { "vnd", "recv", 1, 1, "void *", "csinfo_t *" }, + { "vnd", "recv", 2, 2, "void *", "ipinfo_t *" }, + { "vnd", "recv", 3, 3, "vnd_str_t *", "ifinfo_t *" }, + { "vnd", "recv", 4, 4, "mblk_t *", "etherinfo_t *" }, { NULL } }; diff --git a/usr/src/uts/common/exec/aout/aout.c b/usr/src/uts/common/exec/aout/aout.c index fc45bd9544..5dbb2ed28c 100644 --- a/usr/src/uts/common/exec/aout/aout.c +++ b/usr/src/uts/common/exec/aout/aout.c @@ -22,6 +22,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright (c) 2011 Bayard G. Bell. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -54,7 +55,7 @@ static int aoutexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action); + caddr_t exec_file, cred_t *cred, int *brand_action); static int get_aout_head(struct vnode **vpp, struct exdata *edp, long *execsz, int *isdyn); static int aoutcore(vnode_t *vp, proc_t *pp, cred_t *credp, @@ -130,7 +131,7 @@ _info(struct modinfo *modinfop) static int aoutexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t exec_file, cred_t *cred, int brand_action) + caddr_t exec_file, cred_t *cred, int *brand_action) { auxv32_t auxflags_auxv32; int error; diff --git a/usr/src/uts/common/exec/elf/elf.c b/usr/src/uts/common/exec/elf/elf.c index 0eeb4e798c..936a3f1da1 100644 --- a/usr/src/uts/common/exec/elf/elf.c +++ b/usr/src/uts/common/exec/elf/elf.c @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -163,12 +163,16 @@ dtrace_safe_phdr(Phdr *phdrp, struct uarg *args, uintptr_t base) } /* - * Map in the executable pointed to by vp. Returns 0 on success. + * Map in the executable pointed to by vp. Returns 0 on success. Note that + * this function currently has the maximum number of arguments allowed by + * modstubs on x86 (MAXNARG)! Do _not_ add to this function signature without + * adding to MAXNARG. (Better yet, do not add to this monster of a function + * signature!) */ int mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, - intptr_t *voffset, caddr_t exec_file, int *interp, caddr_t *bssbase, - caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap) + intptr_t *voffset, caddr_t exec_file, char **interpp, caddr_t *bssbase, + caddr_t *brkbase, size_t *brksize, uintptr_t *lddatap, uintptr_t *minaddrp) { size_t len; struct vattr vat; @@ -180,6 +184,7 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, Phdr *junk = NULL; Phdr *dynphdr = NULL; Phdr *dtrphdr = NULL; + char *interp = NULL; uintptr_t lddata; long execsz; intptr_t minaddr; @@ -187,6 +192,9 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, if (lddatap != NULL) *lddatap = NULL; + if (minaddrp != NULL) + *minaddrp = NULL; + if (error = execpermissions(vp, &vat, args)) { uprintf("%s: Cannot execute %s\n", exec_file, args->pathname); return (error); @@ -212,25 +220,65 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, &junk, &dtrphdr, NULL, bssbase, brkbase, voffset, &minaddr, len, &execsz, brksize)) { uprintf("%s: Cannot map %s\n", exec_file, args->pathname); + if (uphdr != NULL && uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); kmem_free(phdrbase, phdrsize); return (error); } + if (minaddrp != NULL) + *minaddrp = minaddr; + /* - * Inform our caller if the executable needs an interpreter. + * If the executable requires an interpreter, determine its name. */ - *interp = (dynphdr == NULL) ? 0 : 1; + if (dynphdr != NULL) { + ssize_t resid; + + if (dynphdr->p_filesz > MAXPATHLEN || dynphdr->p_filesz == 0) { + uprintf("%s: Invalid interpreter\n", exec_file); + kmem_free(phdrbase, phdrsize); + return (ENOEXEC); + } + + interp = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + if ((error = vn_rdwr(UIO_READ, vp, interp, dynphdr->p_filesz, + (offset_t)dynphdr->p_offset, UIO_SYSSPACE, 0, + (rlim64_t)0, CRED(), &resid)) != 0 || resid != 0 || + interp[dynphdr->p_filesz - 1] != '\0') { + uprintf("%s: Cannot obtain interpreter pathname\n", + exec_file); + kmem_free(interp, MAXPATHLEN); + kmem_free(phdrbase, phdrsize); + return (error != 0 ? error : ENOEXEC); + } + } /* * If this is a statically linked executable, voffset should indicate * the address of the executable itself (it normally holds the address * of the interpreter). */ - if (ehdr->e_type == ET_EXEC && *interp == 0) + if (ehdr->e_type == ET_EXEC && interp == NULL) *voffset = minaddr; + /* + * If the caller has asked for the interpreter name, return it (it's + * up to the caller to free it); if the caller hasn't asked for it, + * free it ourselves. + */ + if (interpp != NULL) { + *interpp = interp; + } else if (interp != NULL) { + kmem_free(interp, MAXPATHLEN); + } + if (uphdr != NULL) { *uphdr_vaddr = uphdr->p_vaddr; + + if (uphdr->p_flags == 0) + kmem_free(uphdr, sizeof (Phdr)); } else { *uphdr_vaddr = (Addr)-1; } @@ -243,13 +291,13 @@ mapexec_brand(vnode_t *vp, uarg_t *args, Ehdr *ehdr, Addr *uphdr_vaddr, int elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action) + int *brand_action) { caddr_t phdrbase = NULL; caddr_t bssbase = 0; caddr_t brkbase = 0; size_t brksize = 0; - ssize_t dlnsize; + ssize_t dlnsize, nsize = 0; aux_entry_t *aux; int error; ssize_t resid; @@ -273,6 +321,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int hasauxv = 0; int hasdy = 0; int branded = 0; + int dynuphdr = 0; struct proc *p = ttoproc(curthread); struct user *up = PTOU(p); @@ -339,14 +388,40 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, #endif /* _LP64 */ /* - * We delay invoking the brand callback until we've figured out - * what kind of elf binary we're trying to run, 32-bit or 64-bit. - * We do this because now the brand library can just check - * args->to_model to see if the target is 32-bit or 64-bit without - * having do duplicate all the code above. + * We delay invoking the brand callback until we've figured out what + * kind of elf binary we're trying to run, 32-bit or 64-bit. We do this + * because now the brand library can just check args->to_model to see if + * the target is 32-bit or 64-bit without having do duplicate all the + * code above. + * + * We also give the brand a chance to indicate that based on the ELF + * OSABI of the target binary it should become unbranded and optionally + * indicate that it should be treated as existing in a specific prefix. + * + * Note that if a brand opts to go down this route it does not actually + * end up being debranded. In other words, future programs that exec + * will still be considered for branding unless this escape hatch is + * used. Consider the case of lx brand for example. If a user runs + * /native/usr/sbin/dtrace -c /bin/ls, the isaexec and normal executable + * of DTrace that's in /native will take this escape hatch and be run + * and interpreted using the normal system call table; however, the + * execution of a non-illumos binary in the form of /bin/ls will still + * be branded and be subject to all of the normal actions of the brand. */ + if ((level < 2) && (*brand_action != EBA_NATIVE) && + (PROC_IS_BRANDED(p)) && (BROP(p)->b_native_exec != NULL)) { + if (BROP(p)->b_native_exec(ehdrp->e_ident[EI_OSABI], + &args->brand_nroot) == B_TRUE) { + ASSERT(ehdrp->e_ident[EI_OSABI]); + *brand_action = EBA_NATIVE; + /* Add one for the trailing '/' in the path */ + if (args->brand_nroot != NULL) + nsize = strlen(args->brand_nroot) + 1; + } + } + if ((level < 2) && - (brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + (*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { error = BROP(p)->b_elfexec(vp, uap, args, idatap, level + 1, execsz, setid, exec_file, cred, brand_action); @@ -417,14 +492,15 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * AT_BASE * AT_FLAGS * AT_PAGESZ + * AT_RANDOM (added in stk_copyout) * AT_SUN_AUXFLAGS * AT_SUN_HWCAP * AT_SUN_HWCAP2 - * AT_SUN_PLATFORM (added in stk_copyout) - * AT_SUN_EXECNAME (added in stk_copyout) + * AT_SUN_PLATFORM (added in stk_copyout) + * AT_SUN_EXECNAME (added in stk_copyout) * AT_NULL * - * total == 9 + * total == 10 */ if (hasdy && hasu) { /* @@ -439,7 +515,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 5 */ - args->auxsize = (9 + 5) * sizeof (aux_entry_t); + args->auxsize = (10 + 5) * sizeof (aux_entry_t); } else if (hasdy) { /* * Has PT_INTERP but no PT_PHDR @@ -449,9 +525,9 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * * total = 2 */ - args->auxsize = (9 + 2) * sizeof (aux_entry_t); + args->auxsize = (10 + 2) * sizeof (aux_entry_t); } else { - args->auxsize = 9 * sizeof (aux_entry_t); + args->auxsize = 10 * sizeof (aux_entry_t); } } else { args->auxsize = 0; @@ -464,13 +540,21 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (args->emulator != NULL) args->auxsize += sizeof (aux_entry_t); - if ((brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { + /* + * If this is a native binary that's been given a modified interpreter + * root, inform it that the native system exists at that root. + */ + if (args->brand_nroot != NULL) { + args->auxsize += sizeof (aux_entry_t); + } + + if ((*brand_action != EBA_NATIVE) && (PROC_IS_BRANDED(p))) { branded = 1; /* - * We will be adding 4 entries to the aux vectors. One for - * the the brandname and 3 for the brand specific aux vectors. + * We will be adding 5 entries to the aux vectors. One for + * the the brandname and 4 for the brand specific aux vectors. */ - args->auxsize += 4 * sizeof (aux_entry_t); + args->auxsize += 5 * sizeof (aux_entry_t); } /* Hardware/Software capabilities */ @@ -501,7 +585,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, aux = bigwad->elfargs; /* * Move args to the user's stack. - * This can fill in the AT_SUN_PLATFORM and AT_SUN_EXECNAME aux entries. + * This can fill in the AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM + * aux entries. */ if ((error = exec_args(uap, args, idatap, (void **)&aux)) != 0) { if (error == -1) { @@ -528,6 +613,14 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, len, execsz, &brksize)) != 0) goto bad; + if (uphdr != NULL) { + /* + * Our uphdr has been dynamically allocated if (and only if) + * its program header flags are clear. + */ + dynuphdr = (uphdr->p_flags == 0); + } + if (uphdr != NULL && dyphdr == NULL) goto bad; @@ -542,17 +635,22 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, char *p; struct vnode *nvp; - dlnsize = dyphdr->p_filesz; + dlnsize = dyphdr->p_filesz + nsize; if (dlnsize > MAXPATHLEN || dlnsize <= 0) goto bad; + if (nsize != 0) { + bcopy(args->brand_nroot, dlnp, nsize - 1); + dlnp[nsize - 1] = '/'; + } + /* * Read in "interpreter" pathname. */ - if ((error = vn_rdwr(UIO_READ, vp, dlnp, dyphdr->p_filesz, - (offset_t)dyphdr->p_offset, UIO_SYSSPACE, 0, (rlim64_t)0, - CRED(), &resid)) != 0) { + if ((error = vn_rdwr(UIO_READ, vp, dlnp + nsize, + dyphdr->p_filesz, (offset_t)dyphdr->p_offset, UIO_SYSSPACE, + 0, (rlim64_t)0, CRED(), &resid)) != 0) { uprintf("%s: Cannot obtain interpreter pathname\n", exec_file); goto bad; @@ -688,9 +786,10 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, dtrphdr = NULL; - error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, &junk, &junk, + error = mapelfexec(nvp, ehdrp, nphdrs, phdrbase, NULL, &junk, &junk, &dtrphdr, NULL, NULL, NULL, &voffset, NULL, len, execsz, NULL); + if (error || junk != NULL) { VN_RELE(nvp); uprintf("%s: Cannot map %s\n", exec_file, dlnp); @@ -718,8 +817,8 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, if (hasauxv) { int auxf = AF_SUN_HWCAPVERIFY; /* - * Note: AT_SUN_PLATFORM and AT_SUN_EXECNAME were filled in via - * exec_args() + * Note: AT_SUN_PLATFORM, AT_SUN_EXECNAME and AT_RANDOM were + * filled in via exec_args() */ ADDAUX(aux, AT_BASE, voffset) ADDAUX(aux, AT_FLAGS, at_flags) @@ -747,7 +846,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, * malicious user within the zone from crafting a wrapper to * run native suid commands with unsecure libraries interposed. */ - if ((brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && + if ((*brand_action == EBA_NATIVE) && (PROC_IS_BRANDED(p) && (setid &= ~EXECSETID_SETID) != 0)) auxf &= ~AF_SUN_SETUGID; @@ -789,6 +888,7 @@ elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, ADDAUX(aux, AT_SUN_BRAND_AUX1, 0) ADDAUX(aux, AT_SUN_BRAND_AUX2, 0) ADDAUX(aux, AT_SUN_BRAND_AUX3, 0) + ADDAUX(aux, AT_SUN_BRAND_AUX4, 0) } ADDAUX(aux, AT_NULL, 0) @@ -896,6 +996,8 @@ bad: if (error == 0) error = ENOEXEC; out: + if (dynuphdr) + kmem_free(uphdr, sizeof (Phdr)); if (phdrbase != NULL) kmem_free(phdrbase, phdrsize); if (cap != NULL) @@ -1182,7 +1284,7 @@ mapelfexec( size_t *brksize) { Phdr *phdr; - int i, prot, error; + int i, prot, error, lastprot = 0; caddr_t addr = NULL; size_t zfodsz; int ptload = 0; @@ -1190,6 +1292,7 @@ mapelfexec( off_t offset; int hsize = ehdr->e_phentsize; caddr_t mintmp = (caddr_t)-1; + uintptr_t lastaddr = NULL; extern int use_brk_lpg; if (ehdr->e_type == ET_DYN) { @@ -1220,13 +1323,11 @@ mapelfexec( } else { *voffset = 0; } + phdr = (Phdr *)phdrbase; for (i = nphdrs; i > 0; i--) { switch (phdr->p_type) { case PT_LOAD: - if ((*dyphdr != NULL) && (*uphdr == NULL)) - return (0); - ptload = 1; prot = PROT_USER; if (phdr->p_flags & PF_R) @@ -1238,6 +1339,34 @@ mapelfexec( addr = (caddr_t)((uintptr_t)phdr->p_vaddr + *voffset); + if ((*dyphdr != NULL) && uphdr != NULL && + (*uphdr == NULL)) { + /* + * The PT_PHDR program header is, strictly + * speaking, optional. If we find that this + * is missing, we will determine the location + * of the program headers based on the address + * of the lowest PT_LOAD segment (namely, this + * one): we subtract the p_offset to get to + * the ELF header and then add back the program + * header offset to get to the program headers. + * We then cons up a Phdr that corresponds to + * the (missing) PT_PHDR, setting the flags + * to 0 to denote that this is artificial and + * should (must) be freed by the caller. + */ + Phdr *cons; + + cons = kmem_zalloc(sizeof (Phdr), KM_SLEEP); + + cons->p_flags = 0; + cons->p_type = PT_PHDR; + cons->p_vaddr = ((uintptr_t)addr - + phdr->p_offset) + ehdr->e_phoff; + + *uphdr = cons; + } + /* * Keep track of the segment with the lowest starting * address. @@ -1245,6 +1374,41 @@ mapelfexec( if (addr < mintmp) mintmp = addr; + /* + * Segments need not correspond to page boundaries: + * they are permitted to share a page. If two PT_LOAD + * segments share the same page, and the permissions + * of the segments differ, the behavior is historically + * that the permissions of the latter segment are used + * for the page that the two segments share. This is + * also historically a non-issue: binaries generated + * by most anything will make sure that two PT_LOAD + * segments with differing permissions don't actually + * share any pages. However, there exist some crazy + * things out there (including at least an obscure + * Portuguese teaching language called G-Portugol) that + * actually do the wrong thing and expect it to work: + * they have a segment with execute permission share + * a page with a subsequent segment that does not + * have execute permissions and expect the resulting + * shared page to in fact be executable. To accommodate + * such broken link editors, we take advantage of a + * latitude explicitly granted to the loader: it is + * permitted to make _any_ PT_LOAD segment executable + * (provided that it is readable or writable). If we + * see that we're sharing a page and that the previous + * page was executable, we will add execute permissions + * to our segment. + */ + if (btop(lastaddr) == btop((uintptr_t)addr) && + (phdr->p_flags & (PF_R | PF_W)) && + (lastprot & PROT_EXEC)) { + prot |= PROT_EXEC; + } + + lastaddr = (uintptr_t)addr + phdr->p_filesz; + lastprot = prot; + zfodsz = (size_t)phdr->p_memsz - phdr->p_filesz; offset = phdr->p_offset; @@ -1319,9 +1483,12 @@ mapelfexec( break; case PT_PHDR: - if (ptload) + if (ptload || phdr->p_flags == 0) goto bad; - *uphdr = phdr; + + if (uphdr != NULL) + *uphdr = phdr; + break; case PT_NULL: @@ -2170,7 +2337,7 @@ static struct modlexec modlexec = { extern int elf32exec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, cred_t *cred, - int brand_action); + int *brand_action); extern int elf32core(vnode_t *vp, proc_t *p, cred_t *credp, rlim64_t rlimit, int sig, core_content_t content); diff --git a/usr/src/uts/common/exec/intp/intp.c b/usr/src/uts/common/exec/intp/intp.c index 998c0f1f8e..316f4c07e0 100644 --- a/usr/src/uts/common/exec/intp/intp.c +++ b/usr/src/uts/common/exec/intp/intp.c @@ -22,6 +22,7 @@ * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1988 AT&T */ @@ -53,7 +54,7 @@ #include <sys/modctl.h> extern int intpexec(struct vnode *, struct execa *, struct uarg *, - struct intpdata *, int, long *, int, caddr_t, struct cred *, int); + struct intpdata *, int, long *, int, caddr_t, struct cred *, int *); static struct execsw esw = { intpmagicstr, @@ -169,7 +170,7 @@ intpexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; @@ -180,6 +181,7 @@ intpexec( char *opath; char devfd[19]; /* 32-bit int fits in 10 digits + 8 for "/dev/fd/" */ int fd = -1; + int custom_action = EBA_NONE; if (level) { /* Can't recurse */ error = ENOEXEC; @@ -228,7 +230,7 @@ intpexec( } error = gexec(&nvp, uap, args, &idata, ++level, execsz, exec_file, cred, - EBA_NONE); + &custom_action); if (!error) { /* diff --git a/usr/src/uts/common/exec/java/java.c b/usr/src/uts/common/exec/java/java.c index 7cf9126e8d..4ce223a682 100644 --- a/usr/src/uts/common/exec/java/java.c +++ b/usr/src/uts/common/exec/java/java.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* @@ -85,7 +86,7 @@ char *jexec_arg = "-jar"; static int javaexec(vnode_t *vp, struct execa *uap, struct uarg *args, struct intpdata *idatap, int level, long *execsz, int setid, - caddr_t execfile, cred_t *cred, int brand_action) + caddr_t execfile, cred_t *cred, int *brand_action) { struct intpdata idata; int error; diff --git a/usr/src/uts/common/exec/shbin/shbin.c b/usr/src/uts/common/exec/shbin/shbin.c index a6cf129d9d..a431264103 100644 --- a/usr/src/uts/common/exec/shbin/shbin.c +++ b/usr/src/uts/common/exec/shbin/shbin.c @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -58,7 +59,7 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action); + int *brand_action); #define SHBIN_CNTL(x) ((x)&037) #define SHBINMAGIC_LEN 4 @@ -162,7 +163,7 @@ shbinexec( int setid, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { _NOTE(ARGUNUSED(brand_action)) vnode_t *nvp; diff --git a/usr/src/uts/common/fs/bootfs/bootfs_construct.c b/usr/src/uts/common/fs/bootfs/bootfs_construct.c new file mode 100644 index 0000000000..b909b5d121 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_construct.c @@ -0,0 +1,368 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * This file takes care of reading the boot time modules and constructing them + * into the appropriate series of vnodes. + */ + +#include <sys/conf.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/vfs.h> +#include <sys/sysmacros.h> +#include <sys/stat.h> + +#include <sys/fs/bootfs_impl.h> + +kmem_cache_t *bootfs_node_cache; + +static const vattr_t bootfs_vattr_dir = { + AT_ALL, /* va_mask */ + VDIR, /* va_type */ + S_IFDIR | 0555, /* va_mode */ + 0, /* va_uid */ + 0, /* va_gid */ + 0, /* va_fsid */ + 0, /* va_nodeid */ + 1, /* va_nlink */ + 0, /* va_size */ + 0, /* va_atime */ + 0, /* va_mtime */ + 0, /* va_ctime */ + 0, /* va_rdev */ + 0, /* va_blksize */ + 0, /* va_nblocks */ + 0 /* va_seq */ +}; + +static const vattr_t bootfs_vattr_reg = { + AT_ALL, /* va_mask */ + VREG, /* va_type */ + S_IFREG | 0555, /* va_mode */ + 0, /* va_uid */ + 0, /* va_gid */ + 0, /* va_fsid */ + 0, /* va_nodeid */ + 1, /* va_nlink */ + 0, /* va_size */ + 0, /* va_atime */ + 0, /* va_mtime */ + 0, /* va_ctime */ + 0, /* va_rdev */ + 0, /* va_blksize */ + 0, /* va_nblocks */ + 0 /* va_seq */ +}; + +/*ARGSUSED*/ +int +bootfs_node_constructor(void *buf, void *arg, int kmflags) +{ + bootfs_node_t *bnp = buf; + + bnp->bvn_vnp = vn_alloc(kmflags); + if (bnp->bvn_vnp == NULL) + return (-1); + + return (0); +} + +/*ARGSUSED*/ +void +bootfs_node_destructor(void *buf, void *arg) +{ + bootfs_node_t *bnp = buf; + + vn_free(bnp->bvn_vnp); +} + +static int +bootfs_comparator(const void *a, const void *b) +{ + const bootfs_node_t *lfs, *rfs; + int ret; + + lfs = a; + rfs = b; + + ret = strcmp(lfs->bvn_name, rfs->bvn_name); + if (ret > 0) + ret = 1; + if (ret < 0) + ret = -1; + return (ret); +} + +static void +bootfs_node_init(bootfs_t *bfs, bootfs_node_t *bnp, const struct vattr *vap, + const char *name, size_t namelen) +{ + timestruc_t now; + + vn_reinit(bnp->bvn_vnp); + + bnp->bvn_vnp->v_flag |= VNOSWAP; + bnp->bvn_vnp->v_type = vap->va_type; + bnp->bvn_vnp->v_vfsp = bfs->bfs_vfsp; + bnp->bvn_vnp->v_rdev = 0; + bnp->bvn_vnp->v_data = (caddr_t)bnp; + vn_setops(bnp->bvn_vnp, bootfs_vnodeops); + + bnp->bvn_name = kmem_alloc(namelen + 1, KM_SLEEP); + bcopy(name, bnp->bvn_name, namelen); + bnp->bvn_name[namelen] = '\0'; + if (vap->va_type == VDIR) { + avl_create(&bnp->bvn_dir, bootfs_comparator, + sizeof (bootfs_node_t), + offsetof(bootfs_node_t, bvn_link)); + } + bzero(&bnp->bvn_link, sizeof (avl_node_t)); + bcopy(vap, &bnp->bvn_attr, sizeof (vattr_t)); + + gethrestime(&now); + bnp->bvn_attr.va_atime = now; + bnp->bvn_attr.va_ctime = now; + bnp->bvn_attr.va_mtime = now; + bnp->bvn_attr.va_fsid = makedevice(bootfs_major, bfs->bfs_minor); + bnp->bvn_attr.va_nodeid = bfs->bfs_ninode; + bnp->bvn_attr.va_blksize = PAGESIZE; + bfs->bfs_ninode++; + list_insert_tail(&bfs->bfs_nodes, bnp); +} + +static void +bootfs_mkroot(bootfs_t *bfs) +{ + bootfs_node_t *bnp; + + bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP); + bootfs_node_init(bfs, bnp, &bootfs_vattr_dir, "/", 1); + bnp->bvn_vnp->v_flag |= VROOT; + bnp->bvn_parent = bnp; + bfs->bfs_rootvn = bnp; + bfs->bfs_stat.bfss_ndirs.value.ui32++; + vn_exists(bnp->bvn_vnp); +} + +static int +bootfs_mknode(bootfs_t *bfs, bootfs_node_t *parent, bootfs_node_t **outp, + const char *name, size_t namelen, const vattr_t *vap, uintptr_t addr, + uint64_t size) +{ + bootfs_node_t *bnp; + bootfs_node_t sn; + avl_index_t where; + char *buf; + + ASSERT(parent->bvn_attr.va_type == VDIR); + buf = kmem_alloc(namelen + 1, KM_SLEEP); + bcopy(name, buf, namelen); + buf[namelen] = '\0'; + sn.bvn_name = buf; + if ((bnp = avl_find(&parent->bvn_dir, &sn, &where)) != NULL) { + kmem_free(buf, namelen + 1); + /* Directories can collide, files cannot */ + if (vap->va_type == VDIR) { + *outp = bnp; + return (0); + } + return (EEXIST); + } + kmem_free(buf, namelen + 1); + + bnp = kmem_cache_alloc(bootfs_node_cache, KM_SLEEP); + bootfs_node_init(bfs, bnp, vap, name, namelen); + bnp->bvn_parent = parent; + avl_add(&parent->bvn_dir, bnp); + *outp = bnp; + + if (vap->va_type == VDIR) { + parent->bvn_attr.va_size++; + parent->bvn_attr.va_nlink++; + bfs->bfs_stat.bfss_ndirs.value.ui32++; + } else { + bnp->bvn_addr = addr; + bnp->bvn_size = size; + bfs->bfs_stat.bfss_nfiles.value.ui32++; + bfs->bfs_stat.bfss_nbytes.value.ui64 += size; + bnp->bvn_attr.va_nblocks = P2ROUNDUP(size, 512) >> 9; + bnp->bvn_attr.va_size = size; + } + + vn_exists(bnp->bvn_vnp); + + return (0); +} + +/* + * Given the address, size, and path a boot-time module would like, go through + * and create all of the directory entries that are required and then the file + * itself. If someone has passed in a module that has the same name as another + * one, we honor the first one. + */ +static int +bootfs_construct_entry(bootfs_t *bfs, uintptr_t addr, uint64_t size, + const char *mname) +{ + char *sp; + size_t nlen; + int ret; + bootfs_node_t *nbnp; + + const char *p = mname; + bootfs_node_t *bnp = bfs->bfs_rootvn; + + if (*p == '\0') + return (EINVAL); + + for (;;) { + /* First eliminate all leading / characters. */ + while (*p == '/') + p++; + + /* A name with all slashes or ending in a / */ + if (*p == '\0') + return (EINVAL); + + sp = strchr(p, '/'); + if (sp == NULL) + break; + nlen = (ptrdiff_t)sp - (ptrdiff_t)p; + if (strncmp(p, ".", nlen) == 0) { + p = sp + 1; + continue; + } + + if (strncmp(p, "..", nlen) == 0) { + bnp = bnp->bvn_parent; + p = sp + 1; + continue; + } + + VERIFY(bootfs_mknode(bfs, bnp, &nbnp, p, nlen, + &bootfs_vattr_dir, addr, size) == 0); + p = sp + 1; + bnp = nbnp; + } + + nlen = strlen(p); + ret = bootfs_mknode(bfs, bnp, &nbnp, p, nlen, &bootfs_vattr_reg, + addr, size); + if (ret != 0) + return (ret); + + return (0); +} + +/* + * We're going to go through every boot time module and construct the + * appropriate vnodes for them now. Because there are very few of these that + * exist, generally on the order of a handful, we're going to create them all + * when the file system is initialized and then tear them all down when the + * module gets unloaded. + * + * The information about the modules is contained in properties on the root of + * the devinfo tree. Specifically there are three properties per module: + * + * - module-size-%d int64_t size, in bytes, of the boot time module. + * - module-addr-%d The address of the boot time module + * - module-name-%d The string name of the boot time module + * + * Note that the module-size and module-addr fields are always 64-bit values + * regardless of being on a 32-bit or 64-bit kernel. module-name is a string + * property. + * + * There is no property that indicates the total number of such modules. Modules + * start at 0 and work their way up incrementally. The first time we can't find + * a module or a property, then we stop. + */ +void +bootfs_construct(bootfs_t *bfs) +{ + uint_t id = 0, ndata; + char paddr[64], psize[64], pname[64], *mname; + dev_info_t *root; + uchar_t *datap; + uint64_t size = 0, addr = 0; + int ret; + + bootfs_mkroot(bfs); + root = ddi_root_node(); + + for (;;) { + if (id == UINT32_MAX) + break; + + if (snprintf(paddr, sizeof (paddr), "module-addr-%d", id) > + sizeof (paddr)) + break; + + if (snprintf(psize, sizeof (paddr), "module-size-%d", id) > + sizeof (paddr)) + break; + + if (snprintf(pname, sizeof (paddr), "module-name-%d", id) > + sizeof (paddr)) + break; + + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, paddr, &datap, &ndata) != + DDI_PROP_SUCCESS) + break; + + if (ndata == 8) + bcopy(datap, &addr, sizeof (uint64_t)); + ddi_prop_free(datap); + if (ndata != 8) + break; + + if (ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, psize, &datap, &ndata) != + DDI_PROP_SUCCESS) + break; + if (ndata == 8) + bcopy(datap, &size, sizeof (uint64_t)); + ddi_prop_free(datap); + if (ndata != 8) + break; + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, root, + DDI_PROP_DONTPASS, pname, &mname) != DDI_PROP_SUCCESS) + break; + + ret = bootfs_construct_entry(bfs, addr, size, mname); + if (ret == EINVAL) + bfs->bfs_stat.bfss_ndiscards.value.ui32++; + if (ret == EEXIST) + bfs->bfs_stat.bfss_ndups.value.ui32++; + ddi_prop_free(mname); + + id++; + } +} + +void +bootfs_destruct(bootfs_t *bfs) +{ + bootfs_node_t *bnp; + + while ((bnp = list_remove_head(&bfs->bfs_nodes)) != NULL) { + ASSERT(bnp->bvn_vnp->v_count == 1); + VN_RELE(bnp->bvn_vnp); + kmem_free(bnp->bvn_name, strlen(bnp->bvn_name) + 1); + kmem_cache_free(bootfs_node_cache, bnp); + } +} diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c new file mode 100644 index 0000000000..e642e86169 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_vfsops.c @@ -0,0 +1,321 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#include <sys/errno.h> +#include <sys/modctl.h> +#include <sys/types.h> +#include <sys/mkdev.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/systm.h> +#include <sys/id_space.h> +#include <sys/cmn_err.h> +#include <sys/ksynch.h> +#include <sys/policy.h> +#include <sys/mount.h> +#include <sys/sysmacros.h> + +#include <sys/fs/bootfs_impl.h> + +/* + * While booting, additional types of modules and files can be passed in to the + * loader. These include the familiar boot archive, as well as, a module hash + * and additional modules that are interpreted as files. As part of the handoff + * in early boot, information about these modules are saved as properties on the + * root of the devinfo tree, similar to other boot-time properties. + * + * This file system provides a read-only view of those additional files. Due to + * its limited scope, it has a slightly simpler construction than several other + * file systems. When mounted, it looks for the corresponding properties and + * creates bootfs_node_t's and vnodes for all of the corresponding files and + * directories that exist along the way. At this time, there are currently a + * rather small number of files passed in this way. + * + * This does lead to one behavior that folks used to other file systems might + * find peculiar. Because we are not always actively creating and destroying the + * required vnodes on demand, the count on the root vnode will not be going up + * accordingly with the existence of other vnodes. This means that a bootfs file + * system that is not in use will have all of its vnodes exist with a v_count of + * one. + */ + +major_t bootfs_major; +static int bootfs_fstype; +static id_space_t *bootfs_idspace; +static uint64_t bootfs_nactive; +static kmutex_t bootfs_lock; + +static const char *bootfs_name = "bootfs"; + +static int +bootfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + int ret; + bootfs_t *bfs; + struct pathname dpn; + dev_t fsdev; + + if ((ret = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (ret); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * We indicate that the backing store is bootfs. We don't want to use + * swap, because folks might think that this is putting all the data + * into memory ala tmpfs. Rather these modules are always in memory and + * there's nothing to be done about that. + */ + vfs_setresource(vfsp, bootfs_name, 0); + bfs = kmem_zalloc(sizeof (bootfs_t), KM_NOSLEEP | KM_NORMALPRI); + if (bfs == NULL) + return (ENOMEM); + + ret = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn); + if (ret != 0) { + kmem_free(bfs, sizeof (bfs)); + return (ret); + } + + bfs->bfs_minor = id_alloc(bootfs_idspace); + bfs->bfs_kstat = kstat_create_zone("bootfs", bfs->bfs_minor, "bootfs", + "fs", KSTAT_TYPE_NAMED, + sizeof (bootfs_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (bfs->bfs_kstat == NULL) { + id_free(bootfs_idspace, bfs->bfs_minor); + pn_free(&dpn); + kmem_free(bfs, sizeof (bfs)); + return (ENOMEM); + } + bfs->bfs_kstat->ks_data = &bfs->bfs_stat; + + fsdev = makedevice(bootfs_major, bfs->bfs_minor); + bfs->bfs_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)bfs; + vfsp->vfs_fstype = bootfs_fstype; + vfsp->vfs_dev = fsdev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_RDONLY | VFS_NOSETUID | VFS_NOTRUNC | + VFS_UNLINKABLE; + vfs_make_fsid(&vfsp->vfs_fsid, fsdev, bootfs_fstype); + bfs->bfs_mntpath = kmem_alloc(dpn.pn_pathlen + 1, KM_SLEEP); + bcopy(dpn.pn_path, bfs->bfs_mntpath, dpn.pn_pathlen); + bfs->bfs_mntpath[dpn.pn_pathlen] = '\0'; + pn_free(&dpn); + list_create(&bfs->bfs_nodes, sizeof (bootfs_node_t), + offsetof(bootfs_node_t, bvn_alink)); + + kstat_named_init(&bfs->bfs_stat.bfss_nfiles, "nfiles", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_ndirs, "ndirs", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_nbytes, "nbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&bfs->bfs_stat.bfss_ndups, "ndup", + KSTAT_DATA_UINT32); + kstat_named_init(&bfs->bfs_stat.bfss_ndiscards, "ndiscard", + KSTAT_DATA_UINT32); + + bootfs_construct(bfs); + + kstat_install(bfs->bfs_kstat); + + return (0); +} + +static int +bootfs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + int ret; + bootfs_t *bfs = vfsp->vfs_data; + bootfs_node_t *bnp; + + if ((ret = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (ret); + + if (flag & MS_FORCE) + return (ENOTSUP); + + for (bnp = list_head(&bfs->bfs_nodes); bnp != NULL; + bnp = list_next(&bfs->bfs_nodes, bnp)) { + mutex_enter(&bnp->bvn_vnp->v_lock); + if (bnp->bvn_vnp->v_count > 1) { + mutex_exit(&bnp->bvn_vnp->v_lock); + return (EBUSY); + } + mutex_exit(&bnp->bvn_vnp->v_lock); + } + + kstat_delete(bfs->bfs_kstat); + bootfs_destruct(bfs); + list_destroy(&bfs->bfs_nodes); + kmem_free(bfs->bfs_mntpath, strlen(bfs->bfs_mntpath) + 1); + id_free(bootfs_idspace, bfs->bfs_minor); + kmem_free(bfs, sizeof (bootfs_t)); + return (0); +} + +static int +bootfs_root(vfs_t *vfsp, vnode_t **vpp) +{ + bootfs_t *bfs; + + bfs = (bootfs_t *)vfsp->vfs_data; + *vpp = bfs->bfs_rootvn->bvn_vnp; + VN_HOLD(*vpp) + + return (0); +} + +static int +bootfs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + const bootfs_t *bfs = (bootfs_t *)vfsp; + dev32_t d32; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + sbp->f_blocks = bfs->bfs_stat.bfss_nbytes.value.ui64 >> PAGESHIFT; + sbp->f_bfree = 0; + sbp->f_bavail = 0; + + sbp->f_files = bfs->bfs_stat.bfss_nfiles.value.ui32 + + bfs->bfs_stat.bfss_ndirs.value.ui32; + sbp->f_ffree = 0; + sbp->f_favail = 0; + + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strlcpy(sbp->f_basetype, bootfs_name, FSTYPSZ); + bzero(sbp->f_fstr, sizeof (sbp->f_fstr)); + + return (0); +} + +static const fs_operation_def_t bootfs_vfsops_tmpl[] = { + VFSNAME_MOUNT, { .vfs_mount = bootfs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = bootfs_unmount }, + VFSNAME_ROOT, { .vfs_root = bootfs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = bootfs_statvfs }, + NULL, NULL +}; + +static int +bootfs_init(int fstype, char *name) +{ + int ret; + + bootfs_fstype = fstype; + ASSERT(bootfs_fstype != 0); + + ret = vfs_setfsops(fstype, bootfs_vfsops_tmpl, NULL); + if (ret != 0) + return (ret); + + ret = vn_make_ops(name, bootfs_vnodeops_template, &bootfs_vnodeops); + if (ret != 0) { + (void) vfs_freevfsops_by_type(bootfs_fstype); + return (ret); + } + + bootfs_major = getudev(); + if (bootfs_major == (major_t)-1) { + cmn_err(CE_WARN, "bootfs_init: Can't get unique device number"); + bootfs_major = 0; + } + + bootfs_nactive = 0; + return (0); +} + +static mntopts_t bootfs_mntopts = { + 0, NULL +}; + +static vfsdef_t bootfs_vfsdef = { + VFSDEF_VERSION, + "bootfs", + bootfs_init, + VSW_HASPROTO|VSW_STATS, + &bootfs_mntopts +}; + +static struct modlfs bootfs_modlfs = { + &mod_fsops, "boot-time modules file system", &bootfs_vfsdef +}; + +static struct modlinkage bootfs_modlinkage = { + MODREV_1, &bootfs_modlfs, NULL +}; + +int +_init(void) +{ + bootfs_node_cache = kmem_cache_create("bootfs_node_cache", + sizeof (bootfs_node_t), 0, bootfs_node_constructor, + bootfs_node_destructor, NULL, NULL, NULL, 0); + bootfs_idspace = id_space_create("bootfs_minors", 1, INT32_MAX); + mutex_init(&bootfs_lock, NULL, MUTEX_DEFAULT, NULL); + + return (mod_install(&bootfs_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&bootfs_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + mutex_enter(&bootfs_lock); + if (bootfs_nactive > 0) { + mutex_exit(&bootfs_lock); + return (EBUSY); + } + mutex_exit(&bootfs_lock); + + err = mod_remove(&bootfs_modlinkage); + if (err != 0) + return (err); + + (void) vfs_freevfsops_by_type(bootfs_fstype); + vn_freevnodeops(bootfs_vnodeops); + id_space_destroy(bootfs_idspace); + mutex_destroy(&bootfs_lock); + kmem_cache_destroy(bootfs_node_cache); + return (err); +} diff --git a/usr/src/uts/common/fs/bootfs/bootfs_vnops.c b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c new file mode 100644 index 0000000000..f63d0a4f24 --- /dev/null +++ b/usr/src/uts/common/fs/bootfs/bootfs_vnops.c @@ -0,0 +1,544 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * bootfs vnode operations + */ + +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mman.h> +#include <fs/fs_subr.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/dirent.h> +#include <sys/uio.h> +#include <vm/pvn.h> +#include <vm/hat.h> +#include <vm/seg_map.h> +#include <vm/seg_vn.h> +#include <sys/vmsystm.h> + +#include <sys/fs/bootfs_impl.h> + +struct vnodeops *bootfs_vnodeops; + +/*ARGSUSED*/ +static int +bootfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + int err; + ssize_t sres = uiop->uio_resid; + bootfs_node_t *bnp = vp->v_data; + + if (vp->v_type == VDIR) + return (EISDIR); + + if (vp->v_type != VREG) + return (EINVAL); + + if (uiop->uio_loffset < 0) + return (EINVAL); + + if (uiop->uio_loffset >= bnp->bvn_size) + return (0); + + err = 0; + while (uiop->uio_resid != 0) { + caddr_t base; + long offset, frem; + ulong_t poff, segoff; + size_t bytes; + int relerr; + + offset = uiop->uio_loffset; + poff = offset & PAGEOFFSET; + bytes = MIN(PAGESIZE - poff, uiop->uio_resid); + + frem = bnp->bvn_size - offset; + if (frem <= 0) { + err = 0; + break; + } + + /* Don't read past EOF */ + bytes = MIN(bytes, frem); + + /* + * Segmaps are likely larger than our page size, so make sure we + * have the proper offfset into the resulting segmap data. + */ + segoff = (offset & PAGEMASK) & MAXBOFFSET; + + base = segmap_getmapflt(segkmap, vp, offset & MAXBMASK, bytes, + 1, S_READ); + + err = uiomove(base + segoff + poff, bytes, UIO_READ, uiop); + relerr = segmap_release(segkmap, base, 0); + + if (err == 0) + err = relerr; + + if (err != 0) + break; + } + + /* Even if we had an error in a partial read, return success */ + if (uiop->uio_resid > sres) + err = 0; + + gethrestime(&bnp->bvn_attr.va_atime); + + return (err); +} + +/*ARGSUSED*/ +static int +bootfs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +bootfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + uint32_t mask; + bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data; + + mask = vap->va_mask; + bcopy(&bpn->bvn_attr, vap, sizeof (vattr_t)); + vap->va_mask = mask; + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + int shift = 0; + bootfs_node_t *bpn = (bootfs_node_t *)vp->v_data; + + if (crgetuid(cr) != bpn->bvn_attr.va_uid) { + shift += 3; + if (groupmember(bpn->bvn_attr.va_gid, cr) == 0) + shift += 3; + } + + return (secpolicy_vnode_access2(cr, vp, bpn->bvn_attr.va_uid, + bpn->bvn_attr.va_mode << shift, mode)); +} + +/*ARGSUSED*/ +static int +bootfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + avl_index_t where; + bootfs_node_t sn, *bnp; + bootfs_node_t *bpp = (bootfs_node_t *)dvp->v_data; + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + if (bpp->bvn_attr.va_type != VDIR) + return (ENOTDIR); + + if (*nm == '\0' || strcmp(nm, ".") == 0) { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + + if (strcmp(nm, "..") == 0) { + VN_HOLD(bpp->bvn_parent->bvn_vnp); + *vpp = bpp->bvn_parent->bvn_vnp; + return (0); + } + + sn.bvn_name = nm; + bnp = avl_find(&bpp->bvn_dir, &sn, &where); + if (bnp == NULL) + return (ENOENT); + + VN_HOLD(bnp->bvn_vnp); + *vpp = bnp->bvn_vnp; + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data; + dirent64_t *dp; + void *buf; + ulong_t bsize, brem; + offset_t coff, roff; + int dlen, ret; + bootfs_node_t *dnp; + boolean_t first = B_TRUE; + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp != NULL) + *eofp = 1; + return (0); + } + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (!(uiop->uio_iov->iov_len > 0)) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + roff = uiop->uio_loffset; + coff = 0; + brem = bsize = uiop->uio_iov->iov_len; + buf = kmem_alloc(bsize, KM_SLEEP); + dp = buf; + + /* + * Recall that offsets here are done based on the name of the dirent + * excluding the null terminator. Therefore `.` is always at 0, `..` is + * always at 1, and then the first real dirent is at 3. This offset is + * what's actually stored when we update the offset in the structure. + */ + if (roff == 0) { + dlen = DIRENT64_RECLEN(1); + if (first == B_TRUE) { + if (dlen > brem) { + kmem_free(buf, bsize); + return (EINVAL); + } + first = B_FALSE; + } + dp->d_ino = (ino64_t)bnp->bvn_attr.va_nodeid; + dp->d_off = 0; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, ".", DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + } + + if (roff <= 1) { + dlen = DIRENT64_RECLEN(2); + if (first == B_TRUE) { + if (dlen > brem) { + kmem_free(buf, bsize); + return (EINVAL); + } + first = B_FALSE; + } + dp->d_ino = (ino64_t)bnp->bvn_parent->bvn_attr.va_nodeid; + dp->d_off = 1; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, "..", DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + } + + coff = 3; + for (dnp = avl_first(&bnp->bvn_dir); dnp != NULL; + dnp = AVL_NEXT(&bnp->bvn_dir, dnp)) { + size_t nlen = strlen(dnp->bvn_name); + + if (roff > coff) { + coff += nlen; + continue; + } + + dlen = DIRENT64_RECLEN(nlen); + if (dlen > brem) { + if (first == B_TRUE) { + kmem_free(buf, bsize); + return (EINVAL); + } + break; + } + first = B_FALSE; + + dp->d_ino = (ino64_t)dnp->bvn_attr.va_nodeid; + dp->d_off = coff; + dp->d_reclen = (ushort_t)dlen; + (void) strncpy(dp->d_name, dnp->bvn_name, + DIRENT64_NAMELEN(dlen)); + dp = (struct dirent64 *)((uintptr_t)dp + dp->d_reclen); + brem -= dlen; + coff += nlen; + } + + ret = uiomove(buf, (bsize - brem), UIO_READ, uiop); + + if (ret == 0) { + if (dnp == NULL) { + coff++; + if (eofp != NULL) + *eofp = 1; + } else if (eofp != NULL) { + *eofp = 0; + } + uiop->uio_loffset = coff; + } + gethrestime(&bnp->bvn_attr.va_atime); + kmem_free(buf, bsize); + return (ret); +} + +/*ARGSUSED*/ +static void +bootfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ +} + +/*ARGSUSED*/ +static int +bootfs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + if (write_lock != 0) + return (EINVAL); + return (0); +} + +/*ARGSUSED*/ +static void +bootfs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ +} + +/*ARGSUSED*/ +static int +bootfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + bootfs_node_t *bnp = (bootfs_node_t *)vp->v_data; + if (vp->v_type == VDIR) + return (0); + return ((*noffp < 0 || *noffp > bnp->bvn_size ? EINVAL : 0)); +} + +/* + * We need to fill in a single page of a vnode's memory based on the actual data + * from the kernel. We'll use this node's sliding window into physical memory + * and update one page at a time. + */ +/*ARGSUSED*/ +static int +bootfs_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr) +{ + bootfs_node_t *bnp = vp->v_data; + page_t *pp, *fpp; + pfn_t pfn; + + for (;;) { + /* Easy case where the page exists */ + pp = page_lookup(vp, off, rw == S_CREATE ? SE_EXCL : SE_SHARED); + if (pp != NULL) { + if (pl != NULL) { + pl[0] = pp; + pl[1] = NULL; + } else { + page_unlock(pp); + } + return (0); + } + + pp = page_create_va(vp, off, PAGESIZE, PG_EXCL | PG_WAIT, seg, + addr); + + /* + * If we didn't get the page, that means someone else beat us to + * creating this so we need to try again. + */ + if (pp != NULL) + break; + } + + pfn = btop((bnp->bvn_addr + off) & PAGEMASK); + fpp = page_numtopp_nolock(pfn); + + if (ppcopy(fpp, pp) == 0) { + pvn_read_done(pp, B_ERROR); + return (EIO); + } + + if (pl != NULL) { + pvn_plist_init(pp, pl, plsz, off, PAGESIZE, rw); + } else { + pvn_io_done(pp); + } + + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + int err; + bootfs_node_t *bnp = vp->v_data; + + if (off + len > bnp->bvn_size + PAGEOFFSET) + return (EFAULT); + + if (len <= PAGESIZE) + err = bootfs_getapage(vp, (u_offset_t)off, len, protp, pl, + plsz, seg, addr, rw, cr); + else + err = pvn_getpages(bootfs_getapage, vp, (u_offset_t)off, len, + protp, pl, plsz, seg, addr, rw, cr); + + return (err); +} + +/*ARGSUSED*/ +static int +bootfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + int ret; + segvn_crargs_t vn_a; + +#ifdef _ILP32 + if (len > MAXOFF_T) + return (ENOMEM); +#endif + + if (vp->v_flag & VNOMAP) + return (ENOSYS); + + if (off < 0 || off > MAXOFFSET_T - off) + return (ENXIO); + + if (vp->v_type != VREG) + return (ENODEV); + + if (prot & PROT_WRITE) + return (ENOTSUP); + + as_rangelock(as); + ret = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); + if (ret != 0) { + as_rangeunlock(as); + return (ret); + } + + vn_a.vp = vp; + vn_a.offset = (u_offset_t)off; + vn_a.type = flags & MAP_TYPE; + vn_a.prot = prot; + vn_a.maxprot = maxprot; + vn_a.cred = cr; + vn_a.amp = NULL; + vn_a.flags = flags & ~MAP_TYPE; + vn_a.szc = 0; + vn_a.lgrp_mem_policy_flags = 0; + + ret = as_map(as, *addrp, len, segvn_create, &vn_a); + + as_rangeunlock(as); + return (ret); + +} + +/*ARGSUSED*/ +static int +bootfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +/*ARGSUSED*/ +static int +bootfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + return (0); +} + +static int +bootfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int ret; + + switch (cmd) { + case _PC_TIMESTAMP_RESOLUTION: + *valp = 1L; + ret = 0; + break; + default: + ret = fs_pathconf(vp, cmd, valp, cr, ct); + } + + return (ret); +} + +const fs_operation_def_t bootfs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = bootfs_open }, + VOPNAME_CLOSE, { .vop_close = bootfs_close }, + VOPNAME_READ, { .vop_read = bootfs_read }, + VOPNAME_IOCTL, { .vop_ioctl = bootfs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = bootfs_getattr }, + VOPNAME_ACCESS, { .vop_access = bootfs_access }, + VOPNAME_LOOKUP, { .vop_lookup = bootfs_lookup }, + VOPNAME_READDIR, { .vop_readdir = bootfs_readdir }, + VOPNAME_INACTIVE, { .vop_inactive = bootfs_inactive }, + VOPNAME_RWLOCK, { .vop_rwlock = bootfs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = bootfs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = bootfs_seek }, + VOPNAME_GETPAGE, { .vop_getpage = bootfs_getpage }, + VOPNAME_MAP, { .vop_map = bootfs_map }, + VOPNAME_ADDMAP, { .vop_addmap = bootfs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = bootfs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = bootfs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_nosupport }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/dev/sdev_netops.c b/usr/src/uts/common/fs/dev/sdev_netops.c index 4eaf38f484..41441ec52d 100644 --- a/usr/src/uts/common/fs/dev/sdev_netops.c +++ b/usr/src/uts/common/fs/dev/sdev_netops.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -41,8 +42,102 @@ #include <sys/zone.h> #include <sys/dls.h> +static const char *devnet_zpath = "/dev/net/zone/"; struct vnodeops *devnet_vnodeops; +static zoneid_t +devnet_nodetozone(sdev_node_t *dv) +{ + char *zname = NULL, *dup; + zone_t *zone; + int duplen; + zoneid_t zid; + + /* + * If in a non-global zone, always return it's zid no matter what the + * node is. + */ + zid = getzoneid(); + if (zid != GLOBAL_ZONEID) + return (zid); + + /* + * If it doesn't have /dev/net/zone/ then it can't be a specific zone + * we're targetting. + */ + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) != 0) + return (GLOBAL_ZONEID); + + if (dv->sdev_vnode->v_type == VDIR) { + zone = zone_find_by_name(dv->sdev_name); + } else { + /* Non directories have the form /dev/net/zone/%z/%s */ + dup = strdup(dv->sdev_path); + duplen = strlen(dup); + zname = strrchr(dup, '/'); + *zname = '\0'; + zname--; + zname = strrchr(dup, '/'); + zname++; + zone = zone_find_by_name(zname); + kmem_free(dup, duplen + 1); + } + if (zone == NULL) + return (GLOBAL_ZONEID); + zid = zone->zone_id; + zone_rele(zone); + return (zid); +} + +static int +devnet_mkdir(struct sdev_node *ddv, char *name) +{ + sdev_node_t *dv; + struct vattr va; + int ret; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + dv = sdev_cache_lookup(ddv, name); + if (dv != NULL) { + SDEV_SIMPLE_RELE(dv); + return (EEXIST); + } + + va = *sdev_getdefault_attr(VDIR); + gethrestime(&va.va_atime); + va.va_mtime = va.va_atime; + va.va_ctime = va.va_atime; + + ret = sdev_mknode(ddv, name, &dv, &va, NULL, NULL, kcred, SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(dv); + return (0); +} + +/* + * We basically need to walk down the directory path to determine what we should + * do. At the top level of /dev/net, only the directory /dev/net/zone is valid, + * and it is always valid. Following on that, /dev/net/zone/%zonename is valid + * if and only if we can look up that zone name. If it's not, or it's some other + * name, then it's SDEV_VTOR_INVALID. + */ +static int +devnet_dirvalidate(struct sdev_node *dv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, dv->sdev_path) == 0) + return (SDEV_VTOR_VALID); + + zonep = zone_find_by_name(dv->sdev_name); + if (zonep == NULL) + return (SDEV_VTOR_INVALID); + zone_rele(zonep); + return (SDEV_VTOR_VALID); +} + /* * Check if a net sdev_node is still valid - i.e. it represents a current * network link. @@ -60,11 +155,20 @@ devnet_validate(struct sdev_node *dv) ASSERT(dv->sdev_state == SDEV_READY); - if (dls_mgmt_get_linkid(dv->sdev_name, &linkid) != 0) + if (dv->sdev_vnode->v_type == VDIR) + return (devnet_dirvalidate(dv)); + + if (strncmp(devnet_zpath, dv->sdev_path, strlen(devnet_zpath)) == 0) { + ASSERT(SDEV_IS_GLOBAL(dv)); + zoneid = devnet_nodetozone(dv); + } else { + zoneid = getzoneid(); + } + + if (dls_mgmt_get_linkid_in_zone(dv->sdev_name, &linkid, zoneid) != 0) return (SDEV_VTOR_INVALID); - if (SDEV_IS_GLOBAL(dv)) + if (zoneid == GLOBAL_ZONEID) return (SDEV_VTOR_VALID); - zoneid = getzoneid(); return (zone_check_datalink(&zoneid, linkid) == 0 ? SDEV_VTOR_VALID : SDEV_VTOR_INVALID); } @@ -74,13 +178,14 @@ devnet_validate(struct sdev_node *dv) * a net entry when the node is not found in the cache. */ static int -devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp) +devnet_create_rvp(const char *nm, struct vattr *vap, dls_dl_handle_t *ddhp, + zoneid_t zid) { timestruc_t now; dev_t dev; int error; - if ((error = dls_devnet_open(nm, ddhp, &dev)) != 0) { + if ((error = dls_devnet_open_in_zone(nm, ddhp, &dev, zid)) != 0) { sdcmn_err12(("devnet_create_rvp: not a valid vanity name " "network node: %s\n", nm)); return (error); @@ -116,6 +221,7 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, struct sdev_node *ddv = VTOSDEV(dvp); struct sdev_node *dv = NULL; dls_dl_handle_t ddh = NULL; + zone_t *zone; struct vattr vattr; int nmlen; int error = ENOENT; @@ -123,6 +229,9 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, if (SDEVTOV(ddv)->v_type != VDIR) return (ENOTDIR); + if (!SDEV_IS_GLOBAL(ddv) && crgetzoneid(cred) == GLOBAL_ZONEID) + return (EPERM); + /* * Empty name or ., return node itself. */ @@ -145,6 +254,12 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, rw_enter(&ddv->sdev_contents, RW_WRITER); /* + * ZOMBIED parent does not allow new node creation, bail out early. + */ + if (ddv->sdev_state == SDEV_ZOMBIE) + goto failed; + + /* * directory cache lookup: */ if ((dv = sdev_cache_lookup(ddv, nm)) != NULL) { @@ -153,13 +268,42 @@ devnet_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, goto found; } + if (SDEV_IS_GLOBAL(ddv)) { + /* + * Check for /dev/net/zone + */ + if (strcmp("zone", nm) == 0 && strcmp("/dev/net", + ddv->sdev_path) == 0) { + (void) devnet_mkdir(ddv, nm); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + + /* + * Check for /dev/net/zone/%z. We can't use devnet_zpath due to + * its trailing slash. + */ + if (strcmp("/dev/net/zone", ddv->sdev_path) == 0) { + zone = zone_find_by_name(nm); + if (zone == NULL) + goto failed; + (void) devnet_mkdir(ddv, nm); + zone_rele(zone); + dv = sdev_cache_lookup(ddv, nm); + ASSERT(dv != NULL); + goto found; + } + } else if (strcmp("/dev/net", ddv->sdev_path) != 0) { + goto failed; + } + /* - * ZOMBIED parent does not allow new node creation, bail out early. + * We didn't find what we were looking for. What that is depends a lot + * on what directory we're in. */ - if (ddv->sdev_state == SDEV_ZOMBIE) - goto failed; - error = devnet_create_rvp(nm, &vattr, &ddh); + error = devnet_create_rvp(nm, &vattr, &ddh, devnet_nodetozone(ddv)); if (error != 0) goto failed; @@ -219,7 +363,7 @@ devnet_filldir_datalink(datalink_id_t linkid, void *arg) if ((dv = sdev_cache_lookup(ddv, (char *)link)) != NULL) goto found; - if (devnet_create_rvp(link, &vattr, &ddh) != 0) + if (devnet_create_rvp(link, &vattr, &ddh, devnet_nodetozone(arg)) != 0) return (0); ASSERT(ddh != NULL); @@ -244,16 +388,77 @@ found: return (0); } +/* + * Fill in all the entries for the current zone. + */ static void -devnet_filldir(struct sdev_node *ddv) +devnet_fillzone(struct sdev_node *ddv, zoneid_t zid) { - sdev_node_t *dv, *next; datalink_id_t linkid; + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + if (zid == GLOBAL_ZONEID) { + ASSERT(SDEV_IS_GLOBAL(ddv)); + linkid = DATALINK_INVALID_LINKID; + do { + linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, + DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); + if (linkid != DATALINK_INVALID_LINKID) + (void) devnet_filldir_datalink(linkid, ddv); + } while (linkid != DATALINK_INVALID_LINKID); + } else { + (void) zone_datalink_walk(zid, devnet_filldir_datalink, ddv); + } +} + +/* + * Callback for zone_walk when filling up /dev/net/zone/... + */ +static int +devnet_fillzdir_cb(zone_t *zonep, void *arg) +{ + sdev_node_t *ddv = arg; + + ASSERT(RW_WRITE_HELD(&ddv->sdev_contents)); + (void) devnet_mkdir(ddv, zonep->zone_name); + return (0); +} + +/* + * Fill in a directory that isn't the top level /dev/net. + */ +static void +devnet_fillzdir(struct sdev_node *ddv) +{ + zone_t *zonep; + char *path = "/dev/net/zone"; + + if (strcmp(path, ddv->sdev_path) == 0) { + (void) zone_walk(devnet_fillzdir_cb, ddv); + return; + } + + zonep = zone_find_by_name(ddv->sdev_name); + if (zonep == NULL) + return; + devnet_fillzone(ddv, zonep->zone_id); + zone_rele(zonep); +} + +static void +devnet_filldir(struct sdev_node *ddv) +{ + int ret; + sdev_node_t *dv, *next; + ASSERT(RW_READ_HELD(&ddv->sdev_contents)); if (rw_tryupgrade(&ddv->sdev_contents) == NULL) { rw_exit(&ddv->sdev_contents); rw_enter(&ddv->sdev_contents, RW_WRITER); + if (ddv->sdev_state == SDEV_ZOMBIE) { + rw_exit(&ddv->sdev_contents); + return; + } } for (dv = SDEV_FIRST_ENTRY(ddv); dv; dv = next) { @@ -276,31 +481,38 @@ devnet_filldir(struct sdev_node *ddv) if (SDEVTOV(dv)->v_count > 0) continue; + SDEV_HOLD(dv); + + /* + * Clean out everything underneath before we remove ourselves. + */ + if (SDEVTOV(ddv)->v_type == VDIR) { + ret = sdev_cleandir(dv, NULL, 0); + ASSERT(ret == 0); + } /* remove the cache node */ (void) sdev_cache_update(ddv, &dv, dv->sdev_name, SDEV_CACHE_DELETE); SDEV_RELE(dv); } + if (strcmp(ddv->sdev_path, "/dev/net") != 0) { + devnet_fillzdir(ddv); + goto done; + } + if (((ddv->sdev_flags & SDEV_BUILD) == 0) && !dls_devnet_rebuild()) goto done; if (SDEV_IS_GLOBAL(ddv)) { - linkid = DATALINK_INVALID_LINKID; - do { - linkid = dls_mgmt_get_next(linkid, DATALINK_CLASS_ALL, - DATALINK_ANY_MEDIATYPE, DLMGMT_ACTIVE); - if (linkid != DATALINK_INVALID_LINKID) - (void) devnet_filldir_datalink(linkid, ddv); - } while (linkid != DATALINK_INVALID_LINKID); + devnet_fillzone(ddv, GLOBAL_ZONEID); + (void) devnet_mkdir(ddv, "zone"); } else { - (void) zone_datalink_walk(getzoneid(), - devnet_filldir_datalink, ddv); + devnet_fillzone(ddv, getzoneid()); } ddv->sdev_flags &= ~SDEV_BUILD; - done: rw_downgrade(&ddv->sdev_contents); } @@ -319,6 +531,9 @@ devnet_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, ASSERT(sdvp); + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + if (uiop->uio_offset == 0) devnet_filldir(sdvp); diff --git a/usr/src/uts/common/fs/dev/sdev_plugin.c b/usr/src/uts/common/fs/dev/sdev_plugin.c new file mode 100644 index 0000000000..885191175f --- /dev/null +++ b/usr/src/uts/common/fs/dev/sdev_plugin.c @@ -0,0 +1,913 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* + * Dynamic directory plugin interface for sdev. + * + * The sdev plugin interfaces provides a means for a dynamic directory based on + * in-kernel state to be simply created. Traditionally, dynamic directories were + * built into sdev itself. While these legacy plugins are useful, it makes more + * sense for these pieces of functionality to live with the individual drivers. + * + * The plugin interface requires folks to implement three interfaces and + * provides a series of callbacks that can be made in the context of those + * interfaces to interrogate the sdev_node_t without having to leak + * implementation details of the sdev_node_t. These interfaces are: + * + * o spo_validate + * + * Given a particular node, answer the question as to whether or not this + * entry is still valid. Here, plugins should use the name and the dev_t + * associated with the node to verify that it matches something that still + * exists. + * + * o spo_filldir + * + * Fill all the entries inside of a directory. Note that some of these entries + * may already exist. + * + * o spo_inactive + * + * The given node is no longer being used. This allows the consumer to + * potentially tear down anything that was being held open related to this. + * Note that this only fires when the given sdev_node_t becomes a zombie. + * + * During these callbacks a consumer is not allowed to register or unregister a + * plugin, especially their own. They may call the sdev_ctx style functions. All + * callbacks fire in a context where blocking is allowed (eg. the spl is below + * LOCK_LEVEL). + * + * When a plugin is added, we create its directory in the global zone. By doing + * that, we ensure that something isn't already there and that nothing else can + * come along and try and create something without our knowledge. We only have + * to create it in the GZ and not for all other instances of sdev because an + * instance of sdev that isn't at /dev does not have dynamic directories, and + * second, any instance of sdev present in a non-global zone cannot create + * anything, therefore we know that by it not being in the global zone's + * instance of sdev that we're good to go. + * + * Lock Ordering + * ------------- + * + * The global sdev_plugin_lock must be held before any of the individual + * sdev_plugin_t`sp_lock. Further, once any plugin related lock has been held, + * it is not legal to take any holds on any sdev_node_t or to grab the + * sdev_node_t`contents_lock in any way. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/fs/sdev_impl.h> +#include <sys/fs/sdev_plugin.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/ksynch.h> +#include <sys/sysmacros.h> +#include <sys/list.h> +#include <sys/ctype.h> + +kmutex_t sdev_plugin_lock; +list_t sdev_plugin_list; +kmem_cache_t *sdev_plugin_cache; +struct vnodeops *sdev_plugin_vnops; + +#define SDEV_PLUGIN_NAMELEN 64 + +typedef struct sdev_plugin { + list_node_t sp_link; + char sp_name[SDEV_PLUGIN_NAMELEN]; /* E */ + int sp_nflags; /* E */ + struct vnodeops *sp_vnops; /* E */ + sdev_plugin_ops_t *sp_pops; /* E */ + boolean_t sp_islegacy; /* E */ + int (*sp_lvtor)(sdev_node_t *); /* E */ + kmutex_t sp_lock; /* Protects everything below */ + kcondvar_t sp_nodecv; + size_t sp_nnodes; +} sdev_plugin_t; + +/* ARGSUSED */ +static int +sdev_plugin_cache_constructor(void *buf, void *arg, int tags) +{ + sdev_plugin_t *spp = buf; + mutex_init(&spp->sp_lock, NULL, MUTEX_DRIVER, 0); + cv_init(&spp->sp_nodecv, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +sdev_plugin_cache_destructor(void *buf, void *arg) +{ + sdev_plugin_t *spp = buf; + cv_destroy(&spp->sp_nodecv); + mutex_destroy(&spp->sp_lock); +} + +enum vtype +sdev_ctx_vtype(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_vnode->v_type); +} + +const char * +sdev_ctx_path(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_path); +} + +const char * +sdev_ctx_name(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_name); +} + +/* + * Currently we only support psasing through a single flag -- SDEV_IS_GLOBAL. + */ +sdev_ctx_flags_t +sdev_ctx_flags(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + return (sdp->sdev_flags & SDEV_GLOBAL); +} + +/* + * Return some amount of private data specific to the vtype. In the case of a + * character or block device this is the device number. + */ +const void * +sdev_ctx_vtype_data(sdev_ctx_t ctx) +{ + sdev_node_t *sdp = (sdev_node_t *)ctx; + void *ret; + + ASSERT(RW_LOCK_HELD(&sdp->sdev_contents)); + switch (sdp->sdev_vnode->v_type) { + case VCHR: + case VBLK: + ret = (void *)(uintptr_t)(sdp->sdev_vnode->v_rdev); + break; + default: + ret = NULL; + break; + } + + return (ret); +} + +/* + * Use the same rules as zones for a name. isalphanum + '-', '_', and '.'. + */ +static int +sdev_plugin_name_isvalid(const char *c, int buflen) +{ + int i; + + for (i = 0; i < buflen; i++, c++) { + if (*c == '\0') + return (1); + + if (!isalnum(*c) && *c != '-' && *c != '_' && *c != '.') + return (0); + } + /* Never found a null terminator */ + return (0); +} + +static int +sdev_plugin_mknode(sdev_plugin_t *spp, sdev_node_t *sdvp, char *name, + vattr_t *vap) +{ + int ret; + sdev_node_t *svp; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + ASSERT(spp != NULL); + svp = sdev_cache_lookup(sdvp, name); + if (svp != NULL) { + SDEV_SIMPLE_RELE(svp); + return (EEXIST); + } + + ret = sdev_mknode(sdvp, name, &svp, vap, NULL, NULL, kcred, + SDEV_READY); + if (ret != 0) + return (ret); + SDEV_SIMPLE_RELE(svp); + + return (0); +} + +/* + * Plugin node creation callbacks + */ +int +sdev_plugin_mkdir(sdev_ctx_t ctx, char *name) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(sdvp->sdev_private != NULL); + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); +} + +int +sdev_plugin_mknod(sdev_ctx_t ctx, char *name, mode_t mode, dev_t dev) +{ + sdev_node_t *sdvp; + timestruc_t now; + struct vattr vap; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) + return (EINVAL); + + sdvp = (sdev_node_t *)ctx; + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + if (mode != S_IFCHR && mode != S_IFBLK) + return (EINVAL); + + ASSERT(sdvp->sdev_private != NULL); + + vap = *sdev_getdefault_attr(mode == S_IFCHR ? VCHR : VBLK); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + vap.va_rdev = dev; + vap.va_mode = mode | 0666; + + /* Despite the similar name, this is in fact a different function */ + return (sdev_plugin_mknode(sdvp->sdev_private, sdvp, name, &vap)); + +} + +static int +sdev_plugin_validate(sdev_node_t *sdp) +{ + int ret; + sdev_plugin_t *spp; + + ASSERT(sdp->sdev_private != NULL); + spp = sdp->sdev_private; + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + rw_enter(&sdp->sdev_contents, RW_READER); + ret = spp->sp_pops->spo_validate((uintptr_t)sdp); + rw_exit(&sdp->sdev_contents); + return (ret); +} + +static void +sdev_plugin_validate_dir(sdev_node_t *sdvp) +{ + int ret; + sdev_node_t *svp, *next; + + ASSERT(RW_WRITE_HELD(&sdvp->sdev_contents)); + + for (svp = SDEV_FIRST_ENTRY(sdvp); svp != NULL; svp = next) { + + next = SDEV_NEXT_ENTRY(sdvp, svp); + ASSERT(svp->sdev_state != SDEV_ZOMBIE); + /* skip nodes that aren't ready */ + if (svp->sdev_state == SDEV_INIT) + continue; + + switch (sdev_plugin_validate(svp)) { + case SDEV_VTOR_VALID: + case SDEV_VTOR_SKIP: + continue; + case SDEV_VTOR_INVALID: + case SDEV_VTOR_STALE: + break; + } + + SDEV_HOLD(svp); + + /* + * Clean out everything underneath this node before we + * remove it. + */ + if (svp->sdev_vnode->v_type == VDIR) { + ret = sdev_cleandir(svp, NULL, 0); + ASSERT(ret == 0); + } + /* remove the cache node */ + (void) sdev_cache_update(sdvp, &svp, svp->sdev_name, + SDEV_CACHE_DELETE); + SDEV_RELE(svp); + } +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred, + int *eofp, caller_context_t *ct_unused, int flags_unused) +{ + int ret; + sdev_node_t *sdvp = VTOSDEV(dvp); + sdev_plugin_t *spp; + + ASSERT(RW_READ_HELD(&sdvp->sdev_contents)); + + /* Sanity check we're not a zombie before we do anyting else */ + if (sdvp->sdev_state == SDEV_ZOMBIE) + return (ENOENT); + + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + if (uiop->uio_offset == 0) { + /* + * We upgrade to a write lock and grab the plugin's lock along + * the way. We're almost certainly going to get creation + * callbacks, so this is the only safe way to go. + */ + if (rw_tryupgrade(&sdvp->sdev_contents) == 0) { + rw_exit(&sdvp->sdev_contents); + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_downgrade(&sdvp->sdev_contents); + return (ENOENT); + } + } + + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_downgrade(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + } + + return (devname_readdir_func(dvp, uiop, cred, eofp, 0)); +} + +/* + * If we don't have a callback function that returns a failure, then sdev will + * try to create a node for us which violates all of our basic assertions. To + * work around that we create our own callback for devname_lookup_func which + * always returns ENOENT as at this point either it was created with the filldir + * callback or it was not. + */ +/*ARGSUSED*/ +static int +sdev_plugin_vop_lookup_cb(sdev_node_t *ddv, char *nm, void **arg, cred_t *cred, + void *unused, char *unused2) +{ + return (ENOENT); +} + +/* ARGSUSED */ +static int +sdev_plugin_vop_lookup(struct vnode *dvp, char *nm, struct vnode **vpp, + struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred, + caller_context_t *ct, int *direntflags, pathname_t *realpnp) +{ + int ret; + sdev_node_t *sdvp; + sdev_plugin_t *spp; + + /* execute access is required to search the directory */ + if ((ret = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0) + return (ret); + + sdvp = VTOSDEV(dvp); + spp = sdvp->sdev_private; + ASSERT(spp != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + ASSERT(spp->sp_pops != NULL); + + if (crgetzoneid(cred) == GLOBAL_ZONEID && !SDEV_IS_GLOBAL(sdvp)) + return (EPERM); + + /* + * Go straight for the write lock. + */ + rw_enter(&sdvp->sdev_contents, RW_WRITER); + if (sdvp->sdev_state == SDEV_ZOMBIE) { + rw_exit(&sdvp->sdev_contents); + return (ENOENT); + } + sdev_plugin_validate_dir(sdvp); + ret = spp->sp_pops->spo_filldir((uintptr_t)sdvp); + rw_exit(&sdvp->sdev_contents); + if (ret != 0) + return (ret); + + return (devname_lookup_func(sdvp, nm, vpp, cred, + sdev_plugin_vop_lookup_cb, SDEV_VATTR)); +} + +/* + * sdev is not a good citizen. We get inactive callbacks whenever a vnode goes + * to zero, but isn't necessairily a zombie yet. As such, to make things easier + * for users, we only fire the inactive callback when the node becomes a zombie + * and thus will be torn down here. + */ +static void +sdev_plugin_vop_inactive_cb(struct vnode *dvp) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + + rw_enter(&sdp->sdev_contents, RW_READER); + if (sdp->sdev_state != SDEV_ZOMBIE) { + rw_exit(&sdp->sdev_contents); + return; + } + spp->sp_pops->spo_inactive((uintptr_t)sdp); + mutex_enter(&spp->sp_lock); + VERIFY(spp->sp_nnodes > 0); + spp->sp_nnodes--; + cv_signal(&spp->sp_nodecv); + mutex_exit(&spp->sp_lock); + rw_exit(&sdp->sdev_contents); +} + +/*ARGSUSED*/ +static void +sdev_plugin_vop_inactive(struct vnode *dvp, struct cred *cred, + caller_context_t *ct) +{ + sdev_node_t *sdp = VTOSDEV(dvp); + sdev_plugin_t *spp = sdp->sdev_private; + ASSERT(sdp->sdev_private != NULL); + ASSERT(spp->sp_islegacy == B_FALSE); + devname_inactive_func(dvp, cred, sdev_plugin_vop_inactive_cb); +} + +const fs_operation_def_t sdev_plugin_vnodeops_tbl[] = { + VOPNAME_READDIR, { .vop_readdir = sdev_plugin_vop_readdir }, + VOPNAME_LOOKUP, { .vop_lookup = sdev_plugin_vop_lookup }, + VOPNAME_INACTIVE, { .vop_inactive = sdev_plugin_vop_inactive }, + VOPNAME_CREATE, { .error = fs_nosys }, + VOPNAME_REMOVE, { .error = fs_nosys }, + VOPNAME_MKDIR, { .error = fs_nosys }, + VOPNAME_RMDIR, { .error = fs_nosys }, + VOPNAME_SYMLINK, { .error = fs_nosys }, + VOPNAME_SETSECATTR, { .error = fs_nosys }, + NULL, NULL +}; + +/* + * construct a new template with overrides from vtab + */ +static fs_operation_def_t * +sdev_merge_vtab(const fs_operation_def_t tab[]) +{ + fs_operation_def_t *new; + const fs_operation_def_t *tab_entry; + + /* make a copy of standard vnode ops table */ + new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); + bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); + + /* replace the overrides from tab */ + for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { + fs_operation_def_t *std_entry = new; + while (std_entry->name) { + if (strcmp(tab_entry->name, std_entry->name) == 0) { + std_entry->func = tab_entry->func; + break; + } + std_entry++; + } + } + + return (new); +} + +/* free memory allocated by sdev_merge_vtab */ +static void +sdev_free_vtab(fs_operation_def_t *new) +{ + kmem_free(new, sdev_vnodeops_tbl_size); +} + +/* + * Register a new plugin. + */ +sdev_plugin_hdl_t +sdev_plugin_register(const char *name, sdev_plugin_ops_t *ops, int *errp) +{ + int ret, err; + sdev_plugin_t *spp, *iter; + vnode_t *vp, *nvp; + sdev_node_t *sdp, *slp; + timestruc_t now; + struct vattr vap; + + /* + * Some consumers don't care about why they failed. To keep the code + * simple, we'll just pretend they gave us something. + */ + if (errp == NULL) + errp = &err; + + if (sdev_plugin_name_isvalid(name, SDEV_PLUGIN_NAMELEN) == 0) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_version != 1) { + *errp = EINVAL; + return (NULL); + } + + if (ops->spo_validate == NULL || ops->spo_filldir == NULL || + ops->spo_inactive == NULL) { + *errp = EINVAL; + return (NULL); + } + + if ((ops->spo_flags & ~SDEV_PLUGIN_FLAGS_MASK) != 0) { + *errp = EINVAL; + return (NULL); + } + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, name, SDEV_PLUGIN_NAMELEN); + + spp->sp_pops = ops; + spp->sp_nflags = SDEV_DYNAMIC | SDEV_VTOR; + if (ops->spo_flags & SDEV_PLUGIN_NO_NCACHE) + spp->sp_nflags |= SDEV_NO_NCACHE; + if (ops->spo_flags & SDEV_PLUGIN_SUBDIR) + spp->sp_nflags |= SDEV_SUBDIR; + spp->sp_vnops = sdev_plugin_vnops; + spp->sp_islegacy = B_FALSE; + spp->sp_lvtor = NULL; + spp->sp_nnodes = 0; + + /* + * Make sure it's unique, nothing exists with this name already, and add + * it to the list. We also need to go through and grab the sdev + * root node as we cannot grab any sdev node locks once we've grabbed + * the sdev_plugin_lock. We effectively assert that if a directory is + * not present in the GZ's /dev, then it doesn't exist in any of the + * local zones. + */ + ret = vn_openat("/dev", UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, -1); + if (ret != 0) { + *errp = ret; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + /* Make sure we have the real vnode */ + if (VOP_REALVP(vp, &nvp, NULL) == 0) { + VN_HOLD(nvp); + VN_RELE(vp); + vp = nvp; + nvp = NULL; + } + VERIFY(vp->v_op == sdev_vnodeops); + sdp = VTOSDEV(vp); + rw_enter(&sdp->sdev_contents, RW_WRITER); + slp = sdev_cache_lookup(sdp, spp->sp_name); + if (slp != NULL) { + SDEV_RELE(slp); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + + mutex_enter(&sdev_plugin_lock); + for (iter = list_head(&sdev_plugin_list); iter != NULL; + iter = list_next(&sdev_plugin_list, iter)) { + if (strcmp(spp->sp_name, iter->sp_name) == 0) { + mutex_exit(&sdev_plugin_lock); + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + *errp = EEXIST; + kmem_cache_free(sdev_plugin_cache, spp); + return (NULL); + } + } + + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + /* + * Now go ahead and create the top level directory for the global zone. + */ + vap = *sdev_getdefault_attr(VDIR); + gethrestime(&now); + vap.va_atime = now; + vap.va_mtime = now; + vap.va_ctime = now; + + (void) sdev_plugin_mknode(spp, sdp, spp->sp_name, &vap); + + rw_exit(&sdp->sdev_contents); + VN_RELE(vp); + + return ((sdev_plugin_hdl_t)spp); +} + +static void +sdev_plugin_unregister_cb(sdev_node_t *rdp, void *arg) +{ + sdev_plugin_t *spp = arg; + sdev_node_t *sdp; + + rw_enter(&rdp->sdev_contents, RW_WRITER); + sdp = sdev_cache_lookup(rdp, spp->sp_name); + /* If it doesn't exist, we're done here */ + if (sdp == NULL) { + rw_exit(&rdp->sdev_contents); + return; + } + + /* + * We first delete the directory before recursively marking everything + * else stale. This ordering should ensure that we don't accidentally + * miss anything. + */ + sdev_cache_update(rdp, &sdp, spp->sp_name, SDEV_CACHE_DELETE); + sdev_stale(sdp); + SDEV_RELE(sdp); + rw_exit(&rdp->sdev_contents); +} + +/* + * Remove a plugin. This will block until everything has become a zombie, thus + * guaranteeing the caller that nothing will call into them again once this call + * returns. While the call is ongoing, it could be called into. Note that while + * this is ongoing, it will block other mounts. + */ +int +sdev_plugin_unregister(sdev_plugin_hdl_t hdl) +{ + sdev_plugin_t *spp = (sdev_plugin_t *)hdl; + if (spp->sp_islegacy) + return (EINVAL); + + mutex_enter(&sdev_plugin_lock); + list_remove(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + sdev_mnt_walk(sdev_plugin_unregister_cb, spp); + mutex_enter(&spp->sp_lock); + while (spp->sp_nnodes > 0) + cv_wait(&spp->sp_nodecv, &spp->sp_lock); + mutex_exit(&spp->sp_lock); + kmem_cache_free(sdev_plugin_cache, spp); + return (0); +} + +/* + * Register an old sdev style plugin to deal with what used to be in the vtab. + */ +static int +sdev_plugin_register_legacy(struct sdev_vop_table *vtp) +{ + sdev_plugin_t *spp; + + spp = kmem_cache_alloc(sdev_plugin_cache, KM_SLEEP); + (void) strlcpy(spp->sp_name, vtp->vt_name, SDEV_PLUGIN_NAMELEN); + spp->sp_islegacy = B_TRUE; + spp->sp_pops = NULL; + spp->sp_nflags = vtp->vt_flags; + spp->sp_lvtor = vtp->vt_vtor; + spp->sp_nnodes = 0; + + if (vtp->vt_service != NULL) { + fs_operation_def_t *templ; + templ = sdev_merge_vtab(vtp->vt_service); + if (vn_make_ops(vtp->vt_name, + (const fs_operation_def_t *)templ, + &spp->sp_vnops) != 0) { + cmn_err(CE_WARN, "%s: malformed vnode ops\n", + vtp->vt_name); + sdev_free_vtab(templ); + kmem_cache_free(sdev_plugin_cache, spp); + return (1); + } + + if (vtp->vt_global_vops) { + *(vtp->vt_global_vops) = spp->sp_vnops; + } + + sdev_free_vtab(templ); + } else { + spp->sp_vnops = sdev_vnodeops; + } + + /* + * No need to check for EEXIST here. These are loaded as a part of the + * sdev's initialization function. Further, we don't have to create them + * as that's taken care of in sdev's mount for the GZ. + */ + mutex_enter(&sdev_plugin_lock); + list_insert_tail(&sdev_plugin_list, spp); + mutex_exit(&sdev_plugin_lock); + + return (0); +} + +/* + * We need to match off of the sdev_path, not the sdev_name. We are only allowed + * to exist directly under /dev. + */ +static sdev_plugin_t * +sdev_match(sdev_node_t *dv) +{ + int vlen; + const char *path; + sdev_plugin_t *spp; + + if (strlen(dv->sdev_path) <= 5) + return (NULL); + + if (strncmp(dv->sdev_path, "/dev/", 5) != 0) + return (NULL); + path = dv->sdev_path + 5; + + mutex_enter(&sdev_plugin_lock); + + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + if (spp->sp_nflags & SDEV_SUBDIR) { + vlen = strlen(spp->sp_name); + if ((strncmp(spp->sp_name, path, + vlen - 1) == 0) && path[vlen] == '/') { + mutex_exit(&sdev_plugin_lock); + return (spp); + } + + } + } + + mutex_exit(&sdev_plugin_lock); + return (NULL); +} + +void +sdev_set_no_negcache(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + ASSERT(dv->sdev_path); + path = dv->sdev_path + strlen("/dev/"); + + mutex_enter(&sdev_plugin_lock); + for (spp = list_head(&sdev_plugin_list); spp != NULL; + spp = list_next(&sdev_plugin_list, spp)) { + if (strcmp(spp->sp_name, path) == 0) { + if (spp->sp_nflags & SDEV_NO_NCACHE) + dv->sdev_flags |= SDEV_NO_NCACHE; + break; + } + } + mutex_exit(&sdev_plugin_lock); +} + +struct vnodeops * +sdev_get_vop(sdev_node_t *dv) +{ + char *path; + sdev_plugin_t *spp; + + path = dv->sdev_path; + ASSERT(path); + + /* gets the relative path to /dev/ */ + path += 5; + + if ((spp = sdev_match(dv)) != NULL) { + dv->sdev_flags |= spp->sp_nflags; + if (SDEV_IS_PERSIST(dv->sdev_dotdot) && + (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) + dv->sdev_flags |= SDEV_PERSIST; + return (spp->sp_vnops); + } + + /* child inherits the persistence of the parent */ + if (SDEV_IS_PERSIST(dv->sdev_dotdot)) + dv->sdev_flags |= SDEV_PERSIST; + return (sdev_vnodeops); +} + +void * +sdev_get_vtor(sdev_node_t *dv) +{ + sdev_plugin_t *spp; + + if (dv->sdev_private == NULL) { + spp = sdev_match(dv); + if (spp == NULL) + return (NULL); + } else { + spp = dv->sdev_private; + } + + if (spp->sp_islegacy) + return ((void *)spp->sp_lvtor); + else + return ((void *)sdev_plugin_validate); +} + +void +sdev_plugin_nodeready(sdev_node_t *sdp) +{ + sdev_plugin_t *spp; + + ASSERT(RW_WRITE_HELD(&sdp->sdev_contents)); + ASSERT(sdp->sdev_private == NULL); + + spp = sdev_match(sdp); + if (spp == NULL) + return; + if (spp->sp_islegacy) + return; + sdp->sdev_private = spp; + mutex_enter(&spp->sp_lock); + spp->sp_nnodes++; + mutex_exit(&spp->sp_lock); +} + +int +sdev_plugin_init(void) +{ + sdev_vop_table_t *vtp; + fs_operation_def_t *templ; + + sdev_plugin_cache = kmem_cache_create("sdev_plugin", + sizeof (sdev_plugin_t), 0, sdev_plugin_cache_constructor, + sdev_plugin_cache_destructor, NULL, NULL, NULL, 0); + if (sdev_plugin_cache == NULL) + return (1); + mutex_init(&sdev_plugin_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&sdev_plugin_list, sizeof (sdev_plugin_t), + offsetof(sdev_plugin_t, sp_link)); + + /* + * Register all of the legacy vnops + */ + for (vtp = &vtab[0]; vtp->vt_name != NULL; vtp++) + if (sdev_plugin_register_legacy(vtp) != 0) + return (1); + + templ = sdev_merge_vtab(sdev_plugin_vnodeops_tbl); + if (vn_make_ops("sdev_plugin", + (const fs_operation_def_t *)templ, + &sdev_plugin_vnops) != 0) { + sdev_free_vtab(templ); + return (1); + } + + sdev_free_vtab(templ); + return (0); +} diff --git a/usr/src/uts/common/fs/dev/sdev_subr.c b/usr/src/uts/common/fs/dev/sdev_subr.c index b4b27e6285..16439a66a6 100644 --- a/usr/src/uts/common/fs/dev/sdev_subr.c +++ b/usr/src/uts/common/fs/dev/sdev_subr.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -150,12 +150,6 @@ vattr_t sdev_vattr_chr = { kmem_cache_t *sdev_node_cache; /* sdev_node cache */ int devtype; /* fstype */ -/* static */ -static struct vnodeops *sdev_get_vop(struct sdev_node *); -static void sdev_set_no_negcache(struct sdev_node *); -static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []); -static void sdev_free_vtab(fs_operation_def_t *); - static void sdev_prof_free(struct sdev_node *dv) { @@ -318,6 +312,7 @@ sdev_nodeinit(struct sdev_node *ddv, char *nm, struct sdev_node **newdv, (void) snprintf(dv->sdev_path, len, "%s/%s", ddv->sdev_path, nm); /* overwritten for VLNK nodes */ dv->sdev_symlink = NULL; + list_link_init(&dv->sdev_plist); vp = SDEVTOV(dv); vn_reinit(vp); @@ -406,6 +401,7 @@ sdev_nodeready(struct sdev_node *dv, struct vattr *vap, struct vnode *avp, } else { dv->sdev_nlink = 1; } + sdev_plugin_nodeready(dv); if (!(SDEV_IS_GLOBAL(dv))) { dv->sdev_origin = (struct sdev_node *)args; @@ -502,37 +498,22 @@ sdev_mkroot(struct vfs *vfsp, dev_t devdev, struct vnode *mvp, return (dv); } -/* directory dependent vop table */ -struct sdev_vop_table { - char *vt_name; /* subdirectory name */ - const fs_operation_def_t *vt_service; /* vnodeops table */ - struct vnodeops *vt_vops; /* constructed vop */ - struct vnodeops **vt_global_vops; /* global container for vop */ - int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ - int vt_flags; -}; - -/* - * A nice improvement would be to provide a plug-in mechanism - * for this table instead of a const table. - */ -static struct sdev_vop_table vtab[] = -{ - { "pts", devpts_vnodeops_tbl, NULL, &devpts_vnodeops, devpts_validate, +struct sdev_vop_table vtab[] = { + { "pts", devpts_vnodeops_tbl, &devpts_vnodeops, devpts_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate, + { "vt", devvt_vnodeops_tbl, &devvt_vnodeops, devvt_validate, SDEV_DYNAMIC | SDEV_VTOR }, - { "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops, + { "zvol", devzvol_vnodeops_tbl, &devzvol_vnodeops, devzvol_validate, SDEV_ZONED | SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE }, + { "zcons", NULL, NULL, NULL, SDEV_NO_NCACHE }, - { "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate, - SDEV_DYNAMIC | SDEV_VTOR }, + { "net", devnet_vnodeops_tbl, &devnet_vnodeops, devnet_validate, + SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR }, - { "ipnet", devipnet_vnodeops_tbl, NULL, &devipnet_vnodeops, + { "ipnet", devipnet_vnodeops_tbl, &devipnet_vnodeops, devipnet_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_NO_NCACHE }, /* @@ -547,132 +528,14 @@ static struct sdev_vop_table vtab[] = * preventing a mkdir. */ - { "lofi", NULL, NULL, NULL, NULL, + { "lofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { "rlofi", NULL, NULL, NULL, NULL, + { "rlofi", NULL, NULL, NULL, SDEV_ZONED | SDEV_DYNAMIC | SDEV_PERSIST }, - { NULL, NULL, NULL, NULL, NULL, 0} + { NULL, NULL, NULL, NULL, 0} }; -/* - * We need to match off of the sdev_path, not the sdev_name. We are only allowed - * to exist directly under /dev. - */ -struct sdev_vop_table * -sdev_match(struct sdev_node *dv) -{ - int vlen; - int i; - const char *path; - - if (strlen(dv->sdev_path) <= 5) - return (NULL); - - if (strncmp(dv->sdev_path, "/dev/", 5) != 0) - return (NULL); - path = dv->sdev_path + 5; - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) - return (&vtab[i]); - if (vtab[i].vt_flags & SDEV_SUBDIR) { - vlen = strlen(vtab[i].vt_name); - if ((strncmp(vtab[i].vt_name, path, - vlen - 1) == 0) && path[vlen] == '/') - return (&vtab[i]); - } - - } - return (NULL); -} - -/* - * sets a directory's vnodeops if the directory is in the vtab; - */ -static struct vnodeops * -sdev_get_vop(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - char *path; - - path = dv->sdev_path; - ASSERT(path); - - /* gets the relative path to /dev/ */ - path += 5; - - /* gets the vtab entry it matches */ - if ((vtp = sdev_match(dv)) != NULL) { - dv->sdev_flags |= vtp->vt_flags; - if (SDEV_IS_PERSIST(dv->sdev_dotdot) && - (SDEV_IS_PERSIST(dv) || !SDEV_IS_DYNAMIC(dv))) - dv->sdev_flags |= SDEV_PERSIST; - - if (vtp->vt_vops) { - if (vtp->vt_global_vops) - *(vtp->vt_global_vops) = vtp->vt_vops; - - return (vtp->vt_vops); - } - - if (vtp->vt_service) { - fs_operation_def_t *templ; - templ = sdev_merge_vtab(vtp->vt_service); - if (vn_make_ops(vtp->vt_name, - (const fs_operation_def_t *)templ, - &vtp->vt_vops) != 0) { - cmn_err(CE_PANIC, "%s: malformed vnode ops\n", - vtp->vt_name); - /*NOTREACHED*/ - } - if (vtp->vt_global_vops) { - *(vtp->vt_global_vops) = vtp->vt_vops; - } - sdev_free_vtab(templ); - - return (vtp->vt_vops); - } - - return (sdev_vnodeops); - } - - /* child inherits the persistence of the parent */ - if (SDEV_IS_PERSIST(dv->sdev_dotdot)) - dv->sdev_flags |= SDEV_PERSIST; - - return (sdev_vnodeops); -} - -static void -sdev_set_no_negcache(struct sdev_node *dv) -{ - int i; - char *path; - - ASSERT(dv->sdev_path); - path = dv->sdev_path + strlen("/dev/"); - - for (i = 0; vtab[i].vt_name; i++) { - if (strcmp(vtab[i].vt_name, path) == 0) { - if (vtab[i].vt_flags & SDEV_NO_NCACHE) - dv->sdev_flags |= SDEV_NO_NCACHE; - break; - } - } -} - -void * -sdev_get_vtor(struct sdev_node *dv) -{ - struct sdev_vop_table *vtp; - - vtp = sdev_match(dv); - if (vtp) - return ((void *)vtp->vt_vtor); - else - return (NULL); -} /* * Build the base root inode @@ -952,8 +815,11 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) dv->sdev_path = NULL; } - if (!SDEV_IS_GLOBAL(dv)) + if (!SDEV_IS_GLOBAL(dv)) { sdev_prof_free(dv); + if (dv->sdev_vnode->v_type != VLNK && dv->sdev_origin != NULL) + SDEV_RELE(dv->sdev_origin); + } if (SDEVTOV(dv)->v_type == VDIR) { ASSERT(SDEV_FIRST_ENTRY(dv) == NULL); @@ -967,6 +833,7 @@ sdev_nodedestroy(struct sdev_node *dv, uint_t flags) (void) memset((void *)&dv->sdev_instance_data, 0, sizeof (dv->sdev_instance_data)); vn_invalid(SDEVTOV(dv)); + dv->sdev_private = NULL; kmem_cache_free(sdev_node_cache, dv); } @@ -2948,46 +2815,6 @@ sdev_modctl_devexists(const char *path) return (error); } -extern int sdev_vnodeops_tbl_size; - -/* - * construct a new template with overrides from vtab - */ -static fs_operation_def_t * -sdev_merge_vtab(const fs_operation_def_t tab[]) -{ - fs_operation_def_t *new; - const fs_operation_def_t *tab_entry; - - /* make a copy of standard vnode ops table */ - new = kmem_alloc(sdev_vnodeops_tbl_size, KM_SLEEP); - bcopy((void *)sdev_vnodeops_tbl, new, sdev_vnodeops_tbl_size); - - /* replace the overrides from tab */ - for (tab_entry = tab; tab_entry->name != NULL; tab_entry++) { - fs_operation_def_t *std_entry = new; - while (std_entry->name) { - if (strcmp(tab_entry->name, std_entry->name) == 0) { - std_entry->func = tab_entry->func; - break; - } - std_entry++; - } - if (std_entry->name == NULL) - cmn_err(CE_NOTE, "sdev_merge_vtab: entry %s unused.", - tab_entry->name); - } - - return (new); -} - -/* free memory allocated by sdev_merge_vtab */ -static void -sdev_free_vtab(fs_operation_def_t *new) -{ - kmem_free(new, sdev_vnodeops_tbl_size); -} - /* * a generic setattr() function * diff --git a/usr/src/uts/common/fs/dev/sdev_vfsops.c b/usr/src/uts/common/fs/dev/sdev_vfsops.c index ea9cb6374a..6f32f47635 100644 --- a/usr/src/uts/common/fs/dev/sdev_vfsops.c +++ b/usr/src/uts/common/fs/dev/sdev_vfsops.c @@ -169,7 +169,13 @@ devinit(int fstype, char *name) if ((devmajor = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "%s: can't get unique dev", sdev_vfssw.name); - return (1); + return (ENXIO); + } + + if (sdev_plugin_init() != 0) { + cmn_err(CE_WARN, "%s: failed to set init plugin subsystem", + sdev_vfssw.name); + return (EIO); } /* initialize negative cache */ @@ -332,6 +338,7 @@ sdev_mount(struct vfs *vfsp, struct vnode *mvp, struct mounta *uap, ASSERT(sdev_origins); dv->sdev_flags &= ~SDEV_GLOBAL; dv->sdev_origin = sdev_origins->sdev_root; + SDEV_HOLD(dv->sdev_origin); } else { sdev_ncache_setup(); rw_enter(&dv->sdev_contents, RW_WRITER); @@ -504,3 +511,17 @@ sdev_mntinfo_rele(struct sdev_data *mntinfo) SDEVTOV(mntinfo->sdev_root)->v_count--; mutex_exit(&sdev_lock); } + +void +sdev_mnt_walk(void (*func)(struct sdev_node *, void *), void *arg) +{ + struct sdev_data *mntinfo; + + mutex_enter(&sdev_lock); + mntinfo = sdev_mntinfo; + while (mntinfo != NULL) { + func(mntinfo->sdev_root, arg); + mntinfo = mntinfo->sdev_next; + } + mutex_exit(&sdev_lock); +} diff --git a/usr/src/uts/common/fs/dev/sdev_zvolops.c b/usr/src/uts/common/fs/dev/sdev_zvolops.c index e4c3acf787..495cf450de 100644 --- a/usr/src/uts/common/fs/dev/sdev_zvolops.c +++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* vnode ops for the /dev/zvol directory */ @@ -370,8 +370,10 @@ devzvol_create_pool_dirs(struct vnode *dvp) ASSERT(dvp->v_count > 0); rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0, NULL, kcred, NULL, 0, NULL); - /* should either work, or not be visible from a zone */ - ASSERT(rc == 0 || rc == ENOENT); + /* + * should either work or we should get an error if this should + * not be visible from the zone, or disallowed in the zone + */ if (rc == 0) VN_RELE(vp); pools++; diff --git a/usr/src/uts/common/fs/dnlc.c b/usr/src/uts/common/fs/dnlc.c index 25327d2852..c949117da6 100644 --- a/usr/src/uts/common/fs/dnlc.c +++ b/usr/src/uts/common/fs/dnlc.c @@ -921,50 +921,6 @@ dnlc_fs_purge1(vnodeops_t *vop) } /* - * Perform a reverse lookup in the DNLC. This will find the first occurrence of - * the vnode. If successful, it will return the vnode of the parent, and the - * name of the entry in the given buffer. If it cannot be found, or the buffer - * is too small, then it will return NULL. Note that this is a highly - * inefficient function, since the DNLC is constructed solely for forward - * lookups. - */ -vnode_t * -dnlc_reverse_lookup(vnode_t *vp, char *buf, size_t buflen) -{ - nc_hash_t *nch; - ncache_t *ncp; - vnode_t *pvp; - - if (!doingcache) - return (NULL); - - for (nch = nc_hash; nch < &nc_hash[nc_hashsz]; nch++) { - mutex_enter(&nch->hash_lock); - ncp = nch->hash_next; - while (ncp != (ncache_t *)nch) { - /* - * We ignore '..' entries since it can create - * confusion and infinite loops. - */ - if (ncp->vp == vp && !(ncp->namlen == 2 && - 0 == bcmp(ncp->name, "..", 2)) && - ncp->namlen < buflen) { - bcopy(ncp->name, buf, ncp->namlen); - buf[ncp->namlen] = '\0'; - pvp = ncp->dp; - /* VN_HOLD 2 of 2 in this file */ - VN_HOLD_CALLER(pvp); - mutex_exit(&nch->hash_lock); - return (pvp); - } - ncp = ncp->hash_next; - } - mutex_exit(&nch->hash_lock); - } - - return (NULL); -} -/* * Utility routine to search for a cache entry. Return the * ncache entry if found, NULL otherwise. */ diff --git a/usr/src/uts/common/fs/fem.c b/usr/src/uts/common/fs/fem.c index b4e28cc860..5f524def30 100644 --- a/usr/src/uts/common/fs/fem.c +++ b/usr/src/uts/common/fs/fem.c @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/atomic.h> #include <sys/kmem.h> @@ -33,11 +37,12 @@ #include <sys/systm.h> #include <sys/cmn_err.h> #include <sys/debug.h> - #include <sys/fem.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> +#include <sys/stack.h> +#include <sys/archsystm.h> #define NNODES_DEFAULT 8 /* Default number of nodes in a fem_list */ /* @@ -291,6 +296,536 @@ _op_find(femarg_t *ap, void **fp, int offs0, int offs1) } #endif +/* + * File event monitoring handoffs + * + * File event monitoring relies on being able to inject stack frames between + * vnode consumers and the underlying file systems. This becomes problematic + * when there exist many monitors, as kernel stack depth is finite. The model + * very much encodes this injected frame: the flow of control deliberately + * lies with the monitor, not with the monitoring system. While we could + * conceivably address this by allowing each subsystem to install at most + * one monitor per vnode (and impose on subsystems that they handle any + * of their own consumer multiplexing internally), this in fact exports a + * substantial amount of run-time complexity to deal with an uncommon case + * (and, it must be said, assumes a small number of consuming subsystems). + * To allow our abstraction to remain clean, we instead check our remaining + * stack in every vnext_*() call; if the amount of stack remaining is lower + * than a threshold (fem_stack_needed), we call thread_splitstack() to carry + * on the execution of the monitors and the underlying vnode operation on a + * split stack. Because we can only pass a single argument to our split stack + * function, we must marshal our arguments, the mechanics of which are somewhat + * ornate in terms of the code: to marshal in a type-safe manner, we define a + * baton that is a union of payload structures for each kind of operation, + * loading the per-operation payload explicitly and calling into common handoff + * code that itself calls thread_splitstack(). The function passed to + * thread_splitstack() is a per-entry point function that continues monitor + * processing given the specified (marshalled) arguments. While this method + * is a little verbose to implement, it has the advantage of being relatively + * robust (that is, broadly type-safe) while imposing minimal burden on each + * vnext_*() entry point. + * + * In terms of the implementation: + * + * - The FEM_BATON_n macros define the per-entry point baton structures + * - The fem_baton_payload_t contains the union of these structures + * - The FEM_VNEXTn_DECL macros declare the post-handoff entry point + * - The FEM_VNEXTn macros constitute the per-handoff entry point + * + * Note that we don't use variadic macros -- we define a variant of these + * macros for each of our relevant argument counts. This may seem overly + * explicit, but it is deliberate: the object here is to minimize the + * future maintenance burden by minimizing the likelihood of introduced + * error -- not to minimize the number of characters in this source file. + */ + +#ifndef STACK_GROWTH_DOWN +#error Downward stack growth assumed. +#endif + +int fem_stack_toodeep; +uintptr_t fem_stack_needed = 8 * 1024; +size_t fem_handoff_stacksize = 128 * 1024; + +#define FEM_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \ + (uintptr_t)curthread->t_stkbase < fem_stack_needed) + +#define FEM_BATON_1(what, t0, l0) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + } fb_##what + +#define FEM_BATON_2(what, t0, l0, t1, l1) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + } fb_##what + +#define FEM_BATON_3(what, t0, l0, t1, l1, t2, l2) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + } fb_##what + +#define FEM_BATON_4(what, t0, l0, t1, l1, t2, l2, t3, l3) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + } fb_##what + +#define FEM_BATON_5(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + } fb_##what + +#define FEM_BATON_6(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + } fb_##what + +#define FEM_BATON_8(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + } fb_##what + +#define FEM_BATON_9(what, t0, l0, t1, l1, t2, l2, t3, l3, t4, l4, t5, l5, \ + t6, l6, t7, l7, t8, l8) \ + struct { \ + void *fb_##what##_arg0; \ + caller_context_t *fb_##what##_ct; \ + t0 fb_##what##_##l0; \ + t1 fb_##what##_##l1; \ + t2 fb_##what##_##l2; \ + t3 fb_##what##_##l3; \ + t4 fb_##what##_##l4; \ + t5 fb_##what##_##l5; \ + t6 fb_##what##_##l6; \ + t7 fb_##what##_##l7; \ + t8 fb_##what##_##l8; \ + } fb_##what + +typedef union { + FEM_BATON_2(open, int, mode, cred_t *, cr); + FEM_BATON_4(close, int, flag, int, count, + offset_t, offset, cred_t *, cr); + FEM_BATON_3(read, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_3(write, uio_t *, uiop, int, ioflag, cred_t *, cr); + FEM_BATON_5(ioctl, int, cmd, intptr_t, arg, + int, flag, cred_t *, cr, int *, rvalp); + FEM_BATON_3(setfl, int, oflags, int, nflags, cred_t *, cr); + FEM_BATON_3(getattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(setattr, vattr_t *, vap, int, flags, cred_t *, cr); + FEM_BATON_3(access, int, mode, int, flags, cred_t *, cr); + FEM_BATON_8(lookup, char *, nm, vnode_t **, vpp, + pathname_t *, pnp, int, flags, vnode_t *, rdir, + cred_t *, cr, int *, direntflags, pathname_t *, realpnp); + FEM_BATON_8(create, char *, name, vattr_t *, vap, + vcexcl_t, excl, int, mode, vnode_t **, vpp, + cred_t *, cr, int, flag, vsecattr_t *, vsecp); + FEM_BATON_3(remove, char *, nm, cred_t *, cr, int, flags); + FEM_BATON_4(link, vnode_t *, svp, char *, tnm, + cred_t *, cr, int, flags); + FEM_BATON_5(rename, char *, snm, vnode_t *, tdvp, + char *, tnm, cred_t *, cr, int, flags); + FEM_BATON_6(mkdir, char *, dirname, vattr_t *, vap, + vnode_t **, vpp, cred_t *, cr, int, flags, + vsecattr_t *, vsecp); + FEM_BATON_4(rmdir, char *, nm, vnode_t *, cdir, + cred_t *, cr, int, flags); + FEM_BATON_4(readdir, uio_t *, uiop, cred_t *, cr, + int *, eofp, int, flags); + FEM_BATON_5(symlink, char *, linkname, vattr_t *, vap, + char *, target, cred_t *, cr, int, flags); + FEM_BATON_2(readlink, uio_t *, uiop, cred_t *, cr); + FEM_BATON_2(fsync, int, syncflag, cred_t *, cr); + FEM_BATON_1(inactive, cred_t *, cr); + FEM_BATON_1(fid, fid_t *, fidp); + FEM_BATON_1(rwlock, int, write_lock); + FEM_BATON_1(rwunlock, int, write_lock); + FEM_BATON_2(seek, offset_t, ooff, offset_t *, noffp); + FEM_BATON_1(cmp, vnode_t *, vp2); + FEM_BATON_6(frlock, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, struct flk_callback *, flk_cbp, + cred_t *, cr); + FEM_BATON_5(space, int, cmd, struct flock64 *, bfp, + int, flag, offset_t, offset, cred_t *, cr); + FEM_BATON_1(realvp, vnode_t **, vpp); + FEM_BATON_9(getpage, offset_t, off, size_t, len, + uint_t *, protp, struct page **, plarr, size_t, plsz, + struct seg *, seg, caddr_t, addr, enum seg_rw, rw, + cred_t *, cr); + FEM_BATON_4(putpage, offset_t, off, size_t, len, + int, flags, cred_t *, cr); + FEM_BATON_8(map, offset_t, off, struct as *, as, + caddr_t *, addrp, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(addmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uchar_t, prot, + uchar_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_8(delmap, offset_t, off, struct as *, as, + caddr_t, addr, size_t, len, uint_t, prot, + uint_t, maxprot, uint_t, flags, cred_t *, cr); + FEM_BATON_4(poll, short, events, int, anyyet, + short *, reventsp, struct pollhead **, phpp); + FEM_BATON_3(dump, caddr_t, addr, offset_t, lbdn, offset_t, dblks); + FEM_BATON_3(pathconf, int, cmd, ulong_t *, valp, cred_t *, cr); + FEM_BATON_5(pageio, struct page *, pp, u_offset_t, io_off, + size_t, io_len, int, flags, cred_t *, cr); + FEM_BATON_2(dumpctl, int, action, offset_t *, blkp); + FEM_BATON_4(dispose, struct page *, pp, int, flag, + int, dn, cred_t *, cr); + FEM_BATON_3(setsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_3(getsecattr, vsecattr_t *, vsap, int, flag, cred_t *, cr); + FEM_BATON_4(shrlock, int, cmd, struct shrlock *, shr, + int, flag, cred_t *, cr); + FEM_BATON_3(vnevent, vnevent_t, vnevent, vnode_t *, dvp, char *, cname); + FEM_BATON_3(reqzcbuf, enum uio_rw, ioflag, + xuio_t *, xuiop, cred_t *, cr); + FEM_BATON_2(retzcbuf, xuio_t *, xuiop, cred_t *, cr); +} fem_baton_payload_t; + +typedef struct { + fem_baton_payload_t fb_payload; + int (*fb_func)(); + void (*fb_handoff)(); + int fb_rval; +} fem_baton_t; + +static int +fem_handoff(fem_baton_t *bp) +{ + fem_stack_toodeep++; + thread_splitstack(bp->fb_handoff, bp, fem_handoff_stacksize); + + return (bp->fb_rval); +} + +#define FEM_VNEXT3_DECL(what, a0, a1, a2) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2); \ +} + +#define FEM_VNEXT4_DECL(what, a0, a1, a2, a3) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3); \ +} + +#define FEM_VNEXT5_DECL(what, a0, a1, a2, a3, a4) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4); \ +} + +#define FEM_VNEXT6_DECL(what, a0, a1, a2, a3, a4, a5) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5); \ +} + +#define FEM_VNEXT7_DECL(what, a0, a1, a2, a3, a4, a5, a6) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6); \ +} + +#define FEM_VNEXT8_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7); \ +} + +#define FEM_VNEXT10_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9); \ +} + +#define FEM_VNEXT11_DECL(what, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ +void \ +fem_handoff_##what(fem_baton_t *bp) \ +{ \ + bp->fb_rval = bp->fb_func( \ + bp->fb_payload.fb_##what.fb_##what##_##a0, \ + bp->fb_payload.fb_##what.fb_##what##_##a1, \ + bp->fb_payload.fb_##what.fb_##what##_##a2, \ + bp->fb_payload.fb_##what.fb_##what##_##a3, \ + bp->fb_payload.fb_##what.fb_##what##_##a4, \ + bp->fb_payload.fb_##what.fb_##what##_##a5, \ + bp->fb_payload.fb_##what.fb_##what##_##a6, \ + bp->fb_payload.fb_##what.fb_##what##_##a7, \ + bp->fb_payload.fb_##what.fb_##what##_##a8, \ + bp->fb_payload.fb_##what.fb_##what##_##a9, \ + bp->fb_payload.fb_##what.fb_##what##_##a10); \ +} + +#define FEM_VNEXT3(what, func, a0, a1, a2) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2)) + +#define FEM_VNEXT4(what, func, a0, a1, a2, a3) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3)) + +#define FEM_VNEXT5(what, func, a0, a1, a2, a3, a4) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4)) + +#define FEM_VNEXT6(what, func, a0, a1, a2, a3, a4, a5) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5)) + +#define FEM_VNEXT7(what, func, a0, a1, a2, a3, a4, a5, a6) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6)) + +#define FEM_VNEXT8(what, func, a0, a1, a2, a3, a4, a5, a6, a7) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7)) + +#define FEM_VNEXT10(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)) + +#define FEM_VNEXT11(what, func, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \ + if (FEM_TOODEEP()) { \ + fem_baton_t *baton; \ + int rval; \ + \ + baton = kmem_alloc(sizeof (fem_baton_t), KM_SLEEP); \ + baton->fb_payload.fb_##what.fb_##what##_##a0 = a0; \ + baton->fb_payload.fb_##what.fb_##what##_##a1 = a1; \ + baton->fb_payload.fb_##what.fb_##what##_##a2 = a2; \ + baton->fb_payload.fb_##what.fb_##what##_##a3 = a3; \ + baton->fb_payload.fb_##what.fb_##what##_##a4 = a4; \ + baton->fb_payload.fb_##what.fb_##what##_##a5 = a5; \ + baton->fb_payload.fb_##what.fb_##what##_##a6 = a6; \ + baton->fb_payload.fb_##what.fb_##what##_##a7 = a7; \ + baton->fb_payload.fb_##what.fb_##what##_##a8 = a8; \ + baton->fb_payload.fb_##what.fb_##what##_##a9 = a9; \ + baton->fb_payload.fb_##what.fb_##what##_##a10 = a10; \ + baton->fb_handoff = fem_handoff_##what; \ + baton->fb_func = func; \ + \ + rval = fem_handoff(baton); \ + kmem_free(baton, sizeof (fem_baton_t)); \ + \ + return (rval); \ + } \ + return (func(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)) + static fem_t * fem_alloc() { @@ -2036,10 +2571,60 @@ static struct fs_operation_def fshead_vfs_spec[] = { * 5. Return by invoking the base operation with the base object. * * for each classification, there needs to be at least one "next" operation - * for each "head"operation. - * + * for each "head" operation. Note that we also use the FEM_VNEXTn_DECL macros + * to define the function to run when the stack is split; see the discussion + * on "File event monitoring handoffs", above. */ +FEM_VNEXT4_DECL(open, arg0, mode, cr, ct) +FEM_VNEXT6_DECL(close, arg0, flag, count, offset, cr, ct) +FEM_VNEXT5_DECL(read, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT5_DECL(write, arg0, uiop, ioflag, cr, ct) +FEM_VNEXT7_DECL(ioctl, arg0, cmd, arg, flag, cr, rvalp, ct) +FEM_VNEXT5_DECL(setfl, arg0, oflags, nflags, cr, ct) +FEM_VNEXT5_DECL(getattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(setattr, arg0, vap, flags, cr, ct) +FEM_VNEXT5_DECL(access, arg0, mode, flags, cr, ct) +FEM_VNEXT10_DECL(lookup, arg0, nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp) +FEM_VNEXT10_DECL(create, arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp) +FEM_VNEXT5_DECL(remove, arg0, nm, cr, ct, flags) +FEM_VNEXT6_DECL(link, arg0, svp, tnm, cr, ct, flags) +FEM_VNEXT7_DECL(rename, arg0, snm, tdvp, tnm, cr, ct, flags) +FEM_VNEXT8_DECL(mkdir, arg0, dirname, vap, vpp, cr, ct, flags, vsecp) +FEM_VNEXT6_DECL(rmdir, arg0, nm, cdir, cr, ct, flags) +FEM_VNEXT6_DECL(readdir, arg0, uiop, cr, eofp, ct, flags) +FEM_VNEXT7_DECL(symlink, arg0, linkname, vap, target, cr, ct, flags) +FEM_VNEXT4_DECL(readlink, arg0, uiop, cr, ct) +FEM_VNEXT4_DECL(fsync, arg0, syncflag, cr, ct) +FEM_VNEXT3_DECL(fid, arg0, fidp, ct) +FEM_VNEXT3_DECL(rwlock, arg0, write_lock, ct) +FEM_VNEXT4_DECL(seek, arg0, ooff, noffp, ct) +FEM_VNEXT3_DECL(cmp, arg0, vp2, ct) +FEM_VNEXT8_DECL(frlock, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct) +FEM_VNEXT7_DECL(space, arg0, cmd, bfp, flag, offset, cr, ct) +FEM_VNEXT3_DECL(realvp, arg0, vpp, ct) +FEM_VNEXT11_DECL(getpage, arg0, off, len, protp, plarr, plsz, + seg, addr, rw, cr, ct) +FEM_VNEXT6_DECL(putpage, arg0, off, len, flags, cr, ct) +FEM_VNEXT10_DECL(map, arg0, off, as, addrp, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(addmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT10_DECL(delmap, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct) +FEM_VNEXT6_DECL(poll, arg0, events, anyyet, reventsp, phpp, ct) +FEM_VNEXT5_DECL(dump, arg0, addr, lbdn, dblks, ct) +FEM_VNEXT5_DECL(pathconf, arg0, cmd, valp, cr, ct) +FEM_VNEXT7_DECL(pageio, arg0, pp, io_off, io_len, flags, cr, ct) +FEM_VNEXT4_DECL(dumpctl, arg0, action, blkp, ct) +FEM_VNEXT5_DECL(setsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT5_DECL(getsecattr, arg0, vsap, flag, cr, ct) +FEM_VNEXT6_DECL(shrlock, arg0, cmd, shr, flag, cr, ct) +FEM_VNEXT5_DECL(vnevent, arg0, vnevent, dvp, cname, ct) +FEM_VNEXT5_DECL(reqzcbuf, arg0, ioflag, xuiop, cr, ct) +FEM_VNEXT4_DECL(retzcbuf, arg0, xuiop, cr, ct) + int vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) { @@ -2051,7 +2636,7 @@ vnext_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_open, femop_open); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, cr, ct)); + FEM_VNEXT4(open, func, arg0, mode, cr, ct); } int @@ -2066,7 +2651,7 @@ vnext_close(femarg_t *vf, int flag, int count, offset_t offset, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_close, femop_close); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, flag, count, offset, cr, ct)); + FEM_VNEXT6(close, func, arg0, flag, count, offset, cr, ct); } int @@ -2081,7 +2666,7 @@ vnext_read(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_read, femop_read); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(read, func, arg0, uiop, ioflag, cr, ct); } int @@ -2096,7 +2681,7 @@ vnext_write(femarg_t *vf, uio_t *uiop, int ioflag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_write, femop_write); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, ioflag, cr, ct)); + FEM_VNEXT5(write, func, arg0, uiop, ioflag, cr, ct); } int @@ -2111,7 +2696,7 @@ vnext_ioctl(femarg_t *vf, int cmd, intptr_t arg, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_ioctl, femop_ioctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, arg, flag, cr, rvalp, ct)); + FEM_VNEXT7(ioctl, func, arg0, cmd, arg, flag, cr, rvalp, ct); } int @@ -2126,7 +2711,7 @@ vnext_setfl(femarg_t *vf, int oflags, int nflags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setfl, femop_setfl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, oflags, nflags, cr, ct)); + FEM_VNEXT5(setfl, func, arg0, oflags, nflags, cr, ct); } int @@ -2141,7 +2726,7 @@ vnext_getattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getattr, femop_getattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(getattr, func, arg0, vap, flags, cr, ct); } int @@ -2156,7 +2741,7 @@ vnext_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setattr, femop_setattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vap, flags, cr, ct)); + FEM_VNEXT5(setattr, func, arg0, vap, flags, cr, ct); } int @@ -2171,7 +2756,7 @@ vnext_access(femarg_t *vf, int mode, int flags, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_access, femop_access); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, mode, flags, cr, ct)); + FEM_VNEXT5(access, func, arg0, mode, flags, cr, ct); } int @@ -2187,8 +2772,8 @@ vnext_lookup(femarg_t *vf, char *nm, vnode_t **vpp, pathname_t *pnp, vsop_find(vf, &func, int, &arg0, vop_lookup, femop_lookup); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, vpp, pnp, flags, rdir, cr, ct, - direntflags, realpnp)); + FEM_VNEXT10(lookup, func, arg0, nm, vpp, pnp, flags, rdir, cr, ct, + direntflags, realpnp); } int @@ -2204,7 +2789,8 @@ vnext_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, vsop_find(vf, &func, int, &arg0, vop_create, femop_create); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, name, vap, excl, mode, vpp, cr, flag, ct, vsecp)); + FEM_VNEXT10(create, func, arg0, name, vap, excl, + mode, vpp, cr, flag, ct, vsecp); } int @@ -2219,7 +2805,7 @@ vnext_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, vsop_find(vf, &func, int, &arg0, vop_remove, femop_remove); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cr, ct, flags)); + FEM_VNEXT5(remove, func, arg0, nm, cr, ct, flags); } int @@ -2234,7 +2820,7 @@ vnext_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_link, femop_link); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, svp, tnm, cr, ct, flags)); + FEM_VNEXT6(link, func, arg0, svp, tnm, cr, ct, flags); } int @@ -2249,7 +2835,7 @@ vnext_rename(femarg_t *vf, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rename, femop_rename); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, snm, tdvp, tnm, cr, ct, flags)); + FEM_VNEXT7(rename, func, arg0, snm, tdvp, tnm, cr, ct, flags); } int @@ -2264,7 +2850,7 @@ vnext_mkdir(femarg_t *vf, char *dirname, vattr_t *vap, vnode_t **vpp, vsop_find(vf, &func, int, &arg0, vop_mkdir, femop_mkdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, dirname, vap, vpp, cr, ct, flags, vsecp)); + FEM_VNEXT8(mkdir, func, arg0, dirname, vap, vpp, cr, ct, flags, vsecp); } int @@ -2279,7 +2865,7 @@ vnext_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_rmdir, femop_rmdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, nm, cdir, cr, ct, flags)); + FEM_VNEXT6(rmdir, func, arg0, nm, cdir, cr, ct, flags); } int @@ -2294,7 +2880,7 @@ vnext_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, vsop_find(vf, &func, int, &arg0, vop_readdir, femop_readdir); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, eofp, ct, flags)); + FEM_VNEXT6(readdir, func, arg0, uiop, cr, eofp, ct, flags); } int @@ -2309,7 +2895,7 @@ vnext_symlink(femarg_t *vf, char *linkname, vattr_t *vap, char *target, vsop_find(vf, &func, int, &arg0, vop_symlink, femop_symlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, linkname, vap, target, cr, ct, flags)); + FEM_VNEXT7(symlink, func, arg0, linkname, vap, target, cr, ct, flags); } int @@ -2323,7 +2909,7 @@ vnext_readlink(femarg_t *vf, uio_t *uiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_readlink, femop_readlink); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, uiop, cr, ct)); + FEM_VNEXT4(readlink, func, arg0, uiop, cr, ct); } int @@ -2337,7 +2923,7 @@ vnext_fsync(femarg_t *vf, int syncflag, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fsync, femop_fsync); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, syncflag, cr, ct)); + FEM_VNEXT4(fsync, func, arg0, syncflag, cr, ct); } void @@ -2365,7 +2951,7 @@ vnext_fid(femarg_t *vf, fid_t *fidp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_fid, femop_fid); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, fidp, ct)); + FEM_VNEXT3(fid, func, arg0, fidp, ct); } int @@ -2379,7 +2965,7 @@ vnext_rwlock(femarg_t *vf, int write_lock, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_rwlock, femop_rwlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, write_lock, ct)); + FEM_VNEXT3(rwlock, func, arg0, write_lock, ct); } void @@ -2407,7 +2993,7 @@ vnext_seek(femarg_t *vf, offset_t ooff, offset_t *noffp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_seek, femop_seek); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ooff, noffp, ct)); + FEM_VNEXT4(seek, func, arg0, ooff, noffp, ct); } int @@ -2421,7 +3007,7 @@ vnext_cmp(femarg_t *vf, vnode_t *vp2, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_cmp, femop_cmp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vp2, ct)); + FEM_VNEXT3(cmp, func, arg0, vp2, ct); } int @@ -2437,7 +3023,7 @@ vnext_frlock(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_frlock, femop_frlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct)); + FEM_VNEXT8(frlock, func, arg0, cmd, bfp, flag, offset, flk_cbp, cr, ct); } int @@ -2452,7 +3038,7 @@ vnext_space(femarg_t *vf, int cmd, struct flock64 *bfp, int flag, vsop_find(vf, &func, int, &arg0, vop_space, femop_space); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, bfp, flag, offset, cr, ct)); + FEM_VNEXT7(space, func, arg0, cmd, bfp, flag, offset, cr, ct); } int @@ -2466,7 +3052,7 @@ vnext_realvp(femarg_t *vf, vnode_t **vpp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_realvp, femop_realvp); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vpp, ct)); + FEM_VNEXT3(realvp, func, arg0, vpp, ct); } int @@ -2482,8 +3068,8 @@ vnext_getpage(femarg_t *vf, offset_t off, size_t len, uint_t *protp, vsop_find(vf, &func, int, &arg0, vop_getpage, femop_getpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, protp, plarr, plsz, seg, addr, rw, - cr, ct)); + FEM_VNEXT11(getpage, func, arg0, off, len, protp, + plarr, plsz, seg, addr, rw, cr, ct); } int @@ -2498,7 +3084,7 @@ vnext_putpage(femarg_t *vf, offset_t off, size_t len, int flags, vsop_find(vf, &func, int, &arg0, vop_putpage, femop_putpage); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, len, flags, cr, ct)); + FEM_VNEXT6(putpage, func, arg0, off, len, flags, cr, ct); } int @@ -2514,8 +3100,8 @@ vnext_map(femarg_t *vf, offset_t off, struct as *as, caddr_t *addrp, vsop_find(vf, &func, int, &arg0, vop_map, femop_map); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addrp, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(map, func, arg0, off, as, addrp, len, prot, maxprot, flags, + cr, ct); } int @@ -2531,8 +3117,8 @@ vnext_addmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_addmap, femop_addmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(addmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2548,8 +3134,8 @@ vnext_delmap(femarg_t *vf, offset_t off, struct as *as, caddr_t addr, vsop_find(vf, &func, int, &arg0, vop_delmap, femop_delmap); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, off, as, addr, len, prot, maxprot, flags, - cr, ct)); + FEM_VNEXT10(delmap, func, arg0, off, as, addr, len, prot, maxprot, + flags, cr, ct); } int @@ -2564,7 +3150,7 @@ vnext_poll(femarg_t *vf, short events, int anyyet, short *reventsp, vsop_find(vf, &func, int, &arg0, vop_poll, femop_poll); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, events, anyyet, reventsp, phpp, ct)); + FEM_VNEXT6(poll, func, arg0, events, anyyet, reventsp, phpp, ct); } int @@ -2579,7 +3165,7 @@ vnext_dump(femarg_t *vf, caddr_t addr, offset_t lbdn, offset_t dblks, vsop_find(vf, &func, int, &arg0, vop_dump, femop_dump); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, addr, lbdn, dblks, ct)); + FEM_VNEXT5(dump, func, arg0, addr, lbdn, dblks, ct); } int @@ -2594,7 +3180,7 @@ vnext_pathconf(femarg_t *vf, int cmd, ulong_t *valp, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_pathconf, femop_pathconf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, valp, cr, ct)); + FEM_VNEXT5(pathconf, func, arg0, cmd, valp, cr, ct); } int @@ -2609,7 +3195,7 @@ vnext_pageio(femarg_t *vf, struct page *pp, u_offset_t io_off, vsop_find(vf, &func, int, &arg0, vop_pageio, femop_pageio); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, pp, io_off, io_len, flags, cr, ct)); + FEM_VNEXT7(pageio, func, arg0, pp, io_off, io_len, flags, cr, ct); } int @@ -2623,7 +3209,7 @@ vnext_dumpctl(femarg_t *vf, int action, offset_t *blkp, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_dumpctl, femop_dumpctl); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, action, blkp, ct)); + FEM_VNEXT4(dumpctl, func, arg0, action, blkp, ct); } void @@ -2653,7 +3239,7 @@ vnext_setsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_setsecattr, femop_setsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(setsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2668,7 +3254,7 @@ vnext_getsecattr(femarg_t *vf, vsecattr_t *vsap, int flag, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_getsecattr, femop_getsecattr); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vsap, flag, cr, ct)); + FEM_VNEXT5(getsecattr, func, arg0, vsap, flag, cr, ct); } int @@ -2683,7 +3269,7 @@ vnext_shrlock(femarg_t *vf, int cmd, struct shrlock *shr, int flag, vsop_find(vf, &func, int, &arg0, vop_shrlock, femop_shrlock); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, cmd, shr, flag, cr, ct)); + FEM_VNEXT6(shrlock, func, arg0, cmd, shr, flag, cr, ct); } int @@ -2698,7 +3284,7 @@ vnext_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *cname, vsop_find(vf, &func, int, &arg0, vop_vnevent, femop_vnevent); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, vnevent, dvp, cname, ct)); + FEM_VNEXT5(vnevent, func, arg0, vnevent, dvp, cname, ct); } int @@ -2713,7 +3299,7 @@ vnext_reqzcbuf(femarg_t *vf, enum uio_rw ioflag, xuio_t *xuiop, cred_t *cr, vsop_find(vf, &func, int, &arg0, vop_reqzcbuf, femop_reqzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, ioflag, xuiop, cr, ct)); + FEM_VNEXT5(reqzcbuf, func, arg0, ioflag, xuiop, cr, ct); } int @@ -2727,7 +3313,7 @@ vnext_retzcbuf(femarg_t *vf, xuio_t *xuiop, cred_t *cr, caller_context_t *ct) vsop_find(vf, &func, int, &arg0, vop_retzcbuf, femop_retzcbuf); ASSERT(func != NULL); ASSERT(arg0 != NULL); - return ((*func)(arg0, xuiop, cr, ct)); + FEM_VNEXT4(retzcbuf, func, arg0, xuiop, cr, ct); } int diff --git a/usr/src/uts/common/fs/fifofs/fifosubr.c b/usr/src/uts/common/fs/fifofs/fifosubr.c index 6e56000ffe..56204c6741 100644 --- a/usr/src/uts/common/fs/fifofs/fifosubr.c +++ b/usr/src/uts/common/fs/fifofs/fifosubr.c @@ -614,9 +614,12 @@ fifo_stropen(vnode_t **vpp, int flag, cred_t *crp, int dotwist, int lockheld) /* * The other end of the pipe is almost closed so * reject any other open on this end of the pipe - * This only happens with a pipe mounted under namefs + * This normally only happens with a pipe mounted under namefs, but + * we can also see an open via proc/fd, which should still succeed. + * To indicate the proc/fd case the FKLYR flag is passed. */ - if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE)) { + if ((fnp->fn_flag & (FIFOCLOSE|ISPIPE)) == (FIFOCLOSE|ISPIPE) && + (flag & FKLYR) == 0) { fifo_cleanup(oldvp, flag); cv_broadcast(&fnp->fn_wait_cv); if (!lockheld) diff --git a/usr/src/uts/common/fs/fifofs/fifovnops.c b/usr/src/uts/common/fs/fifofs/fifovnops.c index ac89e430c7..fee2924093 100644 --- a/usr/src/uts/common/fs/fifofs/fifovnops.c +++ b/usr/src/uts/common/fs/fifofs/fifovnops.c @@ -27,7 +27,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * FIFOFS file system vnode operations. This file system @@ -1832,17 +1834,16 @@ fifo_poll(vnode_t *vp, short events, int anyyet, short *reventsp, } /* - * if we happened to get something, return + * if we happened to get something and we're not edge-triggered, return */ - - if ((*reventsp = (short)retevents) != 0) { + if ((*reventsp = (short)retevents) != 0 && !(events & POLLET)) { mutex_exit(&fnp->fn_lock->flk_lock); return (0); } /* - * If poll() has not found any events yet, set up event cell - * to wake up the poll if a requested event occurs on this + * If poll() has not found any events yet or we're edge-triggered, set + * up event cell to wake up the poll if a requested event occurs on this * pipe/fifo. */ if (!anyyet) { diff --git a/usr/src/uts/common/fs/fs_subr.c b/usr/src/uts/common/fs/fs_subr.c index 744d269716..ec5b145a86 100644 --- a/usr/src/uts/common/fs/fs_subr.c +++ b/usr/src/uts/common/fs/fs_subr.c @@ -25,6 +25,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -246,6 +247,7 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, int frcmd; int nlmid; int error = 0; + boolean_t skip_lock = B_FALSE; flk_callback_t serialize_callback; int serialize = 0; v_mode_t mode; @@ -265,6 +267,17 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, } break; + case F_OFD_GETLK: + /* + * TBD we do not support remote OFD locks at this time. + */ + if (flag & (F_REMOTELOCK | F_PXFSLOCK)) { + error = EINVAL; + goto done; + } + skip_lock = B_TRUE; + break; + case F_SETLK_NBMAND: /* * Are NBMAND locks allowed on this file? @@ -326,6 +339,20 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, } break; + case F_OFD_SETLK: + case F_OFD_SETLKW: + case F_FLOCK: + case F_FLOCKW: + /* + * TBD we do not support remote OFD locks at this time. + */ + if (flag & (F_REMOTELOCK | F_PXFSLOCK)) { + error = EINVAL; + goto done; + } + skip_lock = B_TRUE; + break; + case F_HASREMOTELOCKS: nlmid = GETNLMID(bfp->l_sysid); if (nlmid != 0) { /* booted as a cluster */ @@ -354,7 +381,8 @@ fs_frlock(register vnode_t *vp, int cmd, struct flock64 *bfp, int flag, flk_cbp = &serialize_callback; } - error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp); + if (!skip_lock) + error = reclock(vp, bfp, frcmd, flag, offset, flk_cbp); done: if (serialize) diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c new file mode 100644 index 0000000000..05ee2c6e09 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_dir.c @@ -0,0 +1,640 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/stat.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +static int hldir_make_hlnode(hlnode_t *, hlfsmount_t *, vattr_t *, enum de_op, + vnode_t *, hlnode_t **, cred_t *); +static int hldiraddentry(hlnode_t *, hlnode_t *, char *); + + +#define HL_HASH_SIZE 8192 /* must be power of 2 */ +#define HL_MUTEX_SIZE 64 + +static hldirent_t *hl_hashtable[HL_HASH_SIZE]; +static kmutex_t hl_hashmutex[HL_MUTEX_SIZE]; + +#define HL_HASH_INDEX(a) ((a) & (HL_HASH_SIZE-1)) +#define HL_MUTEX_INDEX(a) ((a) & (HL_MUTEX_SIZE-1)) + +#define HYPRLOFS_HASH(tp, name, hash) \ + { \ + char Xc, *Xcp; \ + hash = (uint_t)(uintptr_t)(tp) >> 8; \ + for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++) \ + hash = (hash << 4) + hash + (uint_t)Xc; \ + } + +void +hyprlofs_hash_init(void) +{ + int ix; + + for (ix = 0; ix < HL_MUTEX_SIZE; ix++) + mutex_init(&hl_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL); +} + +static void +hyprlofs_hash_in(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + HYPRLOFS_HASH(h->hld_parent, h->hld_name, hash); + h->hld_hash = hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + h->hld_link = *prevpp; + *prevpp = h; + mutex_exit(hmtx); +} + +/* Remove hldirent *h from the hash list. */ +static void +hyprlofs_hash_out(hldirent_t *h) +{ + uint_t hash; + hldirent_t **prevpp; + kmutex_t *hmtx; + + hash = h->hld_hash; + prevpp = &hl_hashtable[HL_HASH_INDEX(hash)]; + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + while (*prevpp != h) + prevpp = &(*prevpp)->hld_link; + *prevpp = h->hld_link; + mutex_exit(hmtx); +} + +static hldirent_t * +hyprlofs_hash_lookup(char *name, hlnode_t *parent, uint_t hold, + hlnode_t **found) +{ + hldirent_t *l; + uint_t hash; + kmutex_t *hmtx; + hlnode_t *hnp; + + HYPRLOFS_HASH(parent, name, hash); + hmtx = &hl_hashmutex[HL_MUTEX_INDEX(hash)]; + mutex_enter(hmtx); + l = hl_hashtable[HL_HASH_INDEX(hash)]; + while (l) { + if (l->hld_hash == hash && l->hld_parent == parent && + strcmp(l->hld_name, name) == 0) { + /* + * Ensure that the hlnode that we put a hold on is the + * same one that we pass back. Thus the temp. var + * hnp is necessary. + */ + hnp = l->hld_hlnode; + if (hold) { + ASSERT(hnp); + hlnode_hold(hnp); + } + if (found) + *found = hnp; + mutex_exit(hmtx); + return (l); + } else { + l = l->hld_link; + } + } + mutex_exit(hmtx); + return (NULL); +} + +/* + * Search directory 'parent' for entry 'name'. + * + * The calling thread can't hold the write version of the rwlock for the + * directory being searched + * + * On success *foundtp points to the found hlnode with its vnode held. + */ +int +hyprlofs_dirlookup(hlnode_t *parent, char *name, hlnode_t **foundtp, cred_t *cr) +{ + int error; + + *foundtp = NULL; + if (parent->hln_type != VDIR) + return (ENOTDIR); + + if ((error = hyprlofs_taccess(parent, VEXEC, cr))) + return (error); + + if (*name == '\0') { + hlnode_hold(parent); + *foundtp = parent; + return (0); + } + + /* + * Search the directory for the matching name. We need the lock + * protecting the hln_dir list so that it doesn't change out from + * underneath us. hyprlofs_hash_lookup() will pass back the hlnode + * with a hold on it. + */ + if (hyprlofs_hash_lookup(name, parent, 1, foundtp) != NULL) { + ASSERT(*foundtp); + return (0); + } + + return (ENOENT); +} + +/* + * Enter a directory entry (either a file or subdir, depending on op) for + * 'name' and 'hp' into directory 'dir' + */ +int +hyprlofs_direnter( + hlfsmount_t *hm, + hlnode_t *dir, /* target directory to make entry in */ + char *name, /* name of entry */ + enum de_op op, /* entry operation */ + vnode_t *realvp, /* real vnode */ + vattr_t *va, + hlnode_t **hpp, /* return hlnode */ + cred_t *cr) +{ + hldirent_t *hdp; + hlnode_t *found = NULL; + hlnode_t *hp; + int error = 0; + char *s; + + /* hln_rwlock is held to serialize direnter and dirdeletes */ + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + /* Don't allow '/' characters in pathname component */ + for (s = name; *s; s++) + if (*s == '/') + return (EACCES); + + if (name[0] == '\0') + panic("hyprlofs_direnter: NULL name"); + + /* + * This might be a "dangling detached directory". It could have been + * removed, but a reference to it kept in u_cwd. Don't bother searching + * it, and with any luck the user will get tired of dealing with us and + * cd to some absolute pathway. This is in ufs, too. + */ + if (dir->hln_nlink == 0) { + return (ENOENT); + } + + /* Search for the entry. Return "found" if it exists. */ + hdp = hyprlofs_hash_lookup(name, dir, 1, &found); + + if (hdp) { + ASSERT(found); + switch (op) { + case DE_CREATE: + case DE_MKDIR: + if (hpp) { + *hpp = found; + error = EEXIST; + } else { + hlnode_rele(found); + } + break; + } + } else { + + /* + * The entry does not exist. Check write perms in dir to see if + * entry can be created. + */ + if ((error = hyprlofs_taccess(dir, VWRITE, cr))) + return (error); + + /* Make new hlnode and directory entry as required. */ + if ((error = hldir_make_hlnode(dir, hm, va, op, realvp, &hp, + cr))) + return (error); + + if ((error = hldiraddentry(dir, hp, name))) { + /* Unmake the inode we just made. */ + rw_enter(&hp->hln_rwlock, RW_WRITER); + if ((hp->hln_type) == VDIR) { + ASSERT(hdp == NULL); + /* cleanup allocs made by hyprlofs_dirinit() */ + hyprlofs_dirtrunc(hp); + } + mutex_enter(&hp->hln_tlock); + hp->hln_nlink = 0; + mutex_exit(&hp->hln_tlock); + gethrestime(&hp->hln_ctime); + rw_exit(&hp->hln_rwlock); + hlnode_rele(hp); + hp = NULL; + } else if (hpp) { + *hpp = hp; + } else { + hlnode_rele(hp); + } + } + + return (error); +} + +/* + * Delete entry hp of name "nm" from dir. Free dir entry space and decrement + * link count on hlnode(s). + */ +int +hyprlofs_dirdelete(hlnode_t *dir, hlnode_t *hp, char *nm, enum dr_op op, + cred_t *cr) +{ + hldirent_t *hpdp; + int error; + size_t namelen; + hlnode_t *hnp; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(RW_WRITE_HELD(&hp->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (nm[0] == '\0') + panic("hyprlofs_dirdelete: NULL name for %p", (void *)hp); + + /* return error if removing . or .. */ + if (nm[0] == '.') { + if (nm[1] == '\0') + return (EINVAL); + if (nm[1] == '.' && nm[2] == '\0') + return (EEXIST); /* thus in ufs */ + } + + if ((error = hyprlofs_taccess(dir, VEXEC|VWRITE, cr)) != 0) + return (error); + + if (dir->hln_dir == NULL) + return (ENOENT); + + hpdp = hyprlofs_hash_lookup(nm, dir, 0, &hnp); + if (hpdp == NULL) { + /* + * If it is gone, some other thread got here first! + * Return error ENOENT. + */ + return (ENOENT); + } + + /* + * If the hlnode in the hldirent changed (shouldn't happen since we + * don't support rename) then original is gone, so return that status + * (same as UFS). + */ + if (hp != hnp) + return (ENOENT); + + hyprlofs_hash_out(hpdp); + + /* Take hpdp out of the directory list. */ + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + if (hpdp->hld_prev) { + hpdp->hld_prev->hld_next = hpdp->hld_next; + } + if (hpdp->hld_next) { + hpdp->hld_next->hld_prev = hpdp->hld_prev; + } + + /* + * If the roving slot pointer happens to match hpdp, point it at the + * previous dirent. + */ + if (dir->hln_dir->hld_prev == hpdp) { + dir->hln_dir->hld_prev = hpdp->hld_prev; + } + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + /* hpdp points to the correct directory entry */ + namelen = strlen(hpdp->hld_name) + 1; + + kmem_free(hpdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + hp->hln_ctime = now; + + ASSERT(hp->hln_nlink > 0); + DECR_COUNT(&hp->hln_nlink, &hp->hln_tlock); + if (op == DR_RMDIR && hp->hln_type == VDIR) { + hyprlofs_dirtrunc(hp); + ASSERT(hp->hln_nlink == 0); + } + return (0); +} + +/* + * hyprlofs_dirinit initializes a dir with '.' and '..' entries without + * checking perms and locking + */ +void +hyprlofs_dirinit( + hlnode_t *parent, /* parent of directory to initialize */ + hlnode_t *dir) /* the new directory */ +{ + hldirent_t *dot, *dotdot; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&parent->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + dot = kmem_zalloc(sizeof (hldirent_t) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (hldirent_t) + 3, KM_SLEEP); + + /* Initialize the entries */ + dot->hld_hlnode = dir; + dot->hld_offset = 0; + dot->hld_name = (char *)dot + sizeof (hldirent_t); + dot->hld_name[0] = '.'; + dot->hld_parent = dir; + hyprlofs_hash_in(dot); + + dotdot->hld_hlnode = parent; + dotdot->hld_offset = 1; + dotdot->hld_name = (char *)dotdot + sizeof (hldirent_t); + dotdot->hld_name[0] = '.'; + dotdot->hld_name[1] = '.'; + dotdot->hld_parent = dir; + hyprlofs_hash_in(dotdot); + + /* Initialize directory entry list. */ + dot->hld_next = dotdot; + dot->hld_prev = dotdot; + dotdot->hld_next = NULL; + dotdot->hld_prev = dot; + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + /* + * Since hyprlofs_dirinit is called with both dir and parent being the + * same for the root vnode, we need to increment this before we set + * hln_nlink = 2 below. + */ + INCR_COUNT(&parent->hln_nlink, &parent->hln_tlock); + parent->hln_ctime = now; + + dir->hln_dir = dot; + dir->hln_size = 2 * sizeof (hldirent_t) + 5; /* dot and dotdot */ + dir->hln_dirents = 2; + dir->hln_nlink = 2; +} + + +/* + * hyprlofs_dirtrunc removes all dir entries under this dir. + */ +void +hyprlofs_dirtrunc(hlnode_t *dir) +{ + hldirent_t *hdp; + hlnode_t *tp; + size_t namelen; + timestruc_t now; + + ASSERT(RW_WRITE_HELD(&dir->hln_rwlock)); + ASSERT(dir->hln_type == VDIR); + + if (dir->hln_looped) + return; + + for (hdp = dir->hln_dir; hdp; hdp = dir->hln_dir) { + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hdp->hld_hlnode); + + dir->hln_dir = hdp->hld_next; + namelen = strlen(hdp->hld_name) + 1; + + /* + * Adjust the link counts to account for this dir entry removal. + */ + tp = hdp->hld_hlnode; + + ASSERT(tp->hln_nlink > 0); + DECR_COUNT(&tp->hln_nlink, &tp->hln_tlock); + + hyprlofs_hash_out(hdp); + + kmem_free(hdp, sizeof (hldirent_t) + namelen); + dir->hln_size -= (sizeof (hldirent_t) + namelen); + dir->hln_dirents--; + } + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + ASSERT(dir->hln_dir == NULL); + ASSERT(dir->hln_size == 0); + ASSERT(dir->hln_dirents == 0); +} + +static int +hldiraddentry( + hlnode_t *dir, /* target directory to make entry in */ + hlnode_t *hp, /* new hlnode */ + char *name) +{ + hldirent_t *hdp, *hpdp; + size_t namelen, alloc_size; + timestruc_t now; + + /* + * Make sure the parent dir wasn't removed from underneath the caller. + */ + if (dir->hln_dir == NULL) + return (ENOENT); + + /* Check that everything is on the same FS. */ + if (hp->hln_vnode->v_vfsp != dir->hln_vnode->v_vfsp) + return (EXDEV); + + /* Alloc and init dir entry */ + namelen = strlen(name) + 1; + alloc_size = namelen + sizeof (hldirent_t); + hdp = kmem_zalloc(alloc_size, KM_NORMALPRI | KM_NOSLEEP); + if (hdp == NULL) + return (ENOSPC); + + dir->hln_size += alloc_size; + dir->hln_dirents++; + hdp->hld_hlnode = hp; + hdp->hld_parent = dir; + + /* The dir entry and its name were allocated sequentially. */ + hdp->hld_name = (char *)hdp + sizeof (hldirent_t); + (void) strcpy(hdp->hld_name, name); + + hyprlofs_hash_in(hdp); + + /* + * Some utilities expect the size of a directory to remain fairly + * static. For example, a routine which unlinks files between calls to + * readdir(); the size of the dir changes from underneath it and so the + * real dir offset in bytes is invalid. To circumvent this problem, we + * initialize a dir entry with a phony offset, and use this offset to + * determine end of file in hyprlofs_readdir. + */ + hpdp = dir->hln_dir->hld_prev; + /* + * Install at first empty "slot" in directory list. + */ + while (hpdp->hld_next != NULL && (hpdp->hld_next->hld_offset - + hpdp->hld_offset) <= 1) { + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + ASSERT(hpdp->hld_next->hld_offset > hpdp->hld_offset); + hpdp = hpdp->hld_next; + } + hdp->hld_offset = hpdp->hld_offset + 1; + + /* + * If we're at the end of the dirent list and the offset (which is + * necessarily the largest offset in this dir) is more than twice the + * number of dirents, that means the dir is 50% holes. At this point + * we reset the slot pointer back to the beginning of the dir so we + * start using the holes. The idea is that if there are N dirents, + * there must also be N holes, so we can satisfy the next N creates by + * walking at most 2N entries; thus the average cost of a create is + * constant. Note that we use the first dirent's hld_prev as the roving + * slot pointer. This saves a word in every dirent. + */ + if (hpdp->hld_next == NULL && hpdp->hld_offset > 2 * dir->hln_dirents) + dir->hln_dir->hld_prev = dir->hln_dir->hld_next; + else + dir->hln_dir->hld_prev = hdp; + + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + hdp->hld_next = hpdp->hld_next; + if (hdp->hld_next) { + hdp->hld_next->hld_prev = hdp; + } + hdp->hld_prev = hpdp; + hpdp->hld_next = hdp; + + ASSERT(hdp->hld_next != hdp); + ASSERT(hdp->hld_prev != hdp); + ASSERT(hpdp->hld_next != hpdp); + ASSERT(hpdp->hld_prev != hpdp); + + gethrestime(&now); + dir->hln_mtime = now; + dir->hln_ctime = now; + + return (0); +} + +static int +hldir_make_hlnode(hlnode_t *dir, hlfsmount_t *hm, vattr_t *va, enum de_op op, + vnode_t *realvp, hlnode_t **newnode, cred_t *cr) +{ + hlnode_t *hp; + enum vtype type; + + ASSERT(va != NULL); + ASSERT(op == DE_CREATE || op == DE_MKDIR); + if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) || + ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) + return (EOVERFLOW); + type = va->va_type; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, va, cr); + + hp->hln_vnode->v_rdev = hp->hln_rdev = NODEV; + hp->hln_vnode->v_type = type; + hp->hln_uid = crgetuid(cr); + + /* + * To determine the gid of the created file: + * If the directory's set-gid bit is set, set the gid to the gid + * of the parent dir, otherwise, use the process's gid. + */ + if (dir->hln_mode & VSGID) + hp->hln_gid = dir->hln_gid; + else + hp->hln_gid = crgetgid(cr); + + /* + * If we're creating a dir and the parent dir has the set-GID bit set, + * set it on the new dir. Otherwise, if the user is neither privileged + * nor a member of the file's new group, clear the file's set-GID bit. + */ + if (dir->hln_mode & VSGID && type == VDIR) + hp->hln_mode |= VSGID; + else { + if ((hp->hln_mode & VSGID) && + secpolicy_vnode_setids_setgids(cr, hp->hln_gid) != 0) + hp->hln_mode &= ~VSGID; + } + + if (va->va_mask & AT_ATIME) + hp->hln_atime = va->va_atime; + if (va->va_mask & AT_MTIME) + hp->hln_mtime = va->va_mtime; + + if (op == DE_MKDIR) { + hyprlofs_dirinit(dir, hp); + hp->hln_looped = 0; + } else { + hp->hln_realvp = realvp; + hp->hln_size = va->va_size; + hp->hln_looped = 1; + } + + *newnode = hp; + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c new file mode 100644 index 0000000000..1d857309f3 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_subr.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/time.h> +#include <sys/cmn_err.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/mode.h> +#include <sys/vfs.h> +#include <sys/cred.h> +#include <sys/kmem.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/fs/hyprlofs_info.h> + +#define MODESHIFT 3 + +/* Initialize a hlnode and add it to file list under mount point. */ +void +hyprlofs_node_init(hlfsmount_t *hm, hlnode_t *h, vattr_t *vap, cred_t *cr) +{ + vnode_t *vp; + timestruc_t now; + + ASSERT(vap != NULL); + + rw_init(&h->hln_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&h->hln_tlock, NULL, MUTEX_DEFAULT, NULL); + h->hln_mode = MAKEIMODE(vap->va_type, vap->va_mode); + h->hln_mask = 0; + h->hln_type = vap->va_type; + h->hln_nodeid = (ino64_t)(uint32_t)((uintptr_t)h >> 3); + h->hln_nlink = 1; + h->hln_size = 0; + + if (cr == NULL) { + h->hln_uid = vap->va_uid; + h->hln_gid = vap->va_gid; + } else { + h->hln_uid = crgetuid(cr); + h->hln_gid = crgetgid(cr); + } + + h->hln_fsid = hm->hlm_dev; + h->hln_rdev = vap->va_rdev; + h->hln_blksize = PAGESIZE; + h->hln_nblocks = 0; + gethrestime(&now); + h->hln_atime = now; + h->hln_mtime = now; + h->hln_ctime = now; + h->hln_seq = 0; + h->hln_dir = NULL; + + h->hln_vnode = vn_alloc(KM_SLEEP); + vp = HLNTOV(h); + vn_setops(vp, hyprlofs_vnodeops); + vp->v_vfsp = hm->hlm_vfsp; + vp->v_type = vap->va_type; + vp->v_rdev = vap->va_rdev; + vp->v_data = (caddr_t)h; + mutex_enter(&hm->hlm_contents); + /* + * Increment the pseudo generation number for this hlnode. Since + * hlnodes are allocated and freed, there really is no particular + * generation number for a new hlnode. Just fake it by using a + * counter in each file system. + */ + h->hln_gen = hm->hlm_gen++; + + /* + * Add new hlnode to end of linked list of hlnodes for this hyprlofs + * Root dir is handled specially in hyprlofs_mount. + */ + if (hm->hlm_rootnode != (hlnode_t *)NULL) { + h->hln_forw = NULL; + h->hln_back = hm->hlm_rootnode->hln_back; + h->hln_back->hln_forw = hm->hlm_rootnode->hln_back = h; + } + mutex_exit(&hm->hlm_contents); + vn_exists(vp); +} + +int +hyprlofs_taccess(void *vtp, int mode, cred_t *cr) +{ + hlnode_t *hp = vtp; + int shift = 0; + + /* Check access based on owner, group and public perms in hlnode. */ + if (crgetuid(cr) != hp->hln_uid) { + shift += MODESHIFT; + if (groupmember(hp->hln_gid, cr) == 0) + shift += MODESHIFT; + } + + return (secpolicy_vnode_access2(cr, HLNTOV(hp), hp->hln_uid, + hp->hln_mode << shift, mode)); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c new file mode 100644 index 0000000000..c582a8cac2 --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vfsops.c @@ -0,0 +1,614 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +/* + * Hyperlofs is a hybrid file system combining features of the tmpfs(7FS) and + * lofs(7FS) file systems. It is modeled on code from both of these file + * systems. + * + * The purpose is to create a high performance name space for files on which + * applications will compute. Given a large number of data files with various + * owners, we want to construct a view onto those files such that only a subset + * is visible to the applications and such that the view can be changed very + * quickly as compute progresses. Entries in the name space are not mounts and + * thus do not appear in the mnttab. Entries in the name space are allowed to + * refer to files on different backing file systems. Intermediate directories + * in the name space exist only in-memory, ala tmpfs. There are no leaf nodes + * in the name space except for entries that refer to backing files ala lofs. + * + * The name space is managed via ioctls issued on the mounted file system and + * is mostly read-only for the compute applications. That is, applications + * cannot create new files in the name space. If a file is unlinked by an + * application, that only removes the file from the name space, the backing + * file remains in place. It is possible for applications to write-through to + * the backing files if the file system is mounted read-write. + * + * The name space is managed via the HYPRLOFS_ADD_ENTRIES, HYPRLOFS_RM_ENTRIES, + * and HYPRLOFS_RM_ALL ioctls on the top-level mount. + * + * The HYPRLOFS_ADD_ENTRIES ioctl specifies path(s) to the backing file(s) and + * the name(s) for the file(s) in the name space. The name(s) may be path(s) + * which will be relative to the root of the mount and thus cannot begin with + * a /. If the name is a path, it does not have to correspond to any backing + * path. The intermediate directories will only exist in the name space. The + * entry(ies) will be added to the name space. + * + * The HYPRLOFS_RM_ENTRIES ioctl specifies the name(s) of the file(s) in the + * name space which should be removed. The name(s) may be path(s) which will + * be relative to the root of the mount and thus cannot begin with a /. The + * named entry(ies) will be removed. + * + * The HYPRLOFS_RM_ALL ioctl will remove all mappings from the name space. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/time.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/stat.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/statvfs.h> +#include <sys/mount.h> +#include <sys/debug.h> +#include <sys/systm.h> +#include <sys/mntent.h> +#include <fs/fs_subr.h> +#include <vm/page.h> +#include <vm/anon.h> +#include <sys/model.h> +#include <sys/policy.h> + +#include <sys/fs/swapnode.h> +#include <sys/fs/hyprlofs_info.h> + +static int hyprlofsfstype; + +/* + * hyprlofs vfs operations. + */ +static int hyprlofsinit(int, char *); +static int hyprlofs_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *); +static int hyprlofs_unmount(vfs_t *, int, cred_t *); +static int hyprlofs_root(vfs_t *, vnode_t **); +static int hyprlofs_statvfs(vfs_t *, struct statvfs64 *); +static int hyprlofs_vget(vfs_t *, vnode_t **, struct fid *); + +/* + * Loadable module wrapper + */ +#include <sys/modctl.h> + +static mntopts_t hyprlofs_mntopts; + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "hyprlofs", + hyprlofsinit, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, + &hyprlofs_mntopts +}; + +static mntopts_t hyprlofs_mntopts = { + 0, NULL +}; + +/* + * Module linkage information + */ +static struct modlfs modlfs = { + &mod_fsops, "filesystem for hyprlofs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlfs, NULL +}; + +int +_init() +{ + return (mod_install(&modlinkage)); +} + +int +_fini() +{ + int error; + + error = mod_remove(&modlinkage); + if (error) + return (error); + /* + * Tear down the operations vectors + */ + (void) vfs_freevfsops_by_type(hyprlofsfstype); + vn_freevnodeops(hyprlofs_vnodeops); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +/* + * The following are patchable variables limiting the amount of system + * resources hyprlofs can use. + * + * hyprlofs_maxkmem limits the amount of kernel kmem_alloc memory hyprlofs can + * use for it's data structures (e.g. hlnodes, directory entries). It is set + * as a percentage of physical memory which is determined when hyprlofs is + * first used in the system. + * + * hyprlofs_minfree is the minimum amount of swap space that hyprlofs leaves for + * the rest of the system. If the amount of free swap space in the system + * (i.e. anoninfo.ani_free) drops below hyprlofs_minfree, hyprlofs anon + * allocations will fail. + */ +size_t hyprlofs_maxkmem = 0; +size_t hyprlofs_minfree = 0; +size_t hyprlofs_kmemspace; /* bytes of kernel heap used by all hyprlofs */ + +static major_t hyprlofs_major; +static minor_t hyprlofs_minor; +static kmutex_t hyprlofs_minor_lock; + +/* + * initialize global hyprlofs locks and hashes when loading hyprlofs module + */ +static int +hyprlofsinit(int fstype, char *name) +{ + static const fs_operation_def_t hl_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = hyprlofs_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = hyprlofs_unmount }, + VFSNAME_ROOT, { .vfs_root = hyprlofs_root }, + VFSNAME_STATVFS, { .vfs_statvfs = hyprlofs_statvfs }, + VFSNAME_VGET, { .vfs_vget = hyprlofs_vget }, + NULL, NULL + }; + int error; + extern void hyprlofs_hash_init(); + + hyprlofs_hash_init(); + hyprlofsfstype = fstype; + ASSERT(hyprlofsfstype != 0); + + error = vfs_setfsops(fstype, hl_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "hyprlofsinit: bad vfs ops template"); + return (error); + } + + error = vn_make_ops(name, hyprlofs_vnodeops_template, + &hyprlofs_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "hyprlofsinit: bad vnode ops template"); + return (error); + } + + /* + * hyprlofs_minfree is an absolute limit of swap space which still + * allows other processes to execute. Set it if its not patched. + */ + if (hyprlofs_minfree == 0) + hyprlofs_minfree = btopr(HYPRLOFSMINFREE); + + if ((hyprlofs_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "hyprlofsinit: Can't get unique device number."); + hyprlofs_major = 0; + } + mutex_init(&hyprlofs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +static int +hyprlofs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + hlfsmount_t *hm = NULL; + hlnode_t *hp; + struct pathname dpn; + int error; + vattr_t rattr; + int got_attrs; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (uap->flags & MS_REMOUNT) + return (EBUSY); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* Having the resource be anything but "swap" doesn't make sense. */ + vfs_setresource(vfsp, "swap", 0); + + if ((error = pn_get(uap->dir, + (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, + &dpn)) != 0) + goto out; + + if ((hm = kmem_zalloc(sizeof (hlfsmount_t), + KM_NORMALPRI | KM_NOSLEEP)) == NULL) { + pn_free(&dpn); + error = ENOMEM; + goto out; + } + + /* Get an available minor device number for this mount */ + mutex_enter(&hyprlofs_minor_lock); + do { + hyprlofs_minor = (hyprlofs_minor + 1) & L_MAXMIN32; + hm->hlm_dev = makedevice(hyprlofs_major, hyprlofs_minor); + } while (vfs_devismounted(hm->hlm_dev)); + mutex_exit(&hyprlofs_minor_lock); + + /* + * Set but don't bother entering the mutex since hlfsmount is not on + * the mount list yet. + */ + mutex_init(&hm->hlm_contents, NULL, MUTEX_DEFAULT, NULL); + + hm->hlm_vfsp = vfsp; + + vfsp->vfs_data = (caddr_t)hm; + vfsp->vfs_fstype = hyprlofsfstype; + vfsp->vfs_dev = hm->hlm_dev; + vfsp->vfs_bsize = PAGESIZE; + vfsp->vfs_flag |= VFS_NOTRUNC; + vfs_make_fsid(&vfsp->vfs_fsid, hm->hlm_dev, hyprlofsfstype); + hm->hlm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); + (void) strcpy(hm->hlm_mntpath, dpn.pn_path); + + /* allocate and initialize root hlnode structure */ + bzero(&rattr, sizeof (vattr_t)); + rattr.va_mode = (mode_t)(S_IFDIR | 0777); + rattr.va_type = VDIR; + rattr.va_rdev = 0; + hp = kmem_zalloc(sizeof (hlnode_t), KM_SLEEP); + hyprlofs_node_init(hm, hp, &rattr, cr); + + /* Get the mode, uid, and gid from the underlying mount point. */ + rattr.va_mask = AT_MODE|AT_UID|AT_GID; + got_attrs = VOP_GETATTR(mvp, &rattr, 0, cr, NULL); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + HLNTOV(hp)->v_flag |= VROOT; + + /* + * If the getattr succeeded, use its results, otherwise allow the + * previously set defaults to prevail. + */ + if (got_attrs == 0) { + hp->hln_mode = rattr.va_mode; + hp->hln_uid = rattr.va_uid; + hp->hln_gid = rattr.va_gid; + } + + /* + * Initialize linked list of hlnodes so that the back pointer of the + * root hlnode always points to the last one on the list and the + * forward pointer of the last node is null + */ + hp->hln_back = hp; + hp->hln_forw = NULL; + hp->hln_nlink = 0; + hm->hlm_rootnode = hp; + + hyprlofs_dirinit(hp, hp); + + rw_exit(&hp->hln_rwlock); + + pn_free(&dpn); + error = 0; + +out: + return (error); +} + +static int +hyprlofs_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hnp, *cancel; + vnode_t *vp; + int error; + + if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) + return (error); + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + /* + * forced unmount is not supported by this file system + * and thus, ENOTSUP, is being returned. + */ + if (flag & MS_FORCE) + return (ENOTSUP); + + mutex_enter(&hm->hlm_contents); + + /* + * If there are no open files, only the root node should have a ref cnt. + * With hlm_contents held, nothing can be added or removed. There may + * be some dirty pages. To prevent fsflush from disrupting the unmount, + * put a hold on each node while scanning. If we find a previously + * referenced node, undo the holds we have placed and fail EBUSY. + */ + hnp = hm->hlm_rootnode; + if (HLNTOV(hnp)->v_count > 1) { + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + + for (hnp = hnp->hln_forw; hnp; hnp = hnp->hln_forw) { + if ((vp = HLNTOV(hnp))->v_count > 0) { + cancel = hm->hlm_rootnode->hln_forw; + while (cancel != hnp) { + vp = HLNTOV(cancel); + ASSERT(vp->v_count > 0); + VN_RELE(vp); + cancel = cancel->hln_forw; + } + mutex_exit(&hm->hlm_contents); + return (EBUSY); + } + VN_HOLD(vp); + } + + /* We can drop the mutex now because no one can find this mount */ + mutex_exit(&hm->hlm_contents); + + /* + * Free all alloc'd memory associated with this FS. To do this, we go + * through the file list twice, once to remove all the dir entries, and + * then to remove all the files. + */ + + /* Remove all directory entries */ + for (hnp = hm->hlm_rootnode; hnp; hnp = hnp->hln_forw) { + rw_enter(&hnp->hln_rwlock, RW_WRITER); + if (hnp->hln_type == VDIR) + hyprlofs_dirtrunc(hnp); + rw_exit(&hnp->hln_rwlock); + } + + ASSERT(hm->hlm_rootnode); + + /* + * All links are gone, v_count is keeping nodes in place. VN_RELE + * should make the node disappear, unless somebody is holding pages + * against it. Wait and retry until it disappears. + * + * We re-acquire the lock to prevent others who have a HOLD on a hlnode + * from blowing it away (in hyprlofs_inactive) while we're trying to + * get to it here. Once we have a HOLD on it we know it'll stick around. + */ + mutex_enter(&hm->hlm_contents); + + /* Remove all the files (except the rootnode) backwards. */ + while ((hnp = hm->hlm_rootnode->hln_back) != hm->hlm_rootnode) { + mutex_exit(&hm->hlm_contents); + /* Note we handled the link count in pass 2 above. */ + vp = HLNTOV(hnp); + VN_RELE(vp); + mutex_enter(&hm->hlm_contents); + /* + * It's still there after the RELE. Someone else like pageout + * has a hold on it so wait a bit and then try again. + */ + if (hnp == hm->hlm_rootnode->hln_back) { + VN_HOLD(vp); + mutex_exit(&hm->hlm_contents); + delay(hz / 4); + mutex_enter(&hm->hlm_contents); + } + } + mutex_exit(&hm->hlm_contents); + + VN_RELE(HLNTOV(hm->hlm_rootnode)); + + ASSERT(hm->hlm_mntpath); + + kmem_free(hm->hlm_mntpath, strlen(hm->hlm_mntpath) + 1); + + mutex_destroy(&hm->hlm_contents); + kmem_free(hm, sizeof (hlfsmount_t)); + + return (0); +} + +/* Return root hlnode for given vnode */ +static int +hyprlofs_root(vfs_t *vfsp, vnode_t **vpp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = hm->hlm_rootnode; + vnode_t *vp; + + ASSERT(hp); + + vp = HLNTOV(hp); + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +hyprlofs_statvfs(vfs_t *vfsp, struct statvfs64 *sbp) +{ + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + ulong_t blocks; + dev32_t d32; + zoneid_t eff_zid; + struct zone *zp; + + /* + * The FS may have been mounted by the GZ on behalf of the NGZ. In + * that case, the hlfsmount zone_id will be the global zone. We want + * to show the swap cap inside the zone in this case, even though the + * FS was mounted by the GZ. + */ + if (curproc->p_zone->zone_id != GLOBAL_ZONEUNIQID) + zp = curproc->p_zone; + else + zp = hm->hlm_vfsp->vfs_zone; + + if (zp == NULL) + eff_zid = GLOBAL_ZONEUNIQID; + else + eff_zid = zp->zone_id; + + sbp->f_bsize = PAGESIZE; + sbp->f_frsize = PAGESIZE; + + /* + * Find the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); + blocks = (ulong_t)CURRENT_TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + + if (blocks > hyprlofs_minfree) + sbp->f_bfree = blocks - hyprlofs_minfree; + else + sbp->f_bfree = 0; + + sbp->f_bavail = sbp->f_bfree; + + /* + * Total number of blocks is what's available plus what's been used + */ + sbp->f_blocks = (fsblkcnt64_t)(sbp->f_bfree); + + if (eff_zid != GLOBAL_ZONEUNIQID && + zp->zone_max_swap_ctl != UINT64_MAX) { + /* + * If the fs is used by a NGZ with a swap cap, then report the + * capped size. + */ + rctl_qty_t cap, used; + pgcnt_t pgcap, pgused; + + mutex_enter(&zp->zone_mem_lock); + cap = zp->zone_max_swap_ctl; + used = zp->zone_max_swap; + mutex_exit(&zp->zone_mem_lock); + + pgcap = btop(cap); + pgused = btop(used); + + sbp->f_bfree = MIN(pgcap - pgused, sbp->f_bfree); + sbp->f_bavail = sbp->f_bfree; + sbp->f_blocks = MIN(pgcap, sbp->f_blocks); + } + + /* + * This is fairly inaccurate since it doesn't take into account the + * names stored in the directory entries. + */ + sbp->f_ffree = sbp->f_files = ptob(availrmem) / + (sizeof (hlnode_t) + sizeof (hldirent_t)); + + sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); + (void) cmpldev(&d32, vfsp->vfs_dev); + sbp->f_fsid = d32; + (void) strcpy(sbp->f_basetype, vfssw[hyprlofsfstype].vsw_name); + (void) strncpy(sbp->f_fstr, hm->hlm_mntpath, sizeof (sbp->f_fstr)); + /* + * ensure null termination + */ + sbp->f_fstr[sizeof (sbp->f_fstr) - 1] = '\0'; + sbp->f_flag = vf_to_stf(vfsp->vfs_flag); + sbp->f_namemax = MAXNAMELEN - 1; + return (0); +} + +static int +hyprlofs_vget(vfs_t *vfsp, vnode_t **vpp, struct fid *fidp) +{ + hlfid_t *hfid; + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vfsp); + hlnode_t *hp = NULL; + + hfid = (hlfid_t *)fidp; + *vpp = NULL; + + mutex_enter(&hm->hlm_contents); + for (hp = hm->hlm_rootnode; hp; hp = hp->hln_forw) { + mutex_enter(&hp->hln_tlock); + if (hp->hln_nodeid == hfid->hlfid_ino) { + /* + * If the gen numbers don't match we know the file + * won't be found since only one hlnode can have this + * number at a time. + */ + if (hp->hln_gen != hfid->hlfid_gen || + hp->hln_nlink == 0) { + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + *vpp = (vnode_t *)HLNTOV(hp); + + VN_HOLD(*vpp); + + if ((hp->hln_mode & S_ISVTX) && + !(hp->hln_mode & (S_IXUSR | S_IFDIR))) { + mutex_enter(&(*vpp)->v_lock); + (*vpp)->v_flag |= VISSWAP; + mutex_exit(&(*vpp)->v_lock); + } + mutex_exit(&hp->hln_tlock); + mutex_exit(&hm->hlm_contents); + return (0); + } + mutex_exit(&hp->hln_tlock); + } + mutex_exit(&hm->hlm_contents); + return (0); +} diff --git a/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c new file mode 100644 index 0000000000..a2064dfa1f --- /dev/null +++ b/usr/src/uts/common/fs/hyprlofs/hyprlofs_vnops.c @@ -0,0 +1,1441 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/t_lock.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/user.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/flock.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/cred.h> +#include <sys/dirent.h> +#include <sys/pathname.h> +#include <sys/fs/hyprlofs.h> +#include <sys/fs/hyprlofs_info.h> +#include <sys/mman.h> +#include <vm/pvn.h> +#include <sys/cmn_err.h> +#include <sys/buf.h> +#include <sys/policy.h> +#include <fs/fs_subr.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +static int hyprlofs_add_entry(vnode_t *, char *, char *, cred_t *, + caller_context_t *); +static int hyprlofs_rm_entry(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_rm_all(vnode_t *, cred_t *, caller_context_t *, int); +static int hyprlofs_remove(vnode_t *, char *, cred_t *, caller_context_t *, + int); +static int hyprlofs_get_all(vnode_t *, intptr_t, cred_t *, caller_context_t *, + int); + +/* + * This is a somewhat arbitrary upper limit on the number of entries we can + * pass in on a single add/rm ioctl call. This is only used to validate that + * the input list looks sane. + */ +#define MAX_IOCTL_PARAMS 100000 + +static int +hyprlofs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *rvp; + int error; + + rvp = REALVP(*vpp); + + if (VTOHLN(*vpp)->hln_looped == 0) + return (0); + + /* + * looped back, pass through to real vnode. Need to hold new reference + * to vp since VOP_OPEN() may decide to release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + ASSERT(rvp->v_count > 1); + VN_RELE(rvp); + + return (error); +} + +static int +hyprlofs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) { + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + return (0); + } + + return (VOP_CLOSE(REALVP(vp), flag, count, offset, cr, ct)); +} + +static int +hyprlofs_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + if (vp->v_type == VDIR) + return (EISDIR); + return (VOP_READ(REALVP(vp), uiop, ioflag, cr, ct)); +} + +static int +hyprlofs_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + /* We don't support writing to non-regular files */ + if (vp->v_type != VREG) + return (EINVAL); + + if (vn_is_readonly(vp)) + return (EROFS); + + return (VOP_WRITE(REALVP(vp), uiop, ioflag, cr, ct)); +} + +/* ARGSUSED */ +static int +hyprlofs_ioctl(vnode_t *vp, int cmd, intptr_t data, int flag, + cred_t *cr, int *rvalp, caller_context_t *ct) +{ + int len, cnt, error; + int i; + model_t model; + char path[MAXPATHLEN]; + char nm[MAXPATHLEN]; + + /* We only support the hyprlofs ioctls on the root vnode */ + if (!(vp->v_flag & VROOT)) + return (ENOTTY); + + /* + * Check if managing hyprlofs is allowed. + */ + if (secpolicy_hyprlofs_control(cr) != 0) + return (EPERM); + + if (cmd == HYPRLOFS_ADD_ENTRIES || cmd == HYPRLOFS_RM_ENTRIES) { + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_entries_t ebuf; + hyprlofs_entry_t *e; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + cnt = ebuf.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry_t) * cnt; + + e = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(ebuf.hle_entries), e, len)) { + kmem_free(e, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e[i].hle_nlen == 0 || + e[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_name, nm, e[i].hle_nlen) + != 0) { + kmem_free(e, len); + return (EFAULT); + } + nm[e[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e[i].hle_plen == 0 || + e[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin(e[i].hle_path, path, + e[i].hle_plen) != 0) { + kmem_free(e, len); + return (EFAULT); + } + path[e[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e, len); + return (error); + } + } + } + + kmem_free(e, len); + return (0); + + } else { + hyprlofs_entries32_t ebuf32; + hyprlofs_entry32_t *e32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + cnt = ebuf32.hle_len; + if (cnt > MAX_IOCTL_PARAMS) + return (EINVAL); + len = sizeof (hyprlofs_entry32_t) * cnt; + + e32 = kmem_alloc(len, KM_SLEEP); + if (copyin((void *)(unsigned long)(ebuf32.hle_entries), + e32, len)) { + kmem_free(e32, len); + return (EFAULT); + } + + for (i = 0; i < cnt; i++) { + if (e32[i].hle_nlen == 0 || + e32[i].hle_nlen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_name, nm, + e32[i].hle_nlen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + nm[e32[i].hle_nlen] = '\0'; + + if (cmd == HYPRLOFS_ADD_ENTRIES) { + if (e32[i].hle_plen == 0 || + e32[i].hle_plen > MAXPATHLEN) + return (EINVAL); + + if (copyin((void *)(unsigned long) + e32[i].hle_path, path, + e32[i].hle_plen) != 0) { + kmem_free(e32, len); + return (EFAULT); + } + path[e32[i].hle_plen] = '\0'; + + if ((error = hyprlofs_add_entry(vp, + path, nm, cr, ct)) != 0) { + kmem_free(e32, len); + return (error); + } + } else { + if ((error = hyprlofs_rm_entry(vp, nm, + cr, ct, flag)) != 0) { + kmem_free(e32, len); + return (error); + } + } + } + + kmem_free(e32, len); + return (0); + } + } + + if (cmd == HYPRLOFS_RM_ALL) { + return (hyprlofs_rm_all(vp, cr, ct, flag)); + } + + if (cmd == HYPRLOFS_GET_ENTRIES) { + return (hyprlofs_get_all(vp, data, cr, ct, flag)); + } + + return (ENOTTY); +} + +static int +hyprlofs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + vattr_t tmp_va; + + if (tp->hln_looped == 1) { + int error; + + if ((error = VOP_GETATTR(REALVP(vp), &tmp_va, flags, cr, + ct)) != 0) + return (error); + } + + mutex_enter(&tp->hln_tlock); + vap->va_type = vp->v_type; + vap->va_mode = tp->hln_mode & MODEMASK; + vap->va_uid = tp->hln_uid; + vap->va_gid = tp->hln_gid; + vap->va_fsid = tp->hln_fsid; + vap->va_nodeid = (ino64_t)tp->hln_nodeid; + vap->va_nlink = tp->hln_nlink; + vap->va_size = (u_offset_t)tp->hln_size; + vap->va_atime = tp->hln_atime; + vap->va_mtime = tp->hln_mtime; + vap->va_ctime = tp->hln_ctime; + vap->va_blksize = PAGESIZE; + vap->va_rdev = tp->hln_rdev; + vap->va_seq = tp->hln_seq; + + if (tp->hln_looped == 1) { + vap->va_nblocks = tmp_va.va_nblocks; + } else { + vap->va_nblocks = + (fsblkcnt64_t)btodb(ptob(btopr(vap->va_size))); + } + mutex_exit(&tp->hln_tlock); + return (0); +} + +/*ARGSUSED4*/ +static int +hyprlofs_setattr(vnode_t *vp, vattr_t *vap, int flags, + cred_t *cr, caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error = 0; + vattr_t *get; + long mask; + + /* + * Cannot set these attributes + */ + if ((vap->va_mask & AT_NOSET) || (vap->va_mask & AT_XVATTR)) + return (EINVAL); + + mutex_enter(&tp->hln_tlock); + + get = &tp->hln_attr; + /* + * Change file access modes. Must be owner or have sufficient + * privileges. + */ + error = secpolicy_vnode_setattr(cr, vp, vap, get, flags, + hyprlofs_taccess, tp); + + if (error) + goto out; + + mask = vap->va_mask; + + if (mask & AT_MODE) { + get->va_mode &= S_IFMT; + get->va_mode |= vap->va_mode & ~S_IFMT; + } + + if (mask & AT_UID) + get->va_uid = vap->va_uid; + if (mask & AT_GID) + get->va_gid = vap->va_gid; + if (mask & AT_ATIME) + get->va_atime = vap->va_atime; + if (mask & AT_MTIME) + get->va_mtime = vap->va_mtime; + + if (mask & (AT_UID | AT_GID | AT_MODE | AT_MTIME)) + gethrestime(&tp->hln_ctime); + +out: + mutex_exit(&tp->hln_tlock); + return (error); +} + +static int +hyprlofs_access(vnode_t *vp, int mode, int flags, cred_t *cr, + caller_context_t *ct) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(vp); + int error; + + if (mode & VWRITE) { + if (vp->v_type == VREG && vn_is_readonly(vp)) + return (EROFS); + } + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_ACCESS(REALVP(vp), mode, flags, cr, ct)); + + mutex_enter(&tp->hln_tlock); + error = hyprlofs_taccess(tp, mode, cr); + mutex_exit(&tp->hln_tlock); + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + hlnode_t *tp = (hlnode_t *)VTOHLN(dvp); + hlnode_t *ntp = NULL; + int error; + + if (VTOHLN(dvp)->hln_looped == 1) + return (VOP_LOOKUP(REALVP(dvp), nm, vpp, pnp, flags, rdir, + cr, ct, direntflags, realpnp)); + + if (flags & LOOKUP_XATTR) + return (EINVAL); + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + *vpp = dvp; + return (0); + } + ASSERT(tp); + + if ((error = hyprlofs_dirlookup(tp, nm, &ntp, cr)) == 0) { + ASSERT(ntp); + *vpp = HLNTOV(ntp); + } + return (error); +} + +/* + * Create the loopback from the hyprlofs vnode to the real vnode. + */ +static int +hyprlofs_loopback(vnode_t *dvp, vnode_t *rvp, char *nm, vattr_t *vap, + int mode, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *parent; + hlfsmount_t *tm; + int error; + hlnode_t *oldtp; + vnode_t *vp; + + parent = (hlnode_t *)VTOHLN(dvp); + tm = (hlfsmount_t *)VTOHLM(dvp); + error = 0; + oldtp = NULL; + + if (vap->va_type == VREG && (vap->va_mode & VSVTX)) { + /* we don't support the sticky bit */ + vap->va_mode &= ~VSVTX; + } else if (vap->va_type == VNON) { + return (EINVAL); + } + + /* Null component name is a synonym for directory being searched. */ + if (*nm == '\0') { + VN_HOLD(dvp); + oldtp = parent; + } else { + error = hyprlofs_dirlookup(parent, nm, &oldtp, cr); + } + + if (error == 0) { /* name found */ + ASSERT(oldtp); + + rw_enter(&oldtp->hln_rwlock, RW_WRITER); + + /* + * if create/read-only an existing directory, allow it + */ + if ((oldtp->hln_type == VDIR) && (mode & VWRITE)) + error = EISDIR; + else { + error = hyprlofs_taccess(oldtp, mode, cr); + } + + if (error) { + rw_exit(&oldtp->hln_rwlock); + hlnode_rele(oldtp); + return (error); + } + + vp = HLNTOV(oldtp); + rw_exit(&oldtp->hln_rwlock); + + if (vp->v_type == VREG) { + hlnode_rele(oldtp); + return (EEXIST); + } + + vnevent_create(vp, ct); + return (0); + } + + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_CREATE, rvp, vap, NULL, + cr); + rw_exit(&parent->hln_rwlock); + + return (error); +} + +/* + * Create an in-memory directory based on the add-entry ioctl name. + * If the dir exists, return EEXIST but still also return node in vpp. + */ +static int +hyprlofs_mkdir(vnode_t *dvp, char *nm, vattr_t *va, vnode_t **vpp, cred_t *cr) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + hlfsmount_t *tm = (hlfsmount_t *)VTOHLM(dvp); + int error; + + /* + * Might be dangling directory. Catch it here, because a ENOENT return + * from hyprlofs_dirlookup() is a valid return. + */ + if (parent->hln_nlink == 0) + return (ENOENT); + + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error == 0) { + ASSERT(self); + hlnode_rele(self); + /* We can't loop in under a looped in directory */ + if (self->hln_looped) + return (EACCES); + *vpp = HLNTOV(self); + return (EEXIST); + } + if (error != ENOENT) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + error = hyprlofs_direnter(tm, parent, nm, DE_MKDIR, (vnode_t *)NULL, + va, &self, cr); + rw_exit(&parent->hln_rwlock); + + if (error == 0 || error == EEXIST) { + hlnode_rele(self); + *vpp = HLNTOV(self); + } + + return (error); +} + +/* + * Loop in a file or directory into the namespace. + */ +static int +hyprlofs_add_entry(vnode_t *vp, char *fspath, char *fsname, + cred_t *cr, caller_context_t *ct) +{ + int error; + char *p, *pnm; + vnode_t *realvp, *dvp; + vattr_t va; + + /* + * Get vnode for the real file/dir. We'll have a hold on realvp which + * we won't vn_rele until hyprlofs_inactive. + */ + if ((error = lookupname(fspath, UIO_SYSSPACE, FOLLOW, NULLVPP, + &realvp)) != 0) + return (error); + + /* no devices allowed */ + if (IS_DEVVP(realvp)) { + VN_RELE(realvp); + return (ENODEV); + } + + /* + * realvp may be an AUTOFS node, in which case we perform a VOP_ACCESS + * to trigger the mount of the intended filesystem. This causes a + * loopback mount of the intended filesystem instead of the AUTOFS + * filesystem. + */ + if ((error = VOP_ACCESS(realvp, 0, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* + * We're interested in the top most filesystem. This is specially + * important when fspath is a trigger AUTOFS node, since we're really + * interested in mounting the filesystem AUTOFS mounted as result of + * the VOP_ACCESS() call not the AUTOFS node itself. + */ + if (vn_mountedvfs(realvp) != NULL) { + if ((error = traverse(&realvp)) != 0) { + VN_RELE(realvp); + return (error); + } + } + + va.va_type = VNON; + /* + * If the target name is a path, make sure we have all of the + * intermediate directories, creating them if necessary. + */ + dvp = vp; + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') { + VN_RELE(realvp); + return (EINVAL); + } + + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + if (va.va_type == VNON) + /* use the top-level dir as the template va for mkdir */ + if ((error = VOP_GETATTR(vp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) { + VN_RELE(realvp); + return (EINVAL); + } + + if ((error = hyprlofs_mkdir(dvp, pnm, &va, &dvp, cr)) != 0 && + error != EEXIST) { + VN_RELE(realvp); + return (error); + } + + *p = '/'; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') { + VN_RELE(realvp); + return (EINVAL); + } + + /* Now use the real file's va as the template va */ + if ((error = VOP_GETATTR(realvp, &va, 0, cr, NULL)) != 0) { + VN_RELE(realvp); + return (error); + } + + /* Make the vnode */ + error = hyprlofs_loopback(dvp, realvp, pnm, &va, va.va_mode, cr, ct); + if (error != 0) + VN_RELE(realvp); + return (error); +} + +/* + * Remove a looped in file from the namespace. + */ +static int +hyprlofs_rm_entry(vnode_t *dvp, char *fsname, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error; + char *p, *pnm; + hlnode_t *parent; + hlnode_t *fndtp; + + pnm = p = fsname; + + /* path cannot be absolute */ + if (*p == '/') + return (EINVAL); + + /* + * If the target name is a path, get the containing dir and simple + * file name. + */ + parent = (hlnode_t *)VTOHLN(dvp); + for (p = strchr(pnm, '/'); p != NULL; p = strchr(pnm, '/')) { + *p = '\0'; + + /* Path component cannot be empty or relative */ + if (pnm[0] == '\0' || + (pnm[0] == '.' && pnm[1] == '.' && pnm[2] == '\0')) + return (EINVAL); + + if ((error = hyprlofs_dirlookup(parent, pnm, &fndtp, cr)) != 0) + return (error); + + dvp = HLNTOV(fndtp); + parent = fndtp; + pnm = p + 1; + } + + /* The file name is required */ + if (pnm[0] == '\0') + return (EINVAL); + + /* Remove the entry from the parent dir */ + return (hyprlofs_remove(dvp, pnm, cr, ct, flags)); +} + +/* + * Remove all looped in files from the namespace. + */ +static int +hyprlofs_rm_all(vnode_t *dvp, cred_t *cr, caller_context_t *ct, + int flags) +{ + int error = 0; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively remove contents of this subdir */ + if (fndhp->hln_type == VDIR) { + vnode_t *tvp = HLNTOV(fndhp); + + error = hyprlofs_rm_all(tvp, cr, ct, flags); + if (error != 0) + goto done; + } + } + + /* remove the entry */ + error = hyprlofs_remove(dvp, hdp->hld_name, cr, ct, flags); + if (error != 0) + goto done; + + hdp = hp->hln_dir; + } + +done: + hlnode_rele(hp); + return (error); +} + +/* + * Get a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all_entries(vnode_t *dvp, hyprlofs_curr_entry_t *hcp, + char *prefix, int *pcnt, int n_max, + cred_t *cr, caller_context_t *ct, int flags) +{ + int error = 0; + int too_big = 0; + int cnt; + int len; + hlnode_t *hp = (hlnode_t *)VTOHLN(dvp); + hldirent_t *hdp; + char *path; + + cnt = *pcnt; + path = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + hlnode_hold(hp); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + goto done; + } + + hdp = hp->hln_dir; + while (hdp) { + hlnode_t *fndhp; + vnode_t *tvp; + + if (strcmp(hdp->hld_name, ".") == 0 || + strcmp(hdp->hld_name, "..") == 0) { + hdp = hdp->hld_next; + continue; + } + + /* This holds the fndhp vnode */ + error = hyprlofs_dirlookup(hp, hdp->hld_name, &fndhp, cr); + if (error != 0) + goto done; + hlnode_rele(fndhp); + + if (fndhp->hln_looped == 0) { + /* recursively get contents of this subdir */ + VERIFY(fndhp->hln_type == VDIR); + tvp = HLNTOV(fndhp); + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, "%s/%s", + prefix, hdp->hld_name); + + error = hyprlofs_get_all_entries(tvp, hcp, path, + &cnt, n_max, cr, ct, flags); + + if (error == E2BIG) { + too_big = 1; + error = 0; + } + if (error != 0) + goto done; + } else { + if (cnt < n_max) { + char *p; + + if (*prefix == '\0') + (void) strlcpy(path, hdp->hld_name, + MAXPATHLEN); + else + (void) snprintf(path, MAXPATHLEN, + "%s/%s", prefix, hdp->hld_name); + + len = strlen(path); + ASSERT(len <= MAXPATHLEN); + if (copyout(path, (void *)(hcp[cnt].hce_name), + len)) { + error = EFAULT; + goto done; + } + + tvp = REALVP(HLNTOV(fndhp)); + if (tvp->v_path == vn_vpath_empty) { + p = "<unknown>"; + } else { + p = tvp->v_path; + } + len = strlen(p); + ASSERT(len <= MAXPATHLEN); + if (copyout(p, (void *)(hcp[cnt].hce_path), + len)) { + error = EFAULT; + goto done; + } + } + + cnt++; + if (cnt > n_max) + too_big = 1; + } + + hdp = hdp->hld_next; + } + +done: + hlnode_rele(hp); + kmem_free(path, MAXPATHLEN); + + *pcnt = cnt; + if (error == 0 && too_big == 1) + error = E2BIG; + + return (error); +} + +/* + * Return a list of all looped in files in the namespace. + */ +static int +hyprlofs_get_all(vnode_t *dvp, intptr_t data, cred_t *cr, caller_context_t *ct, + int flags) +{ + int limit, cnt, error; + model_t model; + hyprlofs_curr_entry_t *e; + + model = get_udatamodel(); + + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + if (copyin((void *)data, &ebuf, sizeof (ebuf))) + return (EFAULT); + limit = ebuf.hce_cnt; + e = ebuf.hce_entries; + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + if (copyin((void *)data, &ebuf32, sizeof (ebuf32))) + return (EFAULT); + + limit = ebuf32.hce_cnt; + e = (hyprlofs_curr_entry_t *)(unsigned long) + (ebuf32.hce_entries); + if (limit > MAX_IOCTL_PARAMS) + return (EINVAL); + } + + cnt = 0; + error = hyprlofs_get_all_entries(dvp, e, "", &cnt, limit, cr, ct, + flags); + + if (error == 0 || error == E2BIG) { + if (model == DATAMODEL_NATIVE) { + hyprlofs_curr_entries_t ebuf; + + ebuf.hce_cnt = cnt; + if (copyout(&ebuf, (void *)data, sizeof (ebuf))) + return (EFAULT); + + } else { + hyprlofs_curr_entries32_t ebuf32; + + ebuf32.hce_cnt = cnt; + if (copyout(&ebuf32, (void *)data, sizeof (ebuf32))) + return (EFAULT); + } + } + + return (error); +} + +/* ARGSUSED3 */ +static int +hyprlofs_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + int error; + hlnode_t *hp = NULL; + + /* This holds the hp vnode */ + error = hyprlofs_dirlookup(parent, nm, &hp, cr); + if (error) + return (error); + + ASSERT(hp); + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&hp->hln_rwlock, RW_WRITER); + + error = hyprlofs_dirdelete(parent, hp, nm, DR_REMOVE, cr); + + rw_exit(&hp->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_remove(HLNTOV(hp), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(hp); + + return (error); +} + +/* ARGSUSED4 */ +static int +hyprlofs_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + hlnode_t *parent = (hlnode_t *)VTOHLN(dvp); + hlnode_t *self = NULL; + vnode_t *vp; + int error = 0; + + /* Return error if removing . or .. */ + if (strcmp(nm, ".") == 0) + return (EINVAL); + if (strcmp(nm, "..") == 0) + return (EEXIST); /* Should be ENOTEMPTY */ + error = hyprlofs_dirlookup(parent, nm, &self, cr); + if (error) + return (error); + + rw_enter(&parent->hln_rwlock, RW_WRITER); + rw_enter(&self->hln_rwlock, RW_WRITER); + + vp = HLNTOV(self); + if (vp == dvp || vp == cdir) { + error = EINVAL; + goto done1; + } + if (self->hln_type != VDIR) { + error = ENOTDIR; + goto done1; + } + + /* + * When a dir is looped in, we only remove the in-memory dir, not the + * backing dir. + */ + if (self->hln_looped == 0) { + mutex_enter(&self->hln_tlock); + if (self->hln_nlink > 2) { + mutex_exit(&self->hln_tlock); + error = EEXIST; + goto done1; + } + mutex_exit(&self->hln_tlock); + + if (vn_vfswlock(vp)) { + error = EBUSY; + goto done1; + } + if (vn_mountedvfs(vp) != NULL) { + error = EBUSY; + goto done; + } + + /* + * Check for an empty directory, i.e. only includes entries for + * "." and ".." + */ + if (self->hln_dirents > 2) { + error = EEXIST; /* SIGH should be ENOTEMPTY */ + /* + * Update atime because checking hln_dirents is + * equivalent to reading the directory + */ + gethrestime(&self->hln_atime); + goto done; + } + + error = hyprlofs_dirdelete(parent, self, nm, DR_RMDIR, cr); + } else { + error = hyprlofs_dirdelete(parent, self, nm, DR_REMOVE, cr); + } + +done: + if (self->hln_looped == 0) + vn_vfsunlock(vp); +done1: + rw_exit(&self->hln_rwlock); + rw_exit(&parent->hln_rwlock); + vnevent_rmdir(HLNTOV(self), dvp, nm, ct); + + /* + * We've now dropped the dir link so by rele-ing our vnode we should + * clean up in hyprlofs_inactive. + */ + hlnode_rele(self); + + return (error); +} + +static int +hyprlofs_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hldirent_t *hdp; + int error = 0; + size_t namelen; + struct dirent64 *dp; + ulong_t offset; + ulong_t total_bytes_wanted; + long outcount = 0; + long bufsize; + int reclen; + caddr_t outbuf; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_READDIR(REALVP(vp), uiop, cr, eofp, ct, flags)); + + if (uiop->uio_loffset >= MAXOFF_T) { + if (eofp) + *eofp = 1; + return (0); + } + /* assuming syscall has already called hln_rwlock */ + ASSERT(RW_READ_HELD(&hp->hln_rwlock)); + + if (uiop->uio_iovcnt != 1) + return (EINVAL); + + if (vp->v_type != VDIR) + return (ENOTDIR); + + /* + * There's a window here where someone could have removed + * all the entries in the directory after we put a hold on the + * vnode but before we grabbed the rwlock. Just return. + */ + if (hp->hln_dir == NULL) { + if (hp->hln_nlink) { + panic("empty directory 0x%p", (void *)hp); + /*NOTREACHED*/ + } + return (0); + } + + /* Get space for multiple dir entries */ + total_bytes_wanted = uiop->uio_iov->iov_len; + bufsize = total_bytes_wanted + sizeof (struct dirent64); + outbuf = kmem_alloc(bufsize, KM_SLEEP); + + dp = (struct dirent64 *)((uintptr_t)outbuf); + + offset = 0; + hdp = hp->hln_dir; + while (hdp) { + namelen = strlen(hdp->hld_name); /* no +1 needed */ + offset = hdp->hld_offset; + if (offset >= uiop->uio_offset) { + reclen = (int)DIRENT64_RECLEN(namelen); + if (outcount + reclen > total_bytes_wanted) { + if (!outcount) + /* Buffer too small for any entries. */ + error = EINVAL; + break; + } + ASSERT(hdp->hld_hlnode != NULL); + + /* zero out uninitialized bytes */ + (void) strncpy(dp->d_name, hdp->hld_name, + DIRENT64_NAMELEN(reclen)); + dp->d_reclen = (ushort_t)reclen; + dp->d_ino = (ino64_t)hdp->hld_hlnode->hln_nodeid; + dp->d_off = (offset_t)hdp->hld_offset + 1; + dp = (struct dirent64 *) + ((uintptr_t)dp + dp->d_reclen); + outcount += reclen; + ASSERT(outcount <= bufsize); + } + hdp = hdp->hld_next; + } + + if (!error) + error = uiomove(outbuf, outcount, UIO_READ, uiop); + + if (!error) { + /* + * If we reached the end of the list our offset should now be + * just past the end. + */ + if (!hdp) { + offset += 1; + if (eofp) + *eofp = 1; + } else if (eofp) + *eofp = 0; + uiop->uio_offset = offset; + } + gethrestime(&hp->hln_atime); + kmem_free(outbuf, bufsize); + return (error); +} + +static int +hyprlofs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FSYNC(REALVP(vp), syncflag, cr, ct)); + return (0); +} + +/* ARGSUSED */ +static void +hyprlofs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfsmount_t *hm = (hlfsmount_t *)VFSTOHLM(vp->v_vfsp); + + rw_enter(&hp->hln_rwlock, RW_WRITER); + + mutex_enter(&hp->hln_tlock); + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count >= 1); + + /* + * If we don't have the last hold or the link count is non-zero, + * there's nothing to do except drop our hold. + */ + if (vp->v_count > 1 || hp->hln_nlink != 0) { + vp->v_count--; + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + rw_exit(&hp->hln_rwlock); + return; + } + + mutex_exit(&vp->v_lock); + mutex_exit(&hp->hln_tlock); + + /* release hold on the real vnode now */ + if (hp->hln_looped == 1 && hp->hln_realvp != NULL) + VN_RELE(hp->hln_realvp); + + /* Here's our chance to send invalid event while we're between locks */ + vn_invalid(HLNTOV(hp)); + + mutex_enter(&hm->hlm_contents); + if (hp->hln_forw == NULL) + hm->hlm_rootnode->hln_back = hp->hln_back; + else + hp->hln_forw->hln_back = hp->hln_back; + hp->hln_back->hln_forw = hp->hln_forw; + mutex_exit(&hm->hlm_contents); + rw_exit(&hp->hln_rwlock); + rw_destroy(&hp->hln_rwlock); + mutex_destroy(&hp->hln_tlock); + vn_free(HLNTOV(hp)); + kmem_free(hp, sizeof (hlnode_t)); +} + +static int +hyprlofs_fid(vnode_t *vp, struct fid *fidp, caller_context_t *ct) +{ + hlnode_t *hp = (hlnode_t *)VTOHLN(vp); + hlfid_t *hfid; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_FID(REALVP(vp), fidp, ct)); + + if (fidp->fid_len < (sizeof (hlfid_t) - sizeof (ushort_t))) { + fidp->fid_len = sizeof (hlfid_t) - sizeof (ushort_t); + return (ENOSPC); + } + + hfid = (hlfid_t *)fidp; + bzero(hfid, sizeof (hlfid_t)); + hfid->hlfid_len = (int)sizeof (hlfid_t) - sizeof (ushort_t); + + hfid->hlfid_ino = hp->hln_nodeid; + hfid->hlfid_gen = hp->hln_gen; + + return (0); +} + +static int +hyprlofs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_GETPAGE(REALVP(vp), off, len, protp, pl, plsz, seg, addr, + rw, cr, ct)); +} + +int +hyprlofs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, + cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_PUTPAGE(REALVP(vp), off, len, flags, cr, ct)); +} + +static int +hyprlofs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_MAP(REALVP(vp), off, as, addrp, len, prot, maxprot, flags, + cr, ct)); +} + +static int +hyprlofs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_ADDMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, + size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, + caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_DELMAP(REALVP(vp), off, as, addr, len, prot, maxprot, + flags, cr, ct)); +} + +static int +hyprlofs_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) +{ + /* return EACCES to be consistent with mmap */ + if (VTOHLN(vp)->hln_looped != 1) + return (EACCES); + return (VOP_SPACE(REALVP(vp), cmd, bfp, flag, offset, cr, ct)); +} + +static int +hyprlofs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + if (VTOHLN(vp)->hln_looped == 0) + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); + + return (VOP_SEEK(REALVP(vp), ooff, noffp, ct)); +} + +static int +hyprlofs_rwlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) + return (VOP_RWLOCK(REALVP(vp), write_lock, ct)); + + if (write_lock) { + rw_enter(&hp->hln_rwlock, RW_WRITER); + } else { + rw_enter(&hp->hln_rwlock, RW_READER); + } + return (write_lock); +} + +static void +hyprlofs_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ct) +{ + hlnode_t *hp = VTOHLN(vp); + + if (hp->hln_looped == 1) { + VOP_RWUNLOCK(REALVP(vp), write_lock, ct); + return; + } + + rw_exit(&hp->hln_rwlock); +} + +static int +hyprlofs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + int error; + + if (VTOHLN(vp)->hln_looped == 1) + return (VOP_PATHCONF(REALVP(vp), cmd, valp, cr, ct)); + + switch (cmd) { + case _PC_XATTR_ENABLED: + case _PC_XATTR_EXISTS: + case _PC_SATTR_ENABLED: + case _PC_SATTR_EXISTS: + error = EINVAL; + break; + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + error = 0; + break; + default: + error = fs_pathconf(vp, cmd, valp, cr, ct); + } + return (error); +} + + +struct vnodeops *hyprlofs_vnodeops; + +const fs_operation_def_t hyprlofs_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = hyprlofs_open }, + VOPNAME_CLOSE, { .vop_close = hyprlofs_close }, + VOPNAME_READ, { .vop_read = hyprlofs_read }, + VOPNAME_WRITE, { .vop_write = hyprlofs_write }, + VOPNAME_IOCTL, { .vop_ioctl = hyprlofs_ioctl }, + VOPNAME_GETATTR, { .vop_getattr = hyprlofs_getattr }, + VOPNAME_SETATTR, { .vop_setattr = hyprlofs_setattr }, + VOPNAME_ACCESS, { .vop_access = hyprlofs_access }, + VOPNAME_LOOKUP, { .vop_lookup = hyprlofs_lookup }, + VOPNAME_CREATE, { .error = fs_error }, + VOPNAME_REMOVE, { .vop_remove = hyprlofs_remove }, + VOPNAME_LINK, { .error = fs_error }, + VOPNAME_RENAME, { .error = fs_error }, + VOPNAME_MKDIR, { .error = fs_error }, + VOPNAME_RMDIR, { .vop_rmdir = hyprlofs_rmdir }, + VOPNAME_READDIR, { .vop_readdir = hyprlofs_readdir }, + VOPNAME_SYMLINK, { .error = fs_error }, + VOPNAME_READLINK, { .error = fs_error }, + VOPNAME_FSYNC, { .vop_fsync = hyprlofs_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = hyprlofs_inactive }, + VOPNAME_FID, { .vop_fid = hyprlofs_fid }, + VOPNAME_RWLOCK, { .vop_rwlock = hyprlofs_rwlock }, + VOPNAME_RWUNLOCK, { .vop_rwunlock = hyprlofs_rwunlock }, + VOPNAME_SEEK, { .vop_seek = hyprlofs_seek }, + VOPNAME_SPACE, { .vop_space = hyprlofs_space }, + VOPNAME_GETPAGE, { .vop_getpage = hyprlofs_getpage }, + VOPNAME_PUTPAGE, { .vop_putpage = hyprlofs_putpage }, + VOPNAME_MAP, { .vop_map = hyprlofs_map }, + VOPNAME_ADDMAP, { .vop_addmap = hyprlofs_addmap }, + VOPNAME_DELMAP, { .vop_delmap = hyprlofs_delmap }, + VOPNAME_PATHCONF, { .vop_pathconf = hyprlofs_pathconf }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; diff --git a/usr/src/uts/common/fs/lookup.c b/usr/src/uts/common/fs/lookup.c index 55ffb94805..59ec5d1829 100644 --- a/usr/src/uts/common/fs/lookup.c +++ b/usr/src/uts/common/fs/lookup.c @@ -21,6 +21,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -57,6 +58,7 @@ #include <sys/zone.h> #include <sys/dnlc.h> #include <sys/fs/snode.h> +#include <sys/brand.h> /* Controls whether paths are stored with vnodes. */ int vfs_vnode_path = 1; @@ -977,6 +979,96 @@ localpath(char *path, struct vnode *vrootp, cred_t *cr) } /* + * Clean a stale v_path from a vnode. This is only performed if the v_path has + * not been altered since it was found to be stale + */ +static void +vnode_clear_vpath(vnode_t *vp, char *vpath_old) +{ + mutex_enter(&vp->v_lock); + if (vp->v_path != vn_vpath_empty && vp->v_path == vpath_old) { + vp->v_path = vn_vpath_empty; + mutex_exit(&vp->v_lock); + kmem_free(vpath_old, strlen(vpath_old) + 1); + } else { + mutex_exit(&vp->v_lock); + } +} + +/* + * Validate that a pathname refers to a given vnode. + */ +static int +vnode_valid_pn(vnode_t *vp, vnode_t *vrootp, pathname_t *pn, pathname_t *rpn, + int flags, cred_t *cr) +{ + vnode_t *compvp; + /* + * If we are in a zone or a chroot environment, then we have to + * take additional steps, since the path to the root might not + * be readable with the current credentials, even though the + * process can legitmately access the file. In this case, we + * do the following: + * + * lookuppnvp() with all privileges to get the resolved path. + * call localpath() to get the local portion of the path, and + * continue as normal. + * + * If the the conversion to a local path fails, then we continue + * as normal. This is a heuristic to make process object file + * paths available from within a zone. Because lofs doesn't + * support page operations, the vnode stored in the seg_t is + * actually the underlying real vnode, not the lofs node itself. + * Most of the time, the lofs path is the same as the underlying + * vnode (for example, /usr/lib/libc.so.1). + */ + if (vrootp != rootdir) { + char *local = NULL; + + VN_HOLD(rootdir); + if (lookuppnvp(pn, rpn, FOLLOW, NULL, &compvp, rootdir, + rootdir, kcred) == 0) { + local = localpath(rpn->pn_path, vrootp, kcred); + VN_RELE(compvp); + } + + /* + * The original pn was changed through lookuppnvp(). + * Set it to local for next validation attempt. + */ + if (local) { + (void) pn_set(pn, local); + } else { + return (1); + } + } + + /* + * We should have a local path at this point, so start the search from + * the root of the current process. + */ + VN_HOLD(vrootp); + if (vrootp != rootdir) + VN_HOLD(vrootp); + if (lookuppnvp(pn, rpn, FOLLOW | flags, NULL, &compvp, vrootp, vrootp, + cr) == 0) { + /* + * Check to see if the returned vnode is the same as the one we + * expect. + */ + if (vn_compare(vp, compvp) || + vnode_match(vp, compvp, cr)) { + VN_RELE(compvp); + return (0); + } else { + VN_RELE(compvp); + } + } + + return (1); +} + +/* * Given a directory, return the full, resolved path. This looks up "..", * searches for the given vnode in the parent, appends the component, etc. It * is used to implement vnodetopath() and getcwd() when the cached path fails. @@ -995,6 +1087,8 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, char *bufloc; size_t dlen = DIRENT64_RECLEN(MAXPATHLEN); refstr_t *mntpt; + char *vpath_cached; + boolean_t vpath_stale; /* Operation only allowed on directories */ ASSERT(vp->v_type == VDIR); @@ -1088,40 +1182,28 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, * Shortcut: see if this vnode has correct v_path. If so, * we have the work done. */ + vpath_cached = NULL; + vpath_stale = B_FALSE; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { - - if ((err = pn_set(&pn, vp->v_path)) == 0) { - mutex_exit(&vp->v_lock); - rpn.pn_path = rpn.pn_buf; - - /* - * Ensure the v_path pointing to correct vnode - */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - if (lookuppnvp(&pn, &rpn, flags, NULL, - &cmpvp, vrootp, vrootp, cr) == 0) { - - if (VN_CMP(vp, cmpvp)) { - VN_RELE(cmpvp); + if (vp->v_path != vn_vpath_empty && + pn_set(&pn, vp->v_path) == 0) { + vpath_cached = vp->v_path; + mutex_exit(&vp->v_lock); + rpn.pn_path = rpn.pn_buf; - complen = strlen(rpn.pn_path); - bufloc -= complen; - if (bufloc < buf) { - err = ERANGE; - goto out; - } - bcopy(rpn.pn_path, bufloc, - complen); - break; - } else { - VN_RELE(cmpvp); - } + /* Ensure the v_path pointing to correct vnode */ + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, + cr) == 0) { + complen = strlen(rpn.pn_path); + bufloc -= complen; + if (bufloc < buf) { + err = ERANGE; + goto out; } + bcopy(rpn.pn_path, bufloc, complen); + break; } else { - mutex_exit(&vp->v_lock); + vpath_stale = B_TRUE; } } else { mutex_exit(&vp->v_lock); @@ -1166,38 +1248,6 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, } /* - * Try to obtain the path component from dnlc cache - * before searching through the directory. - */ - if ((cmpvp = dnlc_reverse_lookup(vp, dbuf, dlen)) != NULL) { - /* - * If we got parent vnode as a result, - * then the answered path is correct. - */ - if (VN_CMP(cmpvp, pvp)) { - VN_RELE(cmpvp); - complen = strlen(dbuf); - bufloc -= complen; - if (bufloc <= buf) { - err = ENAMETOOLONG; - goto out; - } - bcopy(dbuf, bufloc, complen); - - /* Prepend a slash to the current path */ - *--bufloc = '/'; - - /* And continue with the next component */ - VN_RELE(vp); - vp = pvp; - pvp = NULL; - continue; - } else { - VN_RELE(cmpvp); - } - } - - /* * Search the parent directory for the entry corresponding to * this vnode. */ @@ -1215,6 +1265,11 @@ dirtopath(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, int flags, /* Prepend a slash to the current path. */ *--bufloc = '/'; + /* Clear vp->v_path if it was found to be stale. */ + if (vpath_stale == B_TRUE) { + vnode_clear_vpath(vp, vpath_cached); + } + /* And continue with the next component */ VN_RELE(vp); vp = pvp; @@ -1306,144 +1361,49 @@ vnodetopath_common(vnode_t *vrootp, vnode_t *vp, char *buf, size_t buflen, VN_RELE(vp); } - pn_alloc(&pn); /* - * Check to see if we have a cached path in the vnode. + * Check to see if we have a valid cached path in the vnode. */ + pn_alloc(&pn); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { (void) pn_set(&pn, vp->v_path); mutex_exit(&vp->v_lock); - pn_alloc(&rpn); - /* We should only cache absolute paths */ ASSERT(pn.pn_buf[0] == '/'); - /* - * If we are in a zone or a chroot environment, then we have to - * take additional steps, since the path to the root might not - * be readable with the current credentials, even though the - * process can legitmately access the file. In this case, we - * do the following: - * - * lookuppnvp() with all privileges to get the resolved path. - * call localpath() to get the local portion of the path, and - * continue as normal. - * - * If the the conversion to a local path fails, then we continue - * as normal. This is a heuristic to make process object file - * paths available from within a zone. Because lofs doesn't - * support page operations, the vnode stored in the seg_t is - * actually the underlying real vnode, not the lofs node itself. - * Most of the time, the lofs path is the same as the underlying - * vnode (for example, /usr/lib/libc.so.1). - */ - if (vrootp != rootdir) { - char *local = NULL; - VN_HOLD(rootdir); - if (lookuppnvp(&pn, &rpn, FOLLOW, - NULL, &compvp, rootdir, rootdir, kcred) == 0) { - local = localpath(rpn.pn_path, vrootp, - kcred); - VN_RELE(compvp); - } - - /* - * The original pn was changed through lookuppnvp(). - * Set it to local for next validation attempt. - */ - if (local) { - (void) pn_set(&pn, local); - } else { - goto notcached; + pn_alloc(&rpn); + if (vnode_valid_pn(vp, vrootp, &pn, &rpn, flags, cr) == 0) { + /* Return the result, if we're able. */ + if (buflen > rpn.pn_pathlen) { + bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); + pn_free(&pn); + pn_free(&rpn); + VN_RELE(vrootp); + if (doclose) { + (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, + NULL); + VN_RELE(vp); + } + return (0); } } - /* - * We should have a local path at this point, so start the - * search from the root of the current process. + * A stale v_path will be purged by the later dirtopath lookup. */ - VN_HOLD(vrootp); - if (vrootp != rootdir) - VN_HOLD(vrootp); - ret = lookuppnvp(&pn, &rpn, FOLLOW | flags, NULL, - &compvp, vrootp, vrootp, cr); - if (ret == 0) { - /* - * Check to see if the returned vnode is the same as - * the one we expect. If not, give up. - */ - if (!vn_compare(vp, compvp) && - !vnode_match(vp, compvp, cr)) { - VN_RELE(compvp); - goto notcached; - } - - VN_RELE(compvp); - - /* - * Return the result. - */ - if (buflen <= rpn.pn_pathlen) - goto notcached; - - bcopy(rpn.pn_path, buf, rpn.pn_pathlen + 1); - pn_free(&pn); - pn_free(&rpn); - VN_RELE(vrootp); - if (doclose) { - (void) VOP_CLOSE(vp, FREAD, 1, 0, cr, NULL); - VN_RELE(vp); - } - return (0); - } - -notcached: pn_free(&rpn); } else { mutex_exit(&vp->v_lock); } - pn_free(&pn); if (vp->v_type != VDIR) { - /* - * If we don't have a directory, try to find it in the dnlc via - * reverse lookup. Once this is found, we can use the regular - * directory search to find the full path. - */ - if ((pvp = dnlc_reverse_lookup(vp, path, MAXNAMELEN)) != NULL) { - /* - * Check if we have read privilege so, that - * we can lookup the path in the directory - */ - ret = 0; - if ((flags & LOOKUP_CHECKREAD)) { - ret = VOP_ACCESS(pvp, VREAD, 0, cr, NULL); - } - if (ret == 0) { - ret = dirtopath(vrootp, pvp, buf, buflen, - flags, cr); - } - if (ret == 0) { - len = strlen(buf); - if (len + strlen(path) + 1 >= buflen) { - ret = ENAMETOOLONG; - } else { - if (buf[len - 1] != '/') - buf[len++] = '/'; - bcopy(path, buf + len, - strlen(path) + 1); - } - } - - VN_RELE(pvp); - } else - ret = ENOENT; - } else + ret = ENOENT; + } else { ret = dirtopath(vrootp, vp, buf, buflen, flags, cr); + } VN_RELE(vrootp); if (doclose) { diff --git a/usr/src/uts/common/fs/lxproc/lxpr_subr.c b/usr/src/uts/common/fs/lxproc/lxpr_subr.c new file mode 100644 index 0000000000..3c1405d4af --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_subr.c @@ -0,0 +1,524 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/varargs.h> +#include <sys/cpuvar.h> +#include <sys/mman.h> +#include <sys/vmsystm.h> +#include <sys/prsystm.h> + +#include "lxproc.h" + +#define LXPRCACHE_NAME "lxpr_cache" + +static int lxpr_node_constructor(void *, void *, int); +static void lxpr_node_destructor(void *, void *); + +static kmem_cache_t *lxpr_node_cache; + +struct lxpr_uiobuf { + uio_t *uiop; + char *buffer; + uint32_t buffsize; + char *pos; + size_t beg; + int error; +}; + +int lxpr_bufsize = 4000; + +struct lxpr_uiobuf * +lxpr_uiobuf_new(uio_t *uiop) +{ + /* Allocate memory for both lxpr_uiobuf and output buffer */ + int bufsize = lxpr_bufsize; + struct lxpr_uiobuf *uiobuf = + kmem_alloc(sizeof (struct lxpr_uiobuf) + bufsize, KM_SLEEP); + + uiobuf->uiop = uiop; + uiobuf->buffer = (char *)&uiobuf[1]; + uiobuf->buffsize = bufsize; + uiobuf->pos = uiobuf->buffer; + uiobuf->beg = 0; + uiobuf->error = 0; + + return (uiobuf); +} + +void +lxpr_uiobuf_free(struct lxpr_uiobuf *uiobuf) +{ + ASSERT(uiobuf != NULL); + ASSERT(uiobuf->pos == uiobuf->buffer); + + kmem_free(uiobuf, sizeof (struct lxpr_uiobuf) + uiobuf->buffsize); +} + +void +lxpr_uiobuf_seek(struct lxpr_uiobuf *uiobuf, offset_t offset) +{ + uiobuf->uiop->uio_offset = (off_t)offset; +} + +void +lxpr_uiobuf_seterr(struct lxpr_uiobuf *uiobuf, int err) +{ + ASSERT(uiobuf->error == 0); + + uiobuf->error = err; +} + +int +lxpr_uiobuf_flush(struct lxpr_uiobuf *uiobuf) +{ + off_t off = uiobuf->uiop->uio_offset; + caddr_t uaddr = uiobuf->buffer; + size_t beg = uiobuf->beg; + size_t size = (uintptr_t)uiobuf->pos - (uintptr_t)uaddr; + + if (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + ASSERT(off >= beg); + + if (beg + size > off && off >= 0) + uiobuf->error = + uiomove(uaddr + (off - beg), size - (off - beg), + UIO_READ, uiobuf->uiop); + + uiobuf->beg += size; + } + + uiobuf->pos = uaddr; + + return (uiobuf->error); +} + +void +lxpr_uiobuf_write(struct lxpr_uiobuf *uiobuf, const char *buf, size_t size) +{ + /* While we can still carry on */ + while (uiobuf->error == 0 && uiobuf->uiop->uio_resid != 0) { + uintptr_t remain = (uintptr_t)uiobuf->buffsize - + ((uintptr_t)uiobuf->pos - (uintptr_t)uiobuf->buffer); + + /* Enough space in buffer? */ + if (remain >= size) { + bcopy(buf, uiobuf->pos, size); + uiobuf->pos += size; + return; + } + + /* Not enough space, so copy all we can and try again */ + bcopy(buf, uiobuf->pos, remain); + uiobuf->pos += remain; + (void) lxpr_uiobuf_flush(uiobuf); + buf += remain; + size -= remain; + } +} + +#define TYPBUFFSIZE 256 + +void +lxpr_uiobuf_printf(struct lxpr_uiobuf *uiobuf, const char *fmt, ...) +{ + va_list args; + char buff[TYPBUFFSIZE]; + int len; + char *buffer; + + /* Can we still do any output */ + if (uiobuf->error != 0 || uiobuf->uiop->uio_resid == 0) + return; + + va_start(args, fmt); + + /* Try using stack allocated buffer */ + len = vsnprintf(buff, TYPBUFFSIZE, fmt, args); + if (len < TYPBUFFSIZE) { + va_end(args); + lxpr_uiobuf_write(uiobuf, buff, len); + return; + } + + /* Not enough space in pre-allocated buffer */ + buffer = kmem_alloc(len + 1, KM_SLEEP); + + /* + * We know we allocated the correct amount of space + * so no check on the return value + */ + (void) vsnprintf(buffer, len+1, fmt, args); + lxpr_uiobuf_write(uiobuf, buffer, len); + va_end(args); + kmem_free(buffer, len+1); +} + +/* + * lxpr_lock(): + * + * Lookup process from pid and return with p_plock and P_PR_LOCK held. + */ +proc_t * +lxpr_lock(pid_t pid) +{ + proc_t *p; + kmutex_t *mp; + + ASSERT(!MUTEX_HELD(&pidlock)); + + for (;;) { + mutex_enter(&pidlock); + + /* + * If the pid is 1, we really want the zone's init process + */ + p = prfind((pid == 1) ? + curproc->p_zone->zone_proc_initpid : pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (NULL); + } + + /* + * p_lock is persistent, but p itself is not -- it could + * vanish during cv_wait(). Load p->p_lock now so we can + * drop it after cv_wait() without referencing p. + */ + mp = &p->p_lock; + mutex_enter(mp); + + mutex_exit(&pidlock); + + if (p->p_flag & SEXITING) { + /* + * This process is exiting -- let it go. + */ + mutex_exit(mp); + return (NULL); + } + + if (!(p->p_proc_flag & P_PR_LOCK)) + break; + + cv_wait(&pr_pid_cv[p->p_slot], mp); + mutex_exit(mp); + } + + p->p_proc_flag |= P_PR_LOCK; + THREAD_KPRI_REQUEST(); + return (p); +} + +/* + * lxpr_unlock() + * + * Unlock locked process + */ +void +lxpr_unlock(proc_t *p) +{ + ASSERT(p->p_proc_flag & P_PR_LOCK); + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(!MUTEX_HELD(&pidlock)); + + cv_signal(&pr_pid_cv[p->p_slot]); + p->p_proc_flag &= ~P_PR_LOCK; + mutex_exit(&p->p_lock); + THREAD_KPRI_RELEASE(); +} + +void +lxpr_initnodecache() +{ + lxpr_node_cache = kmem_cache_create(LXPRCACHE_NAME, + sizeof (lxpr_node_t), 0, + lxpr_node_constructor, lxpr_node_destructor, NULL, NULL, NULL, 0); +} + +void +lxpr_fininodecache() +{ + kmem_cache_destroy(lxpr_node_cache); +} + +/* ARGSUSED */ +static int +lxpr_node_constructor(void *buf, void *un, int kmflags) +{ + lxpr_node_t *lxpnp = buf; + vnode_t *vp; + + vp = lxpnp->lxpr_vnode = vn_alloc(kmflags); + if (vp == NULL) + return (-1); + + (void) vn_setops(vp, lxpr_vnodeops); + vp->v_data = lxpnp; + + return (0); +} + +/* ARGSUSED */ +static void +lxpr_node_destructor(void *buf, void *un) +{ + lxpr_node_t *lxpnp = buf; + + vn_free(LXPTOV(lxpnp)); +} + +/* + * Calculate an inode number + * + * This takes various bits of info and munges them + * to give the inode number for an lxproc node + */ +ino_t +lxpr_inode(lxpr_nodetype_t type, pid_t pid, int fd) +{ + if (pid == 1) + pid = curproc->p_zone->zone_proc_initpid; + + switch (type) { + case LXPR_PIDDIR: + return (pid + 1); + case LXPR_PROCDIR: + return (maxpid + 2); + case LXPR_PID_FD_FD: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + LXPR_NFILES + fd); + default: + return (maxpid + 2 + + (pid * (LXPR_FD_PERPROC + LXPR_NFILES)) + + type); + } +} + +/* + * Return inode number of parent (directory) + */ +ino_t +lxpr_parentinode(lxpr_node_t *lxpnp) +{ + /* + * If the input node is the root then the parent inode + * is the mounted on inode so just return our inode number + */ + if (lxpnp->lxpr_type != LXPR_PROCDIR) + return (VTOLXP(lxpnp->lxpr_parent)->lxpr_ino); + else + return (lxpnp->lxpr_ino); +} + +/* + * Allocate a new lxproc node + * + * This also allocates the vnode associated with it + */ +lxpr_node_t * +lxpr_getnode(vnode_t *dp, lxpr_nodetype_t type, proc_t *p, int fd) +{ + lxpr_node_t *lxpnp; + vnode_t *vp; + user_t *up; + timestruc_t now; + + /* + * Allocate a new node. It is deallocated in vop_innactive + */ + lxpnp = kmem_cache_alloc(lxpr_node_cache, KM_SLEEP); + + /* + * Set defaults (may be overridden below) + */ + gethrestime(&now); + lxpnp->lxpr_type = type; + lxpnp->lxpr_realvp = NULL; + lxpnp->lxpr_parent = dp; + VN_HOLD(dp); + if (p != NULL) { + lxpnp->lxpr_pid = ((p->p_pid == + curproc->p_zone->zone_proc_initpid) ? 1 : p->p_pid); + + lxpnp->lxpr_time = PTOU(p)->u_start; + lxpnp->lxpr_uid = crgetruid(p->p_cred); + lxpnp->lxpr_gid = crgetrgid(p->p_cred); + lxpnp->lxpr_ino = lxpr_inode(type, p->p_pid, fd); + } else { + /* Pretend files without a proc belong to sched */ + lxpnp->lxpr_pid = 0; + lxpnp->lxpr_time = now; + lxpnp->lxpr_uid = lxpnp->lxpr_gid = 0; + lxpnp->lxpr_ino = lxpr_inode(type, 0, 0); + } + + /* initialize the vnode data */ + vp = lxpnp->lxpr_vnode; + vn_reinit(vp); + vp->v_flag = VNOCACHE|VNOMAP|VNOSWAP|VNOMOUNT; + vp->v_vfsp = dp->v_vfsp; + + /* + * Do node specific stuff + */ + switch (type) { + case LXPR_PROCDIR: + vp->v_flag |= VROOT; + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by everyone */ + break; + + case LXPR_PID_CURDIR: + ASSERT(p != NULL); + + /* + * Zombie check. p_stat is officially protected by pidlock, + * but we can't grab pidlock here because we already hold + * p_lock. Luckily if we look at the process exit code + * we see that p_stat only transisions from SRUN to SZOMB + * while p_lock is held. Aside from this, the only other + * p_stat transition that we need to be aware about is + * SIDL to SRUN, but that's not a problem since lxpr_lock() + * ignores nodes in the SIDL state so we'll never get a node + * that isn't already in the SRUN state. + */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = up->u_cdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_ROOTDIR: + ASSERT(p != NULL); + /* Zombie check. see locking comment above */ + if (p->p_stat == SZOMB) { + lxpnp->lxpr_realvp = NULL; + } else { + up = PTOU(p); + lxpnp->lxpr_realvp = + up->u_rdir != NULL ? up->u_rdir : rootdir; + ASSERT(lxpnp->lxpr_realvp != NULL); + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_EXE: + ASSERT(p != NULL); + lxpnp->lxpr_realvp = p->p_exec; + if (lxpnp->lxpr_realvp != NULL) { + VN_HOLD(lxpnp->lxpr_realvp); + } + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; + break; + + case LXPR_SELF: + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0777; /* anyone does anything ! */ + break; + + case LXPR_PID_FD_FD: + ASSERT(p != NULL); + /* lxpr_realvp is set after we return */ + vp->v_type = VLNK; + lxpnp->lxpr_mode = 0700; /* read-write-exe owner only */ + break; + + case LXPR_PID_FDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0500; /* read-search by owner only */ + break; + + case LXPR_PIDDIR: + ASSERT(p != NULL); + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0511; + break; + + case LXPR_NETDIR: + vp->v_type = VDIR; + lxpnp->lxpr_mode = 0555; /* read-search by all */ + break; + + case LXPR_PID_ENV: + case LXPR_PID_MEM: + ASSERT(p != NULL); + /*FALLTHRU*/ + case LXPR_KCORE: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0400; /* read-only by owner only */ + break; + + default: + vp->v_type = VREG; + lxpnp->lxpr_mode = 0444; /* read-only by all */ + break; + } + + return (lxpnp); +} + + +/* + * Free the storage obtained from lxpr_getnode(). + */ +void +lxpr_freenode(lxpr_node_t *lxpnp) +{ + ASSERT(lxpnp != NULL); + ASSERT(LXPTOV(lxpnp) != NULL); + + /* + * delete any association with realvp + */ + if (lxpnp->lxpr_realvp != NULL) + VN_RELE(lxpnp->lxpr_realvp); + + /* + * delete any association with parent vp + */ + if (lxpnp->lxpr_parent != NULL) + VN_RELE(lxpnp->lxpr_parent); + + /* + * Release the lxprnode. + */ + kmem_cache_free(lxpr_node_cache, lxpnp); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c new file mode 100644 index 0000000000..1bb7bd3823 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vfsops.c @@ -0,0 +1,367 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> +#include <sys/cred.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/proc.h> +#include <sys/stat.h> +#include <sys/statvfs.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> +#include <sys/vnode.h> +#include <sys/mode.h> +#include <sys/signal.h> +#include <sys/user.h> +#include <sys/mount.h> +#include <sys/bitmap.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> + +#include "lxproc.h" + +/* Module level parameters */ +static int lxprocfstype; +static dev_t lxprocdev; +static kmutex_t lxpr_mount_lock; + +int nproc_highbit; /* highbit(v.v_nproc) */ + +static int lxpr_mount(vfs_t *, vnode_t *, mounta_t *, cred_t *); +static int lxpr_unmount(vfs_t *, int, cred_t *); +static int lxpr_root(vfs_t *, vnode_t **); +static int lxpr_statvfs(vfs_t *, statvfs64_t *); +static int lxpr_init(int, char *); + +static vfsdef_t vfw = { + VFSDEF_VERSION, + "lxproc", + lxpr_init, + VSW_ZMOUNT, + NULL +}; + +/* + * Module linkage information for the kernel. + */ +extern struct mod_ops mod_fsops; + +static struct modlfs modlfs = { + &mod_fsops, "generic linux procfs", &vfw +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlfs, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int retval; + + /* + * attempt to unload the module + */ + if ((retval = mod_remove(&modlinkage)) != 0) + goto done; + + /* + * destroy lxpr_node cache + */ + lxpr_fininodecache(); + + /* + * clean out the vfsops and vnodeops + */ + (void) vfs_freevfsops_by_type(lxprocfstype); + vn_freevnodeops(lxpr_vnodeops); + + mutex_destroy(&lxpr_mount_lock); +done: + return (retval); +} + +static int +lxpr_init(int fstype, char *name) +{ + static const fs_operation_def_t lxpr_vfsops_template[] = { + VFSNAME_MOUNT, { .vfs_mount = lxpr_mount }, + VFSNAME_UNMOUNT, { .vfs_unmount = lxpr_unmount }, + VFSNAME_ROOT, { .vfs_root = lxpr_root }, + VFSNAME_STATVFS, { .vfs_statvfs = lxpr_statvfs }, + NULL, NULL + }; + extern const fs_operation_def_t lxpr_vnodeops_template[]; + int error; + major_t dev; + + nproc_highbit = highbit(v.v_proc); + lxprocfstype = fstype; + ASSERT(lxprocfstype != 0); + + mutex_init(&lxpr_mount_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * Associate VFS ops vector with this fstype. + */ + error = vfs_setfsops(fstype, lxpr_vfsops_template, NULL); + if (error != 0) { + cmn_err(CE_WARN, "lxpr_init: bad vfs ops template"); + return (error); + } + + /* + * Set up vnode ops vector too. + */ + error = vn_make_ops(name, lxpr_vnodeops_template, &lxpr_vnodeops); + if (error != 0) { + (void) vfs_freevfsops_by_type(fstype); + cmn_err(CE_WARN, "lxpr_init: bad vnode ops template"); + return (error); + } + + /* + * Assign a unique "device" number (used by stat(2)). + */ + if ((dev = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, "lxpr_init: can't get unique device number"); + dev = 0; + } + + /* + * Make the pseudo device + */ + lxprocdev = makedevice(dev, 0); + + /* + * Initialize cache for lxpr_nodes + */ + lxpr_initnodecache(); + + return (0); +} + +static int +lxpr_mount(vfs_t *vfsp, vnode_t *mvp, mounta_t *uap, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt; + zone_t *zone = curproc->p_zone; + ldi_ident_t li; + int err; + + /* + * must be root to mount + */ + if (secpolicy_fs_mount(cr, mvp, vfsp) != 0) + return (EPERM); + + /* + * mount point must be a directory + */ + if (mvp->v_type != VDIR) + return (ENOTDIR); + + if (zone == global_zone) { + zone_t *mntzone; + + mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt)); + zone_rele(mntzone); + if (zone != mntzone) + return (EBUSY); + } + + /* + * Having the resource be anything but "lxproc" doesn't make sense + */ + vfs_setresource(vfsp, "lxproc", 0); + + lxpr_mnt = kmem_alloc(sizeof (*lxpr_mnt), KM_SLEEP); + + if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) { + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + return (err); + } + + lxpr_mnt->lxprm_li = li; + + mutex_enter(&lxpr_mount_lock); + + /* + * Ensure we don't allow overlaying mounts + */ + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count > 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + mutex_exit(&lxpr_mount_lock); + kmem_free(lxpr_mnt, sizeof ((*lxpr_mnt))); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * allocate the first vnode + */ + zone_hold(lxpr_mnt->lxprm_zone = zone); + + /* Arbitrarily set the parent vnode to the mounted over directory */ + lxpr_mnt->lxprm_node = lxpr_getnode(mvp, LXPR_PROCDIR, NULL, 0); + + /* Correctly set the fs for the root node */ + lxpr_mnt->lxprm_node->lxpr_vnode->v_vfsp = vfsp; + + vfs_make_fsid(&vfsp->vfs_fsid, lxprocdev, lxprocfstype); + vfsp->vfs_bsize = DEV_BSIZE; + vfsp->vfs_fstype = lxprocfstype; + vfsp->vfs_data = (caddr_t)lxpr_mnt; + vfsp->vfs_dev = lxprocdev; + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_unmount(vfs_t *vfsp, int flag, cred_t *cr) +{ + lxpr_mnt_t *lxpr_mnt = (lxpr_mnt_t *)vfsp->vfs_data; + vnode_t *vp; + int count; + + ASSERT(lxpr_mnt != NULL); + vp = LXPTOV(lxpr_mnt->lxprm_node); + + mutex_enter(&lxpr_mount_lock); + + /* + * must be root to unmount + */ + if (secpolicy_fs_unmount(cr, vfsp) != 0) { + mutex_exit(&lxpr_mount_lock); + return (EPERM); + } + + /* + * forced unmount is not supported by this file system + */ + if (flag & MS_FORCE) { + mutex_exit(&lxpr_mount_lock); + return (ENOTSUP); + } + + /* + * Ensure that no vnodes are in use on this mount point. + */ + mutex_enter(&vp->v_lock); + count = vp->v_count; + mutex_exit(&vp->v_lock); + if (count > 1) { + mutex_exit(&lxpr_mount_lock); + return (EBUSY); + } + + /* + * purge the dnlc cache for vnode entries + * associated with this file system + */ + count = dnlc_purge_vfsp(vfsp, 0); + + /* + * free up the lxprnode + */ + lxpr_freenode(lxpr_mnt->lxprm_node); + zone_rele(lxpr_mnt->lxprm_zone); + kmem_free(lxpr_mnt, sizeof (*lxpr_mnt)); + + mutex_exit(&lxpr_mount_lock); + + return (0); +} + +static int +lxpr_root(vfs_t *vfsp, vnode_t **vpp) +{ + lxpr_node_t *lxpnp = ((lxpr_mnt_t *)vfsp->vfs_data)->lxprm_node; + vnode_t *vp = LXPTOV(lxpnp); + + VN_HOLD(vp); + *vpp = vp; + return (0); +} + +static int +lxpr_statvfs(vfs_t *vfsp, statvfs64_t *sp) +{ + int n; + dev32_t d32; + extern uint_t nproc; + + n = v.v_proc - nproc; + + bzero((caddr_t)sp, sizeof (*sp)); + sp->f_bsize = DEV_BSIZE; + sp->f_frsize = DEV_BSIZE; + sp->f_blocks = (fsblkcnt64_t)0; + sp->f_bfree = (fsblkcnt64_t)0; + sp->f_bavail = (fsblkcnt64_t)0; + sp->f_files = (fsfilcnt64_t)v.v_proc + 2; + sp->f_ffree = (fsfilcnt64_t)n; + sp->f_favail = (fsfilcnt64_t)n; + (void) cmpldev(&d32, vfsp->vfs_dev); + sp->f_fsid = d32; + /* It is guaranteed that vsw_name will fit in f_basetype */ + (void) strcpy(sp->f_basetype, vfssw[lxprocfstype].vsw_name); + sp->f_flag = vf_to_stf(vfsp->vfs_flag); + sp->f_namemax = 64; /* quite arbitrary */ + + (void) strcpy(sp->f_fstr, "lxproc"); + + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxpr_vnops.c b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c new file mode 100644 index 0000000000..48f4efc1bf --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxpr_vnops.c @@ -0,0 +1,3099 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +/* + * lxproc -- a loosely Linux-compatible /proc + * + * We have -- confusingly -- two implementations of Linux /proc. One is to + * support the LX brand with a Linux /proc entirely compatible with the Linux + * world view; the other -- this one -- is to support native (but Linux-borne) + * programs that wish to view the native system via the Linux /proc model. So + * the aspiration here is to provide something that sufficiently approximates + * the Linux /proc implementation for purposes of offering some compatibility + * for simple Linux /proc readers (e.g., ps/top/htop). However, it is not + * intended to exactly mimic Linux semantics; when choosing between offering + * compatibility and telling the truth, we emphatically pick the truth. A + * particular glaring example of this is the Linux notion of "tasks" (that is, + * threads), which -- due to historical misadventures on Linux -- allocate their + * identifiers from the process identifier space. (That is, each thread has in + * effect a pid.) Some Linux /proc readers have come to depend on this + * attribute, and become confused when threads appear with proper identifiers, + * so we simply opt for the pre-2.6 behavior, and do not present the tasks + * directory at all. Similarly, when choosing between offering compatibility + * and remaining consistent with our broader security model, we (obviously) + * choose security over compatibility. In short, this is meant to be a best + * effort -- no more -- and as such, it should not be unified with the much + * more complete Linux /proc implementation found in the LX brand. + */ + +#include <sys/cpupart.h> +#include <sys/cpuvar.h> +#include <sys/session.h> +#include <sys/vmparam.h> +#include <sys/mman.h> +#include <vm/rm.h> +#include <vm/seg_vn.h> +#include <sys/sdt.h> +#include <sys/strlog.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/x86_archext.h> +#include <sys/archsystm.h> +#include <sys/fp.h> +#include <sys/pool_pset.h> +#include <sys/pset.h> +#include <sys/zone.h> +#include <sys/pghw.h> +#include <sys/vfs_opreg.h> + +/* Dependent on procfs */ +extern kthread_t *prchoose(proc_t *); + +#include "lxproc.h" + +extern pgcnt_t swapfs_minfree; +extern time_t boot_time; + +/* + * Pointer to the vnode ops vector for this fs. + * This is instantiated in lxprinit() in lxpr_vfsops.c + */ +vnodeops_t *lxpr_vnodeops; + +static int lxpr_open(vnode_t **, int, cred_t *, caller_context_t *); +static int lxpr_close(vnode_t *, int, int, offset_t, cred_t *, + caller_context_t *); +static int lxpr_read(vnode_t *, uio_t *, int, cred_t *, caller_context_t *); +static int lxpr_getattr(vnode_t *, vattr_t *, int, cred_t *, + caller_context_t *); +static int lxpr_access(vnode_t *, int, int, cred_t *, caller_context_t *); +static int lxpr_lookup(vnode_t *, char *, vnode_t **, + pathname_t *, int, vnode_t *, cred_t *, caller_context_t *, int *, + pathname_t *); +static int lxpr_readdir(vnode_t *, uio_t *, cred_t *, int *, + caller_context_t *, int); +static int lxpr_readlink(vnode_t *, uio_t *, cred_t *, caller_context_t *); +static int lxpr_cmp(vnode_t *, vnode_t *, caller_context_t *); +static int lxpr_realvp(vnode_t *, vnode_t **, caller_context_t *); +static int lxpr_sync(void); +static void lxpr_inactive(vnode_t *, cred_t *, caller_context_t *); + +static vnode_t *lxpr_lookup_procdir(vnode_t *, char *); +static vnode_t *lxpr_lookup_piddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_not_a_dir(vnode_t *, char *); +static vnode_t *lxpr_lookup_fddir(vnode_t *, char *); +static vnode_t *lxpr_lookup_netdir(vnode_t *, char *); + +static int lxpr_readdir_procdir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_piddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_not_a_dir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_fddir(lxpr_node_t *, uio_t *, int *); +static int lxpr_readdir_netdir(lxpr_node_t *, uio_t *, int *); + +static void lxpr_read_invalid(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_empty(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_cpuinfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_isdir(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_fd(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_kmsg(lxpr_node_t *, lxpr_uiobuf_t *, ldi_handle_t); +static void lxpr_read_loadavg(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_meminfo(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_mounts(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_partitions(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_uptime(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_version(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_pid_cmdline(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_maps(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_statm(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_pid_status(lxpr_node_t *, lxpr_uiobuf_t *); + +static void lxpr_read_net_arp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_dev_mcast(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_igmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_ip_mr_vif(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_mcfilter(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_netstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_raw(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_route(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rpc(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_rt_cache(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_sockstat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_snmp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_stat(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_tcp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_udp(lxpr_node_t *, lxpr_uiobuf_t *); +static void lxpr_read_net_unix(lxpr_node_t *, lxpr_uiobuf_t *); + +/* + * Simple conversion + */ +#define btok(x) ((x) >> 10) /* bytes to kbytes */ +#define ptok(x) ((x) << (PAGESHIFT - 10)) /* pages to kbytes */ + +/* + * The lxproc vnode operations vector + */ +const fs_operation_def_t lxpr_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = lxpr_open }, + VOPNAME_CLOSE, { .vop_close = lxpr_close }, + VOPNAME_READ, { .vop_read = lxpr_read }, + VOPNAME_GETATTR, { .vop_getattr = lxpr_getattr }, + VOPNAME_ACCESS, { .vop_access = lxpr_access }, + VOPNAME_LOOKUP, { .vop_lookup = lxpr_lookup }, + VOPNAME_READDIR, { .vop_readdir = lxpr_readdir }, + VOPNAME_READLINK, { .vop_readlink = lxpr_readlink }, + VOPNAME_FSYNC, { .error = lxpr_sync }, + VOPNAME_SEEK, { .error = lxpr_sync }, + VOPNAME_INACTIVE, { .vop_inactive = lxpr_inactive }, + VOPNAME_CMP, { .vop_cmp = lxpr_cmp }, + VOPNAME_REALVP, { .vop_realvp = lxpr_realvp }, + NULL, NULL +}; + +/* + * file contents of an lxproc directory. + */ +static lxpr_dirent_t lxpr_dir[] = { + { LXPR_CMDLINE, "cmdline" }, + { LXPR_CPUINFO, "cpuinfo" }, + { LXPR_DEVICES, "devices" }, + { LXPR_DMA, "dma" }, + { LXPR_FILESYSTEMS, "filesystems" }, + { LXPR_INTERRUPTS, "interrupts" }, + { LXPR_IOPORTS, "ioports" }, + { LXPR_KCORE, "kcore" }, + { LXPR_KMSG, "kmsg" }, + { LXPR_LOADAVG, "loadavg" }, + { LXPR_MEMINFO, "meminfo" }, + { LXPR_MOUNTS, "mounts" }, + { LXPR_NETDIR, "net" }, + { LXPR_PARTITIONS, "partitions" }, + { LXPR_SELF, "self" }, + { LXPR_STAT, "stat" }, + { LXPR_UPTIME, "uptime" }, + { LXPR_VERSION, "version" } +}; + +#define PROCDIRFILES (sizeof (lxpr_dir) / sizeof (lxpr_dir[0])) + +/* + * Contents of an /lxproc/<pid> directory. + */ +static lxpr_dirent_t piddir[] = { + { LXPR_PID_CMDLINE, "cmdline" }, + { LXPR_PID_CPU, "cpu" }, + { LXPR_PID_CURDIR, "cwd" }, + { LXPR_PID_ENV, "environ" }, + { LXPR_PID_EXE, "exe" }, + { LXPR_PID_MAPS, "maps" }, + { LXPR_PID_MEM, "mem" }, + { LXPR_PID_ROOTDIR, "root" }, + { LXPR_PID_STAT, "stat" }, + { LXPR_PID_STATM, "statm" }, + { LXPR_PID_STATUS, "status" }, + { LXPR_PID_FDDIR, "fd" } +}; + +#define PIDDIRFILES (sizeof (piddir) / sizeof (piddir[0])) + +/* + * contents of /lxproc/net directory + */ +static lxpr_dirent_t netdir[] = { + { LXPR_NET_ARP, "arp" }, + { LXPR_NET_DEV, "dev" }, + { LXPR_NET_DEV_MCAST, "dev_mcast" }, + { LXPR_NET_IGMP, "igmp" }, + { LXPR_NET_IP_MR_CACHE, "ip_mr_cache" }, + { LXPR_NET_IP_MR_VIF, "ip_mr_vif" }, + { LXPR_NET_MCFILTER, "mcfilter" }, + { LXPR_NET_NETSTAT, "netstat" }, + { LXPR_NET_RAW, "raw" }, + { LXPR_NET_ROUTE, "route" }, + { LXPR_NET_RPC, "rpc" }, + { LXPR_NET_RT_CACHE, "rt_cache" }, + { LXPR_NET_SOCKSTAT, "sockstat" }, + { LXPR_NET_SNMP, "snmp" }, + { LXPR_NET_STAT, "stat" }, + { LXPR_NET_TCP, "tcp" }, + { LXPR_NET_UDP, "udp" }, + { LXPR_NET_UNIX, "unix" } +}; + +#define NETDIRFILES (sizeof (netdir) / sizeof (netdir[0])) + +/* + * These are the major signal number differences between Linux and native: + * + * ==================================== + * | Number | Linux | Native | + * | ====== | ========= | ========== | + * | 7 | SIGBUS | SIGEMT | + * | 10 | SIGUSR1 | SIGBUS | + * | 12 | SIGUSR2 | SIGSYS | + * | 16 | SIGSTKFLT | SIGUSR1 | + * | 17 | SIGCHLD | SIGUSR2 | + * | 18 | SIGCONT | SIGCHLD | + * | 19 | SIGSTOP | SIGPWR | + * | 20 | SIGTSTP | SIGWINCH | + * | 21 | SIGTTIN | SIGURG | + * | 22 | SIGTTOU | SIGPOLL | + * | 23 | SIGURG | SIGSTOP | + * | 24 | SIGXCPU | SIGTSTP | + * | 25 | SIGXFSZ | SIGCONT | + * | 26 | SIGVTALARM | SIGTTIN | + * | 27 | SIGPROF | SIGTTOU | + * | 28 | SIGWINCH | SIGVTALARM | + * | 29 | SIGPOLL | SIGPROF | + * | 30 | SIGPWR | SIGXCPU | + * | 31 | SIGSYS | SIGXFSZ | + * ==================================== + * + * Not every Linux signal maps to a native signal, nor does every native + * signal map to a Linux counterpart. However, when signals do map, the + * mapping is unique. + */ +static int +lxpr_sigmap[NSIG] = { + 0, + LX_SIGHUP, + LX_SIGINT, + LX_SIGQUIT, + LX_SIGILL, + LX_SIGTRAP, + LX_SIGABRT, + LX_SIGSTKFLT, + LX_SIGFPE, + LX_SIGKILL, + LX_SIGBUS, + LX_SIGSEGV, + LX_SIGSYS, + LX_SIGPIPE, + LX_SIGALRM, + LX_SIGTERM, + LX_SIGUSR1, + LX_SIGUSR2, + LX_SIGCHLD, + LX_SIGPWR, + LX_SIGWINCH, + LX_SIGURG, + LX_SIGPOLL, + LX_SIGSTOP, + LX_SIGTSTP, + LX_SIGCONT, + LX_SIGTTIN, + LX_SIGTTOU, + LX_SIGVTALRM, + LX_SIGPROF, + LX_SIGXCPU, + LX_SIGXFSZ, + -1, /* 32: illumos SIGWAITING */ + -1, /* 33: illumos SIGLWP */ + -1, /* 34: illumos SIGFREEZE */ + -1, /* 35: illumos SIGTHAW */ + -1, /* 36: illumos SIGCANCEL */ + -1, /* 37: illumos SIGLOST */ + -1, /* 38: illumos SIGXRES */ + -1, /* 39: illumos SIGJVM1 */ + -1, /* 40: illumos SIGJVM2 */ + -1, /* 41: illumos SIGINFO */ + LX_SIGRTMIN, /* 42: illumos _SIGRTMIN */ + LX_SIGRTMIN + 1, + LX_SIGRTMIN + 2, + LX_SIGRTMIN + 3, + LX_SIGRTMIN + 4, + LX_SIGRTMIN + 5, + LX_SIGRTMIN + 6, + LX_SIGRTMIN + 7, + LX_SIGRTMIN + 8, + LX_SIGRTMIN + 9, + LX_SIGRTMIN + 10, + LX_SIGRTMIN + 11, + LX_SIGRTMIN + 12, + LX_SIGRTMIN + 13, + LX_SIGRTMIN + 14, + LX_SIGRTMIN + 15, + LX_SIGRTMIN + 16, + LX_SIGRTMIN + 17, + LX_SIGRTMIN + 18, + LX_SIGRTMIN + 19, + LX_SIGRTMIN + 20, + LX_SIGRTMIN + 21, + LX_SIGRTMIN + 22, + LX_SIGRTMIN + 23, + LX_SIGRTMIN + 24, + LX_SIGRTMIN + 25, + LX_SIGRTMIN + 26, + LX_SIGRTMIN + 27, + LX_SIGRTMIN + 28, + LX_SIGRTMIN + 29, + LX_SIGRTMIN + 30, + LX_SIGRTMAX +}; + +/* + * lxpr_open(): Vnode operation for VOP_OPEN() + */ +static int +lxpr_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) +{ + vnode_t *vp = *vpp; + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + vnode_t *rvp; + int error = 0; + + /* + * We only allow reading in this file systrem + */ + if (flag & FWRITE) + return (EROFS); + + /* + * If we are opening an underlying file only allow regular files + * reject the open for anything but a regular file. + * Just do it if we are opening the current or root directory. + */ + if (lxpnp->lxpr_realvp != NULL) { + rvp = lxpnp->lxpr_realvp; + + if (type == LXPR_PID_FD_FD && rvp->v_type != VREG) + error = EACCES; + else { + /* + * Need to hold rvp since VOP_OPEN() may release it. + */ + VN_HOLD(rvp); + error = VOP_OPEN(&rvp, flag, cr, ct); + if (error) { + VN_RELE(rvp); + } else { + *vpp = rvp; + VN_RELE(vp); + } + } + } + + return (error); +} + + +/* + * lxpr_close(): Vnode operation for VOP_CLOSE() + */ +/* ARGSUSED */ +static int +lxpr_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpr = VTOLXP(vp); + lxpr_nodetype_t type = lxpr->lxpr_type; + + /* + * we should never get here because the close is done on the realvp + * for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR && + type != LXPR_PID_EXE); + + return (0); +} + +static void (*lxpr_read_function[LXPR_NFILES])() = { + lxpr_read_isdir, /* /proc */ + lxpr_read_isdir, /* /proc/<pid> */ + lxpr_read_pid_cmdline, /* /proc/<pid>/cmdline */ + lxpr_read_empty, /* /proc/<pid>/cpu */ + lxpr_read_invalid, /* /proc/<pid>/cwd */ + lxpr_read_empty, /* /proc/<pid>/environ */ + lxpr_read_invalid, /* /proc/<pid>/exe */ + lxpr_read_pid_maps, /* /proc/<pid>/maps */ + lxpr_read_empty, /* /proc/<pid>/mem */ + lxpr_read_invalid, /* /proc/<pid>/root */ + lxpr_read_pid_stat, /* /proc/<pid>/stat */ + lxpr_read_pid_statm, /* /proc/<pid>/statm */ + lxpr_read_pid_status, /* /proc/<pid>/status */ + lxpr_read_isdir, /* /proc/<pid>/fd */ + lxpr_read_fd, /* /proc/<pid>/fd/nn */ + lxpr_read_empty, /* /proc/cmdline */ + lxpr_read_cpuinfo, /* /proc/cpuinfo */ + lxpr_read_empty, /* /proc/devices */ + lxpr_read_empty, /* /proc/dma */ + lxpr_read_empty, /* /proc/filesystems */ + lxpr_read_empty, /* /proc/interrupts */ + lxpr_read_empty, /* /proc/ioports */ + lxpr_read_empty, /* /proc/kcore */ + lxpr_read_invalid, /* /proc/kmsg -- see lxpr_read() */ + lxpr_read_loadavg, /* /proc/loadavg */ + lxpr_read_meminfo, /* /proc/meminfo */ + lxpr_read_mounts, /* /proc/mounts */ + lxpr_read_isdir, /* /proc/net */ + lxpr_read_net_arp, /* /proc/net/arp */ + lxpr_read_net_dev, /* /proc/net/dev */ + lxpr_read_net_dev_mcast, /* /proc/net/dev_mcast */ + lxpr_read_net_igmp, /* /proc/net/igmp */ + lxpr_read_net_ip_mr_cache, /* /proc/net/ip_mr_cache */ + lxpr_read_net_ip_mr_vif, /* /proc/net/ip_mr_vif */ + lxpr_read_net_mcfilter, /* /proc/net/mcfilter */ + lxpr_read_net_netstat, /* /proc/net/netstat */ + lxpr_read_net_raw, /* /proc/net/raw */ + lxpr_read_net_route, /* /proc/net/route */ + lxpr_read_net_rpc, /* /proc/net/rpc */ + lxpr_read_net_rt_cache, /* /proc/net/rt_cache */ + lxpr_read_net_sockstat, /* /proc/net/sockstat */ + lxpr_read_net_snmp, /* /proc/net/snmp */ + lxpr_read_net_stat, /* /proc/net/stat */ + lxpr_read_net_tcp, /* /proc/net/tcp */ + lxpr_read_net_udp, /* /proc/net/udp */ + lxpr_read_net_unix, /* /proc/net/unix */ + lxpr_read_partitions, /* /proc/partitions */ + lxpr_read_invalid, /* /proc/self */ + lxpr_read_stat, /* /proc/stat */ + lxpr_read_uptime, /* /proc/uptime */ + lxpr_read_version, /* /proc/version */ +}; + +/* + * Array of lookup functions, indexed by /lxproc file type. + */ +static vnode_t *(*lxpr_lookup_function[LXPR_NFILES])() = { + lxpr_lookup_procdir, /* /proc */ + lxpr_lookup_piddir, /* /proc/<pid> */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/environ */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/exe */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/maps */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/mem */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/root */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/stat */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/statm */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/status */ + lxpr_lookup_fddir, /* /proc/<pid>/fd */ + lxpr_lookup_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_lookup_not_a_dir, /* /proc/cmdline */ + lxpr_lookup_not_a_dir, /* /proc/cpuinfo */ + lxpr_lookup_not_a_dir, /* /proc/devices */ + lxpr_lookup_not_a_dir, /* /proc/dma */ + lxpr_lookup_not_a_dir, /* /proc/filesystems */ + lxpr_lookup_not_a_dir, /* /proc/interrupts */ + lxpr_lookup_not_a_dir, /* /proc/ioports */ + lxpr_lookup_not_a_dir, /* /proc/kcore */ + lxpr_lookup_not_a_dir, /* /proc/kmsg */ + lxpr_lookup_not_a_dir, /* /proc/loadavg */ + lxpr_lookup_not_a_dir, /* /proc/meminfo */ + lxpr_lookup_not_a_dir, /* /proc/mounts */ + lxpr_lookup_netdir, /* /proc/net */ + lxpr_lookup_not_a_dir, /* /proc/net/arp */ + lxpr_lookup_not_a_dir, /* /proc/net/dev */ + lxpr_lookup_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_lookup_not_a_dir, /* /proc/net/igmp */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_lookup_not_a_dir, /* /proc/net/mcfilter */ + lxpr_lookup_not_a_dir, /* /proc/net/netstat */ + lxpr_lookup_not_a_dir, /* /proc/net/raw */ + lxpr_lookup_not_a_dir, /* /proc/net/route */ + lxpr_lookup_not_a_dir, /* /proc/net/rpc */ + lxpr_lookup_not_a_dir, /* /proc/net/rt_cache */ + lxpr_lookup_not_a_dir, /* /proc/net/sockstat */ + lxpr_lookup_not_a_dir, /* /proc/net/snmp */ + lxpr_lookup_not_a_dir, /* /proc/net/stat */ + lxpr_lookup_not_a_dir, /* /proc/net/tcp */ + lxpr_lookup_not_a_dir, /* /proc/net/udp */ + lxpr_lookup_not_a_dir, /* /proc/net/unix */ + lxpr_lookup_not_a_dir, /* /proc/partitions */ + lxpr_lookup_not_a_dir, /* /proc/self */ + lxpr_lookup_not_a_dir, /* /proc/stat */ + lxpr_lookup_not_a_dir, /* /proc/uptime */ + lxpr_lookup_not_a_dir, /* /proc/version */ +}; + +/* + * Array of readdir functions, indexed by /proc file type. + */ +static int (*lxpr_readdir_function[LXPR_NFILES])() = { + lxpr_readdir_procdir, /* /proc */ + lxpr_readdir_piddir, /* /proc/<pid> */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cpu */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/cwd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/environ */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/exe */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/maps */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/mem */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/root */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/stat */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/statm */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/status */ + lxpr_readdir_fddir, /* /proc/<pid>/fd */ + lxpr_readdir_not_a_dir, /* /proc/<pid>/fd/nn */ + lxpr_readdir_not_a_dir, /* /proc/cmdline */ + lxpr_readdir_not_a_dir, /* /proc/cpuinfo */ + lxpr_readdir_not_a_dir, /* /proc/devices */ + lxpr_readdir_not_a_dir, /* /proc/dma */ + lxpr_readdir_not_a_dir, /* /proc/filesystems */ + lxpr_readdir_not_a_dir, /* /proc/interrupts */ + lxpr_readdir_not_a_dir, /* /proc/ioports */ + lxpr_readdir_not_a_dir, /* /proc/kcore */ + lxpr_readdir_not_a_dir, /* /proc/kmsg */ + lxpr_readdir_not_a_dir, /* /proc/loadavg */ + lxpr_readdir_not_a_dir, /* /proc/meminfo */ + lxpr_readdir_not_a_dir, /* /proc/mounts */ + lxpr_readdir_netdir, /* /proc/net */ + lxpr_readdir_not_a_dir, /* /proc/net/arp */ + lxpr_readdir_not_a_dir, /* /proc/net/dev */ + lxpr_readdir_not_a_dir, /* /proc/net/dev_mcast */ + lxpr_readdir_not_a_dir, /* /proc/net/igmp */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/ip_mr_vif */ + lxpr_readdir_not_a_dir, /* /proc/net/mcfilter */ + lxpr_readdir_not_a_dir, /* /proc/net/netstat */ + lxpr_readdir_not_a_dir, /* /proc/net/raw */ + lxpr_readdir_not_a_dir, /* /proc/net/route */ + lxpr_readdir_not_a_dir, /* /proc/net/rpc */ + lxpr_readdir_not_a_dir, /* /proc/net/rt_cache */ + lxpr_readdir_not_a_dir, /* /proc/net/sockstat */ + lxpr_readdir_not_a_dir, /* /proc/net/snmp */ + lxpr_readdir_not_a_dir, /* /proc/net/stat */ + lxpr_readdir_not_a_dir, /* /proc/net/tcp */ + lxpr_readdir_not_a_dir, /* /proc/net/udp */ + lxpr_readdir_not_a_dir, /* /proc/net/unix */ + lxpr_readdir_not_a_dir, /* /proc/partitions */ + lxpr_readdir_not_a_dir, /* /proc/self */ + lxpr_readdir_not_a_dir, /* /proc/stat */ + lxpr_readdir_not_a_dir, /* /proc/uptime */ + lxpr_readdir_not_a_dir, /* /proc/version */ +}; + + +/* + * lxpr_read(): Vnode operation for VOP_READ() + * + * As the format of all the files that can be read in lxproc is human readable + * and not binary structures there do not have to be different read variants + * depending on whether the reading process model is 32- or 64-bit. + */ +/* ARGSUSED */ +static int +lxpr_read(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, + caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + lxpr_uiobuf_t *uiobuf = lxpr_uiobuf_new(uiop); + int error; + + ASSERT(type < LXPR_NFILES); + + if (type == LXPR_KMSG) { + ldi_ident_t li = VTOLXPM(vp)->lxprm_li; + ldi_handle_t ldih; + struct strioctl str; + int rv; + + /* + * Open the zone's console device using the layered driver + * interface. + */ + if ((error = + ldi_open_by_name("/dev/log", FREAD, cr, &ldih, li)) != 0) + return (error); + + /* + * Send an ioctl to the underlying console device, letting it + * know we're interested in getting console messages. + */ + str.ic_cmd = I_CONSLOG; + str.ic_timout = 0; + str.ic_len = 0; + str.ic_dp = NULL; + if ((error = ldi_ioctl(ldih, I_STR, + (intptr_t)&str, FKIOCTL, cr, &rv)) != 0) + return (error); + + lxpr_read_kmsg(lxpnp, uiobuf, ldih); + + if ((error = ldi_close(ldih, FREAD, cr)) != 0) + return (error); + } else { + lxpr_read_function[type](lxpnp, uiobuf); + } + + error = lxpr_uiobuf_flush(uiobuf); + lxpr_uiobuf_free(uiobuf); + + return (error); +} + +/* + * lxpr_read_invalid(), lxpr_read_isdir(), lxpr_read_empty() + * + * Various special case reads: + * - trying to read a directory + * - invalid file (used to mean a file that should be implemented, + * but isn't yet) + * - empty file + * - wait to be able to read a file that will never have anything to read + */ +/* ARGSUSED */ +static void +lxpr_read_isdir(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EISDIR); +} + +/* ARGSUSED */ +static void +lxpr_read_invalid(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_seterr(uiobuf, EINVAL); +} + +/* ARGSUSED */ +static void +lxpr_read_empty(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_pid_cmdline(): + * + * This is not precisely compatible with Linux: the Linux cmdline returns argv + * with the correct separation using \0 between the arguments, but we cannot do + * that without copying the real argv from the correct process context. This + * is too difficult to attempt so we pretend that the entire cmdline is just + * argv[0]. This is good enough for ps and htop to display correctly, but might + * cause some other things not to work correctly. + */ +static void +lxpr_read_pid_cmdline(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + char *buf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_CMDLINE); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + buf = PTOU(p)->u_argv != 0 ? PTOU(p)->u_psargs : PTOU(p)->u_comm; + + lxpr_uiobuf_write(uiobuf, buf, strlen(buf) + 1); + lxpr_unlock(p); +} + +/* + * lxpr_read_pid_maps(): memory map file + */ +static void +lxpr_read_pid_maps(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + struct seg *seg; + char *buf; + int buflen = MAXPATHLEN; + struct print_data { + caddr_t saddr; + caddr_t eaddr; + int type; + char prot[5]; + uint32_t offset; + vnode_t *vp; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *pbuf; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_MAPS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + if (as == &kas) { + lxpr_unlock(p); + return; + } + + mutex_exit(&p->p_lock); + + /* Iterate over all segments in the address space */ + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + for (seg = AS_SEGFIRST(as); seg != NULL; seg = AS_SEGNEXT(as, seg)) { + vnode_t *vp; + uint_t protbits; + + pbuf = kmem_alloc(sizeof (*pbuf), KM_SLEEP); + + pbuf->saddr = seg->s_base; + pbuf->eaddr = seg->s_base+seg->s_size; + pbuf->type = SEGOP_GETTYPE(seg, seg->s_base); + + /* + * Cheat and only use the protection bits of the first page + * in the segment + */ + (void) strncpy(pbuf->prot, "----", sizeof (pbuf->prot)); + (void) SEGOP_GETPROT(seg, seg->s_base, 0, &protbits); + + if (protbits & PROT_READ) pbuf->prot[0] = 'r'; + if (protbits & PROT_WRITE) pbuf->prot[1] = 'w'; + if (protbits & PROT_EXEC) pbuf->prot[2] = 'x'; + if (pbuf->type & MAP_SHARED) pbuf->prot[3] = 's'; + else if (pbuf->type & MAP_PRIVATE) pbuf->prot[3] = 'p'; + + if (seg->s_ops == &segvn_ops && + SEGOP_GETVP(seg, seg->s_base, &vp) == 0 && + vp != NULL && vp->v_type == VREG) { + VN_HOLD(vp); + pbuf->vp = vp; + } else { + pbuf->vp = NULL; + } + + pbuf->offset = (uint32_t)SEGOP_GETOFFSET(seg, pbuf->saddr); + + pbuf->next = NULL; + *print_tail = pbuf; + print_tail = &pbuf->next; + } + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + buf = kmem_alloc(buflen, KM_SLEEP); + + /* print the data we've extracted */ + pbuf = print_head; + while (pbuf != NULL) { + struct print_data *pbuf_next; + vattr_t vattr; + + int maj = 0; + int min = 0; + u_longlong_t inode = 0; + + *buf = '\0'; + if (pbuf->vp != NULL) { + vattr.va_mask = AT_FSID | AT_NODEID; + if (VOP_GETATTR(pbuf->vp, &vattr, 0, CRED(), + NULL) == 0) { + maj = getmajor(vattr.va_fsid); + min = getminor(vattr.va_fsid); + inode = vattr.va_nodeid; + } + (void) vnodetopath(NULL, pbuf->vp, buf, buflen, CRED()); + VN_RELE(pbuf->vp); + } + + if (*buf != '\0') { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld %s\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode, buf); + } else { + lxpr_uiobuf_printf(uiobuf, + "%08x-%08x %s %08x %02d:%03d %lld\n", + pbuf->saddr, pbuf->eaddr, pbuf->prot, pbuf->offset, + maj, min, inode); + } + + pbuf_next = pbuf->next; + kmem_free(pbuf, sizeof (*pbuf)); + pbuf = pbuf_next; + } + + kmem_free(buf, buflen); +} + +/* + * lxpr_read_pid_statm(): memory status file + */ +static void +lxpr_read_pid_statm(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + struct as *as; + size_t vsize; + size_t rss; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATM); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + as = p->p_as; + + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = btopr(as->a_resvsize); + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + + lxpr_uiobuf_printf(uiobuf, + "%lu %lu %lu %lu %lu %lu %lu\n", + vsize, rss, 0l, rss, 0l, 0l, 0l); +} + +/* + * lxpr_read_pid_status(): status file + */ +static void +lxpr_read_pid_status(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + user_t *up; + cred_t *cr; + const gid_t *groups; + int ngroups; + struct as *as; + char *status; + pid_t pid, ppid; + size_t vsize; + size_t rss; + k_sigset_t current, ignore, handle; + int i, lx_sig; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STATUS); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Convert pid to the Linux default of 1 if we're the zone's init + * process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; + ppid = 0; /* parent pid for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) + ? curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + status = "S (sleeping)"; + break; + case TS_RUN: + case TS_ONPROC: + status = "R (running)"; + break; + case TS_ZOMB: + status = "Z (zombie)"; + break; + case TS_STOPPED: + status = "T (stopped)"; + break; + default: + status = "! (unknown)"; + break; + } + thread_unlock(t); + } else { + /* + * there is a hole in the exit code, where a proc can have + * no threads but it is yet to be flagged SZOMB. We will + * assume we are about to become a zombie + */ + status = "Z (zombie)"; + } + + up = PTOU(p); + mutex_enter(&p->p_crlock); + crhold(cr = p->p_cred); + mutex_exit(&p->p_crlock); + + lxpr_uiobuf_printf(uiobuf, + "Name:\t%s\n" + "State:\t%s\n" + "Tgid:\t%d\n" + "Pid:\t%d\n" + "PPid:\t%d\n" + "TracerPid:\t%d\n" + "Uid:\t%u\t%u\t%u\t%u\n" + "Gid:\t%u\t%u\t%u\t%u\n" + "FDSize:\t%d\n" + "Groups:\t", + up->u_comm, + status, + pid, /* thread group id - same as pid */ + pid, + ppid, + 0, + crgetruid(cr), crgetuid(cr), crgetsuid(cr), crgetuid(cr), + crgetrgid(cr), crgetgid(cr), crgetsgid(cr), crgetgid(cr), + p->p_fno_ctl); + + ngroups = crgetngroups(cr); + groups = crgetgroups(cr); + for (i = 0; i < ngroups; i++) { + lxpr_uiobuf_printf(uiobuf, + "%u ", + groups[i]); + } + crfree(cr); + + as = p->p_as; + if ((p->p_stat != SZOMB) && !(p->p_flag & SSYS) && (as != &kas)) { + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "\n" + "VmSize:\t%8lu kB\n" + "VmLck:\t%8lu kB\n" + "VmRSS:\t%8lu kB\n" + "VmData:\t%8lu kB\n" + "VmStk:\t%8lu kB\n" + "VmExe:\t%8lu kB\n" + "VmLib:\t%8lu kB", + btok(vsize), + 0l, + ptok(rss), + 0l, + btok(p->p_stksize), + ptok(rss), + 0l); + } + + sigemptyset(¤t); + sigemptyset(&ignore); + sigemptyset(&handle); + + for (i = 1; i < NSIG; i++) { + lx_sig = lxpr_sigmap[i]; + + if ((lx_sig > 0) && (lx_sig <= LX_NSIG)) { + if (sigismember(&p->p_sig, i)) + sigaddset(¤t, lx_sig); + + if (up->u_signal[i - 1] == SIG_IGN) + sigaddset(&ignore, lx_sig); + else if (up->u_signal[i - 1] != SIG_DFL) + sigaddset(&handle, lx_sig); + } + } + + lxpr_uiobuf_printf(uiobuf, + "\n" + "SigPnd:\t%08x%08x\n" + "SigBlk:\t%08x%08x\n" + "SigIgn:\t%08x%08x\n" + "SigCgt:\t%08x%08x\n" + "CapInh:\t%016x\n" + "CapPrm:\t%016x\n" + "CapEff:\t%016x\n", + current.__sigbits[1], current.__sigbits[0], + 0, 0, /* signals blocked on per thread basis */ + ignore.__sigbits[1], ignore.__sigbits[0], + handle.__sigbits[1], handle.__sigbits[0], + /* Can't do anything with linux capabilities */ + 0, + 0, + 0); + + lxpr_unlock(p); +} + + +/* + * lxpr_read_pid_stat(): pid stat file + */ +static void +lxpr_read_pid_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + proc_t *p; + kthread_t *t; + struct as *as; + char stat; + pid_t pid, ppid, pgpid, spid; + gid_t psgid; + dev_t psdev; + size_t rss, vsize; + int nice, pri; + caddr_t wchan; + processorid_t cpu; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_STAT); + + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) { + lxpr_uiobuf_seterr(uiobuf, EINVAL); + return; + } + + pid = p->p_pid; + + /* + * Set Linux defaults if we're the zone's init process + */ + if (pid == curproc->p_zone->zone_proc_initpid) { + pid = 1; /* PID for init */ + ppid = 0; /* parent PID for init is 0 */ + pgpid = 0; /* process group for init is 0 */ + psgid = (gid_t)-1; /* credential GID for init is -1 */ + spid = 0; /* session id for init is 0 */ + psdev = 0; /* session device for init is 0 */ + } else { + /* + * Make sure not to reference parent PIDs that reside outside + * the zone + */ + ppid = ((p->p_flag & SZONETOP) ? + curproc->p_zone->zone_zsched->p_pid : p->p_ppid); + + /* + * Convert ppid to the Linux default of 1 if our parent is the + * zone's init process + */ + if (ppid == curproc->p_zone->zone_proc_initpid) + ppid = 1; + + pgpid = p->p_pgrp; + + mutex_enter(&p->p_splock); + mutex_enter(&p->p_sessp->s_lock); + spid = p->p_sessp->s_sid; + psdev = p->p_sessp->s_dev; + if (p->p_sessp->s_cred) + psgid = crgetgid(p->p_sessp->s_cred); + else + psgid = crgetgid(p->p_cred); + + mutex_exit(&p->p_sessp->s_lock); + mutex_exit(&p->p_splock); + } + + t = prchoose(p); + if (t != NULL) { + switch (t->t_state) { + case TS_SLEEP: + stat = 'S'; break; + case TS_RUN: + case TS_ONPROC: + stat = 'R'; break; + case TS_ZOMB: + stat = 'Z'; break; + case TS_STOPPED: + stat = 'T'; break; + default: + stat = '!'; break; + } + + if (CL_DONICE(t, NULL, 0, &nice) != 0) + nice = 0; + + pri = t->t_pri; + wchan = t->t_wchan; + cpu = t->t_cpu->cpu_id; + thread_unlock(t); + } else { + /* Only zombies have no threads */ + stat = 'Z'; + nice = 0; + pri = 0; + wchan = 0; + cpu = 0; + } + as = p->p_as; + mutex_exit(&p->p_lock); + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + vsize = as->a_resvsize; + rss = rm_asrss(as); + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + + lxpr_uiobuf_printf(uiobuf, + "%d (%s) %c %d %d %d %d %d " + "%lu %lu %lu %lu %lu " + "%lu %lu %ld %ld " + "%d %d %d " + "%lu " + "%lu " + "%lu %ld %llu " + "%lu %lu %u " + "%lu %lu " + "%lu %lu %lu %lu " + "%lu " + "%lu %lu " + "%d " + "%d" + "\n", + pid, PTOU(p)->u_comm, stat, ppid, pgpid, spid, psdev, psgid, + 0l, 0l, 0l, 0l, 0l, /* flags, minflt, cminflt, majflt, cmajflt */ + p->p_utime, p->p_stime, p->p_cutime, p->p_cstime, + pri, nice, p->p_lwpcnt, + 0l, /* itrealvalue (time before next SIGALRM) */ + PTOU(p)->u_ticks, + vsize, rss, p->p_vmem_ctl, + 0l, 0l, USRSTACK, /* startcode, endcode, startstack */ + 0l, 0l, /* kstkesp, kstkeip */ + 0l, 0l, 0l, 0l, /* signal, blocked, sigignore, sigcatch */ + wchan, + 0l, 0l, /* nswap, cnswap */ + 0, /* exit_signal */ + cpu); + + lxpr_unlock(p); +} + +/* ARGSUSED */ +static void +lxpr_read_net_arp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, "Inter-| Receive " + " | Transmit\n"); + lxpr_uiobuf_printf(uiobuf, " face |bytes packets errs drop fifo" + " frame compressed multicast|bytes packets errs drop fifo" + " colls carrier compressed\n"); + + /* + * Data about each interface should go here, but that shouldn't be added + * unless there is an lxproc reader that actually makes use of it (and + * doesn't need anything else that we refuse to provide)... + */ +} + +/* ARGSUSED */ +static void +lxpr_read_net_dev_mcast(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_igmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_ip_mr_vif(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_mcfilter(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_netstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_raw(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_route(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rpc(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_rt_cache(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_sockstat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_snmp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_tcp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_udp(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* ARGSUSED */ +static void +lxpr_read_net_unix(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ +} + +/* + * lxpr_read_kmsg(): read the contents of the kernel message queue. We + * translate this into the reception of console messages for this zone; each + * read copies out a single zone console message, or blocks until the next one + * is produced. + */ + +#define LX_KMSG_PRI "<0>" + +static void +lxpr_read_kmsg(lxpr_node_t *lxpnp, struct lxpr_uiobuf *uiobuf, ldi_handle_t lh) +{ + mblk_t *mp; + + ASSERT(lxpnp->lxpr_type == LXPR_KMSG); + + if (ldi_getmsg(lh, &mp, NULL) == 0) { + /* + * lxproc doesn't like successive reads to the same file + * descriptor unless we do an explicit rewind each time. + */ + lxpr_uiobuf_seek(uiobuf, 0); + + lxpr_uiobuf_printf(uiobuf, "%s%s", LX_KMSG_PRI, + mp->b_cont->b_rptr); + + freemsg(mp); + } +} + +/* + * lxpr_read_loadavg(): read the contents of the "loadavg" file. We do just + * enough for uptime and other simple lxproc readers to work + */ +extern int nthread; + +static void +lxpr_read_loadavg(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ulong_t avenrun1; + ulong_t avenrun5; + ulong_t avenrun15; + ulong_t avenrun1_cs; + ulong_t avenrun5_cs; + ulong_t avenrun15_cs; + int loadavg[3]; + int *loadbuf; + cpupart_t *cp; + zone_t *zone = LXPTOZ(lxpnp); + + uint_t nrunnable = 0; + rctl_qty_t nlwps; + + ASSERT(lxpnp->lxpr_type == LXPR_LOADAVG); + + mutex_enter(&cpu_lock); + + /* + * Need to add up values over all CPU partitions. If pools are active, + * only report the values of the zone's partition, which by definition + * includes the current CPU. + */ + if (pool_pset_enabled()) { + psetid_t psetid = zone_pset_get(curproc->p_zone); + + ASSERT(curproc->p_zone != &zone0); + cp = CPU->cpu_part; + + nrunnable = cp->cp_nrunning + cp->cp_nrunnable; + (void) cpupart_get_loadavg(psetid, &loadavg[0], 3); + loadbuf = &loadavg[0]; + } else { + cp = cp_list_head; + do { + nrunnable += cp->cp_nrunning + cp->cp_nrunnable; + } while ((cp = cp->cp_next) != cp_list_head); + + loadbuf = zone == global_zone ? + &avenrun[0] : zone->zone_avenrun; + } + + /* + * If we're in the non-global zone, we'll report the total number of + * LWPs in the zone for the "nproc" parameter of /proc/loadavg, + * otherwise will just use nthread (which will include kernel threads, + * but should be good enough for lxproc). + */ + nlwps = zone == global_zone ? nthread : zone->zone_nlwps; + + mutex_exit(&cpu_lock); + + avenrun1 = loadbuf[0] >> FSHIFT; + avenrun1_cs = ((loadbuf[0] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun5 = loadbuf[1] >> FSHIFT; + avenrun5_cs = ((loadbuf[1] & (FSCALE-1)) * 100) >> FSHIFT; + avenrun15 = loadbuf[2] >> FSHIFT; + avenrun15_cs = ((loadbuf[2] & (FSCALE-1)) * 100) >> FSHIFT; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d %ld.%02d %d/%d %d\n", + avenrun1, avenrun1_cs, + avenrun5, avenrun5_cs, + avenrun15, avenrun15_cs, + nrunnable, nlwps, 0); +} + +/* + * lxpr_read_meminfo(): read the contents of the "meminfo" file. + */ +static void +lxpr_read_meminfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + zone_t *zone = LXPTOZ(lxpnp); + int global = zone == global_zone; + long total_mem, free_mem, total_swap, used_swap; + + ASSERT(lxpnp->lxpr_type == LXPR_MEMINFO); + + if (global || zone->zone_phys_mem_ctl == UINT64_MAX) { + total_mem = physmem * PAGESIZE; + free_mem = freemem * PAGESIZE; + } else { + total_mem = zone->zone_phys_mem_ctl; + free_mem = zone->zone_phys_mem_ctl - zone->zone_phys_mem; + } + + if (global || zone->zone_max_swap_ctl == UINT64_MAX) { + total_swap = k_anoninfo.ani_max * PAGESIZE; + used_swap = k_anoninfo.ani_phys_resv * PAGESIZE; + } else { + mutex_enter(&zone->zone_mem_lock); + total_swap = zone->zone_max_swap_ctl; + used_swap = zone->zone_max_swap; + mutex_exit(&zone->zone_mem_lock); + } + + lxpr_uiobuf_printf(uiobuf, + " total: used: free: shared: buffers: cached:\n" + "Mem: %8lu %8lu %8lu %8u %8u %8u\n" + "Swap: %8lu %8lu %8lu\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "MemShared: %8u kB\n" + "Buffers: %8u kB\n" + "Cached: %8u kB\n" + "SwapCached:%8u kB\n" + "Active: %8u kB\n" + "Inactive: %8u kB\n" + "HighTotal: %8u kB\n" + "HighFree: %8u kB\n" + "LowTotal: %8u kB\n" + "LowFree: %8u kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n", + total_mem, total_mem - free_mem, free_mem, 0, 0, 0, + total_swap, used_swap, total_swap - used_swap, + btok(total_mem), /* MemTotal */ + btok(free_mem), /* MemFree */ + 0, /* MemShared */ + 0, /* Buffers */ + 0, /* Cached */ + 0, /* SwapCached */ + 0, /* Active */ + 0, /* Inactive */ + 0, /* HighTotal */ + 0, /* HighFree */ + btok(total_mem), /* LowTotal */ + btok(free_mem), /* LowFree */ + btok(total_swap), /* SwapTotal */ + btok(total_swap - used_swap)); /* SwapFree */ +} + +/* + * lxpr_read_mounts(): + */ +/* ARGSUSED */ +static void +lxpr_read_mounts(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + struct vfs *vfsp; + struct vfs *vfslist; + zone_t *zone = LXPTOZ(lxpnp); + struct print_data { + refstr_t *vfs_mntpt; + refstr_t *vfs_resource; + uint_t vfs_flag; + int vfs_fstype; + struct print_data *next; + } *print_head = NULL; + struct print_data **print_tail = &print_head; + struct print_data *printp; + + vfs_list_read_lock(); + + if (zone == global_zone) { + vfsp = vfslist = rootvfs; + } else { + vfsp = vfslist = zone->zone_vfslist; + /* + * If the zone has a root entry, it will be the first in + * the list. If it doesn't, we conjure one up. + */ + if (vfslist == NULL || strcmp(refstr_value(vfsp->vfs_mntpt), + zone->zone_rootpath) != 0) { + struct vfs *tvfsp; + /* + * The root of the zone is not a mount point. The vfs + * we want to report is that of the zone's root vnode. + */ + tvfsp = zone->zone_rootvp->v_vfsp; + + lxpr_uiobuf_printf(uiobuf, + "/ / %s %s 0 0\n", + vfssw[tvfsp->vfs_fstype].vsw_name, + tvfsp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + + } + if (vfslist == NULL) { + vfs_list_unlock(); + return; + } + } + + /* + * Later on we have to do a lookupname, which can end up causing + * another vfs_list_read_lock() to be called. Which can lead to a + * deadlock. To avoid this, we extract the data we need into a local + * list, then we can run this list without holding vfs_list_read_lock() + * We keep the list in the same order as the vfs_list + */ + do { + /* Skip mounts we shouldn't show */ + if (vfsp->vfs_flag & VFS_NOMNTTAB) { + goto nextfs; + } + + printp = kmem_alloc(sizeof (*printp), KM_SLEEP); + refstr_hold(vfsp->vfs_mntpt); + printp->vfs_mntpt = vfsp->vfs_mntpt; + refstr_hold(vfsp->vfs_resource); + printp->vfs_resource = vfsp->vfs_resource; + printp->vfs_flag = vfsp->vfs_flag; + printp->vfs_fstype = vfsp->vfs_fstype; + printp->next = NULL; + + *print_tail = printp; + print_tail = &printp->next; + +nextfs: + vfsp = (zone == global_zone) ? + vfsp->vfs_next : vfsp->vfs_zone_next; + + } while (vfsp != vfslist); + + vfs_list_unlock(); + + /* + * now we can run through what we've extracted without holding + * vfs_list_read_lock() + */ + printp = print_head; + while (printp != NULL) { + struct print_data *printp_next; + const char *resource; + char *mntpt; + struct vnode *vp; + int error; + + mntpt = (char *)refstr_value(printp->vfs_mntpt); + resource = refstr_value(printp->vfs_resource); + + if (mntpt != NULL && mntpt[0] != '\0') + mntpt = ZONE_PATH_TRANSLATE(mntpt, zone); + else + mntpt = "-"; + + error = lookupname(mntpt, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); + + if (error != 0) + goto nextp; + + if (!(vp->v_flag & VROOT)) { + VN_RELE(vp); + goto nextp; + } + VN_RELE(vp); + + if (resource != NULL && resource[0] != '\0') { + if (resource[0] == '/') { + resource = ZONE_PATH_VISIBLE(resource, zone) ? + ZONE_PATH_TRANSLATE(resource, zone) : + mntpt; + } + } else { + resource = "-"; + } + + lxpr_uiobuf_printf(uiobuf, + "%s %s %s %s 0 0\n", + resource, mntpt, vfssw[printp->vfs_fstype].vsw_name, + printp->vfs_flag & VFS_RDONLY ? "ro" : "rw"); + +nextp: + printp_next = printp->next; + refstr_rele(printp->vfs_mntpt); + refstr_rele(printp->vfs_resource); + kmem_free(printp, sizeof (*printp)); + printp = printp_next; + + } +} + +/* + * lxpr_read_partitions(): + * + * We don't support partitions in a local zone because it requires access to + * physical devices. But we need to fake up enough of the file to show that we + * have no partitions. + */ +/* ARGSUSED */ +static void +lxpr_read_partitions(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "major minor #blocks name rio rmerge rsect ruse " + "wio wmerge wsect wuse running use aveq\n\n"); +} + +/* + * lxpr_read_version(): read the contents of the "version" file. Note that + * we don't lie here -- we don't pretend that we're Linux. If lxproc is to + * be used in a Linux-branded zone, there will need to be a mount option to + * indicate that Linux should be more fully mimicked. + */ +/* ARGSUSED */ +static void +lxpr_read_version(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + lxpr_uiobuf_printf(uiobuf, + "%s version %s (%s version %d.%d.%d) " + "#%s SMP %s\n", + utsname.sysname, utsname.release, +#if defined(__GNUC__) + "gcc", + __GNUC__, + __GNUC_MINOR__, + __GNUC_PATCHLEVEL__, +#else + "Sun C", + __SUNPRO_C / 0x100, + (__SUNPRO_C & 0xff) / 0x10, + __SUNPRO_C & 0xf, +#endif + utsname.version, + "00:00:00 00/00/00"); +} + +/* + * lxpr_read_stat(): read the contents of the "stat" file. + * + */ +/* ARGSUSED */ +static void +lxpr_read_stat(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t sys_cum = 0; + ulong_t user_cum = 0; + ulong_t irq_cum = 0; + ulong_t cpu_nrunnable_cum = 0; + ulong_t w_io_cum = 0; + + ulong_t pgpgin_cum = 0; + ulong_t pgpgout_cum = 0; + ulong_t pgswapout_cum = 0; + ulong_t pgswapin_cum = 0; + ulong_t intr_cum = 0; + ulong_t pswitch_cum = 0; + ulong_t forks_cum = 0; + hrtime_t msnsecs[NCMSTATES]; + + /* temporary variable since scalehrtime modifies data in place */ + hrtime_t tmptime; + + ASSERT(lxpnp->lxpr_type == LXPR_STAT); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + /* Calculate cumulative stats */ + cp = cpstart = CPU->cpu_part->cp_cpulist; + do { + int i; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_cum += NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_cum += NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_cum += NSEC_TO_TICK(msnsecs[CMS_USER]); + + pgpgin_cum += CPU_STATS(cp, vm.pgpgin); + pgpgout_cum += CPU_STATS(cp, vm.pgpgout); + pgswapin_cum += CPU_STATS(cp, vm.pgswapin); + pgswapout_cum += CPU_STATS(cp, vm.pgswapout); + + cpu_nrunnable_cum += cp->cpu_disp->disp_nrunnable; + w_io_cum += CPU_STATS(cp, sys.iowait); + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_cum += NSEC_TO_TICK(tmptime); + } + + for (i = 0; i < PIL_MAX; i++) + intr_cum += CPU_STATS(cp, sys.intr[i]); + + pswitch_cum += CPU_STATS(cp, sys.pswitch); + forks_cum += CPU_STATS(cp, sys.sysfork); + forks_cum += CPU_STATS(cp, sys.sysvfork); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + lxpr_uiobuf_printf(uiobuf, "cpu %lu %lu %lu %lu %lu %lu %lu\n", + user_cum, 0L, sys_cum, idle_cum, 0L, irq_cum, 0L); + + /* Do per processor stats */ + do { + int i; + + ulong_t idle_ticks; + ulong_t sys_ticks; + ulong_t user_ticks; + ulong_t irq_ticks = 0; + + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + get_cpu_mstate(cp, msnsecs); + + idle_ticks = NSEC_TO_TICK(msnsecs[CMS_IDLE]); + sys_ticks = NSEC_TO_TICK(msnsecs[CMS_SYSTEM]); + user_ticks = NSEC_TO_TICK(msnsecs[CMS_USER]); + + for (i = 0; i < NCMSTATES; i++) { + tmptime = cp->cpu_intracct[i]; + scalehrtime(&tmptime); + irq_ticks += NSEC_TO_TICK(tmptime); + } + + lxpr_uiobuf_printf(uiobuf, + "cpu%d %lu %lu %lu %lu %lu %lu %lu\n", + cp->cpu_id, user_ticks, 0L, sys_ticks, idle_ticks, + 0L, irq_ticks, 0L); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); + + lxpr_uiobuf_printf(uiobuf, + "page %lu %lu\n" + "swap %lu %lu\n" + "intr %lu\n" + "ctxt %lu\n" + "btime %lu\n" + "processes %lu\n" + "procs_running %lu\n" + "procs_blocked %lu\n", + pgpgin_cum, pgpgout_cum, + pgswapin_cum, pgswapout_cum, + intr_cum, + pswitch_cum, + boot_time, + forks_cum, + cpu_nrunnable_cum, + w_io_cum); +} + +/* + * lxpr_read_uptime(): read the contents of the "uptime" file. + * + * format is: "%.2lf, %.2lf",uptime_secs, idle_secs + * Use fixed point arithmetic to get 2 decimal places + */ +/* ARGSUSED */ +static void +lxpr_read_uptime(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + cpu_t *cp, *cpstart; + int pools_enabled; + ulong_t idle_cum = 0; + ulong_t cpu_count = 0; + ulong_t idle_s; + ulong_t idle_cs; + ulong_t up_s; + ulong_t up_cs; + hrtime_t birthtime; + hrtime_t centi_sec = 10000000; /* 10^7 */ + + ASSERT(lxpnp->lxpr_type == LXPR_UPTIME); + + /* Calculate cumulative stats */ + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * Don't count CPUs that aren't even in the system + * or aren't up yet. + */ + if ((cp->cpu_flags & CPU_EXISTS) == 0) { + continue; + } + + idle_cum += CPU_STATS(cp, sys.cpu_ticks_idle); + idle_cum += CPU_STATS(cp, sys.cpu_ticks_wait); + cpu_count += 1; + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + mutex_exit(&cpu_lock); + + /* Getting the Zone zsched process startup time */ + birthtime = LXPTOZ(lxpnp)->zone_zsched->p_mstart; + up_cs = (gethrtime() - birthtime) / centi_sec; + up_s = up_cs / 100; + up_cs %= 100; + + ASSERT(cpu_count > 0); + idle_cum /= cpu_count; + idle_s = idle_cum / hz; + idle_cs = idle_cum % hz; + idle_cs *= 100; + idle_cs /= hz; + + lxpr_uiobuf_printf(uiobuf, + "%ld.%02d %ld.%02d\n", up_s, up_cs, idle_s, idle_cs); +} + +static const char *amd_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", + "nx", NULL, "mmxext", NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", "3dnowext", "3dnow" +}; + +static const char *amd_x_ecx[] = { + "lahf_lm", NULL, "svm", NULL, + "altmovcr8" +}; + +static const char *tm_x_edx[] = { + "recovery", "longrun", NULL, "lrti" +}; + +/* + * Intel calls no-execute "xd" in its docs, but Linux still reports it as "nx." + */ +static const char *intc_x_edx[] = { + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + "nx", NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, "lm", NULL, NULL +}; + +static const char *intc_edx[] = { + "fpu", "vme", "de", "pse", + "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", + "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", + NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", + "ht", "tm", "ia64", "pbe" +}; + +/* + * "sse3" on linux is called "pni" (Prescott New Instructions). + */ +static const char *intc_ecx[] = { + "pni", NULL, NULL, "monitor", + "ds_cpl", NULL, NULL, "est", + "tm2", NULL, "cid", NULL, + NULL, "cx16", "xtpr" +}; + +static void +lxpr_read_cpuinfo(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + int i; + uint32_t bits; + cpu_t *cp, *cpstart; + int pools_enabled; + const char **fp; + char brandstr[CPU_IDSTRLEN]; + struct cpuid_regs cpr; + int maxeax; + int std_ecx, std_edx, ext_ecx, ext_edx; + + ASSERT(lxpnp->lxpr_type == LXPR_CPUINFO); + + mutex_enter(&cpu_lock); + pools_enabled = pool_pset_enabled(); + + cp = cpstart = CPU; + do { + /* + * This returns the maximum eax value for standard cpuid + * functions in eax. + */ + cpr.cp_eax = 0; + (void) cpuid_insn(cp, &cpr); + maxeax = cpr.cp_eax; + + /* + * Get standard x86 feature flags. + */ + cpr.cp_eax = 1; + (void) cpuid_insn(cp, &cpr); + std_ecx = cpr.cp_ecx; + std_edx = cpr.cp_edx; + + /* + * Now get extended feature flags. + */ + cpr.cp_eax = 0x80000001; + (void) cpuid_insn(cp, &cpr); + ext_ecx = cpr.cp_ecx; + ext_edx = cpr.cp_edx; + + (void) cpuid_getbrandstr(cp, brandstr, CPU_IDSTRLEN); + + lxpr_uiobuf_printf(uiobuf, + "processor\t: %d\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n" + "stepping\t: %d\n" + "cpu MHz\t\t: %u.%03u\n", + cp->cpu_id, cpuid_getvendorstr(cp), cpuid_getfamily(cp), + cpuid_getmodel(cp), brandstr, cpuid_getstep(cp), + (uint32_t)(cpu_freq_hz / 1000000), + ((uint32_t)(cpu_freq_hz / 1000)) % 1000); + + lxpr_uiobuf_printf(uiobuf, "cache size\t: %u KB\n", + getl2cacheinfo(cp, NULL, NULL, NULL) / 1024); + + if (is_x86_feature(x86_featureset, X86FSET_HTT)) { + /* + * 'siblings' is used for HT-style threads + */ + lxpr_uiobuf_printf(uiobuf, + "physical id\t: %lu\n" + "siblings\t: %u\n", + pg_plat_hw_instance_id(cp, PGHW_CHIP), + cpuid_get_ncpu_per_chip(cp)); + } + + /* + * Since we're relatively picky about running on older hardware, + * we can be somewhat cavalier about the answers to these ones. + * + * In fact, given the hardware we support, we just say: + * + * fdiv_bug : no (if we're on a 64-bit kernel) + * hlt_bug : no + * f00f_bug : no + * coma_bug : no + * wp : yes (write protect in supervsr mode) + */ + lxpr_uiobuf_printf(uiobuf, + "fdiv_bug\t: %s\n" + "hlt_bug \t: no\n" + "f00f_bug\t: no\n" + "coma_bug\t: no\n" + "fpu\t\t: %s\n" + "fpu_exception\t: %s\n" + "cpuid level\t: %d\n" + "flags\t\t:", +#if defined(__i386) + fpu_pentium_fdivbug ? "yes" : "no", +#else + "no", +#endif /* __i386 */ + fpu_exists ? "yes" : "no", fpu_exists ? "yes" : "no", + maxeax); + + for (bits = std_edx, fp = intc_edx, i = 0; + i < sizeof (intc_edx) / sizeof (intc_edx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + /* + * name additional features where appropriate + */ + switch (x86_vendor) { + case X86_VENDOR_Intel: + for (bits = ext_edx, fp = intc_x_edx, i = 0; + i < sizeof (intc_x_edx) / sizeof (intc_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_AMD: + for (bits = ext_edx, fp = amd_x_edx, i = 0; + i < sizeof (amd_x_edx) / sizeof (amd_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + for (bits = ext_ecx, fp = amd_x_ecx, i = 0; + i < sizeof (amd_x_ecx) / sizeof (amd_x_ecx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + + case X86_VENDOR_TM: + for (bits = ext_edx, fp = tm_x_edx, i = 0; + i < sizeof (tm_x_edx) / sizeof (tm_x_edx[0]); + fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + break; + default: + break; + } + + for (bits = std_ecx, fp = intc_ecx, i = 0; + i < sizeof (intc_ecx) / sizeof (intc_ecx[0]); fp++, i++) + if ((bits & (1 << i)) != 0 && *fp) + lxpr_uiobuf_printf(uiobuf, " %s", *fp); + + lxpr_uiobuf_printf(uiobuf, "\n\n"); + + if (pools_enabled) + cp = cp->cpu_next_part; + else + cp = cp->cpu_next; + } while (cp != cpstart); + + mutex_exit(&cpu_lock); +} + +/* ARGSUSED */ +static void +lxpr_read_fd(lxpr_node_t *lxpnp, lxpr_uiobuf_t *uiobuf) +{ + ASSERT(lxpnp->lxpr_type == LXPR_PID_FD_FD); + lxpr_uiobuf_seterr(uiobuf, EFAULT); +} + +/* + * lxpr_getattr(): Vnode operation for VOP_GETATTR() + */ +static int +lxpr_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + register lxpr_node_t *lxpnp = VTOLXP(vp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + extern uint_t nproc; + int error; + + /* + * Return attributes of underlying vnode if ATTR_REAL + * + * but keep fd files with the symlink permissions + */ + if (lxpnp->lxpr_realvp != NULL && (flags & ATTR_REAL)) { + vnode_t *rvp = lxpnp->lxpr_realvp; + + /* + * withold attribute information to owner or root + */ + if ((error = VOP_ACCESS(rvp, 0, 0, cr, ct)) != 0) { + return (error); + } + + /* + * now its attributes + */ + if ((error = VOP_GETATTR(rvp, vap, flags, cr, ct)) != 0) { + return (error); + } + + /* + * if it's a file in lx /proc/pid/fd/xx then set its + * mode and keep it looking like a symlink + */ + if (type == LXPR_PID_FD_FD) { + vap->va_mode = lxpnp->lxpr_mode; + vap->va_type = vp->v_type; + vap->va_size = 0; + vap->va_nlink = 1; + } + return (0); + } + + /* Default attributes, that may be overridden below */ + bzero(vap, sizeof (*vap)); + vap->va_atime = vap->va_mtime = vap->va_ctime = lxpnp->lxpr_time; + vap->va_nlink = 1; + vap->va_type = vp->v_type; + vap->va_mode = lxpnp->lxpr_mode; + vap->va_fsid = vp->v_vfsp->vfs_dev; + vap->va_blksize = DEV_BSIZE; + vap->va_uid = lxpnp->lxpr_uid; + vap->va_gid = lxpnp->lxpr_gid; + vap->va_nodeid = lxpnp->lxpr_ino; + + switch (type) { + case LXPR_PROCDIR: + vap->va_nlink = nproc + 2 + PROCDIRFILES; + vap->va_size = (nproc + 2 + PROCDIRFILES) * LXPR_SDSIZE; + break; + case LXPR_PIDDIR: + vap->va_nlink = PIDDIRFILES; + vap->va_size = PIDDIRFILES * LXPR_SDSIZE; + break; + case LXPR_SELF: + vap->va_uid = crgetruid(curproc->p_cred); + vap->va_gid = crgetrgid(curproc->p_cred); + break; + default: + break; + } + + vap->va_nblocks = (fsblkcnt64_t)btod(vap->va_size); + return (0); +} + +/* + * lxpr_access(): Vnode operation for VOP_ACCESS() + */ +static int +lxpr_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) +{ + lxpr_node_t *lxpnp = VTOLXP(vp); + int shift = 0; + proc_t *tp; + + /* lx /proc is a read only file system */ + if (mode & VWRITE) + return (EROFS); + + /* + * If this is a restricted file, check access permissions. + */ + switch (lxpnp->lxpr_type) { + case LXPR_PIDDIR: + return (0); + case LXPR_PID_CURDIR: + case LXPR_PID_ENV: + case LXPR_PID_EXE: + case LXPR_PID_MAPS: + case LXPR_PID_MEM: + case LXPR_PID_ROOTDIR: + case LXPR_PID_FDDIR: + case LXPR_PID_FD_FD: + if ((tp = lxpr_lock(lxpnp->lxpr_pid)) == NULL) + return (ENOENT); + if (tp != curproc && secpolicy_proc_access(cr) != 0 && + priv_proc_cred_perm(cr, tp, NULL, mode) != 0) { + lxpr_unlock(tp); + return (EACCES); + } + lxpr_unlock(tp); + default: + break; + } + + if (lxpnp->lxpr_realvp != NULL) { + /* + * For these we use the underlying vnode's accessibility. + */ + return (VOP_ACCESS(lxpnp->lxpr_realvp, mode, flags, cr, ct)); + } + + /* If user is root allow access regardless of permission bits */ + if (secpolicy_proc_access(cr) == 0) + return (0); + + /* + * Access check is based on only one of owner, group, public. If not + * owner, then check group. If not a member of the group, then check + * public access. + */ + if (crgetuid(cr) != lxpnp->lxpr_uid) { + shift += 3; + if (!groupmember((uid_t)lxpnp->lxpr_gid, cr)) + shift += 3; + } + + mode &= ~(lxpnp->lxpr_mode << shift); + + if (mode == 0) + return (0); + + return (EACCES); +} + +/* ARGSUSED */ +static vnode_t * +lxpr_lookup_not_a_dir(vnode_t *dp, char *comp) +{ + return (NULL); +} + +/* + * lxpr_lookup(): Vnode operation for VOP_LOOKUP() + */ +/* ARGSUSED */ +static int +lxpr_lookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pathp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the lookup + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict lookup permission to owner or root + */ + if ((error = lxpr_access(dp, VEXEC, 0, cr, ct)) != 0) { + return (error); + } + + /* + * Just return the parent vnode if that's where we are trying to go. + */ + if (strcmp(comp, "..") == 0) { + VN_HOLD(lxpnp->lxpr_parent); + *vpp = lxpnp->lxpr_parent; + return (0); + } + + /* + * Special handling for directory searches. Note: null component name + * denotes that the current directory is being searched. + */ + if ((dp->v_type == VDIR) && (*comp == '\0' || strcmp(comp, ".") == 0)) { + VN_HOLD(dp); + *vpp = dp; + return (0); + } + + *vpp = (lxpr_lookup_function[type](dp, comp)); + return ((*vpp == NULL) ? ENOENT : 0); +} + +/* + * Do a sequential search on the given directory table + */ +static vnode_t * +lxpr_lookup_common(vnode_t *dp, char *comp, proc_t *p, + lxpr_dirent_t *dirtab, int dirtablen) +{ + lxpr_node_t *lxpnp; + int count; + + for (count = 0; count < dirtablen; count++) { + if (strcmp(dirtab[count].d_name, comp) == 0) { + lxpnp = lxpr_getnode(dp, dirtab[count].d_type, p, 0); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + return (dp); + } + } + return (NULL); +} + +static vnode_t * +lxpr_lookup_piddir(vnode_t *dp, char *comp) +{ + proc_t *p; + + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PIDDIR); + + p = lxpr_lock(VTOLXP(dp)->lxpr_pid); + if (p == NULL) + return (NULL); + + dp = lxpr_lookup_common(dp, comp, p, piddir, PIDDIRFILES); + + lxpr_unlock(p); + + return (dp); +} + +/* + * Lookup one of the process's open files. + */ +static vnode_t * +lxpr_lookup_fddir(vnode_t *dp, char *comp) +{ + lxpr_node_t *dlxpnp = VTOLXP(dp); + lxpr_node_t *lxpnp; + vnode_t *vp = NULL; + proc_t *p; + file_t *fp; + uint_t fd; + int c; + uf_entry_t *ufp; + uf_info_t *fip; + + ASSERT(dlxpnp->lxpr_type == LXPR_PID_FDDIR); + + /* + * convert the string rendition of the filename + * to a file descriptor + */ + fd = 0; + while ((c = *comp++) != '\0') { + int ofd; + if (c < '0' || c > '9') + return (NULL); + + ofd = fd; + fd = 10*fd + c - '0'; + /* integer overflow */ + if (fd / 10 != ofd) + return (NULL); + } + + /* + * get the proc to work with and lock it + */ + p = lxpr_lock(dlxpnp->lxpr_pid); + if ((p == NULL)) + return (NULL); + + /* + * If the process is a zombie or system process + * it can't have any open files. + */ + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) { + lxpr_unlock(p); + return (NULL); + } + + /* + * get us a fresh node/vnode + */ + lxpnp = lxpr_getnode(dp, LXPR_PID_FD_FD, p, fd); + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we dereference into fi_list. + */ + mutex_exit(&p->p_lock); + + /* + * get open file info + */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fd < fip->fi_nfiles) { + UF_ENTER(ufp, fip, fd); + /* + * ensure the fd is still kosher. + * it may have gone between the readdir and + * the lookup + */ + if (fip->fi_list[fd].uf_file == NULL) { + mutex_exit(&fip->fi_lock); + UF_EXIT(ufp); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } + + if ((fp = ufp->uf_file) != NULL) + vp = fp->f_vnode; + UF_EXIT(ufp); + } + mutex_exit(&fip->fi_lock); + + if (vp == NULL) { + mutex_enter(&p->p_lock); + lxpr_unlock(p); + lxpr_freenode(lxpnp); + return (NULL); + } else { + /* + * Fill in the lxpr_node so future references will be able to + * find the underlying vnode. The vnode is held on the realvp. + */ + lxpnp->lxpr_realvp = vp; + VN_HOLD(lxpnp->lxpr_realvp); + } + + mutex_enter(&p->p_lock); + lxpr_unlock(p); + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); +} + +static vnode_t * +lxpr_lookup_netdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_NETDIR); + + dp = lxpr_lookup_common(dp, comp, NULL, netdir, NETDIRFILES); + + return (dp); +} + +static vnode_t * +lxpr_lookup_procdir(vnode_t *dp, char *comp) +{ + ASSERT(VTOLXP(dp)->lxpr_type == LXPR_PROCDIR); + + /* + * We know all the names of files & dirs in our file system structure + * except those that are pid names. These change as pids are created/ + * deleted etc., so we just look for a number as the first char to see + * if we are we doing pid lookups. + * + * Don't need to check for "self" as it is implemented as a symlink + */ + if (*comp >= '0' && *comp <= '9') { + pid_t pid = 0; + lxpr_node_t *lxpnp = NULL; + proc_t *p; + int c; + + while ((c = *comp++) != '\0') + pid = 10 * pid + c - '0'; + + /* + * Can't continue if the process is still loading or it doesn't + * really exist yet (or maybe it just died!) + */ + p = lxpr_lock(pid); + if (p == NULL) + return (NULL); + + if (secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + lxpr_unlock(p); + return (NULL); + } + + /* + * allocate and fill in a new lxpr node + */ + lxpnp = lxpr_getnode(dp, LXPR_PIDDIR, p, 0); + + lxpr_unlock(p); + + dp = LXPTOV(lxpnp); + ASSERT(dp != NULL); + + return (dp); + } + + /* Lookup fixed names */ + return (lxpr_lookup_common(dp, comp, NULL, lxpr_dir, PROCDIRFILES)); +} + +/* + * lxpr_readdir(): Vnode operation for VOP_READDIR() + */ +/* ARGSUSED */ +static int +lxpr_readdir(vnode_t *dp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + lxpr_node_t *lxpnp = VTOLXP(dp); + lxpr_nodetype_t type = lxpnp->lxpr_type; + ssize_t uresid; + off_t uoffset; + int error; + + ASSERT(dp->v_type == VDIR); + ASSERT(type < LXPR_NFILES); + + /* + * we should never get here because the readdir + * is done on the realvp for these nodes + */ + ASSERT(type != LXPR_PID_FD_FD && + type != LXPR_PID_CURDIR && + type != LXPR_PID_ROOTDIR); + + /* + * restrict readdir permission to owner or root + */ + if ((error = lxpr_access(dp, VREAD, 0, cr, ct)) != 0) + return (error); + + uoffset = uiop->uio_offset; + uresid = uiop->uio_resid; + + /* can't do negative reads */ + if (uoffset < 0 || uresid <= 0) + return (EINVAL); + + /* can't read directory entries that don't exist! */ + if (uoffset % LXPR_SDSIZE) + return (ENOENT); + + return (lxpr_readdir_function[lxpnp->lxpr_type](lxpnp, uiop, eofp)); +} + +/* ARGSUSED */ +static int +lxpr_readdir_not_a_dir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + return (ENOTDIR); +} + +/* + * This has the common logic for returning directory entries + */ +static int +lxpr_readdir_common(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp, + lxpr_dirent_t *dirtab, int dirtablen) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + + oresid = uiop->uio_resid; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Satisfy user request + */ + while ((uresid = uiop->uio_resid) > 0) { + int dirindex; + off_t uoffset; + int reclen; + int error; + + uoffset = uiop->uio_offset; + dirindex = (uoffset / LXPR_SDSIZE) - 2; + + if (uoffset == 0) { + + dirent->d_ino = lxpnp->lxpr_ino; + dirent->d_name[0] = '.'; + dirent->d_name[1] = '\0'; + reclen = DIRENT64_RECLEN(1); + + } else if (uoffset == LXPR_SDSIZE) { + + dirent->d_ino = lxpr_parentinode(lxpnp); + dirent->d_name[0] = '.'; + dirent->d_name[1] = '.'; + dirent->d_name[2] = '\0'; + reclen = DIRENT64_RECLEN(2); + + } else if (dirindex >= 0 && dirindex < dirtablen) { + int slen = strlen(dirtab[dirindex].d_name); + + dirent->d_ino = lxpr_inode(dirtab[dirindex].d_type, + lxpnp->lxpr_pid, 0); + + VERIFY(slen < LXPNSIZ); + (void) strcpy(dirent->d_name, dirtab[dirindex].d_name); + reclen = DIRENT64_RECLEN(slen); + + } else { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) { + return (EINVAL); + } + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); + + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + /* Have run out of space, but could have just done last table entry */ + if (eofp) { + *eofp = + (uiop->uio_offset >= ((dirtablen+2) * LXPR_SDSIZE)) ? 1 : 0; + } + return (0); +} + + +static int +lxpr_readdir_procdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + zoneid_t zoneid; + pid_t pid; + int error; + int ceof; + + ASSERT(lxpnp->lxpr_type == LXPR_PROCDIR); + + oresid = uiop->uio_resid; + zoneid = LXPTOZ(lxpnp)->zone_id; + + /* + * We return directory entries in the order: "." and ".." then the + * unique lxproc files, then the directories corresponding to the + * running processes. We have defined this as the ordering because + * it allows us to more easily keep track of where we are betwen calls + * to getdents(). If the number of processes changes between calls + * then we can't lose track of where we are in the lxproc files. + */ + + /* Do the fixed entries */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, lxpr_dir, + PROCDIRFILES); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + return (error); + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* Do the process entries */ + while ((uresid = uiop->uio_resid) > 0) { + proc_t *p; + int len; + int reclen; + int i; + + uoffset = uiop->uio_offset; + + /* + * Stop when entire proc table has been examined. + */ + i = (uoffset / LXPR_SDSIZE) - 2 - PROCDIRFILES; + if (i < 0 || i >= v.v_proc) { + /* Run out of table entries */ + if (eofp) { + *eofp = 1; + } + return (0); + } + mutex_enter(&pidlock); + + /* + * Skip indices for which there is no pid_entry, PIDs for + * which there is no corresponding process, a PID of 0, + * and anything the security policy doesn't allow + * us to look at. + */ + if ((p = pid_entry(i)) == NULL || p->p_stat == SIDL || + p->p_pid == 0 || + secpolicy_basic_procinfo(CRED(), p, curproc) != 0) { + mutex_exit(&pidlock); + goto next; + } + mutex_exit(&pidlock); + + /* + * Convert pid to the Linux default of 1 if we're the zone's + * init process, otherwise use the value from the proc + * structure + */ + pid = ((p->p_pid != curproc->p_zone->zone_proc_initpid) ? + p->p_pid : 1); + + /* + * If this /proc was mounted in the global zone, view + * all procs; otherwise, only view zone member procs. + */ + if (zoneid != GLOBAL_ZONEID && p->p_zone->zone_id != zoneid) { + goto next; + } + + ASSERT(p->p_stat != 0); + + dirent->d_ino = lxpr_inode(LXPR_PIDDIR, pid, 0); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", pid); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + /* + * if the size of the data to transfer is greater + * that that requested then we can't do it this transfer. + */ + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + return (EINVAL); + break; + } + + /* + * uiomove() updates both uiop->uio_resid and uiop->uio_offset + * by the same amount. But we want uiop->uio_offset to change + * in increments of LXPR_SDSIZE, which is different from the + * number of bytes being returned to the user. So we set + * uiop->uio_offset separately, in the increment of this for + * the loop, ignoring what uiomove() does. + */ + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + return (error); +next: + uiop->uio_offset = uoffset + LXPR_SDSIZE; + } + + if (eofp != NULL) { + *eofp = (uiop->uio_offset >= + ((v.v_proc + PROCDIRFILES + 2) * LXPR_SDSIZE)) ? 1 : 0; + } + + return (0); +} + +static int +lxpr_readdir_piddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + proc_t *p; + + ASSERT(lxpnp->lxpr_type == LXPR_PIDDIR); + + /* can't read its contents if it died */ + mutex_enter(&pidlock); + + p = prfind((lxpnp->lxpr_pid == 1) ? + curproc->p_zone->zone_proc_initpid : lxpnp->lxpr_pid); + + if (p == NULL || p->p_stat == SIDL) { + mutex_exit(&pidlock); + return (ENOENT); + } + mutex_exit(&pidlock); + + return (lxpr_readdir_common(lxpnp, uiop, eofp, piddir, PIDDIRFILES)); +} + +static int +lxpr_readdir_netdir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + ASSERT(lxpnp->lxpr_type == LXPR_NETDIR); + return (lxpr_readdir_common(lxpnp, uiop, eofp, netdir, NETDIRFILES)); +} + +static int +lxpr_readdir_fddir(lxpr_node_t *lxpnp, uio_t *uiop, int *eofp) +{ + /* bp holds one dirent64 structure */ + longlong_t bp[DIRENT64_RECLEN(LXPNSIZ) / sizeof (longlong_t)]; + dirent64_t *dirent = (dirent64_t *)bp; + ssize_t oresid; /* save a copy for testing later */ + ssize_t uresid; + off_t uoffset; + int error; + int ceof; + proc_t *p; + int fddirsize = -1; + uf_info_t *fip; + + ASSERT(lxpnp->lxpr_type == LXPR_PID_FDDIR); + + oresid = uiop->uio_resid; + + /* can't read its contents if it died */ + p = lxpr_lock(lxpnp->lxpr_pid); + if (p == NULL) + return (ENOENT); + + if ((p->p_stat == SZOMB) || (p->p_flag & SSYS) || (p->p_as == &kas)) + fddirsize = 0; + + /* + * Drop p_lock, but keep the process P_PR_LOCK'd to prevent it from + * going away while we iterate over its fi_list. + */ + mutex_exit(&p->p_lock); + + /* Get open file info */ + fip = (&(p)->p_user.u_finfo); + mutex_enter(&fip->fi_lock); + + if (fddirsize == -1) + fddirsize = fip->fi_nfiles; + + /* Do the fixed entries (in this case just "." & "..") */ + error = lxpr_readdir_common(lxpnp, uiop, &ceof, 0, 0); + + /* Finished if we got an error or if we couldn't do all the table */ + if (error != 0 || ceof == 0) + goto out; + + /* clear out the dirent buffer */ + bzero(bp, sizeof (bp)); + + /* + * Loop until user's request is satisfied or until + * all file descriptors have been examined. + */ + for (; (uresid = uiop->uio_resid) > 0; + uiop->uio_offset = uoffset + LXPR_SDSIZE) { + int reclen; + int fd; + int len; + + uoffset = uiop->uio_offset; + + /* + * Stop at the end of the fd list + */ + fd = (uoffset / LXPR_SDSIZE) - 2; + if (fd < 0 || fd >= fddirsize) { + if (eofp) { + *eofp = 1; + } + goto out; + } + + if (fip->fi_list[fd].uf_file == NULL) + continue; + + dirent->d_ino = lxpr_inode(LXPR_PID_FD_FD, lxpnp->lxpr_pid, fd); + len = snprintf(dirent->d_name, LXPNSIZ, "%d", fd); + ASSERT(len < LXPNSIZ); + reclen = DIRENT64_RECLEN(len); + + dirent->d_off = (off64_t)(uoffset + LXPR_SDSIZE); + dirent->d_reclen = (ushort_t)reclen; + + if (reclen > uresid) { + /* + * Error if no entries have been returned yet. + */ + if (uresid == oresid) + error = EINVAL; + goto out; + } + + if ((error = uiomove((caddr_t)dirent, reclen, UIO_READ, + uiop)) != 0) + goto out; + } + + if (eofp != NULL) { + *eofp = + (uiop->uio_offset >= ((fddirsize+2) * LXPR_SDSIZE)) ? 1 : 0; + } + +out: + mutex_exit(&fip->fi_lock); + mutex_enter(&p->p_lock); + lxpr_unlock(p); + return (error); +} + + +/* + * lxpr_readlink(): Vnode operation for VOP_READLINK() + */ +/* ARGSUSED */ +static int +lxpr_readlink(vnode_t *vp, uio_t *uiop, cred_t *cr, caller_context_t *ct) +{ + char bp[MAXPATHLEN + 1]; + size_t buflen = sizeof (bp); + lxpr_node_t *lxpnp = VTOLXP(vp); + vnode_t *rvp = lxpnp->lxpr_realvp; + pid_t pid; + int error = 0; + + /* must be a symbolic link file */ + if (vp->v_type != VLNK) + return (EINVAL); + + /* Try to produce a symlink name for anything that has a realvp */ + if (rvp != NULL) { + if ((error = lxpr_access(vp, VREAD, 0, CRED(), ct)) != 0) + return (error); + if ((error = vnodetopath(NULL, rvp, bp, buflen, CRED())) != 0) + return (error); + } else { + switch (lxpnp->lxpr_type) { + case LXPR_SELF: + /* + * Convert pid to the Linux default of 1 if we're the + * zone's init process + */ + pid = ((curproc->p_pid != + curproc->p_zone->zone_proc_initpid) + ? curproc->p_pid : 1); + + /* + * Don't need to check result as every possible int + * will fit within MAXPATHLEN bytes. + */ + (void) snprintf(bp, buflen, "%d", pid); + break; + case LXPR_PID_CURDIR: + case LXPR_PID_ROOTDIR: + case LXPR_PID_EXE: + return (EACCES); + default: + /* + * Need to return error so that nothing thinks + * that the symlink is empty and hence "." + */ + return (EINVAL); + } + } + + /* copy the link data to user space */ + return (uiomove(bp, strlen(bp), UIO_READ, uiop)); +} + +/* + * lxpr_inactive(): Vnode operation for VOP_INACTIVE() + * Vnode is no longer referenced, deallocate the file + * and all its resources. + */ +/* ARGSUSED */ +static void +lxpr_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + lxpr_freenode(VTOLXP(vp)); +} + +/* + * lxpr_sync(): Vnode operation for VOP_SYNC() + */ +static int +lxpr_sync() +{ + /* + * Nothing to sync but this function must never fail + */ + return (0); +} + +/* + * lxpr_cmp(): Vnode operation for VOP_CMP() + */ +static int +lxpr_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) +{ + vnode_t *rvp; + + while (vn_matchops(vp1, lxpr_vnodeops) && + (rvp = VTOLXP(vp1)->lxpr_realvp) != NULL) { + vp1 = rvp; + } + + while (vn_matchops(vp2, lxpr_vnodeops) && + (rvp = VTOLXP(vp2)->lxpr_realvp) != NULL) { + vp2 = rvp; + } + + if (vn_matchops(vp1, lxpr_vnodeops) || vn_matchops(vp2, lxpr_vnodeops)) + return (vp1 == vp2); + + return (VOP_CMP(vp1, vp2, ct)); +} + +/* + * lxpr_realvp(): Vnode operation for VOP_REALVP() + */ +static int +lxpr_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) +{ + vnode_t *rvp; + + if ((rvp = VTOLXP(vp)->lxpr_realvp) != NULL) { + vp = rvp; + if (VOP_REALVP(vp, &rvp, ct) == 0) + vp = rvp; + } + + *vpp = vp; + return (0); +} diff --git a/usr/src/uts/common/fs/lxproc/lxproc.h b/usr/src/uts/common/fs/lxproc/lxproc.h new file mode 100644 index 0000000000..e3718372e0 --- /dev/null +++ b/usr/src/uts/common/fs/lxproc/lxproc.h @@ -0,0 +1,277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifdef _LXPROC_BRANDED_H +#error Attempted to include native lxproc.h after branded lx_proc.h +#endif + +#ifndef _LXPROC_H +#define _LXPROC_H +#define _LXPROC_NATIVE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * lxproc.h: declarations, data structures and macros for lxprocfs + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/policy.h> +#include <sys/debug.h> +#include <sys/dirent.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/var.h> +#include <sys/user.h> +#include <sys/t_lock.h> +#include <sys/sysmacros.h> +#include <sys/cred.h> +#include <sys/priv.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/statvfs.h> +#include <sys/cmn_err.h> +#include <sys/zone.h> +#include <sys/uio.h> +#include <sys/utsname.h> +#include <sys/dnlc.h> +#include <sys/atomic.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> +#include <vm/as.h> +#include <vm/anon.h> + +#define LX_SIGHUP 1 +#define LX_SIGINT 2 +#define LX_SIGQUIT 3 +#define LX_SIGILL 4 +#define LX_SIGTRAP 5 +#define LX_SIGABRT 6 +#define LX_SIGIOT 6 +#define LX_SIGBUS 7 +#define LX_SIGFPE 8 +#define LX_SIGKILL 9 +#define LX_SIGUSR1 10 +#define LX_SIGSEGV 11 +#define LX_SIGUSR2 12 +#define LX_SIGPIPE 13 +#define LX_SIGALRM 14 +#define LX_SIGTERM 15 +#define LX_SIGSTKFLT 16 +#define LX_SIGCHLD 17 +#define LX_SIGCONT 18 +#define LX_SIGSTOP 19 +#define LX_SIGTSTP 20 +#define LX_SIGTTIN 21 +#define LX_SIGTTOU 22 +#define LX_SIGURG 23 +#define LX_SIGXCPU 24 +#define LX_SIGXFSZ 25 +#define LX_SIGVTALRM 26 +#define LX_SIGPROF 27 +#define LX_SIGWINCH 28 +#define LX_SIGIO 29 +#define LX_SIGPOLL LX_SIGIO +#define LX_SIGPWR 30 +#define LX_SIGSYS 31 +#define LX_SIGUNUSED 31 + +#define LX_NSIG 64 /* Linux _NSIG */ + +#define LX_SIGRTMIN 32 +#define LX_SIGRTMAX LX_NSIG + +/* + * Convert a vnode into an lxpr_mnt_t + */ +#define VTOLXPM(vp) ((lxpr_mnt_t *)(vp)->v_vfsp->vfs_data) + +/* + * convert a vnode into an lxpr_node + */ +#define VTOLXP(vp) ((lxpr_node_t *)(vp)->v_data) + +/* + * convert a lxprnode into a vnode + */ +#define LXPTOV(lxpnp) ((lxpnp)->lxpr_vnode) + +/* + * convert a lxpr_node into zone for fs + */ +#define LXPTOZ(lxpnp) \ + (((lxpr_mnt_t *)(lxpnp)->lxpr_vnode->v_vfsp->vfs_data)->lxprm_zone) + +#define LXPNSIZ 256 /* max size of lx /proc file name entries */ + +/* + * Pretend that a directory entry takes 16 bytes + */ +#define LXPR_SDSIZE 16 + +/* + * Node/file types for lx /proc files + * (directories and files contained therein). + */ +typedef enum lxpr_nodetype { + LXPR_PROCDIR, /* /proc */ + LXPR_PIDDIR, /* /proc/<pid> */ + LXPR_PID_CMDLINE, /* /proc/<pid>/cmdline */ + LXPR_PID_CPU, /* /proc/<pid>/cpu */ + LXPR_PID_CURDIR, /* /proc/<pid>/cwd */ + LXPR_PID_ENV, /* /proc/<pid>/environ */ + LXPR_PID_EXE, /* /proc/<pid>/exe */ + LXPR_PID_MAPS, /* /proc/<pid>/maps */ + LXPR_PID_MEM, /* /proc/<pid>/mem */ + LXPR_PID_ROOTDIR, /* /proc/<pid>/root */ + LXPR_PID_STAT, /* /proc/<pid>/stat */ + LXPR_PID_STATM, /* /proc/<pid>/statm */ + LXPR_PID_STATUS, /* /proc/<pid>/status */ + LXPR_PID_FDDIR, /* /proc/<pid>/fd */ + LXPR_PID_FD_FD, /* /proc/<pid>/fd/nn */ + LXPR_CMDLINE, /* /proc/cmdline */ + LXPR_CPUINFO, /* /proc/cpuinfo */ + LXPR_DEVICES, /* /proc/devices */ + LXPR_DMA, /* /proc/dma */ + LXPR_FILESYSTEMS, /* /proc/filesystems */ + LXPR_INTERRUPTS, /* /proc/interrupts */ + LXPR_IOPORTS, /* /proc/ioports */ + LXPR_KCORE, /* /proc/kcore */ + LXPR_KMSG, /* /proc/kmsg */ + LXPR_LOADAVG, /* /proc/loadavg */ + LXPR_MEMINFO, /* /proc/meminfo */ + LXPR_MOUNTS, /* /proc/mounts */ + LXPR_NETDIR, /* /proc/net */ + LXPR_NET_ARP, /* /proc/net/arp */ + LXPR_NET_DEV, /* /proc/net/dev */ + LXPR_NET_DEV_MCAST, /* /proc/net/dev_mcast */ + LXPR_NET_IGMP, /* /proc/net/igmp */ + LXPR_NET_IP_MR_CACHE, /* /proc/net/ip_mr_cache */ + LXPR_NET_IP_MR_VIF, /* /proc/net/ip_mr_vif */ + LXPR_NET_MCFILTER, /* /proc/net/mcfilter */ + LXPR_NET_NETSTAT, /* /proc/net/netstat */ + LXPR_NET_RAW, /* /proc/net/raw */ + LXPR_NET_ROUTE, /* /proc/net/route */ + LXPR_NET_RPC, /* /proc/net/rpc */ + LXPR_NET_RT_CACHE, /* /proc/net/rt_cache */ + LXPR_NET_SOCKSTAT, /* /proc/net/sockstat */ + LXPR_NET_SNMP, /* /proc/net/snmp */ + LXPR_NET_STAT, /* /proc/net/stat */ + LXPR_NET_TCP, /* /proc/net/tcp */ + LXPR_NET_UDP, /* /proc/net/udp */ + LXPR_NET_UNIX, /* /proc/net/unix */ + LXPR_PARTITIONS, /* /proc/partitions */ + LXPR_SELF, /* /proc/self */ + LXPR_STAT, /* /proc/stat */ + LXPR_UPTIME, /* /proc/uptime */ + LXPR_VERSION, /* /proc/version */ + LXPR_NFILES /* number of lx /proc file types */ +} lxpr_nodetype_t; + +/* + * Number of fds allowed for in the inode number calculation + * per process (if a process has more fds then inode numbers + * may be duplicated) + */ +#define LXPR_FD_PERPROC 2000 + +/* + * external dirent characteristics + */ +#define LXPRMAXNAMELEN 14 +typedef struct { + lxpr_nodetype_t d_type; + char d_name[LXPRMAXNAMELEN]; +} lxpr_dirent_t; + +/* + * This is the lxprocfs private data object + * which is attached to v_data in the vnode structure + */ +typedef struct lxpr_node { + lxpr_nodetype_t lxpr_type; /* type of this node */ + vnode_t *lxpr_vnode; /* vnode for the node */ + vnode_t *lxpr_parent; /* parent directory */ + vnode_t *lxpr_realvp; /* real vnode, file in dirs */ + timestruc_t lxpr_time; /* creation etc time for file */ + mode_t lxpr_mode; /* file mode bits */ + uid_t lxpr_uid; /* file owner */ + gid_t lxpr_gid; /* file group owner */ + pid_t lxpr_pid; /* pid of proc referred to */ + ino_t lxpr_ino; /* node id */ +} lxpr_node_t; + +struct zone; /* forward declaration */ + +/* + * This is the lxprocfs private data object + * which is attached to vfs_data in the vfs structure + */ +typedef struct lxpr_mnt { + lxpr_node_t *lxprm_node; /* node at root of proc mount */ + struct zone *lxprm_zone; /* zone for this mount */ + ldi_ident_t lxprm_li; /* ident for ldi */ +} lxpr_mnt_t; + +extern vnodeops_t *lxpr_vnodeops; +extern int nproc_highbit; /* highbit(v.v_nproc) */ + +typedef struct mounta mounta_t; + +extern void lxpr_initnodecache(); +extern void lxpr_fininodecache(); +extern void lxpr_initrootnode(lxpr_node_t **, vfs_t *); +extern ino_t lxpr_inode(lxpr_nodetype_t, pid_t, int); +extern ino_t lxpr_parentinode(lxpr_node_t *); +extern lxpr_node_t *lxpr_getnode(vnode_t *, lxpr_nodetype_t, proc_t *, int); +extern void lxpr_freenode(lxpr_node_t *); + +typedef struct lxpr_uiobuf lxpr_uiobuf_t; +extern lxpr_uiobuf_t *lxpr_uiobuf_new(uio_t *); +extern void lxpr_uiobuf_free(lxpr_uiobuf_t *); +extern int lxpr_uiobuf_flush(lxpr_uiobuf_t *); +extern void lxpr_uiobuf_seek(lxpr_uiobuf_t *, offset_t); +extern void lxpr_uiobuf_write(lxpr_uiobuf_t *, const char *, size_t); +extern void lxpr_uiobuf_printf(lxpr_uiobuf_t *, const char *, ...); +extern void lxpr_uiobuf_seterr(lxpr_uiobuf_t *, int); + +proc_t *lxpr_lock(pid_t); +void lxpr_unlock(proc_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LXPROC_H */ diff --git a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c index 207a708771..2176dcb9de 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs3_vnops.c b/usr/src/uts/common/fs/nfs/nfs3_vnops.c index b7354c168a..cd1e08d5d7 100644 --- a/usr/src/uts/common/fs/nfs/nfs3_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs3_vnops.c @@ -29,7 +29,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -3353,10 +3353,9 @@ nfs3rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c index ce0c9485a6..3ee41939ac 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c +++ b/usr/src/uts/common/fs/nfs/nfs4_srv_ns.c @@ -22,6 +22,7 @@ /* * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/systm.h> @@ -178,12 +179,12 @@ pseudo_exportfs(vnode_t *vp, fid_t *fid, struct exp_visible *vis_head, kex = &exi->exi_export; kex->ex_flags = EX_PSEUDO; - vpathlen = vp->v_path ? strlen(vp->v_path) : 0; + vpathlen = strlen(vp->v_path); kex->ex_pathlen = vpathlen + strlen(PSEUDOFS_SUFFIX); kex->ex_path = kmem_alloc(kex->ex_pathlen + 1, KM_SLEEP); if (vpathlen) - (void) strcpy(kex->ex_path, vp->v_path); + (void) strncpy(kex->ex_path, vp->v_path, vpathlen); (void) strcpy(kex->ex_path + vpathlen, PSEUDOFS_SUFFIX); /* Transfer the secinfo data from exdata to this new pseudo node */ diff --git a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c index 151cb62403..55f6c95289 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vfsops.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index d6bf384a8b..2b3fdfdd55 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -34,7 +34,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -8061,8 +8061,9 @@ link_call: * vnode if it already existed. */ if (error == 0) { - vnode_t *tvp; + vnode_t *tvp, *tovp; rnode4_t *trp; + /* * Notify the vnode. Each links is represented by * a different vnode, in nfsv4. @@ -8075,23 +8076,20 @@ link_call: vnevent_rename_dest(tvp, ndvp, nnm, ct); } - /* - * if the source and destination directory are not the - * same notify the destination directory. - */ - if (VTOR4(odvp) != VTOR4(ndvp)) { - trp = VTOR4(ndvp); - tvp = ndvp; - if (IS_SHADOW(ndvp, trp)) - tvp = RTOV4(trp); - vnevent_rename_dest_dir(tvp, ct); - } - trp = VTOR4(ovp); - tvp = ovp; + tovp = ovp; if (IS_SHADOW(ovp, trp)) + tovp = RTOV4(trp); + + vnevent_rename_src(tovp, odvp, onm, ct); + + trp = VTOR4(ndvp); + tvp = ndvp; + + if (IS_SHADOW(ndvp, trp)) tvp = RTOV4(trp); - vnevent_rename_src(tvp, odvp, onm, ct); + + vnevent_rename_dest_dir(tvp, tovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/nfs/nfs_auth.c b/usr/src/uts/common/fs/nfs/nfs_auth.c index 3410340581..a8bcfdf438 100644 --- a/usr/src/uts/common/fs/nfs/nfs_auth.c +++ b/usr/src/uts/common/fs/nfs/nfs_auth.c @@ -22,6 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -559,11 +560,16 @@ retry: *access = res.ares.auth_perm; *srv_uid = res.ares.auth_srv_uid; *srv_gid = res.ares.auth_srv_gid; - *srv_gids_cnt = res.ares.auth_srv_gids.len; - *srv_gids = kmem_alloc(*srv_gids_cnt * sizeof (gid_t), - KM_SLEEP); - bcopy(res.ares.auth_srv_gids.val, *srv_gids, - *srv_gids_cnt * sizeof (gid_t)); + + if ((*srv_gids_cnt = res.ares.auth_srv_gids.len) != 0) { + *srv_gids = kmem_alloc(*srv_gids_cnt * + sizeof (gid_t), KM_SLEEP); + bcopy(res.ares.auth_srv_gids.val, *srv_gids, + *srv_gids_cnt * sizeof (gid_t)); + } else { + *srv_gids = NULL; + } + break; case NFSAUTH_DR_EFAIL: @@ -1114,9 +1120,13 @@ wait: if (gid != NULL) *gid = p->auth_srv_gid; if (ngids != NULL && gids != NULL) { - *ngids = p->auth_srv_ngids; - *gids = kmem_alloc(*ngids * sizeof (gid_t), KM_SLEEP); - bcopy(p->auth_srv_gids, *gids, *ngids * sizeof (gid_t)); + if ((*ngids = p->auth_srv_ngids) != 0) { + size_t sz = *ngids * sizeof (gid_t); + *gids = kmem_alloc(sz, KM_SLEEP); + bcopy(p->auth_srv_gids, *gids, sz); + } else { + *gids = NULL; + } } access = p->auth_access; diff --git a/usr/src/uts/common/fs/nfs/nfs_server.c b/usr/src/uts/common/fs/nfs/nfs_server.c index 7e94c62734..dcc64def59 100644 --- a/usr/src/uts/common/fs/nfs/nfs_server.c +++ b/usr/src/uts/common/fs/nfs/nfs_server.c @@ -23,6 +23,7 @@ * Copyright (c) 2011 Bayard G. Bell. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -2570,6 +2571,9 @@ nfs_srvinit(void) { int error; + if (getzoneid() != GLOBAL_ZONEID) + return (EACCES); + error = nfs_exportinit(); if (error != 0) return (error); @@ -3208,7 +3212,7 @@ nfs_getflabel(vnode_t *vp, struct exportinfo *exi) char *path; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { zone = zone_find_by_any_path(vp->v_path, B_FALSE); mutex_exit(&vp->v_lock); } else { diff --git a/usr/src/uts/common/fs/nfs/nfs_vfsops.c b/usr/src/uts/common/fs/nfs/nfs_vfsops.c index 57b21778b4..ffd5380a86 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vfsops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All rights reserved. diff --git a/usr/src/uts/common/fs/nfs/nfs_vnops.c b/usr/src/uts/common/fs/nfs/nfs_vnops.c index 1a1082bcb8..8c321c8aa3 100644 --- a/usr/src/uts/common/fs/nfs/nfs_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs_vnops.c @@ -26,7 +26,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -2688,11 +2688,9 @@ nfsrename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, if (nvp) vnevent_rename_dest(nvp, ndvp, nnm, ct); - if (odvp != ndvp) - vnevent_rename_dest_dir(ndvp, ct); - ASSERT(ovp != NULL); vnevent_rename_src(ovp, odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, ovp, nnm, ct); } if (nvp) { diff --git a/usr/src/uts/common/fs/pcfs/pc_dir.c b/usr/src/uts/common/fs/pcfs/pc_dir.c index a9ee604b7c..21a0b1a4bd 100644 --- a/usr/src/uts/common/fs/pcfs/pc_dir.c +++ b/usr/src/uts/common/fs/pcfs/pc_dir.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/errno.h> #include <sys/systm.h> @@ -822,10 +826,10 @@ top: return (error); } } -out: - vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); - if (dp != tdp) { - vnevent_rename_dest_dir(PCTOV(tdp), ctp); + + if (error == 0) { + vnevent_rename_src(PCTOV(pcp), PCTOV(dp), snm, ctp); + vnevent_rename_dest_dir(PCTOV(tdp), PCTOV(pcp), tnm, ctp); } VN_RELE(PCTOV(pcp)); diff --git a/usr/src/uts/common/fs/portfs/port.c b/usr/src/uts/common/fs/portfs/port.c index 14be8cbbae..11b7386269 100644 --- a/usr/src/uts/common/fs/portfs/port.c +++ b/usr/src/uts/common/fs/portfs/port.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/types.h> #include <sys/systm.h> @@ -1381,12 +1383,18 @@ portnowait: if (model == DATAMODEL_NATIVE) { eventsz = sizeof (port_event_t); - kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp = NULL; + } else { + kevp = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { @@ -1423,12 +1431,18 @@ portnowait: port_event32_t *kevp32; eventsz = sizeof (port_event32_t); - kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); - if (kevp32 == NULL) { - if (nmax > pp->port_max_list) - nmax = pp->port_max_list; - kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + + if (nmax == 0) { + kevp32 = NULL; + } else { + kevp32 = kmem_alloc(eventsz * nmax, KM_NOSLEEP); + if (kevp32 == NULL) { + if (nmax > pp->port_max_list) + nmax = pp->port_max_list; + kevp32 = kmem_alloc(eventsz * nmax, KM_SLEEP); + } } + results = kevp32; lev = NULL; /* start with first event in the queue */ for (nevents = 0; nevents < nmax; ) { diff --git a/usr/src/uts/common/fs/portfs/port_vnops.c b/usr/src/uts/common/fs/portfs/port_vnops.c index b2f5088e06..ab95c0a1f8 100644 --- a/usr/src/uts/common/fs/portfs/port_vnops.c +++ b/usr/src/uts/common/fs/portfs/port_vnops.c @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/vnode.h> #include <sys/vfs_opreg.h> @@ -294,14 +298,10 @@ port_poll(vnode_t *vp, short events, int anyyet, short *reventsp, levents |= POLLOUT; levents &= events; *reventsp = levents; - if (levents == 0) { - if (!anyyet) { - *phpp = &pp->port_pollhd; - portq->portq_flags |= - events & POLLIN ? PORTQ_POLLIN : 0; - portq->portq_flags |= - events & POLLOUT ? PORTQ_POLLOUT : 0; - } + if ((levents == 0 && !anyyet) || (events & POLLET)) { + *phpp = &pp->port_pollhd; + portq->portq_flags |= events & POLLIN ? PORTQ_POLLIN : 0; + portq->portq_flags |= events & POLLOUT ? PORTQ_POLLOUT : 0; } mutex_exit(&portq->portq_mutex); return (0); diff --git a/usr/src/uts/common/fs/proc/prargv.c b/usr/src/uts/common/fs/proc/prargv.c new file mode 100644 index 0000000000..a4ad82d661 --- /dev/null +++ b/usr/src/uts/common/fs/proc/prargv.c @@ -0,0 +1,441 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/sysmacros.h> +#include <vm/as.h> + +/* + * Safely read a contiguous region of memory from 'addr' in the address space + * of a particular process into the supplied kernel buffer (*buf, sz). + * Partially mapped regions will result in a partial read terminating at the + * first hole in the address space. The number of bytes actually read is + * returned to the caller via 'rdsz'. + */ +static int +prreadbuf(proc_t *p, uintptr_t ustart, uint8_t *buf, size_t sz, size_t *rdsz) +{ + int error = 0; + size_t rem = sz; + off_t pos = 0; + + if (rdsz != NULL) + *rdsz = 0; + + while (rem != 0) { + uintptr_t addr = ustart + pos; + size_t len = MIN(rem, PAGESIZE - (addr & PAGEOFFSET)); + + if ((error = uread(p, buf + pos, len, addr)) != 0) { + if (error == ENXIO) { + /* + * ENXIO from uread() indicates that the page + * does not exist. This will simply be a + * partial read. + */ + error = 0; + } + break; + } + + rem -= len; + pos += len; + } + + if (rdsz != NULL) + *rdsz = pos; + + return (error); +} + +/* + * Attempt to read the argument vector (argv) from this process. The caller + * must hold the p_lock mutex, and have marked the process P_PR_LOCK (e.g. via + * prlock or lx_prlock). + * + * The caller must provide a buffer (buf, buflen). We will concatenate each + * argument string (including the NUL terminator) into this buffer. The number + * of characters written to this buffer (including the final NUL terminator) + * will be stored in 'slen'. + */ +int +prreadargv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *argv = NULL; + size_t argvsz = 0; + int i; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_argv == NULL) { + /* + * Return the regular psargs string to the caller. + */ + bcopy(up->u_psargs, buf, MIN(bufsz, sizeof (up->u_psargs))); + buf[bufsz - 1] = '\0'; + *slen = strlen(buf) + 1; + + return (0); + } + + /* + * Allocate space to store argv array. + */ + argvsz = up->u_argc * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + argv = kmem_alloc(argvsz, KM_SLEEP); + + /* + * Extract the argv array from the target process. Drop p_lock + * while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + if ((error = prreadbuf(p, up->u_argv, (uint8_t *)argv, argvsz, + NULL)) != 0) { + kmem_free(argv, argvsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each argument string from the pointers in the argv array. + */ + pos = 0; + for (i = 0; i < up->u_argc; i++) { + size_t rdsz, trysz; + uintptr_t arg; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + arg = (uintptr_t)((caddr32_t *)argv)[i]; + } else { + arg = (uintptr_t)argv[i]; + } +#else + arg = (uintptr_t)argv[i]; +#endif + + /* + * Stop trying to read arguments if we reach a NULL + * pointer in the vector. + */ + if (arg == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual argument strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this argument. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, arg, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(argv, argvsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} + +/* + * Similar to prreadargv except reads the env vector. This is slightly more + * complex because there is no count for the env vector that corresponds to + * u_argc. + */ +int +prreadenvv(proc_t *p, char *buf, size_t bufsz, size_t *slen) +{ + int error; + user_t *up; + struct as *as; + size_t pos = 0; + caddr_t *envp = NULL; + uintptr_t tmpp = NULL; + size_t envpsz = 0, rdsz = 0; + int i; + int cnt, bound; + + VERIFY(MUTEX_HELD(&p->p_lock)); + VERIFY(p->p_proc_flag & P_PR_LOCK); + + up = PTOU(p); + as = p->p_as; + + if ((p->p_flag & SSYS) || as == &kas || up->u_envp == NULL) { + /* + * Return empty string. + */ + buf[0] = '\0'; + *slen = 1; + + return (0); + } + + /* + * Drop p_lock while we do I/O to avoid deadlock with the clock thread. + */ + mutex_exit(&p->p_lock); + + /* + * We first have to count how many env entries we have. This is + * somewhat painful. We extract the env entries from the target process + * one entry at a time. Stop trying to read env entries if we reach a + * NULL pointer in the vector or hit our upper bound (which we take + * as the bufsz/4) to ensure we don't run off. + */ + rdsz = (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + bound = (int)(bufsz / 4); + for (cnt = 0, tmpp = up->u_envp; cnt < bound; cnt++, tmpp += rdsz) { + caddr_t tmp = NULL; + + if ((error = prreadbuf(p, tmpp, (uint8_t *)&tmp, rdsz, + NULL)) != 0) { + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + if (tmp == NULL) + break; + } + if (cnt == 0) { + /* Return empty string. */ + buf[0] = '\0'; + *slen = 1; + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (0); + } + + /* + * Allocate space to store env array. + */ + envpsz = cnt * (p->p_model == DATAMODEL_ILP32 ? + sizeof (caddr32_t) : sizeof (caddr_t)); + envp = kmem_alloc(envpsz, KM_SLEEP); + + /* + * Extract the env array from the target process. + */ + if ((error = prreadbuf(p, up->u_envp, (uint8_t *)envp, envpsz, + NULL)) != 0) { + kmem_free(envp, envpsz); + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + return (-1); + } + + /* + * Read each env string from the pointers in the env array. + */ + pos = 0; + for (i = 0; i < cnt; i++) { + size_t rdsz, trysz; + uintptr_t ev; + off_t j; + boolean_t found_nul; + boolean_t do_retry = B_TRUE; + +#ifdef _SYSCALL32_IMPL + if (p->p_model == DATAMODEL_ILP32) { + ev = (uintptr_t)((caddr32_t *)envp)[i]; + } else { + ev = (uintptr_t)envp[i]; + } +#else + ev = (uintptr_t)envp[i]; +#endif + + /* + * Stop trying to read env entries if we reach a NULL + * pointer in the vector. + */ + if (ev == NULL) + break; + + /* + * Stop reading if we have read the maximum length + * we can return to the user. + */ + if (pos >= bufsz) + break; + + /* + * Initially we try a short read, on the assumption that + * most individual env strings are less than 80 + * characters long. + */ + if ((trysz = MIN(80, bufsz - pos - 1)) < 80) { + /* + * We don't have room in the target buffer for even + * an entire short read, so there is no need to retry + * with a longer read. + */ + do_retry = B_FALSE; + } + +retry: + /* + * Read string data for this env var. Leave room + * in the buffer for a final NUL terminator. + */ + if ((error = prreadbuf(p, ev, (uint8_t *)&buf[pos], trysz, + &rdsz)) != 0) { + /* + * There was a problem reading this string + * from the process. Give up. + */ + break; + } + + /* + * Find the NUL terminator. + */ + found_nul = B_FALSE; + for (j = 0; j < rdsz; j++) { + if (buf[pos + j] == '\0') { + found_nul = B_TRUE; + break; + } + } + + if (!found_nul && do_retry) { + /* + * We did not find a NUL terminator, but this + * was a first pass short read. Try once more + * with feeling. + */ + trysz = bufsz - pos - 1; + do_retry = B_FALSE; + goto retry; + } + + /* + * Commit the string we read to the buffer. + */ + pos += j + 1; + if (!found_nul && pos < bufsz) { + /* + * A NUL terminator was not found; add one. + */ + buf[pos++] = '\0'; + } + } + + /* + * Ensure the entire string is NUL-terminated. + */ + buf[bufsz - 1] = '\0'; + + mutex_enter(&p->p_lock); + VERIFY(p->p_proc_flag & P_PR_LOCK); + kmem_free(envp, envpsz); + + /* + * If the operation was a success, return the copied string length + * to the caller. + */ + *slen = (error == 0) ? pos : 0; + + return (error); +} diff --git a/usr/src/uts/common/fs/proc/prcontrol.c b/usr/src/uts/common/fs/proc/prcontrol.c index a73a64a4a4..7e99d23b97 100644 --- a/usr/src/uts/common/fs/proc/prcontrol.c +++ b/usr/src/uts/common/fs/proc/prcontrol.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -1481,7 +1481,7 @@ pr_setsig(prnode_t *pnp, siginfo_t *sip) } else if (t->t_state == TS_STOPPED && sig == SIGKILL) { /* If SIGKILL, set stopped lwp running */ p->p_stopsig = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; t->t_dtrace_stop = 0; setrun_locked(t); } @@ -2276,9 +2276,17 @@ pr_szoneid(proc_t *p, zoneid_t zoneid, cred_t *cr) return (EPERM); if (zoneid != GLOBAL_ZONEID && zoneid != p->p_zone->zone_id) return (EINVAL); - if ((zptr = zone_find_by_id(zoneid)) == NULL) - return (EINVAL); + /* + * We cannot hold p_lock when we call zone_find_by_id since that can + * lead to a deadlock. zone_find_by_id() takes zonehash_lock. + * zone_enter() can hold the zonehash_lock and needs p_lock when it + * calls task_join. + */ mutex_exit(&p->p_lock); + if ((zptr = zone_find_by_id(zoneid)) == NULL) { + mutex_enter(&p->p_lock); + return (EINVAL); + } mutex_enter(&p->p_crlock); oldcred = p->p_cred; crhold(oldcred); diff --git a/usr/src/uts/common/fs/proc/prdata.h b/usr/src/uts/common/fs/proc/prdata.h index 8ea516bf82..72f26b3c05 100644 --- a/usr/src/uts/common/fs/proc/prdata.h +++ b/usr/src/uts/common/fs/proc/prdata.h @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_PROC_PRDATA_H @@ -123,6 +123,7 @@ typedef enum prnodetype { #if defined(__i386) || defined(__amd64) PR_LDT, /* /proc/<pid>/ldt */ #endif + PR_ARGV, /* /proc/<pid>/argv */ PR_USAGE, /* /proc/<pid>/usage */ PR_LUSAGE, /* /proc/<pid>/lusage */ PR_PAGEDATA, /* /proc/<pid>/pagedata */ @@ -347,6 +348,8 @@ extern int pr_unset(proc_t *, long); extern void pr_sethold(prnode_t *, sigset_t *); extern void pr_setfault(proc_t *, fltset_t *); extern int prusrio(proc_t *, enum uio_rw, struct uio *, int); +extern int prreadargv(proc_t *, char *, size_t, size_t *); +extern int prreadenvv(proc_t *, char *, size_t, size_t *); extern int prwritectl(vnode_t *, struct uio *, cred_t *); extern int prlock(prnode_t *, int); extern void prunmark(proc_t *); diff --git a/usr/src/uts/common/fs/proc/prsubr.c b/usr/src/uts/common/fs/proc/prsubr.c index 7801fd0ac8..284bf8cb88 100644 --- a/usr/src/uts/common/fs/proc/prsubr.c +++ b/usr/src/uts/common/fs/proc/prsubr.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -201,6 +201,7 @@ prchoose(proc_t *p) case PR_SYSEXIT: case PR_SIGNALLED: case PR_FAULTED: + case PR_BRAND: /* * Make an lwp calling exit() be the * last lwp seen in the process. diff --git a/usr/src/uts/common/fs/proc/prvnops.c b/usr/src/uts/common/fs/proc/prvnops.c index 411c9b8b0b..276f54ae3a 100644 --- a/usr/src/uts/common/fs/proc/prvnops.c +++ b/usr/src/uts/common/fs/proc/prvnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -96,6 +96,11 @@ struct prdirect { #define PRSDSIZE (sizeof (struct prdirect)) /* + * Maximum length of the /proc/$$/argv file: + */ +int prmaxargvlen = 4096; + +/* * Directory characteristics. */ typedef struct prdirent { @@ -166,6 +171,8 @@ static prdirent_t piddir[] = { { PR_LDT, 27 * sizeof (prdirent_t), sizeof (prdirent_t), "ldt" }, #endif + { PR_ARGV, 28 * sizeof (prdirent_t), sizeof (prdirent_t), + "argv" }, }; #define NPIDDIRFILES (sizeof (piddir) / sizeof (piddir[0]) - 2) @@ -582,6 +589,7 @@ static int pr_read_inval(), pr_read_as(), pr_read_status(), #if defined(__x86) pr_read_ldt(), #endif + pr_read_argv(), pr_read_usage(), pr_read_lusage(), pr_read_pagedata(), pr_read_watch(), pr_read_lwpstatus(), pr_read_lwpsinfo(), pr_read_lwpusage(), pr_read_xregs(), pr_read_priv(), @@ -610,6 +618,7 @@ static int (*pr_read_function[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage, /* /proc/<pid>/usage */ pr_read_lusage, /* /proc/<pid>/lusage */ pr_read_pagedata, /* /proc/<pid>/pagedata */ @@ -672,6 +681,41 @@ pr_uioread(void *base, long count, uio_t *uiop) } static int +pr_read_argv(prnode_t *pnp, uio_t *uiop) +{ + char *args; + int error; + size_t asz = prmaxargvlen, sz; + + /* + * Allocate a scratch buffer for collection of the process arguments. + */ + args = kmem_alloc(asz, KM_SLEEP); + + ASSERT(pnp->pr_type == PR_ARGV); + + if ((error = prlock(pnp, ZNO)) != 0) { + kmem_free(args, asz); + return (error); + } + + if ((error = prreadargv(pnp->pr_common->prc_proc, args, asz, + &sz)) != 0) { + prunlock(pnp); + kmem_free(args, asz); + return (error); + } + + prunlock(pnp); + + error = pr_uioread(args, sz, uiop); + + kmem_free(args, asz); + + return (error); +} + +static int pr_read_as(prnode_t *pnp, uio_t *uiop) { int error; @@ -1767,6 +1811,7 @@ static int (*pr_read_function_32[PR_NFILES])() = { #if defined(__x86) pr_read_ldt, /* /proc/<pid>/ldt */ #endif + pr_read_argv, /* /proc/<pid>/argv */ pr_read_usage_32, /* /proc/<pid>/usage */ pr_read_lusage_32, /* /proc/<pid>/lusage */ pr_read_pagedata_32, /* /proc/<pid>/pagedata */ @@ -2686,6 +2731,103 @@ prread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) #endif } +/* + * We make pr_write_psinfo_fname() somewhat simpler by asserting at compile + * time that PRFNSZ has the same definition as MAXCOMLEN. + */ +#if PRFNSZ != MAXCOMLEN +#error PRFNSZ/MAXCOMLEN mismatch +#endif + +static int +pr_write_psinfo_fname(prnode_t *pnp, uio_t *uiop) +{ + char fname[PRFNSZ]; + int offset = offsetof(psinfo_t, pr_fname), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_fname); +#endif + + /* + * If this isn't a write to pr_fname (or if the size doesn't match + * PRFNSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRFNSZ) + return (0); + + if ((error = uiomove(fname, PRFNSZ, UIO_WRITE, uiop)) != 0) + return (error); + + fname[PRFNSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(fname, pnp->pr_common->prc_proc->p_user.u_comm, PRFNSZ); + + prunlock(pnp); + + return (0); +} + +/* + * We make pr_write_psinfo_psargs() somewhat simpler by asserting at compile + * time that PRARGSZ has the same definition as PSARGSZ. + */ +#if PRARGSZ != PSARGSZ +#error PRARGSZ/PSARGSZ mismatch +#endif + +static int +pr_write_psinfo_psargs(prnode_t *pnp, uio_t *uiop) +{ + char psargs[PRARGSZ]; + int offset = offsetof(psinfo_t, pr_psargs), error; + +#ifdef _SYSCALL32_IMPL + if (curproc->p_model != DATAMODEL_LP64) + offset = offsetof(psinfo32_t, pr_psargs); +#endif + + /* + * If this isn't a write to pr_psargs (or if the size doesn't match + * PRARGSZ) return. + */ + if (uiop->uio_offset != offset || uiop->uio_resid != PRARGSZ) + return (0); + + if ((error = uiomove(psargs, PRARGSZ, UIO_WRITE, uiop)) != 0) + return (error); + + psargs[PRARGSZ - 1] = '\0'; + + if ((error = prlock(pnp, ZNO)) != 0) + return (error); + + bcopy(psargs, pnp->pr_common->prc_proc->p_user.u_psargs, PRARGSZ); + + prunlock(pnp); + + return (0); +} + +int +pr_write_psinfo(prnode_t *pnp, uio_t *uiop) +{ + int error; + + if ((error = pr_write_psinfo_fname(pnp, uiop)) != 0) + return (error); + + if ((error = pr_write_psinfo_psargs(pnp, uiop)) != 0) + return (error); + + return (0); +} + + /* ARGSUSED */ static int prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) @@ -2764,6 +2906,9 @@ prwrite(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct) uiop->uio_resid = resid; return (error); + case PR_PSINFO: + return (pr_write_psinfo(pnp, uiop)); + default: return ((vp->v_type == VDIR)? EISDIR : EBADF); } @@ -3047,6 +3192,13 @@ prgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, case PR_AUXV: vap->va_size = __KERN_NAUXV_IMPL * PR_OBJSIZE(auxv32_t, auxv_t); break; + case PR_ARGV: + if ((p->p_flag & SSYS) || p->p_as == &kas) { + vap->va_size = PSARGSZ; + } else { + vap->va_size = prmaxargvlen; + } + break; #if defined(__x86) case PR_LDT: mutex_exit(&p->p_lock); @@ -3222,6 +3374,7 @@ praccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: p = pr_p_lock(pnp); mutex_exit(&pr_pidlock); if (p == NULL) @@ -3307,6 +3460,7 @@ static vnode_t *(*pr_lookup_function[PR_NFILES])() = { #if defined(__x86) pr_lookup_notdir, /* /proc/<pid>/ldt */ #endif + pr_lookup_notdir, /* /proc/<pid>/argv */ pr_lookup_notdir, /* /proc/<pid>/usage */ pr_lookup_notdir, /* /proc/<pid>/lusage */ pr_lookup_notdir, /* /proc/<pid>/pagedata */ @@ -4546,11 +4700,15 @@ prgetnode(vnode_t *dp, prnodetype_t type) break; case PR_PSINFO: + pnp->pr_mode = 0644; /* readable by all + owner can write */ + break; + case PR_LPSINFO: case PR_LWPSINFO: case PR_USAGE: case PR_LUSAGE: case PR_LWPUSAGE: + case PR_ARGV: pnp->pr_mode = 0444; /* read-only by all */ break; @@ -4656,6 +4814,7 @@ static int (*pr_readdir_function[PR_NFILES])() = { #if defined(__x86) pr_readdir_notdir, /* /proc/<pid>/ldt */ #endif + pr_readdir_notdir, /* /proc/<pid>/argv */ pr_readdir_notdir, /* /proc/<pid>/usage */ pr_readdir_notdir, /* /proc/<pid>/lusage */ pr_readdir_notdir, /* /proc/<pid>/pagedata */ @@ -4805,6 +4964,7 @@ pr_readdir_piddir(prnode_t *pnp, uio_t *uiop, int *eofp) case PR_PROCDIR: case PR_PSINFO: case PR_USAGE: + case PR_ARGV: break; default: continue; @@ -6010,7 +6170,7 @@ prpoll(vnode_t *vp, short events, int anyyet, short *reventsp, } *reventsp = revents; - if (!anyyet && revents == 0) { + if ((!anyyet && revents == 0) || (events & POLLET)) { /* * Arrange to wake up the polling lwp when * the target process/lwp stops or terminates diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c index 703e26ea61..682f1d867b 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -501,6 +502,9 @@ sonode_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + return (0); } @@ -654,6 +658,10 @@ sonode_fini(struct sonode *so) if (so->so_filter_top != NULL) sof_sonode_cleanup(so); + /* Clean up any remnants of krecv callbacks */ + so->so_krecv_cb = NULL; + so->so_krecv_arg = NULL; + ASSERT(list_is_empty(&so->so_acceptq_list)); ASSERT(list_is_empty(&so->so_acceptq_defer)); ASSERT(!list_link_active(&so->so_acceptq_node)); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 0be628f329..40c368736d 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -128,7 +128,7 @@ so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_BIND(so, name, namelen, flags, cr)); ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); @@ -586,11 +586,6 @@ so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); - if ((so->so_mode & SM_SENDFILESUPP) == 0) { - SO_UNBLOCK_FALLBACK(so); - return (EOPNOTSUPP); - } - error = so_sendmblk_impl(so, msg, fflag, cr, mpp, so->so_filter_top, B_FALSE); @@ -653,7 +648,7 @@ so_getsockname(struct sonode *so, struct sockaddr *addr, { int error; - SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); if (so->so_filter_active == 0 || (error = sof_filter_getsockname(so, addr, addrlen, cr)) < 0) @@ -702,7 +697,7 @@ so_getsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_getsockopt(so, option_name, optval, optlenp, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); if ((so->so_filter_active == 0 || @@ -791,7 +786,7 @@ so_setsockopt(struct sonode *so, int level, int option_name, if (level == SOL_FILTER) return (sof_setsockopt(so, option_name, optval, optlen, cr)); - SO_BLOCK_FALLBACK(so, + SO_BLOCK_FALLBACK_SAFE(so, SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); /* X/Open requires this check */ @@ -953,6 +948,13 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, if (!list_is_empty(&so->so_acceptq_list)) *reventsp |= (POLLIN|POLLRDNORM) & events; + /* + * If we're looking for POLLRDHUP, indicate it if we have sent the + * last rx signal for the socket. + */ + if ((events & POLLRDHUP) && (state & SS_SENTLASTREADSIG)) + *reventsp |= POLLRDHUP; + /* Data */ /* so_downcalls is null for sctp */ if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) { @@ -988,14 +990,20 @@ so_poll(struct sonode *so, short events, int anyyet, short *reventsp, *reventsp |= POLLHUP; } - if (!*reventsp && !anyyet) { + if ((!*reventsp && !anyyet) || (events & POLLET)) { /* Check for read events again, but this time under lock */ if (events & (POLLIN|POLLRDNORM)) { mutex_enter(&so->so_lock); if (SO_HAVE_DATA(so) || !list_is_empty(&so->so_acceptq_list)) { + if (events & POLLET) { + so->so_pollev |= SO_POLLEV_IN; + *phpp = &so->so_poll_list; + } + mutex_exit(&so->so_lock); *reventsp |= (POLLIN|POLLRDNORM) & events; + return (0); } else { so->so_pollev |= SO_POLLEV_IN; @@ -1316,6 +1324,26 @@ so_queue_msg_impl(struct sonode *so, mblk_t *mp, } } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + boolean_t cont; + so_krecv_f func = so->so_krecv_cb; + void *arg = so->so_krecv_arg; + + mutex_exit(&so->so_lock); + cont = func(so, mp, msg_size, flags & MSG_OOB, arg); + mutex_enter(&so->so_lock); + if (cont == B_TRUE) { + space_left = so->so_rcvbuf; + } else { + so->so_rcv_queued = so->so_rcvlowat; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; + } + mutex_exit(&so->so_lock); + if (flags & MSG_OOB) { so_queue_oob(so, mp, msg_size); mutex_enter(&so->so_lock); @@ -1594,6 +1622,13 @@ so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, return (ENOTCONN); } + mutex_enter(&so->so_lock); + if (so->so_krecv_cb != NULL) { + mutex_exit(&so->so_lock); + return (EOPNOTSUPP); + } + mutex_exit(&so->so_lock); + if (msg->msg_flags & MSG_PEEK) msg->msg_flags &= ~MSG_WAITALL; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index 957c8f93b4..9604ea5dba 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -24,6 +24,7 @@ */ /* * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -2276,9 +2277,9 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) fbfunc = sp->sp_smod_info->smod_proto_fallback_func; /* - * Cannot fallback if the socket has active filters + * Cannot fallback if the socket has active filters or a krecv callback. */ - if (so->so_filter_active > 0) + if (so->so_filter_active > 0 || so->so_krecv_cb != NULL) return (EINVAL); switch (so->so_family) { @@ -2456,3 +2457,50 @@ out: return (error); } + +int +so_krecv_set(sonode_t *so, so_krecv_f cb, void *arg) +{ + int ret; + + if (cb == NULL && arg != NULL) + return (EINVAL); + + SO_BLOCK_FALLBACK(so, so_krecv_set(so, cb, arg)); + + mutex_enter(&so->so_lock); + if (so->so_state & SS_FALLBACK_COMP) { + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + return (ENOTSUP); + } + + ret = so_lock_read(so, 0); + VERIFY(ret == 0); + /* + * Other consumers may actually care about getting extant data delivered + * to them, when they come along, they should figure out the best API + * for that. + */ + so_rcv_flush(so); + + so->so_krecv_cb = cb; + so->so_krecv_arg = arg; + + so_unlock_read(so); + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + + return (0); +} + +void +so_krecv_unblock(sonode_t *so) +{ + mutex_enter(&so->so_lock); + VERIFY(so->so_krecv_cb != NULL); + + so->so_rcv_queued = 0; + (void) so_check_flow_control(so); + mutex_exit(&so->so_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/sockfilter.c b/usr/src/uts/common/fs/sockfs/sockfilter.c index 971523945e..7dca6ae6fc 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter.c +++ b/usr/src/uts/common/fs/sockfs/sockfilter.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/systm.h> @@ -246,6 +247,18 @@ sof_setsockopt_impl(struct sonode *so, int option_name, /* Module loaded OK, so there must be an ops vector */ ASSERT(ent->sofe_mod != NULL); + + /* + * Check again to confirm ATTACH is ok. See if the the module + * is not SOF_ATT_SAFE after an unsafe operation has taken + * place. + */ + if ((ent->sofe_mod->sofm_flags & SOF_ATT_SAFE) == 0 && + so->so_state & SS_FILOP_UNSF) { + sof_instance_destroy(inst); + return (EINVAL); + } + inst->sofi_ops = &ent->sofe_mod->sofm_ops; SOF_STAT_ADD(inst, tot_active_attach, 1); @@ -1444,7 +1457,13 @@ sof_filter_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, * sof_register(version, name, ops, flags) * * Register a socket filter identified by name `name' and which should use - * the ops vector `ops' for event notification. `flags' should be set to 0. + * the ops vector `ops' for event notification. `flags' should be set to 0 + * by default for "unsafe" modules or SOF_ATT_SAFE for "safe" modules. An + * unsafe filter is one that cannot be attached after any socket operation has + * occured. This is the legacy default. A "safe" filter can be attached even + * after some basic initial socket operations have taken place. This set is + * currently bind, getsockname, getsockopt and setsockopt. The order in which + * a "safe" filter can be attached is more relaxed, and thus more flexible. * On success 0 is returned, otherwise an errno is returned. */ int @@ -1452,14 +1471,13 @@ sof_register(int version, const char *name, const sof_ops_t *ops, int flags) { sof_module_t *mod; - _NOTE(ARGUNUSED(flags)); - if (version != SOF_VERSION) return (EINVAL); mod = kmem_zalloc(sizeof (sof_module_t), KM_SLEEP); mod->sofm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); (void) strcpy(mod->sofm_name, name); + mod->sofm_flags = flags; mod->sofm_ops = *ops; mutex_enter(&sof_module_lock); diff --git a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h index 7f7aece1f1..cf2ad8b20d 100644 --- a/usr/src/uts/common/fs/sockfs/sockfilter_impl.h +++ b/usr/src/uts/common/fs/sockfs/sockfilter_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SOCKFS_SOCKFILTER_H @@ -51,6 +52,7 @@ typedef struct sof_kstat sof_kstat_t; struct sof_module { char *sofm_name; + int sofm_flags; sof_ops_t sofm_ops; uint_t sofm_refcnt; list_node_t sofm_node; diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c index 3d5ba2a7e8..3f858afecc 100644 --- a/usr/src/uts/common/fs/sockfs/socknotify.c +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -377,7 +377,7 @@ i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev) so->so_state |= SS_SENTLASTREADSIG; so->so_pollev &= ~SO_POLLEV_IN; - *pollev |= POLLIN|POLLRDNORM; + *pollev |= POLLIN|POLLRDNORM|POLLRDHUP; *sigev |= SOCKETSIG_READ; return (1); diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 06d76044e5..095d84fc42 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -1879,7 +1880,7 @@ ssize_t soreadfile(file_t *fp, uchar_t *buf, u_offset_t fileoff, int *err, size_t size) { struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec aiov[1]; register vnode_t *vp; int ioflag, rwflag; ssize_t cnt; diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index a86cda937c..067eaedd7c 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -21,10 +21,10 @@ /* * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ -/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ - #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> @@ -51,6 +51,7 @@ #include <sys/cmn_err.h> #include <sys/vmsystm.h> #include <sys/policy.h> +#include <sys/limits.h> #include <sys/socket.h> #include <sys/socketvar.h> @@ -83,12 +84,6 @@ extern void nl7c_init(void); extern int sockfs_defer_nl7c_init; /* - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? - */ -#define MSG_MAXIOVLEN 16 - -/* * Kernel component of socket creation. * * The socket library determines which version number to use. @@ -1023,9 +1018,10 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) STRUCT_HANDLE(nmsghdr, umsgptr); struct nmsghdr lmsg; struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; int *flagsp; model_t model; @@ -1068,22 +1064,37 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1091,15 +1102,28 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1107,6 +1131,9 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1121,12 +1148,20 @@ recvmsg(int sock, struct nmsghdr *msg, int flags) (do_useracc == 0 || useracc(lmsg.msg_control, lmsg.msg_controllen, B_WRITE) != 0)) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } - return (recvit(sock, &lmsg, &auio, flags, + rval = recvit(sock, &lmsg, &auio, flags, STRUCT_FADDR(umsgptr, msg_namelen), - STRUCT_FADDR(umsgptr, msg_controllen), flagsp)); + STRUCT_FADDR(umsgptr, msg_controllen), flagsp); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } /* @@ -1264,9 +1299,10 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) struct nmsghdr lmsg; STRUCT_DECL(nmsghdr, u_lmsg); struct uio auio; - struct iovec aiov[MSG_MAXIOVLEN]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + ssize_t iovsize = 0; int iovcnt; - ssize_t len; + ssize_t len, rval; int i; model_t model; @@ -1309,7 +1345,7 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovcnt = lmsg.msg_iovlen; - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) { + if (iovcnt <= 0 || iovcnt > IOV_MAX) { /* * Unless this is XPG 4.2 we allow iovcnt == 0 to * be compatible with SunOS 4.X and 4.4BSD. @@ -1318,19 +1354,34 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) return (set_errno(EMSGSIZE)); } + if (iovcnt > IOV_MAX_STACK) { + iovsize = iovcnt * sizeof (struct iovec); + aiov = kmem_alloc(iovsize, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, while ensuring * that they can't move more than 2Gbytes of data in a single call. */ if (model == DATAMODEL_ILP32) { - struct iovec32 aiov32[MSG_MAXIOVLEN]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + ssize_t iov32size; ssize32_t count32; + iov32size = iovcnt * sizeof (struct iovec32); + if (iovsize != 0) + aiov32 = kmem_alloc(iov32size, KM_SLEEP); + if (iovcnt != 0 && - copyin((struct iovec32 *)lmsg.msg_iov, aiov32, - iovcnt * sizeof (struct iovec32))) + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { @@ -1338,17 +1389,30 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (iovsize != 0) { + kmem_free(aiov32, iov32size); + kmem_free(aiov, iovsize); + } + return (set_errno(EINVAL)); + } + aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (iovsize != 0) + kmem_free(aiov32, iov32size); } else #endif /* _SYSCALL32_IMPL */ if (iovcnt != 0 && copyin(lmsg.msg_iov, aiov, (unsigned)iovcnt * sizeof (struct iovec))) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EFAULT)); } len = 0; @@ -1356,6 +1420,9 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) ssize_t iovlen = aiov[i].iov_len; len += iovlen; if (iovlen < 0 || len < 0) { + if (iovsize != 0) + kmem_free(aiov, iovsize); + return (set_errno(EINVAL)); } } @@ -1366,7 +1433,12 @@ sendmsg(int sock, struct nmsghdr *msg, int flags) auio.uio_segflg = UIO_USERSPACE; auio.uio_limit = 0; - return (sendit(sock, &lmsg, &auio, flags)); + rval = sendit(sock, &lmsg, &auio, flags); + + if (iovsize != 0) + kmem_free(aiov, iovsize); + + return (rval); } ssize_t diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index d33f53f7a6..485e73eb02 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -6272,6 +6272,13 @@ sotpi_poll( if (sti->sti_conn_ind_head != NULL) *reventsp |= (POLLIN|POLLRDNORM) & events; + if (so->so_state & SS_CANTRCVMORE) { + *reventsp |= POLLRDHUP & events; + + if (so->so_state & SS_CANTSENDMORE) + *reventsp |= POLLHUP; + } + if (so->so_state & SS_OOBPEND) *reventsp |= POLLRDBAND & events; diff --git a/usr/src/uts/common/fs/swapfs/swap_subr.c b/usr/src/uts/common/fs/swapfs/swap_subr.c index 74c4302da9..a4d983665b 100644 --- a/usr/src/uts/common/fs/swapfs/swap_subr.c +++ b/usr/src/uts/common/fs/swapfs/swap_subr.c @@ -110,9 +110,11 @@ swapfs_recalc(pgcnt_t pgs) * memory that can be used as swap space should do so by * setting swapfs_desfree at boot time, not swapfs_minfree. * However, swapfs_minfree is tunable by install as a - * workaround for bugid 1147463. + * workaround for bugid 1147463. Note swapfs_minfree is set + * to 1/8th of memory, but clamped at the limit of 256 MB. */ - new_swapfs_minfree = MAX(btopr(2 * 1024 * 1024), pgs >> 3); + new_swapfs_minfree = MIN(MAX(btopr(2 * 1024 * 1024), pgs >> 3), + btopr(256 * 1024 * 1024)); } /* diff --git a/usr/src/uts/common/fs/tmpfs/tmp_dir.c b/usr/src/uts/common/fs/tmpfs/tmp_dir.c index f6621c8097..387cc6ae54 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_dir.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_dir.c @@ -516,7 +516,7 @@ tdirdelete( */ namelen = strlen(tpdp->td_name) + 1; - tmp_memfree(tpdp, sizeof (struct tdirent) + namelen); + kmem_free(tpdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; @@ -549,8 +549,8 @@ tdirinit( ASSERT(RW_WRITE_HELD(&parent->tn_rwlock)); ASSERT(dir->tn_type == VDIR); - dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE); - dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE); + dot = kmem_zalloc(sizeof (struct tdirent) + 2, KM_SLEEP); + dotdot = kmem_zalloc(sizeof (struct tdirent) + 3, KM_SLEEP); /* * Initialize the entries @@ -650,7 +650,7 @@ tdirtrunc(struct tmpnode *dir) tmpfs_hash_out(tdp); - tmp_memfree(tdp, sizeof (struct tdirent) + namelen); + kmem_free(tdp, sizeof (struct tdirent) + namelen); dir->tn_size -= (sizeof (struct tdirent) + namelen); dir->tn_dirents--; } @@ -925,7 +925,7 @@ tdiraddentry( */ namelen = strlen(name) + 1; alloc_size = namelen + sizeof (struct tdirent); - tdp = tmp_memalloc(alloc_size, 0); + tdp = kmem_zalloc(alloc_size, KM_NOSLEEP | KM_NORMALPRI); if (tdp == NULL) return (ENOSPC); @@ -1025,7 +1025,7 @@ tdirmaketnode( ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime))) return (EOVERFLOW); type = va->va_type; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, va, cred); /* setup normal file/dir's extended attribute directory */ diff --git a/usr/src/uts/common/fs/tmpfs/tmp_subr.c b/usr/src/uts/common/fs/tmpfs/tmp_subr.c index 2e59d28d80..e6e2b392fe 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_subr.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_subr.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -40,9 +41,19 @@ #include <sys/policy.h> #include <sys/fs/tmp.h> #include <sys/fs/tmpnode.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#define KILOBYTE 1024 +#define MEGABYTE (1024 * KILOBYTE) +#define GIGABYTE (1024 * MEGABYTE) #define MODESHIFT 3 +#define VALIDMODEBITS 07777 + +extern pgcnt_t swapfs_minfree; + int tmp_taccess(void *vtp, int mode, struct cred *cred) { @@ -71,7 +82,6 @@ tmp_taccess(void *vtp, int mode, struct cred *cred) * a plain file and you have write access to that file. * Function returns 0 if remove access is granted. */ - int tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, struct cred *cr) @@ -89,111 +99,122 @@ tmp_sticky_remove_access(struct tmpnode *dir, struct tmpnode *entry, } /* - * Allocate zeroed memory if tmpfs_maxkmem has not been exceeded - * or the 'musthave' flag is set. 'musthave' allocations should - * always be subordinate to normal allocations so that tmpfs_maxkmem - * can't be exceeded by more than a few KB. Example: when creating - * a new directory, the tmpnode is a normal allocation; if that - * succeeds, the dirents for "." and ".." are 'musthave' allocations. - */ -void * -tmp_memalloc(size_t size, int musthave) -{ - static time_t last_warning; - time_t now; - - if (atomic_add_long_nv(&tmp_kmemspace, size) < tmpfs_maxkmem || - musthave) - return (kmem_zalloc(size, KM_SLEEP)); - - atomic_add_long(&tmp_kmemspace, -size); - now = gethrestime_sec(); - if (last_warning != now) { - last_warning = now; - cmn_err(CE_WARN, "tmp_memalloc: tmpfs over memory limit"); - } - return (NULL); -} - -void -tmp_memfree(void *cp, size_t size) -{ - kmem_free(cp, size); - atomic_add_long(&tmp_kmemspace, -size); -} - -/* * Convert a string containing a number (number of bytes) to a pgcnt_t, * containing the corresponding number of pages. On 32-bit kernels, the * maximum value encoded in 'str' is PAGESIZE * ULONG_MAX, while the value * returned in 'maxpg' is at most ULONG_MAX. * - * If the number is followed by a "k" or "K", the value is converted from - * kilobytes to bytes. If it is followed by an "m" or "M" it is converted - * from megabytes to bytes. If it is not followed by a character it is - * assumed to be in bytes. Multiple letter options are allowed, so for instance - * '2mk' is interpreted as 2gb. + * The number may be followed by a magnitude suffix: "k" or "K" for kilobytes; + * "m" or "M" for megabytes; "g" or "G" for gigabytes. This interface allows + * for an arguably esoteric interpretation of multiple suffix characters: + * namely, they cascade. For example, the caller may specify "2mk", which is + * interpreted as 2 gigabytes. It would seem, at this late stage, that the + * horse has left not only the barn but indeed the country, and possibly the + * entire planetary system. Alternatively, the number may be followed by a + * single '%' sign, indicating the size is a percentage of either the zone's + * swap limit or the system's overall swap size. * * Parse and overflow errors are detected and a non-zero number returned on * error. */ - int tmp_convnum(char *str, pgcnt_t *maxpg) { - uint64_t num = 0, oldnum; + u_longlong_t num = 0; #ifdef _LP64 - uint64_t max_bytes = ULONG_MAX; + u_longlong_t max_bytes = ULONG_MAX; #else - uint64_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; + u_longlong_t max_bytes = PAGESIZE * (uint64_t)ULONG_MAX; #endif char *c; - - if (str == NULL) + const struct convchar { + char *cc_char; + uint64_t cc_factor; + } convchars[] = { + { "kK", KILOBYTE }, + { "mM", MEGABYTE }, + { "gG", GIGABYTE }, + { NULL, 0 } + }; + + if (str == NULL) { return (EINVAL); + } c = str; /* - * Convert str to number + * Convert the initial numeric portion of the input string. */ - while ((*c >= '0') && (*c <= '9')) { - oldnum = num; - num = num * 10 + (*c++ - '0'); - if (oldnum > num) /* overflow */ + if (ddi_strtoull(str, &c, 10, &num) != 0) { + return (EINVAL); + } + + /* + * Handle a size in percent. Anything other than a single percent + * modifier is invalid. We use either the zone's swap limit or the + * system's total available swap size as the initial value. Perform the + * intermediate calculation in pages to avoid overflow. + */ + if (*c == '\%') { + u_longlong_t cap; + + if (*(c + 1) != '\0') + return (EINVAL); + + if (num > 100) return (EINVAL); + + cap = (u_longlong_t)curproc->p_zone->zone_max_swap_ctl; + if (cap == UINT64_MAX) { + /* + * Use the amount of available physical and memory swap + */ + mutex_enter(&anoninfo_lock); + cap = TOTAL_AVAILABLE_SWAP; + mutex_exit(&anoninfo_lock); + } else { + cap = btop(cap); + } + + num = ptob(cap * num / 100); + goto done; } /* - * Terminate on null + * Apply the (potentially cascading) magnitude suffixes until an + * invalid character is found, or the string comes to an end. */ - while (*c != '\0') { - switch (*c++) { + for (; *c != '\0'; c++) { + int i; + + for (i = 0; convchars[i].cc_char != NULL; i++) { + /* + * Check if this character matches this multiplier + * class: + */ + if (strchr(convchars[i].cc_char, *c) != NULL) { + /* + * Check for overflow: + */ + if (num > max_bytes / convchars[i].cc_factor) { + return (EINVAL); + } + + num *= convchars[i].cc_factor; + goto valid_char; + } + } /* - * convert from kilobytes + * This was not a valid multiplier suffix character. */ - case 'k': - case 'K': - if (num > max_bytes / 1024) /* will overflow */ - return (EINVAL); - num *= 1024; - break; + return (EINVAL); - /* - * convert from megabytes - */ - case 'm': - case 'M': - if (num > max_bytes / (1024 * 1024)) /* will overflow */ - return (EINVAL); - num *= 1024 * 1024; - break; - - default: - return (EINVAL); - } +valid_char: + continue; } +done: /* * Since btopr() rounds up to page granularity, this round-up can * cause an overflow only if 'num' is between (max_bytes - PAGESIZE) @@ -204,3 +225,29 @@ tmp_convnum(char *str, pgcnt_t *maxpg) return (EINVAL); return (0); } + +/* + * Parse an octal mode string for use as the permissions set for the root + * of the tmpfs mount. + */ +int +tmp_convmode(char *str, mode_t *mode) +{ + ulong_t num; + char *c; + + if (str == NULL) { + return (EINVAL); + } + + if (ddi_strtoul(str, &c, 8, &num) != 0) { + return (EINVAL); + } + + if ((num & ~VALIDMODEBITS) != 0) { + return (EINVAL); + } + + *mode = VALIDMODEBITS & num; + return (0); +} diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c index f8a36a528f..3c088c442c 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vfsops.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +56,15 @@ static int tmpfsfstype; /* + * tmpfs_mountcount is used to prevent module unloads while there is still + * state from a former mount hanging around. With forced umount support, the + * filesystem module must not be allowed to go away before the last + * VFS_FREEVFS() call has been made. Since this is just an atomic counter, + * there's no need for locking. + */ +static uint32_t tmpfs_mountcount; + +/* * tmpfs vfs operations. */ static int tmpfsinit(int, char *); @@ -64,6 +74,7 @@ static int tmp_unmount(struct vfs *, int, struct cred *); static int tmp_root(struct vfs *, struct vnode **); static int tmp_statvfs(struct vfs *, struct statvfs64 *); static int tmp_vget(struct vfs *, struct vnode **, struct fid *); +static void tmp_freevfs(vfs_t *vfsp); /* * Loadable module wrapper @@ -76,7 +87,7 @@ static vfsdef_t vfw = { VFSDEF_VERSION, "tmpfs", tmpfsinit, - VSW_HASPROTO|VSW_STATS|VSW_ZMOUNT, + VSW_HASPROTO|VSW_CANREMOUNT|VSW_STATS|VSW_ZMOUNT, &tmpfs_proto_opttbl }; @@ -90,7 +101,8 @@ static mntopt_t tmpfs_options[] = { /* Option name Cancel Opt Arg Flags Data */ { MNTOPT_XATTR, xattr_cancel, NULL, MO_DEFAULT, NULL}, { MNTOPT_NOXATTR, noxattr_cancel, NULL, NULL, NULL}, - { "size", NULL, "0", MO_HASVALUE, NULL} + { "size", NULL, "0", MO_HASVALUE, NULL}, + { "mode", NULL, NULL, MO_HASVALUE, NULL} }; @@ -121,6 +133,14 @@ _fini() { int error; + /* + * If a forceably unmounted instance is still hanging around, we cannot + * allow the module to be unloaded because that would cause panics once + * the VFS framework decides it's time to call into VFS_FREEVFS(). + */ + if (tmpfs_mountcount) + return (EBUSY); + error = mod_remove(&modlinkage); if (error) return (error); @@ -139,14 +159,6 @@ _info(struct modinfo *modinfop) } /* - * The following are patchable variables limiting the amount of system - * resources tmpfs can use. - * - * tmpfs_maxkmem limits the amount of kernel kmem_alloc memory - * tmpfs can use for it's data structures (e.g. tmpnodes, directory entries) - * It is not determined by setting a hard limit but rather as a percentage of - * physical memory which is determined when tmpfs is first used in the system. - * * tmpfs_minfree is the minimum amount of swap space that tmpfs leaves for * the rest of the system. In other words, if the amount of free swap space * in the system (i.e. anoninfo.ani_free) drops below tmpfs_minfree, tmpfs @@ -155,9 +167,7 @@ _info(struct modinfo *modinfop) * There is also a per mount limit on the amount of swap space * (tmount.tm_anonmax) settable via a mount option. */ -size_t tmpfs_maxkmem = 0; size_t tmpfs_minfree = 0; -size_t tmp_kmemspace; /* bytes of kernel heap used by all tmpfs */ static major_t tmpfs_major; static minor_t tmpfs_minor; @@ -176,6 +186,7 @@ tmpfsinit(int fstype, char *name) VFSNAME_ROOT, { .vfs_root = tmp_root }, VFSNAME_STATVFS, { .vfs_statvfs = tmp_statvfs }, VFSNAME_VGET, { .vfs_vget = tmp_vget }, + VFSNAME_FREEVFS, { .vfs_freevfs = tmp_freevfs }, NULL, NULL }; int error; @@ -210,27 +221,17 @@ tmpfsinit(int fstype, char *name) tmpfs_minfree = btopr(TMPMINFREE); } - /* - * The maximum amount of space tmpfs can allocate is - * TMPMAXPROCKMEM percent of kernel memory - */ - if (tmpfs_maxkmem == 0) - tmpfs_maxkmem = MAX(PAGESIZE, kmem_maxavail() / TMPMAXFRACKMEM); - if ((tmpfs_major = getudev()) == (major_t)-1) { cmn_err(CE_WARN, "tmpfsinit: Can't get unique device number."); tmpfs_major = 0; } mutex_init(&tmpfs_minor_lock, NULL, MUTEX_DEFAULT, NULL); + tmpfs_mountcount = 0; return (0); } static int -tmp_mount( - struct vfs *vfsp, - struct vnode *mvp, - struct mounta *uap, - struct cred *cr) +tmp_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) { struct tmount *tm = NULL; struct tmpnode *tp; @@ -239,8 +240,9 @@ tmp_mount( pgcnt_t anonmax; struct vattr rattr; int got_attrs; - - char *sizestr; + boolean_t mode_arg = B_FALSE; + mode_t root_mode = 0777; + char *argstr; if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) return (error); @@ -249,7 +251,7 @@ tmp_mount( return (ENOTDIR); mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_OVERLAY) == 0 && + if ((uap->flags & MS_REMOUNT) == 0 && (uap->flags & MS_OVERLAY) == 0 && (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { mutex_exit(&mvp->v_lock); return (EBUSY); @@ -275,18 +277,45 @@ tmp_mount( * tm_anonmax is set according to the mount arguments * if any. Otherwise, it is set to a maximum value. */ - if (vfs_optionisset(vfsp, "size", &sizestr)) { - if ((error = tmp_convnum(sizestr, &anonmax)) != 0) + if (vfs_optionisset(vfsp, "size", &argstr)) { + if ((error = tmp_convnum(argstr, &anonmax)) != 0) goto out; } else { anonmax = ULONG_MAX; } + /* + * The "mode" mount argument allows the operator to override the + * permissions of the root of the tmpfs mount. + */ + if (vfs_optionisset(vfsp, "mode", &argstr)) { + if ((error = tmp_convmode(argstr, &root_mode)) != 0) { + goto out; + } + mode_arg = B_TRUE; + } + if (error = pn_get(uap->dir, (uap->flags & MS_SYSSPACE) ? UIO_SYSSPACE : UIO_USERSPACE, &dpn)) goto out; - if ((tm = tmp_memalloc(sizeof (struct tmount), 0)) == NULL) { + if (uap->flags & MS_REMOUNT) { + tm = (struct tmount *)VFSTOTM(vfsp); + + /* + * If we change the size so its less than what is currently + * being used, we allow that. The file system will simply be + * full until enough files have been removed to get below the + * new max. + */ + mutex_enter(&tm->tm_contents); + tm->tm_anonmax = anonmax; + mutex_exit(&tm->tm_contents); + goto out; + } + + if ((tm = kmem_zalloc(sizeof (struct tmount), + KM_NOSLEEP | KM_NORMALPRI)) == NULL) { pn_free(&dpn); error = ENOMEM; goto out; @@ -318,17 +347,17 @@ tmp_mount( vfsp->vfs_bsize = PAGESIZE; vfsp->vfs_flag |= VFS_NOTRUNC; vfs_make_fsid(&vfsp->vfs_fsid, tm->tm_dev, tmpfsfstype); - tm->tm_mntpath = tmp_memalloc(dpn.pn_pathlen + 1, TMP_MUSTHAVE); + tm->tm_mntpath = kmem_zalloc(dpn.pn_pathlen + 1, KM_SLEEP); (void) strcpy(tm->tm_mntpath, dpn.pn_path); /* * allocate and initialize root tmpnode structure */ bzero(&rattr, sizeof (struct vattr)); - rattr.va_mode = (mode_t)(S_IFDIR | 0777); /* XXX modes */ + rattr.va_mode = (mode_t)(S_IFDIR | root_mode); rattr.va_type = VDIR; rattr.va_rdev = 0; - tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE); + tp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tmpnode_init(tm, tp, &rattr, cr); /* @@ -345,7 +374,14 @@ tmp_mount( * the previously set hardwired defaults to prevail. */ if (got_attrs == 0) { - tp->tn_mode = rattr.va_mode; + if (!mode_arg) { + /* + * Only use the underlying mount point for the + * mode if the "mode" mount argument was not + * provided. + */ + tp->tn_mode = rattr.va_mode; + } tp->tn_uid = rattr.va_uid; tp->tn_gid = rattr.va_gid; } @@ -366,6 +402,7 @@ tmp_mount( pn_free(&dpn); error = 0; + atomic_inc_32(&tmpfs_mountcount); out: if (error == 0) @@ -381,36 +418,107 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) struct tmpnode *tnp, *cancel; struct vnode *vp; int error; + uint_t cnt; + int i; if ((error = secpolicy_fs_unmount(cr, vfsp)) != 0) return (error); - /* - * forced unmount is not supported by this file system - * and thus, ENOTSUP, is being returned. - */ - if (flag & MS_FORCE) - return (ENOTSUP); - mutex_enter(&tm->tm_contents); /* - * If there are no open files, only the root node should have - * a reference count. + * In the normal unmount case (non-forced unmount), if there are no + * open files, only the root node should have a reference count. + * * With tm_contents held, nothing can be added or removed. * There may be some dirty pages. To prevent fsflush from * disrupting the unmount, put a hold on each node while scanning. * If we find a previously referenced node, undo the holds we have * placed and fail EBUSY. + * + * However, in the case of a forced umount, things are a bit different. + * An additional VFS_HOLD is added for each outstanding VN_HOLD to + * ensure that the file system is not cleaned up (tmp_freevfs) until + * the last vfs hold is dropped. This happens in tmp_inactive as the + * vnodes are released. Also, we can't add an additional VN_HOLD in + * this case since that would prevent tmp_inactive from ever being + * called. Finally, we do need to drop the zone ref now (zone_rele_ref) + * so that the zone is not blocked waiting for the final file system + * cleanup. */ tnp = tm->tm_rootnode; - if (TNTOV(tnp)->v_count > 1) { + + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + vfsp->vfs_flag |= VFS_UNMOUNTED; + /* Extra hold which we rele below when we drop the zone ref */ + VFS_HOLD(vfsp); + + for (i = 1; i < cnt; i++) + VFS_HOLD(vfsp); + + /* drop the mutex now because no one can find this mount */ + mutex_exit(&tm->tm_contents); + } else if (cnt > 1) { + mutex_exit(&vp->v_lock); mutex_exit(&tm->tm_contents); return (EBUSY); } + mutex_exit(&vp->v_lock); + /* + * Check for open files. An open file causes everything to unwind + * unless this is a forced umount. + */ for (tnp = tnp->tn_forw; tnp; tnp = tnp->tn_forw) { - if ((vp = TNTOV(tnp))->v_count > 0) { + vp = TNTOV(tnp); + mutex_enter(&vp->v_lock); + cnt = vp->v_count; + if (flag & MS_FORCE) { + for (i = 0; i < cnt; i++) + VFS_HOLD(vfsp); + + /* + * In the case of a forced umount don't add an + * additional VN_HOLD on the already held vnodes, like + * we do in the non-forced unmount case. If the + * cnt > 0, then the vnode already has at least one + * hold and we need tmp_inactive to get called when the + * last pre-existing hold on the node is released so + * that we can VFS_RELE the VFS holds we just added. + */ + if (cnt == 0) { + /* directly add VN_HOLD since have the lock */ + vp->v_count++; + } + + mutex_exit(&vp->v_lock); + + /* + * If the tmpnode has any pages associated with it + * (i.e. if it's a normal file with non-zero size), the + * tmpnode could still be discovered by pageout or + * fsflush via the page vnode pointers. To prevent this + * from interfering with the tmp_freevfs, truncate the + * tmpnode now. + */ + if (tnp->tn_size != 0 && tnp->tn_type == VREG) { + rw_enter(&tnp->tn_rwlock, RW_WRITER); + rw_enter(&tnp->tn_contents, RW_WRITER); + + (void) tmpnode_trunc(tm, tnp, 0); + + rw_exit(&tnp->tn_contents); + rw_exit(&tnp->tn_rwlock); + + ASSERT(tnp->tn_size == 0); + ASSERT(tnp->tn_nblocks == 0); + } + } else if (cnt > 0) { + /* An open file; unwind the holds we've been adding. */ + mutex_exit(&vp->v_lock); cancel = tm->tm_rootnode->tn_forw; while (cancel != tnp) { vp = TNTOV(cancel); @@ -420,14 +528,50 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) } mutex_exit(&tm->tm_contents); return (EBUSY); + } else { + /* directly add a VN_HOLD since we have the lock */ + vp->v_count++; + mutex_exit(&vp->v_lock); } - VN_HOLD(vp); } - /* - * We can drop the mutex now because no one can find this mount - */ - mutex_exit(&tm->tm_contents); + if (flag & MS_FORCE) { + /* + * Drop the zone ref now since we don't know how long it will + * be until the final vfs_rele is called by tmp_inactive. + */ + if (vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, + ZONE_REF_VFS); + vfsp->vfs_zone = 0; + } + /* We can now drop the extra hold we added above. */ + VFS_RELE(vfsp); + } else { + /* + * For the non-forced case, we can drop the mutex now because + * no one can find this mount anymore + */ + vfsp->vfs_flag |= VFS_UNMOUNTED; + mutex_exit(&tm->tm_contents); + } + + return (0); +} + +/* + * Implementation of VFS_FREEVFS() to support forced umounts. This is called by + * the vfs framework after umount and the last VFS_RELE, to trigger the release + * of any resources still associated with the given vfs_t. We only add + * additional VFS_HOLDs during the forced umount case, so this is normally + * called immediately after tmp_umount. + */ +void +tmp_freevfs(vfs_t *vfsp) +{ + struct tmount *tm = (struct tmount *)VFSTOTM(vfsp); + struct tmpnode *tnp; + struct vnode *vp; /* * Free all kmemalloc'd and anonalloc'd memory associated with @@ -437,6 +581,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) * tmpnode_free which assumes that the directory entry has been * removed before the file. */ + + /* + * Now that we are tearing ourselves down we need to remove the + * UNMOUNTED flag. If we don't, we'll later hit a VN_RELE when we remove + * files from the system causing us to have a negative value. Doing this + * seems a bit better than trying to set a flag on the tmount that says + * we're tearing down. + */ + vfsp->vfs_flag &= ~VFS_UNMOUNTED; + /* * Remove all directory entries */ @@ -503,15 +657,16 @@ tmp_unmount(struct vfs *vfsp, int flag, struct cred *cr) ASSERT(tm->tm_mntpath); - tmp_memfree(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); + kmem_free(tm->tm_mntpath, strlen(tm->tm_mntpath) + 1); ASSERT(tm->tm_anonmem == 0); mutex_destroy(&tm->tm_contents); mutex_destroy(&tm->tm_renamelck); - tmp_memfree(tm, sizeof (struct tmount)); + kmem_free(tm, sizeof (struct tmount)); - return (0); + /* Allow _fini() to succeed now */ + atomic_dec_32(&tmpfs_mountcount); } /* @@ -614,13 +769,7 @@ tmp_statvfs(struct vfs *vfsp, struct statvfs64 *sbp) * available to tmpfs. This is fairly inaccurate since it doesn't * take into account the names stored in the directory entries. */ - if (tmpfs_maxkmem > tmp_kmemspace) - sbp->f_ffree = (tmpfs_maxkmem - tmp_kmemspace) / - (sizeof (struct tmpnode) + sizeof (struct tdirent)); - else - sbp->f_ffree = 0; - - sbp->f_files = tmpfs_maxkmem / + sbp->f_ffree = sbp->f_files = ptob(availrmem) / (sizeof (struct tmpnode) + sizeof (struct tdirent)); sbp->f_favail = (fsfilcnt64_t)(sbp->f_ffree); (void) cmpldev(&d32, vfsp->vfs_dev); diff --git a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c index 61607a6dcc..464d638db0 100644 --- a/usr/src/uts/common/fs/tmpfs/tmp_vnops.c +++ b/usr/src/uts/common/fs/tmpfs/tmp_vnops.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -584,6 +584,10 @@ tmp_read(struct vnode *vp, struct uio *uiop, int ioflag, cred_t *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support reading non-regular files */ @@ -613,6 +617,10 @@ tmp_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cred, struct tmount *tm = (struct tmount *)VTOTM(vp); int error; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + /* * We don't currently support writing to non-regular files */ @@ -833,6 +841,9 @@ tmp_lookup( struct tmpnode *ntp = NULL; int error; + /* If the filesystem was umounted by force, return immediately. */ + if (dvp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); /* allow cd into @ dir */ if (flags & LOOKUP_XATTR) { @@ -871,8 +882,7 @@ tmp_lookup( return (error); } - xdp = tmp_memalloc(sizeof (struct tmpnode), - TMP_MUSTHAVE); + xdp = kmem_zalloc(sizeof (struct tmpnode), KM_SLEEP); tm = VTOTM(dvp); tmpnode_init(tm, xdp, &tp->tn_attr, NULL); /* @@ -1186,7 +1196,7 @@ tmp_rename( struct tmpnode *toparent; struct tmpnode *fromtp = NULL; /* source tmpnode */ struct tmount *tm = (struct tmount *)VTOTM(odvp); - int error; + int error, err; int samedir = 0; /* set if odvp == ndvp */ struct vnode *realvp; @@ -1262,15 +1272,6 @@ tmp_rename( error = 0; goto done; } - vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); - - /* - * Notify the target directory if not same as - * source directory. - */ - if (ndvp != odvp) { - vnevent_rename_dest_dir(ndvp, ct); - } /* * Unlink from source. @@ -1278,7 +1279,7 @@ tmp_rename( rw_enter(&fromparent->tn_rwlock, RW_WRITER); rw_enter(&fromtp->tn_rwlock, RW_WRITER); - error = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred); + error = err = tdirdelete(fromparent, fromtp, onm, DR_RENAME, cred); /* * The following handles the case where our source tmpnode was @@ -1293,6 +1294,12 @@ tmp_rename( rw_exit(&fromtp->tn_rwlock); rw_exit(&fromparent->tn_rwlock); + + if (err == 0) { + vnevent_rename_src(TNTOV(fromtp), odvp, onm, ct); + vnevent_rename_dest_dir(ndvp, TNTOV(fromtp), nnm, ct); + } + done: tmpnode_rele(fromtp); mutex_exit(&tm->tm_renamelck); @@ -1459,6 +1466,10 @@ tmp_readdir( int reclen; caddr_t outbuf; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (uiop->uio_loffset >= MAXOFF_T) { if (eofp) *eofp = 1; @@ -1597,7 +1608,7 @@ tmp_symlink( return (error); } len = strlen(tnm) + 1; - cp = tmp_memalloc(len, 0); + cp = kmem_alloc(len, KM_NOSLEEP | KM_NORMALPRI); if (cp == NULL) { tmpnode_rele(self); return (ENOSPC); @@ -1662,10 +1673,27 @@ top: * there's little to do -- just drop our hold. */ if (vp->v_count > 1 || tp->tn_nlink != 0) { - vp->v_count--; + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) { + /* + * Since the file system was forcibly unmounted, we can + * have a case (v_count == 1, tn_nlink != 0) where this + * file was open so we didn't add an extra hold on the + * file in tmp_unmount. We are counting on the + * interaction of the hold made in tmp_unmount and + * rele-ed in tmp_vfsfree so we need to be sure we + * don't decrement in this case. + */ + if (vp->v_count > 1) + vp->v_count--; + } else { + vp->v_count--; + } mutex_exit(&vp->v_lock); mutex_exit(&tp->tn_tlock); rw_exit(&tp->tn_rwlock); + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); return; } @@ -1690,7 +1718,7 @@ top: goto top; } if (tp->tn_type == VLNK) - tmp_memfree(tp->tn_symlink, tp->tn_size + 1); + kmem_free(tp->tn_symlink, tp->tn_size + 1); } /* @@ -1724,7 +1752,11 @@ top: rw_destroy(&tp->tn_rwlock); mutex_destroy(&tp->tn_tlock); vn_free(TNTOV(tp)); - tmp_memfree(tp, sizeof (struct tmpnode)); + kmem_free(tp, sizeof (struct tmpnode)); + + /* If the filesystem was umounted by force, rele the vfs ref */ + if (tm->tm_vfsp->vfs_flag & VFS_UNMOUNTED) + VFS_RELE(tm->tm_vfsp); } /* ARGSUSED2 */ @@ -1846,6 +1878,10 @@ tmp_getapage( struct vnode *pvp; u_offset_t poff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + if (protp != NULL) *protp = PROT_ALL; again: @@ -2067,6 +2103,10 @@ tmp_putapage( u_offset_t offset; u_offset_t tmpoff; + /* If the filesystem was umounted by force, return immediately. */ + if (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED) + return (EIO); + ASSERT(PAGE_LOCKED(pp)); /* Kluster in tmp_klustsize chunks */ diff --git a/usr/src/uts/common/fs/udfs/udf_dir.c b/usr/src/uts/common/fs/udfs/udf_dir.c index c1e2c74a87..def046a0bf 100644 --- a/usr/src/uts/common/fs/udfs/udf_dir.c +++ b/usr/src/uts/common/fs/udfs/udf_dir.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -562,9 +563,8 @@ out: namep, ctp); } - if (sdp != tdp) { - vnevent_rename_dest_dir(ITOV(tdp), ctp); - } + vnevent_rename_dest_dir(ITOV(tdp), ITOV(tip), + namep, ctp); } /* diff --git a/usr/src/uts/common/fs/udfs/udf_vnops.c b/usr/src/uts/common/fs/udfs/udf_vnops.c index 307d3987ed..2d8de23399 100644 --- a/usr/src/uts/common/fs/udfs/udf_vnops.c +++ b/usr/src/uts/common/fs/udfs/udf_vnops.c @@ -911,7 +911,7 @@ udf_rename( caller_context_t *ct, int flags) { - int32_t error = 0; + int32_t error = 0, err; struct udf_vfs *udf_vfsp; struct ud_inode *sip; /* source inode */ struct ud_inode *sdp, *tdp; /* source and target parent inode */ @@ -995,7 +995,6 @@ udf_rename( rw_exit(&tdp->i_rwlock); goto errout; } - vnevent_rename_src(ITOV(sip), sdvp, snm, ct); rw_exit(&tdp->i_rwlock); rw_enter(&sdp->i_rwlock, RW_WRITER); @@ -1006,11 +1005,15 @@ udf_rename( * If the entry has changed just forget about it. Release * the source inode. */ - if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0, + if ((error = err = ud_dirremove(sdp, snm, sip, (struct vnode *)0, DR_RENAME, cr, ct)) == ENOENT) { error = 0; } rw_exit(&sdp->i_rwlock); + + if (err == 0) + vnevent_rename_src(ITOV(sip), sdvp, snm, ct); + errout: ITIMES(sdp); ITIMES(tdp); diff --git a/usr/src/uts/common/fs/ufs/ufs_vnops.c b/usr/src/uts/common/fs/ufs/ufs_vnops.c index fcffd952ed..c77872b11d 100644 --- a/usr/src/uts/common/fs/ufs/ufs_vnops.c +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1984, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -3367,11 +3367,10 @@ ufs_rename( struct inode *ip = NULL; /* check inode */ struct inode *sdp; /* old (source) parent inode */ struct inode *tdp; /* new (target) parent inode */ - struct vnode *svp = NULL; /* source vnode */ struct vnode *tvp = NULL; /* target vnode, if it exists */ struct vnode *realvp; struct ufsvfs *ufsvfsp; - struct ulockfs *ulp; + struct ulockfs *ulp = NULL; struct ufs_slot slot; timestruc_t now; int error; @@ -3380,7 +3379,7 @@ ufs_rename( krwlock_t *first_lock; krwlock_t *second_lock; krwlock_t *reverse_lock; - int serr, terr; + int terr; sdp = VTOI(sdvp); slot.fbp = NULL; @@ -3389,34 +3388,13 @@ ufs_rename( if (VOP_REALVP(tdvp, &realvp, ct) == 0) tdvp = realvp; + /* Must do this before taking locks in case of DNLC miss */ terr = ufs_eventlookup(tdvp, tnm, cr, &tvp); - serr = ufs_eventlookup(sdvp, snm, cr, &svp); - - if ((serr == 0) && ((terr == 0) || (terr == ENOENT))) { - if (tvp != NULL) - vnevent_rename_dest(tvp, tdvp, tnm, ct); - - /* - * Notify the target directory of the rename event - * if source and target directories are not the same. - */ - if (sdvp != tdvp) - vnevent_rename_dest_dir(tdvp, ct); - - if (svp != NULL) - vnevent_rename_src(svp, sdvp, snm, ct); - } - - if (tvp != NULL) - VN_RELE(tvp); - - if (svp != NULL) - VN_RELE(svp); retry_rename: error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_RENAME_MASK); if (error) - goto out; + goto unlock; if (ulp) TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_RENAME, @@ -3712,6 +3690,9 @@ retry_firstlock: goto errout; } + if (terr == 0 && tvp != NULL) + vnevent_rename_dest(tvp, tdvp, tnm, ct); + /* * Unlink the source. * Remove the source entry. ufs_dirremove() checks that the entry @@ -3723,6 +3704,9 @@ retry_firstlock: DR_RENAME, cr)) == ENOENT) error = 0; + vnevent_rename_src(ITOV(sip), sdvp, snm, ct); + vnevent_rename_dest_dir(tdvp, ITOV(sip), tnm, ct); + errout: if (slot.fbp) fbrelse(slot.fbp, S_OTHER); @@ -3732,15 +3716,17 @@ errout: rw_exit(&sdp->i_rwlock); } - VN_RELE(ITOV(sip)); - unlock: + if (tvp != NULL) + VN_RELE(tvp); + if (sip != NULL) + VN_RELE(ITOV(sip)); + if (ulp) { TRANS_END_CSYNC(ufsvfsp, error, issync, TOP_RENAME, trans_size); ufs_lockfs_end(ulp); } -out: return (error); } diff --git a/usr/src/uts/common/fs/vfs.c b/usr/src/uts/common/fs/vfs.c index 6e93d056df..b680b1168c 100644 --- a/usr/src/uts/common/fs/vfs.c +++ b/usr/src/uts/common/fs/vfs.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -235,7 +235,8 @@ fsop_root(vfs_t *vfsp, vnode_t **vpp) * Make sure this root has a path. With lofs, it is possible to have * a NULL mountpoint. */ - if (ret == 0 && vfsp->vfs_mntpt != NULL && (*vpp)->v_path == NULL) { + if (ret == 0 && vfsp->vfs_mntpt != NULL && + (*vpp)->v_path == vn_vpath_empty) { mntpt = vfs_getmntpoint(vfsp); vn_setpath_str(*vpp, refstr_value(mntpt), strlen(refstr_value(mntpt))); @@ -905,6 +906,7 @@ vfs_mountroot(void) vfs_mountfs("mntfs", "/etc/mnttab", "/etc/mnttab"); vfs_mountfs("tmpfs", "/etc/svc/volatile", "/etc/svc/volatile"); vfs_mountfs("objfs", "objfs", OBJFS_ROOT); + vfs_mountfs("bootfs", "bootfs", "/system/boot"); if (getzoneid() == GLOBAL_ZONEID) { vfs_mountfs("sharefs", "sharefs", "/etc/dfs/sharetab"); @@ -3899,6 +3901,8 @@ vfs_to_modname(const char *vfstype) vfstype = "fdfs"; } else if (strncmp(vfstype, "nfs", 3) == 0) { vfstype = "nfs"; + } else if (strcmp(vfstype, "lxproc") == 0) { + vfstype = "lxprocfs"; } return (vfstype); diff --git a/usr/src/uts/common/fs/vnode.c b/usr/src/uts/common/fs/vnode.c index 4abb040de0..9fb1ea702a 100644 --- a/usr/src/uts/common/fs/vnode.c +++ b/usr/src/uts/common/fs/vnode.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -66,6 +66,7 @@ #include <fs/fs_subr.h> #include <sys/taskq.h> #include <fs/fs_reparse.h> +#include <sys/time.h> /* Determine if this vnode is a file that is read-only */ #define ISROFILE(vp) \ @@ -102,6 +103,9 @@ kmutex_t vskstat_tree_lock; /* Global variable which enables/disables the vopstats collection */ int vopstats_enabled = 1; +/* Global used for empty/invalid v_path */ +char *vn_vpath_empty = ""; + /* * forward declarations for internal vnode specific data (vsd) */ @@ -200,6 +204,11 @@ static void (**vsd_destructor)(void *); cr = crgetmapped(cr); \ } +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * Convert stat(2) formats to vnode types and vice versa. (Knows about * numerical order of S_IFMT and vnode types.) @@ -2284,7 +2293,7 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL); rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL); vp->v_femhead = NULL; /* Must be done before vn_reinit() */ - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; vp->v_mpssdata = NULL; vp->v_vsd = NULL; vp->v_fopdata = NULL; @@ -2331,6 +2340,7 @@ void vn_recycle(vnode_t *vp) { ASSERT(vp->v_pages == NULL); + VERIFY(vp->v_path != NULL); /* * XXX - This really belongs in vn_reinit(), but we have some issues @@ -2353,9 +2363,9 @@ vn_recycle(vnode_t *vp) kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead))); vp->v_femhead = NULL; } - if (vp->v_path) { + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } if (vp->v_fopdata != NULL) { @@ -2427,9 +2437,10 @@ vn_free(vnode_t *vp) */ ASSERT((vp->v_count == 0) || (vp->v_count == 1)); ASSERT(vp->v_count_dnlc == 0); - if (vp->v_path != NULL) { + VERIFY(vp->v_path != NULL); + if (vp->v_path != vn_vpath_empty) { kmem_free(vp->v_path, strlen(vp->v_path) + 1); - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; } /* If FEM was in use, make sure everything gets cleaned up */ @@ -2516,6 +2527,7 @@ vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct) if (vp == NULL || vp->v_femhead == NULL) { return; } + (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct); (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct); } @@ -2530,12 +2542,13 @@ vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name, } void -vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct) +vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name, + caller_context_t *ct) { if (vp == NULL || vp->v_femhead == NULL) { return; } - (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct); + (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct); } void @@ -2951,7 +2964,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, * the potential for deadlock. */ mutex_enter(&base->v_lock); - if (base->v_path == NULL) { + if (base->v_path == vn_vpath_empty) { mutex_exit(&base->v_lock); return; } @@ -2978,7 +2991,8 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath = kmem_alloc(rpathalloc, KM_SLEEP); mutex_enter(&base->v_lock); - if (base->v_path == NULL || strlen(base->v_path) != rpathlen) { + if (base->v_path == vn_vpath_empty || + strlen(base->v_path) != rpathlen) { mutex_exit(&base->v_lock); kmem_free(rpath, rpathalloc); return; @@ -2992,7 +3006,7 @@ vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp, rpath[rpathlen + plen] = '\0'; mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(rpath, rpathalloc); } else { @@ -3012,7 +3026,7 @@ vn_setpath_str(struct vnode *vp, const char *str, size_t len) char *buf = kmem_alloc(len + 1, KM_SLEEP); mutex_enter(&vp->v_lock); - if (vp->v_path != NULL) { + if (vp->v_path != vn_vpath_empty) { mutex_exit(&vp->v_lock); kmem_free(buf, len + 1); return; @@ -3036,10 +3050,10 @@ vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len) mutex_enter(&vp->v_lock); tmp = vp->v_path; - vp->v_path = NULL; + vp->v_path = vn_vpath_empty; mutex_exit(&vp->v_lock); vn_setpath(rootdir, dvp, vp, nm, len); - if (tmp != NULL) + if (tmp != vn_vpath_empty) kmem_free(tmp, strlen(tmp) + 1); } @@ -3054,7 +3068,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) int alloc; mutex_enter(&src->v_lock); - if (src->v_path == NULL) { + if (src->v_path == vn_vpath_empty) { mutex_exit(&src->v_lock); return; } @@ -3064,7 +3078,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); buf = kmem_alloc(alloc, KM_SLEEP); mutex_enter(&src->v_lock); - if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) { + if (src->v_path == vn_vpath_empty || strlen(src->v_path) + 1 != alloc) { mutex_exit(&src->v_lock); kmem_free(buf, alloc); return; @@ -3073,7 +3087,7 @@ vn_copypath(struct vnode *src, struct vnode *dst) mutex_exit(&src->v_lock); mutex_enter(&dst->v_lock); - if (dst->v_path != NULL) { + if (dst->v_path != vn_vpath_empty) { mutex_exit(&dst->v_lock); kmem_free(buf, alloc); return; @@ -3231,14 +3245,57 @@ fop_read( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, read, - read_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, read, read_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += len; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3250,14 +3307,62 @@ fop_write( cred_t *cr, caller_context_t *ct) { - int err; ssize_t resid_start = uiop->uio_resid; + zone_t *zonep = curzone; + zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats; + + hrtime_t start, lat; + ssize_t len; + int err; + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for VFS write operations. There's no + * actual wait queue for VFS operations. + */ + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + start = gethrtime(); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + } VOPXID_MAP_CR(vp, cr); err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct); - VOPSTATS_UPDATE_IO(vp, write, - write_bytes, (resid_start - uiop->uio_resid)); + len = resid_start - uiop->uio_resid; + + VOPSTATS_UPDATE_IO(vp, write, write_bytes, len); + + if (vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) { + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += len; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + if (lat < VOP_LATENCY_100MS) + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + } + return (err); } @@ -3421,7 +3526,7 @@ fop_lookup( } if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, lookup); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm)); } } @@ -3463,7 +3568,7 @@ fop_create( (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, create); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, name, strlen(name)); } } @@ -3585,7 +3690,7 @@ fop_mkdir( (dvp, dirname, vap, vpp, cr, ct, flags, vsecp); if (ret == 0 && *vpp) { VOPSTATS_UPDATE(*vpp, mkdir); - if ((*vpp)->v_path == NULL) { + if ((*vpp)->v_path == vn_vpath_empty) { vn_setpath(rootdir, dvp, *vpp, dirname, strlen(dirname)); } diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 09d2e1dd8f..d2b584cf02 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -129,6 +129,7 @@ #include <sys/vdev.h> #include <sys/vdev_impl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> #include <sys/multilist.h> #ifdef _KERNEL #include <sys/vmsystm.h> @@ -4277,6 +4278,14 @@ top: rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); + /* + * At this point, this read I/O has already missed in the ARC + * and will be going through to the disk. The I/O throttle + * should delay this I/O if this zone is using more than its I/O + * priority allows. + */ + zfs_zone_io_throttle(ZFS_ZONE_IOP_READ); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 465dfd08b2..4b644f7479 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -617,8 +617,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + + if (bonuslen) { + /* + * Absent byzantine on-disk corruption, we fully expect + * our bonuslen to be no more than DN_MAX_BONUSLEN -- + * but we nonetheless explicitly clamp it on the bcopy() + * to prevent any on-disk corruption from becoming + * rampant in-kernel corruption. + */ + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + MIN(bonuslen, DN_MAX_BONUSLEN)); + } + DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 43bcfee91c..bfe5cda5e4 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1799,7 +1799,6 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) if (!zio_checksum_table[checksum].ci_dedup) dedup_verify = B_TRUE; } - /* * Enable nopwrite if we have a cryptographically secure * checksum that has no known collisions (i.e. SHA-256) diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 56ce2f0c27..c2b1dca1c0 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -38,11 +38,11 @@ #include <sys/sa_impl.h> #include <sys/zfs_context.h> #include <sys/varargs.h> +#include <sys/zfs_zone.h> typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); - dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { @@ -223,6 +223,8 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) if (len == 0) return; + zfs_zone_io_throttle(ZFS_ZONE_IOP_LOGICAL_WRITE); + min_bs = SPA_MINBLOCKSHIFT; max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1; min_ibs = DN_MIN_INDBLKSHIFT; diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 6a20ab3ca2..1222b8933a 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -42,6 +42,7 @@ #include <sys/zio.h> #include <sys/arc.h> #include <sys/sunddi.h> +#include <sys/zfs_zone.h> #include <sys/zfeature.h> #include <sys/policy.h> #include <sys/zfs_znode.h> @@ -1266,7 +1267,7 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, - MSEC2NSEC(10), MSEC2NSEC(10)); + zfs_zone_txg_delay(), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 8c5d820ede..8ad7c789d3 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -42,6 +42,7 @@ #include <sys/zfs_znode.h> #include <sys/spa_impl.h> #include <sys/dsl_deadlist.h> +#include <sys/zfs_zone.h> #include <sys/bptree.h> #include <sys/zfeature.h> #include <sys/zil_impl.h> diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 9ba9fd3841..02cccae29d 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -63,6 +63,11 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ int zfs_condense_pct = 200; /* + * Never condense any space map. This is for debugging/recovery only. + */ +int zfs_condense_never = 0; + +/* * Condensing a metaslab is not guaranteed to actually reduce the amount of * space used on disk. In particular, a space map uses data in increments of * MAX(1 << ashift, space_map_blksize), so a metaslab might use the @@ -1656,6 +1661,9 @@ metaslab_should_condense(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); + if (zfs_condense_never != 0) + return (B_FALSE); + /* * Use the ms_size_tree range tree, which is ordered by size, to * obtain the largest segment in the free tree. We always condense diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 0b5b37f5fb..5a2602271b 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -24,6 +24,7 @@ * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> @@ -406,15 +407,18 @@ sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; - int i; + int i, size; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; - tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, - KM_SLEEP); - bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + + if ((size = sizeof (sa_attr_type_t) * attr_count) != 0) { + tb->lot_attrs = kmem_alloc(size, KM_SLEEP); + bcopy(attrs, tb->lot_attrs, size); + } + tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 17a18a3199..bde9b0a7ab 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -116,6 +116,7 @@ struct vdev_queue { avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; uint64_t vq_last_offset; + zoneid_t vq_last_zone_id; hrtime_t vq_io_complete_ts; /* time last i/o completed */ kmutex_t vq_lock; }; diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_zone.h b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h new file mode 100644 index 0000000000..f1431b3f55 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zfs_zone.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZONE_H +#define _SYS_FS_ZFS_ZONE_H + +#ifdef _KERNEL +#include <sys/isa_defs.h> +#include <sys/types32.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_ZONE_IOP_READ = 0, + ZFS_ZONE_IOP_WRITE, + ZFS_ZONE_IOP_LOGICAL_WRITE, +} zfs_zone_iop_type_t; + +extern void zfs_zone_io_throttle(zfs_zone_iop_type_t); + +extern void zfs_zone_zio_init(zio_t *); +extern void zfs_zone_zio_start(zio_t *); +extern void zfs_zone_zio_done(zio_t *); +extern void zfs_zone_zio_dequeue(zio_t *); +extern void zfs_zone_zio_enqueue(zio_t *); +extern void zfs_zone_report_txg_sync(void *); +extern hrtime_t zfs_zone_txg_delay(); +#ifdef _KERNEL +extern zio_t *zfs_zone_schedule(vdev_queue_t *, zio_priority_t, avl_index_t, + avl_tree_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZONE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 198dc92387..78c9355438 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -421,7 +421,8 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; uint64_t io_offset; - hrtime_t io_timestamp; + hrtime_t io_timestamp; /* time I/O entered zio pipeline */ + hrtime_t io_dispatched; /* time I/O was dispatched to disk */ avl_node_t io_queue_node; avl_node_t io_offset_node; @@ -450,6 +451,7 @@ struct zio { zio_cksum_report_t *io_cksum_report; uint64_t io_ena; + zoneid_t io_zoneid; /* zone which originated this I/O */ /* Taskq dispatching state */ taskq_ent_t io_tqent; }; diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 191259e75b..915c9bb4b2 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -31,6 +31,7 @@ #include <sys/dsl_pool.h> #include <sys/dsl_scan.h> #include <sys/callb.h> +#include <sys/zfs_zone.h> /* * ZFS Transaction Groups @@ -506,6 +507,8 @@ txg_sync_thread(dsl_pool_t *dp) txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); + zfs_zone_report_txg_sync(dp); + start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index ed4a8b773b..e4cc42452a 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -21,11 +21,13 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ #include <sys/zfs_context.h> +#include <sys/zfs_zone.h> #include <sys/spa_impl.h> #include <sys/refcount.h> #include <sys/vdev_disk.h> @@ -44,6 +46,11 @@ extern ldi_ident_t zfs_li; static void vdev_disk_close(vdev_t *); +typedef struct vdev_disk_buf { + buf_t vdb_buf; + zio_t *vdb_io; +} vdev_disk_buf_t; + typedef struct vdev_disk_ldi_cb { list_node_t lcb_next; ldi_callback_id_t lcb_id; @@ -127,6 +134,8 @@ vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie, int ldi_result, void *arg, void *ev_data) { vdev_t *vd = (vdev_t *)arg; + vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; /* * Ignore events other than offline. @@ -586,6 +595,7 @@ static void vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + vdev_disk_ldi_cb_t *lcb; if (vd->vdev_reopening || dvd == NULL) return; @@ -812,6 +822,8 @@ vdev_disk_io_start(zio_t *zio) bp->b_bufsize = zio->io_size; bp->b_iodone = (int (*)())vdev_disk_io_intr; + zfs_zone_zio_start(zio); + /* ldi_strategy() will return non-zero only on programming errors */ VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); } @@ -821,6 +833,8 @@ vdev_disk_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; + zfs_zone_zio_done(zio); + /* * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if * the device has been removed. If this is the case, then we trigger an diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 79d6e13b3b..4c02013405 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013, Joyent, Inc. All rights reserved. */ /* @@ -33,6 +34,7 @@ #include <sys/zio.h> #include <sys/avl.h> #include <sys/dsl_pool.h> +#include <sys/zfs_zone.h> /* * ZFS I/O Scheduler @@ -141,7 +143,7 @@ uint32_t zfs_vdev_sync_write_min_active = 10; uint32_t zfs_vdev_sync_write_max_active = 10; uint32_t zfs_vdev_async_read_min_active = 1; uint32_t zfs_vdev_async_read_max_active = 3; -uint32_t zfs_vdev_async_write_min_active = 1; +uint32_t zfs_vdev_async_write_min_active = 3; uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; @@ -237,6 +239,8 @@ vdev_queue_init(vdev_t *vd) vdev_queue_offset_compare, sizeof (zio_t), offsetof(struct zio, io_offset_node)); + vq->vq_last_zone_id = 0; + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { int (*compfn) (const void *, const void *); @@ -274,6 +278,7 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_enqueue(zio); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -289,6 +294,7 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + zfs_zone_zio_dequeue(zio); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); @@ -648,7 +654,11 @@ again: search.io_timestamp = 0; search.io_offset = vq->vq_last_offset + 1; VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); +#ifdef _KERNEL + zio = zfs_zone_schedule(vq, p, idx, tree); +#else zio = avl_nearest(tree, idx, AVL_AFTER); +#endif if (zio == NULL) zio = avl_first(tree); ASSERT3U(zio->io_priority, ==, p); diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index bd7424b55b..2adb297937 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/types.h> @@ -847,9 +848,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on %s is %u, " "should be at least %u", - zp->z_vnode->v_path ? zp->z_vnode->v_path : - "<unknown>", (int)zp->z_links, - zp_is_dir + 1); + zp->z_vnode->v_path != vn_vpath_empty ? + zp->z_vnode->v_path : "<unknown>", + (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 692b49611d..47b0b6f650 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -22,8 +22,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska - * Copyright 2015 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. @@ -611,9 +611,10 @@ zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, * Check permissions for special properties. */ switch (prop) { + case ZFS_PROP_DEDUP: case ZFS_PROP_ZONED: /* - * Disallow setting of 'zoned' from within a local zone. + * Disallow setting these properties from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); @@ -943,6 +944,9 @@ zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; + if (secpolicy_fs_import(cr) != 0) + return (set_errno(EPERM)); + if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); @@ -2034,7 +2038,8 @@ zfs_ioc_vdev_setfru(zfs_cmd_t *zc) } static int -zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os, + boolean_t cachedpropsonly) { int error = 0; nvlist_t *nv; @@ -2052,7 +2057,8 @@ zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && - dmu_objset_type(os) == DMU_OST_ZVOL) { + dmu_objset_type(os) == DMU_OST_ZVOL && + !cachedpropsonly) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); @@ -2079,11 +2085,24 @@ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { - error = zfs_ioc_objset_stats_impl(zc, os); + error = zfs_ioc_objset_stats_impl(zc, os, cachedpropsonly); dmu_objset_rele(os, FTAG); } @@ -2277,8 +2296,21 @@ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; + nvlist_t *nvl = NULL; + boolean_t cachedpropsonly = B_FALSE; int error; + if (zc->zc_nvlist_src != NULL && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl) != 0)) + return (error); + + if (nvl != NULL) { + (void) nvlist_lookup_boolean_value(nvl, "cachedpropsonly", + &cachedpropsonly); + nvlist_free(nvl); + } + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); @@ -2307,8 +2339,10 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); - if (error == 0) - error = zfs_ioc_objset_stats_impl(zc, ossnap); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, + ossnap, cachedpropsonly); + } dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { @@ -3018,6 +3052,7 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; ASSERT(zplprops != NULL); @@ -3061,8 +3096,9 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); - if (norm == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); @@ -3071,13 +3107,15 @@ zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, */ if (norm) u8 = 1; - if (u8 == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); - if (sense == ZFS_PROP_UNDEFINED) - VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 8cf83b399a..f9b7986c56 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1950,6 +1951,17 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); + /* + * If we're doing a forced unmount on a dataset which still has + * references and is in a zone, then we need to cleanup the zone + * reference at this point or else the zone will never be able to + * shutdown. + */ + if ((fflag & MS_FORCE) && vfsp->vfs_count > 1 && vfsp->vfs_zone) { + zone_rele_ref(&vfsp->vfs_implp->vi_zone_ref, ZONE_REF_VFS); + vfsp->vfs_zone = NULL; + } + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index bf75097d2a..ddec59ffc9 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -22,11 +22,16 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + #include <sys/types.h> #include <sys/param.h> #include <sys/time.h> @@ -662,6 +667,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) rl_t *rl; int max_blksz = zfsvfs->z_max_blksz; int error = 0; + int prev_error; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; @@ -683,6 +689,17 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -735,17 +752,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } /* - * Pre-fault the pages to ensure slow (eg NFS) pages - * don't hold up txg. - * Skip this if uio contains loaned arc_buf. - */ - if ((uio->uio_extflg == UIO_XUIO) && - (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) - xuio = (xuio_t *)uio; - else - uio_prefaultpages(MIN(n, max_blksz), uio); - - /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { @@ -966,7 +972,6 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); - ASSERT(error == 0); } /* * If we are replaying and eof is non zero then force @@ -976,18 +981,20 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; + /* + * Keep track of a possible pre-existing error from a partial + * write via dmu_write_uio_dbuf above. + */ + prev_error = error; error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); dmu_tx_commit(tx); - if (error != 0) + if (prev_error != 0 || error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; - - if (!xuio && n > 0) - uio_prefaultpages(MIN(n, max_blksz), uio); } zfs_range_unlock(rl); @@ -3659,18 +3666,6 @@ top: } } - vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); - if (tzp) - vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); - - /* - * notify the target directory if it is not the same - * as source directory. - */ - if (tdvp != sdvp) { - vnevent_rename_dest_dir(tdvp, ct); - } - tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); @@ -3711,8 +3706,12 @@ top: return (error); } - if (tzp) /* Attempt to remove the existing target */ + if (tzp) { + /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + if (error == 0) + vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct); + } if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); @@ -3754,6 +3753,11 @@ top: } dmu_tx_commit(tx); + + if (error == 0) { + vnevent_rename_src(ZTOV(szp), sdvp, snm, ct); + vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct); + } out: if (zl != NULL) zfs_rename_unlock(&zl); @@ -4244,6 +4248,8 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } dmu_tx_commit(tx); @@ -4779,10 +4785,6 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); - if ((flags & MAP_SHARED) && (prot & PROT_WRITE) && - vn_has_cached_data(vp)) - (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct); - return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_zone.c b/usr/src/uts/common/fs/zfs/zfs_zone.c new file mode 100644 index 0000000000..4861c64f8e --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_zone.c @@ -0,0 +1,1336 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. All rights reserved. + */ + +/* + * The ZFS/Zone I/O throttle and scheduler attempts to ensure fair access to + * ZFS I/O resources for each zone. + * + * I/O contention can be major pain point on a multi-tenant system. A single + * zone can issue a stream of I/O operations, usually synchronous writes, which + * disrupt I/O performance for all other zones. This problem is further + * exacerbated by ZFS, which buffers all asynchronous writes in a single TXG, + * a set of blocks which are atomically synced to disk. The process of + * syncing a TXG can occupy all of a device's I/O bandwidth, thereby starving + * out any pending read operations. + * + * There are two facets to this capability; the throttle and the scheduler. + * + * Throttle + * + * The requirements on the throttle are: + * + * 1) Ensure consistent and predictable I/O latency across all zones. + * 2) Sequential and random workloads have very different characteristics, + * so it is a non-starter to track IOPS or throughput. + * 3) A zone should be able to use the full disk bandwidth if no other zone + * is actively using the disk. + * + * The throttle has two components: one to track and account for each zone's + * I/O requests, and another to throttle each zone's operations when it + * exceeds its fair share of disk I/O. When the throttle detects that a zone is + * consuming more than is appropriate, each read or write system call is + * delayed by up to 100 microseconds, which we've found is sufficient to allow + * other zones to interleave I/O requests during those delays. + * + * Note: The throttle will delay each logical I/O (as opposed to the physical + * I/O which will likely be issued asynchronously), so it may be easier to + * think of the I/O throttle delaying each read/write syscall instead of the + * actual I/O operation. For each zone, the throttle tracks an ongoing average + * of read and write operations performed to determine the overall I/O + * utilization for each zone. + * + * The throttle calculates a I/O utilization metric for each zone using the + * following formula: + * + * (# of read syscalls) x (Average read latency) + + * (# of write syscalls) x (Average write latency) + * + * Once each zone has its utilization metric, the I/O throttle will compare I/O + * utilization across all zones, and if a zone has a higher-than-average I/O + * utilization, system calls from that zone are throttled. That is, if one + * zone has a much higher utilization, that zone's delay is increased by 5 + * microseconds, up to a maximum of 100 microseconds. Conversely, if a zone is + * already throttled and has a lower utilization than average, its delay will + * be lowered by 5 microseconds. + * + * The throttle calculation is driven by IO activity, but since IO does not + * happen at fixed intervals, timestamps are used to track when the last update + * was made and to drive recalculation. + * + * The throttle recalculates each zone's I/O usage and throttle delay (if any) + * on the zfs_zone_adjust_time interval. Overall I/O latency is maintained as + * a decayed average which is updated on the zfs_zone_sys_avg_cycle interval. + * + * Scheduler + * + * The I/O scheduler manages the vdev queues – the queues of pending I/Os to + * issue to the disks. It only makes scheduling decisions for the two + * synchronous I/O queues (read & write). + * + * The scheduler maintains how many I/Os in the queue are from each zone, and + * if one zone has a disproportionately large number of I/Os in the queue, the + * scheduler will allow certain I/Os from the underutilized zones to be "bumped" + * and pulled from the middle of the queue. This bump allows zones with a small + * number of I/Os (so small they may not even be taken into account by the + * throttle) to complete quickly instead of waiting behind dozens of I/Os from + * other zones. + */ + +#include <sys/spa.h> +#include <sys/vdev_impl.h> +#include <sys/zfs_zone.h> + +#ifndef _KERNEL + +/* + * Stubs for when compiling for user-land. + */ + +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ +} + +void +zfs_zone_zio_init(zio_t *zp) +{ +} + +void +zfs_zone_zio_start(zio_t *zp) +{ +} + +void +zfs_zone_zio_done(zio_t *zp) +{ +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ +} + +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ +} + +hrtime_t +zfs_zone_txg_delay() +{ + return (MSEC2NSEC(10)); +} + +#else + +/* + * The real code. + */ + +#include <sys/systm.h> +#include <sys/thread.h> +#include <sys/proc.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/atomic.h> +#include <sys/zio.h> +#include <sys/zone.h> +#include <sys/avl.h> +#include <sys/sdt.h> +#include <sys/ddi.h> + +/* + * The zone throttle delays read and write operations from certain zones based + * on each zone's IO utilitzation. Once a cycle (defined by zfs_zone_cycle_time + * below), the delays for each zone are recalculated based on the utilization + * over the previous window. + */ +boolean_t zfs_zone_delay_enable = B_TRUE; /* enable IO throttle */ +uint16_t zfs_zone_delay_step = 5; /* usec amnt to change delay */ +uint16_t zfs_zone_delay_ceiling = 100; /* usec delay max */ + +boolean_t zfs_zone_priority_enable = B_TRUE; /* enable IO priority */ + +/* + * For certain workloads, one zone may be issuing primarily sequential I/O and + * another primarily random I/O. The sequential I/O will complete much more + * quickly than the random I/O, driving the average system latency for those + * operations way down. As a result, the random I/O may be throttled back, even + * though the sequential I/O should be throttled to allow the random I/O more + * access to the disk. + * + * This tunable limits the discrepancy between the read and write system + * latency. If one becomes excessively high, this tunable prevents the I/O + * throttler from exacerbating the imbalance. + */ +uint_t zfs_zone_rw_lat_limit = 10; + +/* + * The I/O throttle will only start delaying zones when it detects disk + * utilization has reached a certain level. This tunable controls the + * threshold at which the throttle will start delaying zones. When the number + * of vdevs is small, the calculation should correspond closely with the %b + * column from iostat -- but as the number of vdevs becomes large, it will + * correlate less and less to any single device (therefore making it a poor + * approximation for the actual I/O utilization on such systems). We + * therefore use our derived utilization conservatively: we know that low + * derived utilization does indeed correlate to low I/O use -- but that a high + * rate of derived utilization does not necesarily alone denote saturation; + * where we see a high rate of utilization, we also look for laggard I/Os to + * attempt to detect saturation. + */ +uint_t zfs_zone_util_threshold = 80; +uint_t zfs_zone_underutil_threshold = 60; + +/* + * There are three important tunables here: zfs_zone_laggard_threshold denotes + * the threshold at which an I/O is considered to be of notably high latency; + * zfs_zone_laggard_recent denotes the number of microseconds before the + * current time after which the last laggard is considered to be sufficiently + * recent to merit increasing the throttle; zfs_zone_laggard_ancient denotes + * the microseconds before the current time before which the last laggard is + * considered to be sufficiently old to merit decreasing the throttle. The + * most important tunable of these three is the zfs_zone_laggard_threshold: in + * modeling data from a large public cloud, this tunable was found to have a + * much greater effect on the throttle than the two time-based thresholds. + * This must be set high enough to not result in spurious throttling, but not + * so high as to allow pathological I/O to persist in the system. + */ +uint_t zfs_zone_laggard_threshold = 50000; /* 50 ms */ +uint_t zfs_zone_laggard_recent = 1000000; /* 1000 ms */ +uint_t zfs_zone_laggard_ancient = 5000000; /* 5000 ms */ + +/* + * Throughout this subsystem, our timestamps are in microseconds. Our system + * average cycle is one second or 1 million microseconds. Our zone counter + * update cycle is two seconds or 2 million microseconds. We use a longer + * duration for that cycle because some ops can see a little over two seconds of + * latency when they are being starved by another zone. + */ +uint_t zfs_zone_sys_avg_cycle = 1000000; /* 1 s */ +uint_t zfs_zone_cycle_time = 2000000; /* 2 s */ + +/* + * How often the I/O throttle will reevaluate each zone's utilization, in + * microseconds. Default is 1/4 sec. + */ +uint_t zfs_zone_adjust_time = 250000; /* 250 ms */ + +typedef struct { + hrtime_t cycle_start; + int cycle_cnt; + hrtime_t cycle_lat; + hrtime_t sys_avg_lat; +} sys_lat_cycle_t; + +typedef struct { + hrtime_t zi_now; + uint_t zi_avgrlat; + uint_t zi_avgwlat; + uint64_t zi_totpri; + uint64_t zi_totutil; + int zi_active; + uint_t zi_diskutil; + boolean_t zi_underutil; + boolean_t zi_overutil; +} zoneio_stats_t; + +static sys_lat_cycle_t rd_lat; +static sys_lat_cycle_t wr_lat; + +/* + * Some basic disk stats to determine disk utilization. The utilization info + * for all disks on the system is aggregated into these values. + * + * Overall disk utilization for the current cycle is calculated as: + * + * ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) + * ---------------------------------------------- + * ((now - zfs_zone_last_checked) * 1000); + */ +kmutex_t zfs_disk_lock; /* protects the following: */ +uint_t zfs_disk_rcnt; /* Number of outstanding IOs */ +hrtime_t zfs_disk_rtime = 0; /* cummulative sum of time performing IO */ +hrtime_t zfs_disk_rlastupdate = 0; /* time last IO dispatched */ + +hrtime_t zfs_disk_last_rtime = 0; /* prev. cycle's zfs_disk_rtime val */ +/* time that we last updated per-zone throttle info */ +hrtime_t zfs_zone_last_checked = 0; +hrtime_t zfs_disk_last_laggard = 0; + +/* + * Data used to keep track of how often txg sync is running. + */ +extern int zfs_txg_timeout; +static uint_t txg_last_check; +static uint_t txg_cnt; +static uint_t txg_sync_rate; + +boolean_t zfs_zone_schedule_enable = B_TRUE; /* enable IO sched. */ +/* + * Threshold for when zio scheduling should kick in. + * + * This threshold is based on the zfs_vdev_sync_read_max_active value for the + * number of I/Os that can be pending on a device. If there are more than the + * max_active ops already queued up, beyond those already issued to the vdev, + * then use zone-based scheduling to get the next synchronous zio. + */ +uint32_t zfs_zone_schedule_thresh = 10; + +/* + * On each pass of the scheduler we increment the zone's weight (up to this + * maximum). The weight is used by the scheduler to prevent starvation so + * that zones which haven't been able to do any IO over many iterations + * will max out thier weight to this value. + */ +#define SCHED_WEIGHT_MAX 20 + +/* + * Tunables for delay throttling when TXG sync is occurring. + * + * If the zone is performing a write and we're doing above normal TXG syncing, + * then throttle for longer than normal. The zone's wait time is multiplied + * by the scale (zfs_zone_txg_throttle_scale). + */ +int zfs_zone_txg_throttle_scale = 2; +hrtime_t zfs_zone_txg_delay_nsec = MSEC2NSEC(20); + +typedef struct { + int zq_qdepth; + zio_priority_t zq_queue; + int zq_priority; + int zq_wt; + zoneid_t zq_zoneid; +} zone_q_bump_t; + +/* + * This uses gethrtime() but returns a value in usecs. + */ +#define GET_USEC_TIME (gethrtime() / 1000) +#define NANO_TO_MICRO(x) (x / (NANOSEC / MICROSEC)) + +/* + * Keep track of the zone's ZFS IOPs. + * + * See the comment on the zfs_zone_io_throttle function for which/how IOPs are + * accounted for. + * + * If the number of ops is >1 then we can just use that value. However, + * if the number of ops is <2 then we might have a zone which is trying to do + * IO but is not able to get any ops through the system. We don't want to lose + * track of this zone so we factor in its decayed count into the current count. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed count. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last update + * was made. If it was more than one cycle ago, then we need to decay the + * historical count by the proper number of additional cycles in which no IO was + * performed. + * + * Return a time delta indicating how far into the current cycle we are or 0 + * if the last IO was more than a cycle ago. + */ +static hrtime_t +compute_historical_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new zone count. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_cycle_time) + return (delta); + + /* A previous cycle is past, compute the new zone count. */ + + /* + * Figure out how many generations we have to decay the historical + * count, since multiple cycles may have elapsed since our last IO. + * We depend on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_cycle_time); + + /* If more than 5 cycles since last the IO, reset count. */ + if (gen_cnt > 5) { + cp->zone_avg_cnt = 0; + } else { + /* Update the count. */ + int i; + + /* + * If the zone did more than 1 IO, just use its current count + * as the historical value, otherwise decay the historical + * count and factor that into the new historical count. We + * pick a threshold > 1 so that we don't lose track of IO due + * to int rounding. + */ + if (cp->cycle_cnt > 1) + cp->zone_avg_cnt = cp->cycle_cnt; + else + cp->zone_avg_cnt = cp->cycle_cnt + + (cp->zone_avg_cnt / 2); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->zone_avg_cnt = cp->zone_avg_cnt / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + + return (0); +} + +/* + * Add IO op data to the zone. + */ +static void +add_zone_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_historical_zone_cnt(unow, &zonep->zone_rd_ops); + zonep->zone_rd_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_wr_ops); + zonep->zone_wr_ops.cycle_cnt++; + break; + case ZFS_ZONE_IOP_LOGICAL_WRITE: + (void) compute_historical_zone_cnt(unow, &zonep->zone_lwr_ops); + zonep->zone_lwr_ops.cycle_cnt++; + break; + } +} + +/* + * Use a decaying average to keep track of the overall system latency. + * + * We want to have the recent activity heavily weighted, but if the + * activity decreases or stops, then the average should quickly decay + * down to the new value. + * + * Each cycle (zfs_zone_sys_avg_cycle) we want to update the decayed average. + * However, since this calculation is driven by IO activity and since IO does + * not happen at fixed intervals, we use a timestamp to see when the last + * update was made. If it was more than one cycle ago, then we need to decay + * the average by the proper number of additional cycles in which no IO was + * performed. + * + * Return true if we actually computed a new system average. + * If we're still within an active cycle there is nothing to do, return false. + */ +static boolean_t +compute_new_sys_avg(hrtime_t unow, sys_lat_cycle_t *cp) +{ + hrtime_t delta; + int gen_cnt; + + /* + * Check if its time to recompute a new average. + * If we're still collecting data for the current cycle, return false. + */ + delta = unow - cp->cycle_start; + if (delta < zfs_zone_sys_avg_cycle) + return (B_FALSE); + + /* A previous cycle is past, compute a new system average. */ + + /* + * Figure out how many generations we have to decay, since multiple + * cycles may have elapsed since our last IO. + * We count on int rounding here. + */ + gen_cnt = (int)(delta / zfs_zone_sys_avg_cycle); + + /* If more than 5 cycles since last the IO, reset average. */ + if (gen_cnt > 5) { + cp->sys_avg_lat = 0; + } else { + /* Update the average. */ + int i; + + cp->sys_avg_lat = + (cp->sys_avg_lat + cp->cycle_lat) / (1 + cp->cycle_cnt); + + /* + * If more than one generation has elapsed since the last + * update, decay the values further. + */ + for (i = 1; i < gen_cnt; i++) + cp->sys_avg_lat = cp->sys_avg_lat / 2; + } + + /* A new cycle begins. */ + cp->cycle_start = unow; + cp->cycle_cnt = 0; + cp->cycle_lat = 0; + + return (B_TRUE); +} + +static void +add_sys_iop(hrtime_t unow, int op, int lat) +{ + switch (op) { + case ZFS_ZONE_IOP_READ: + (void) compute_new_sys_avg(unow, &rd_lat); + rd_lat.cycle_cnt++; + rd_lat.cycle_lat += lat; + break; + case ZFS_ZONE_IOP_WRITE: + (void) compute_new_sys_avg(unow, &wr_lat); + wr_lat.cycle_cnt++; + wr_lat.cycle_lat += lat; + break; + } +} + +/* + * Get the zone IO counts. + */ +static uint_t +calc_zone_cnt(hrtime_t unow, sys_zio_cntr_t *cp) +{ + hrtime_t delta; + uint_t cnt; + + if ((delta = compute_historical_zone_cnt(unow, cp)) == 0) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + cnt = cp->zone_avg_cnt; + } else { + /* + * If we're less than half way through the cycle then use + * the current count plus half the historical count, otherwise + * just use the current count. + */ + if (delta < (zfs_zone_cycle_time / 2)) + cnt = cp->cycle_cnt + (cp->zone_avg_cnt / 2); + else + cnt = cp->cycle_cnt; + } + + return (cnt); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static uint_t +calc_avg_lat(hrtime_t unow, sys_lat_cycle_t *cp) +{ + if (compute_new_sys_avg(unow, cp)) { + /* + * No activity in the current cycle, we already have the + * historical data so we'll use that. + */ + return (cp->sys_avg_lat); + } else { + /* + * We're within a cycle; weight the current activity higher + * compared to the historical data and use that. + */ + DTRACE_PROBE3(zfs__zone__calc__wt__avg, + uintptr_t, cp->sys_avg_lat, + uintptr_t, cp->cycle_lat, + uintptr_t, cp->cycle_cnt); + + return ((cp->sys_avg_lat + (cp->cycle_lat * 8)) / + (1 + (cp->cycle_cnt * 8))); + } +} + +/* + * Account for the current IOP on the zone and for the system as a whole. + * The latency parameter is in usecs. + */ +static void +add_iop(zone_t *zonep, hrtime_t unow, zfs_zone_iop_type_t op, hrtime_t lat) +{ + /* Add op to zone */ + add_zone_iop(zonep, unow, op); + + /* Track system latency */ + if (op != ZFS_ZONE_IOP_LOGICAL_WRITE) + add_sys_iop(unow, op, lat); +} + +/* + * Calculate and return the total number of read ops, write ops and logical + * write ops for the given zone. If the zone has issued operations of any type + * return a non-zero value, otherwise return 0. + */ +static int +get_zone_io_cnt(hrtime_t unow, zone_t *zonep, uint_t *rops, uint_t *wops, + uint_t *lwops) +{ + *rops = calc_zone_cnt(unow, &zonep->zone_rd_ops); + *wops = calc_zone_cnt(unow, &zonep->zone_wr_ops); + *lwops = calc_zone_cnt(unow, &zonep->zone_lwr_ops); + + DTRACE_PROBE4(zfs__zone__io__cnt, uintptr_t, zonep->zone_id, + uintptr_t, *rops, uintptr_t, *wops, uintptr_t, *lwops); + + return (*rops | *wops | *lwops); +} + +/* + * Get the average read/write latency in usecs for the system. + */ +static void +get_sys_avg_lat(hrtime_t unow, uint_t *rlat, uint_t *wlat) +{ + *rlat = calc_avg_lat(unow, &rd_lat); + *wlat = calc_avg_lat(unow, &wr_lat); + + /* + * In an attempt to improve the accuracy of the throttling algorithm, + * assume that IO operations can't have zero latency. Instead, assume + * a reasonable lower bound for each operation type. If the actual + * observed latencies are non-zero, use those latency values instead. + */ + if (*rlat == 0) + *rlat = 1000; + if (*wlat == 0) + *wlat = 1000; + + DTRACE_PROBE2(zfs__zone__sys__avg__lat, uintptr_t, *rlat, + uintptr_t, *wlat); +} + +/* + * Find disk utilization for each zone and average utilization for all active + * zones. + */ +static int +zfs_zone_wait_adjust_calculate_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint_t rops, wops, lwops; + + if (zonep->zone_id == GLOBAL_ZONEID || + get_zone_io_cnt(sp->zi_now, zonep, &rops, &wops, &lwops) == 0) { + zonep->zone_io_util = 0; + return (0); + } + + zonep->zone_io_util = (rops * sp->zi_avgrlat) + + (wops * sp->zi_avgwlat) + (lwops * sp->zi_avgwlat); + sp->zi_totutil += zonep->zone_io_util; + + if (zonep->zone_io_util > 0) { + sp->zi_active++; + sp->zi_totpri += zonep->zone_zfs_io_pri; + } + + /* + * sdt:::zfs-zone-utilization + * + * arg0: zone ID + * arg1: read operations observed during time window + * arg2: physical write operations observed during time window + * arg3: logical write ops observed during time window + * arg4: calculated utilization given read and write ops + * arg5: I/O priority assigned to this zone + */ + DTRACE_PROBE6(zfs__zone__utilization, uint_t, zonep->zone_id, + uint_t, rops, uint_t, wops, uint_t, lwops, + uint_t, zonep->zone_io_util, uint_t, zonep->zone_zfs_io_pri); + + return (0); +} + +static void +zfs_zone_delay_inc(zone_t *zonep) +{ + if (zonep->zone_io_delay < zfs_zone_delay_ceiling) + zonep->zone_io_delay += zfs_zone_delay_step; +} + +static void +zfs_zone_delay_dec(zone_t *zonep) +{ + if (zonep->zone_io_delay > 0) + zonep->zone_io_delay -= zfs_zone_delay_step; +} + +/* + * For all zones "far enough" away from the average utilization, increase that + * zones delay. Otherwise, reduce its delay. + */ +static int +zfs_zone_wait_adjust_delay_cb(zone_t *zonep, void *arg) +{ + zoneio_stats_t *sp = arg; + uint16_t delay = zonep->zone_io_delay; + uint_t fairutil = 0; + + zonep->zone_io_util_above_avg = B_FALSE; + + /* + * Given the calculated total utilitzation for all zones, calculate the + * fair share of I/O for this zone. + */ + if (zfs_zone_priority_enable && sp->zi_totpri > 0) { + fairutil = (sp->zi_totutil * zonep->zone_zfs_io_pri) / + sp->zi_totpri; + } else if (sp->zi_active > 0) { + fairutil = sp->zi_totutil / sp->zi_active; + } + + /* + * Adjust each IO's delay. If the overall delay becomes too high, avoid + * increasing beyond the ceiling value. + */ + if (zonep->zone_io_util > fairutil && sp->zi_overutil) { + zonep->zone_io_util_above_avg = B_TRUE; + + if (sp->zi_active > 1) + zfs_zone_delay_inc(zonep); + } else if (zonep->zone_io_util < fairutil || sp->zi_underutil || + sp->zi_active <= 1) { + zfs_zone_delay_dec(zonep); + } + + /* + * sdt:::zfs-zone-throttle + * + * arg0: zone ID + * arg1: old delay for this zone + * arg2: new delay for this zone + * arg3: calculated fair I/O utilization + * arg4: actual I/O utilization + */ + DTRACE_PROBE5(zfs__zone__throttle, uintptr_t, zonep->zone_id, + uintptr_t, delay, uintptr_t, zonep->zone_io_delay, + uintptr_t, fairutil, uintptr_t, zonep->zone_io_util); + + return (0); +} + +/* + * Examine the utilization between different zones, and adjust the delay for + * each zone appropriately. + */ +static void +zfs_zone_wait_adjust(hrtime_t unow, hrtime_t last_checked) +{ + zoneio_stats_t stats; + hrtime_t laggard_udelta = 0; + + (void) bzero(&stats, sizeof (stats)); + + stats.zi_now = unow; + get_sys_avg_lat(unow, &stats.zi_avgrlat, &stats.zi_avgwlat); + + if (stats.zi_avgrlat > stats.zi_avgwlat * zfs_zone_rw_lat_limit) + stats.zi_avgrlat = stats.zi_avgwlat * zfs_zone_rw_lat_limit; + else if (stats.zi_avgrlat * zfs_zone_rw_lat_limit < stats.zi_avgwlat) + stats.zi_avgwlat = stats.zi_avgrlat * zfs_zone_rw_lat_limit; + + if (zone_walk(zfs_zone_wait_adjust_calculate_cb, &stats) != 0) + return; + + /* + * Calculate disk utilization for the most recent period. + */ + if (zfs_disk_last_rtime == 0 || unow - last_checked <= 0) { + stats.zi_diskutil = 0; + } else { + stats.zi_diskutil = + ((zfs_disk_rtime - zfs_disk_last_rtime) * 100) / + ((unow - last_checked) * 1000); + } + zfs_disk_last_rtime = zfs_disk_rtime; + + if (unow > zfs_disk_last_laggard) + laggard_udelta = unow - zfs_disk_last_laggard; + + /* + * To minimize porpoising, we have three separate states for our + * assessment of I/O performance: overutilized, underutilized, and + * neither overutilized nor underutilized. We will increment the + * throttle if a zone is using more than its fair share _and_ I/O + * is overutilized; we will decrement the throttle if a zone is using + * less than its fair share _or_ I/O is underutilized. + */ + stats.zi_underutil = stats.zi_diskutil < zfs_zone_underutil_threshold || + laggard_udelta > zfs_zone_laggard_ancient; + + stats.zi_overutil = stats.zi_diskutil > zfs_zone_util_threshold && + laggard_udelta < zfs_zone_laggard_recent; + + /* + * sdt:::zfs-zone-stats + * + * Statistics observed over the last period: + * + * arg0: average system read latency + * arg1: average system write latency + * arg2: number of active zones + * arg3: total I/O 'utilization' for all zones + * arg4: total I/O priority of all active zones + * arg5: calculated disk utilization + */ + DTRACE_PROBE6(zfs__zone__stats, uintptr_t, stats.zi_avgrlat, + uintptr_t, stats.zi_avgwlat, uintptr_t, stats.zi_active, + uintptr_t, stats.zi_totutil, uintptr_t, stats.zi_totpri, + uintptr_t, stats.zi_diskutil); + + (void) zone_walk(zfs_zone_wait_adjust_delay_cb, &stats); +} + +/* + * Callback used to calculate a zone's IO schedule priority. + * + * We scan the zones looking for ones with ops in the queue. Out of those, + * we pick the one that calculates to the highest schedule priority. + */ +static int +get_sched_pri_cb(zone_t *zonep, void *arg) +{ + int pri; + uint_t cnt; + zone_q_bump_t *qbp = arg; + zio_priority_t p = qbp->zq_queue; + + cnt = zonep->zone_zfs_queued[p]; + if (cnt == 0) { + zonep->zone_zfs_weight = 0; + return (0); + } + + /* + * On each pass, increment the zone's weight. We use this as input + * to the calculation to prevent starvation. The value is reset + * each time we issue an IO for this zone so zones which haven't + * done any IO over several iterations will see their weight max + * out. + */ + if (zonep->zone_zfs_weight < SCHED_WEIGHT_MAX) + zonep->zone_zfs_weight++; + + /* + * This zone's IO priority is the inverse of the number of IOs + * the zone has enqueued * zone's configured priority * weight. + * The queue depth has already been scaled by 10 to avoid problems + * with int rounding. + * + * This means that zones with fewer IOs in the queue will get + * preference unless other zone's assigned priority pulls them + * ahead. The weight is factored in to help ensure that zones + * which haven't done IO in a while aren't getting starved. + */ + pri = (qbp->zq_qdepth / cnt) * + zonep->zone_zfs_io_pri * zonep->zone_zfs_weight; + + /* + * If this zone has a higher priority than what we found so far, + * it becomes the new leading contender. + */ + if (pri > qbp->zq_priority) { + qbp->zq_zoneid = zonep->zone_id; + qbp->zq_priority = pri; + qbp->zq_wt = zonep->zone_zfs_weight; + } + return (0); +} + +/* + * See if we need to bump a zone's zio to the head of the queue. This is only + * done on the two synchronous I/O queues (see the block comment on the + * zfs_zone_schedule function). We get the correct vdev_queue_class_t and + * queue depth from our caller. + * + * For single-threaded synchronous processes a zone cannot get more than + * 1 op into the queue at a time unless the zone is running multiple processes + * in parallel. This can cause an imbalance in performance if there are zones + * with many parallel processes (and ops in the queue) vs. other zones which + * are doing simple single-threaded processes, such as interactive tasks in the + * shell. These zones can get backed up behind a deep queue and their IO + * performance will appear to be very poor as a result. This can make the + * zone work badly for interactive behavior. + * + * The scheduling algorithm kicks in once we start to get a deeper queue. + * Once that occurs, we look at all of the zones to see which one calculates + * to the highest priority. We bump that zone's first zio to the head of the + * queue. + * + * We use a counter on the zone so that we can quickly find how many ops each + * zone has in the queue without having to search the entire queue itself. + * This scales better since the number of zones is expected to be on the + * order of 10-100 whereas the queue depth can be in the range of 50-2000. + * In addition, since the zio's in the queue only have the zoneid, we would + * have to look up the zone for each zio enqueued and that means the overhead + * for scanning the queue each time would be much higher. + * + * In all cases, we fall back to simply pulling the next op off the queue + * if something should go wrong. + */ +static zio_t * +get_next_zio(vdev_queue_class_t *vqc, int qdepth, zio_priority_t p, + avl_tree_t *tree) +{ + zone_q_bump_t qbump; + zio_t *zp = NULL, *zphead; + int cnt = 0; + + /* To avoid problems with int rounding, scale the queue depth by 10 */ + qbump.zq_qdepth = qdepth * 10; + qbump.zq_priority = 0; + qbump.zq_zoneid = 0; + qbump.zq_queue = p; + (void) zone_walk(get_sched_pri_cb, &qbump); + + zphead = avl_first(tree); + + /* Check if the scheduler didn't pick a zone for some reason!? */ + if (qbump.zq_zoneid != 0) { + for (zp = avl_first(tree); zp != NULL; + zp = avl_walk(tree, zp, AVL_AFTER)) { + if (zp->io_zoneid == qbump.zq_zoneid) + break; + cnt++; + } + } + + if (zp == NULL) { + zp = zphead; + } else if (zp != zphead) { + /* + * Only fire the probe if we actually picked a different zio + * than the one already at the head of the queue. + */ + DTRACE_PROBE4(zfs__zone__sched__bump, uint_t, zp->io_zoneid, + uint_t, cnt, int, qbump.zq_priority, int, qbump.zq_wt); + } + + return (zp); +} + +/* + * Add our zone ID to the zio so we can keep track of which zones are doing + * what, even when the current thread processing the zio is not associated + * with the zone (e.g. the kernel taskq which pushes out TX groups). + */ +void +zfs_zone_zio_init(zio_t *zp) +{ + zone_t *zonep = curzone; + + zp->io_zoneid = zonep->zone_id; +} + +/* + * Track and throttle IO operations per zone. Called from: + * - dmu_tx_count_write for (logical) write ops (both dataset and zvol writes + * go through this path) + * - arc_read for read ops that miss the ARC (both dataset and zvol) + * For each operation, increment that zone's counter based on the type of + * operation, then delay the operation, if necessary. + * + * There are three basic ways that we can see write ops: + * 1) An application does write syscalls. Those ops go into a TXG which + * we'll count here. Sometime later a kernel taskq thread (we'll see the + * vdev IO as zone 0) will perform some number of physical writes to commit + * the TXG to disk. Those writes are not associated with the zone which + * made the write syscalls and the number of operations is not correlated + * between the taskq and the zone. We only see logical writes in this + * function, we see the physcial writes in the zfs_zone_zio_start and + * zfs_zone_zio_done functions. + * 2) An application opens a file with O_SYNC. Each write will result in + * an operation which we'll see here plus a low-level vdev write from + * that zone. + * 3) An application does write syscalls followed by an fsync(). We'll + * count the writes going into a TXG here. We'll also see some number + * (usually much smaller, maybe only 1) of low-level vdev writes from this + * zone when the fsync is performed, plus some other low-level vdev writes + * from the taskq in zone 0 (are these metadata writes?). + * + * 4) In addition to the above, there are misc. system-level writes, such as + * writing out dirty pages to swap, or sync(2) calls, which will be handled + * by the global zone and which we count but don't generally worry about. + * + * Because of the above, we can see writes twice; first because this function + * is always called by a zone thread for logical writes, but then we also will + * count the physical writes that are performed at a low level via + * zfs_zone_zio_start. Without this, it can look like a non-global zone never + * writes (case 1). Depending on when the TXG is synced, the counts may be in + * the same sample bucket or in a different one. + * + * Tracking read operations is simpler due to their synchronous semantics. The + * zfs_read function -- called as a result of a read(2) syscall -- will always + * retrieve the data to be read through arc_read and we only come into this + * function when we have an arc miss. + */ +void +zfs_zone_io_throttle(zfs_zone_iop_type_t type) +{ + zone_t *zonep = curzone; + hrtime_t unow, last_checked; + uint16_t wait; + + unow = GET_USEC_TIME; + + /* + * Only bump the counter for logical writes here. The counters for + * tracking physical IO operations are handled in zfs_zone_zio_done. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, type, 0); + mutex_exit(&zonep->zone_stg_io_lock); + } + + if (!zfs_zone_delay_enable) + return; + + /* + * If the zone's I/O priority is set to zero, don't throttle that zone's + * operations at all. + */ + if (zonep->zone_zfs_io_pri == 0) + return; + + /* + * XXX There's a potential race here in that more than one thread may + * update the zone delays concurrently. The worst outcome is corruption + * of our data to track each zone's IO, so the algorithm may make + * incorrect throttling decisions until the data is refreshed. + */ + last_checked = zfs_zone_last_checked; + if ((unow - last_checked) > zfs_zone_adjust_time) { + zfs_zone_last_checked = unow; + zfs_zone_wait_adjust(unow, last_checked); + } + + if ((wait = zonep->zone_io_delay) > 0) { + /* + * If this is a write and we're doing above normal TXG + * syncing, then throttle for longer than normal. + */ + if (type == ZFS_ZONE_IOP_LOGICAL_WRITE && + (txg_cnt > 1 || txg_sync_rate > 1)) + wait *= zfs_zone_txg_throttle_scale; + + /* + * sdt:::zfs-zone-wait + * + * arg0: zone ID + * arg1: type of IO operation + * arg2: time to delay (in us) + */ + DTRACE_PROBE3(zfs__zone__wait, uintptr_t, zonep->zone_id, + uintptr_t, type, uintptr_t, wait); + + drv_usecwait(wait); + + if (zonep->zone_vfs_stats != NULL) { + atomic_inc_64(&zonep->zone_vfs_stats-> + zv_delay_cnt.value.ui64); + atomic_add_64(&zonep->zone_vfs_stats-> + zv_delay_time.value.ui64, wait); + } + } +} + +/* + * XXX Ignore the pool pointer parameter for now. + * + * Keep track to see if the TXG sync rate is running above the expected rate. + * If so, this implies that we are filling TXG's at a high rate due to a heavy + * write workload. We use this as input into the zone throttle. + * + * This function is called every 5 seconds (zfs_txg_timeout) under a normal + * write load. In this case, the sync rate is going to be 1. When there + * is a heavy write load, TXG's fill up fast and the sync thread will write + * the TXG more frequently (perhaps once a second). In this case the rate + * will be > 1. The sync rate is a lagging indicator since it can be up + * to 5 seconds old. We use the txg_cnt to keep track of the rate in the + * current 5 second interval and txg_sync_rate to keep track of the previous + * 5 second interval. In that way we don't have a period (1 or more seconds) + * where the txg_cnt == 0 and we cut back on throttling even though the rate + * is still high. + */ +/*ARGSUSED*/ +void +zfs_zone_report_txg_sync(void *dp) +{ + uint_t now; + + txg_cnt++; + now = (uint_t)(gethrtime() / NANOSEC); + if ((now - txg_last_check) >= zfs_txg_timeout) { + txg_sync_rate = txg_cnt / 2; + txg_cnt = 0; + txg_last_check = now; + } +} + +hrtime_t +zfs_zone_txg_delay() +{ + if (curzone->zone_io_util_above_avg) + return (zfs_zone_txg_delay_nsec); + + return (MSEC2NSEC(10)); +} + +/* + * Called from vdev_disk_io_start when an IO hits the end of the zio pipeline + * and is issued. + * Keep track of start time for latency calculation in zfs_zone_zio_done. + */ +void +zfs_zone_zio_start(zio_t *zp) +{ + zone_t *zonep; + + /* + * I/Os of type ZIO_TYPE_IOCTL are used to flush the disk cache, not for + * an actual I/O operation. Ignore those operations as they relate to + * throttling and scheduling. + */ + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_zfs_lock); + if (zp->io_type == ZIO_TYPE_READ) + kstat_runq_enter(&zonep->zone_zfs_rwstats); + zonep->zone_zfs_weight = 0; + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zp->io_dispatched = gethrtime(); + + if (zfs_disk_rcnt++ != 0) + zfs_disk_rtime += (zp->io_dispatched - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = zp->io_dispatched; + mutex_exit(&zfs_disk_lock); + + zone_rele(zonep); +} + +/* + * Called from vdev_disk_io_done when an IO completes. + * Increment our counter for zone ops. + * Calculate the IO latency avg. for this zone. + */ +void +zfs_zone_zio_done(zio_t *zp) +{ + zone_t *zonep; + hrtime_t now, unow, udelta; + + if (zp->io_type == ZIO_TYPE_IOCTL) + return; + + if (zp->io_dispatched == 0) + return; + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + now = gethrtime(); + unow = NANO_TO_MICRO(now); + udelta = unow - NANO_TO_MICRO(zp->io_dispatched); + + mutex_enter(&zonep->zone_zfs_lock); + + /* + * To calculate the wsvc_t average, keep a cumulative sum of all the + * wait time before each I/O was dispatched. Since most writes are + * asynchronous, only track the wait time for read I/Os. + */ + if (zp->io_type == ZIO_TYPE_READ) { + zonep->zone_zfs_rwstats.reads++; + zonep->zone_zfs_rwstats.nread += zp->io_size; + + zonep->zone_zfs_stats->zz_waittime.value.ui64 += + zp->io_dispatched - zp->io_timestamp; + + kstat_runq_exit(&zonep->zone_zfs_rwstats); + } else { + zonep->zone_zfs_rwstats.writes++; + zonep->zone_zfs_rwstats.nwritten += zp->io_size; + } + + mutex_exit(&zonep->zone_zfs_lock); + + mutex_enter(&zfs_disk_lock); + zfs_disk_rcnt--; + zfs_disk_rtime += (now - zfs_disk_rlastupdate); + zfs_disk_rlastupdate = now; + + if (udelta > zfs_zone_laggard_threshold) + zfs_disk_last_laggard = unow; + + mutex_exit(&zfs_disk_lock); + + if (zfs_zone_delay_enable) { + mutex_enter(&zonep->zone_stg_io_lock); + add_iop(zonep, unow, zp->io_type == ZIO_TYPE_READ ? + ZFS_ZONE_IOP_READ : ZFS_ZONE_IOP_WRITE, udelta); + mutex_exit(&zonep->zone_stg_io_lock); + } + + zone_rele(zonep); + + /* + * sdt:::zfs-zone-latency + * + * arg0: zone ID + * arg1: type of I/O operation + * arg2: I/O latency (in us) + */ + DTRACE_PROBE3(zfs__zone__latency, uintptr_t, zp->io_zoneid, + uintptr_t, zp->io_type, uintptr_t, udelta); +} + +void +zfs_zone_zio_dequeue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + ASSERT(zonep->zone_zfs_queued[p] > 0); + if (zonep->zone_zfs_queued[p] == 0) + cmn_err(CE_WARN, "zfs_zone_zio_dequeue: count==0"); + else + zonep->zone_zfs_queued[p]--; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +void +zfs_zone_zio_enqueue(zio_t *zp) +{ + zio_priority_t p; + zone_t *zonep; + + p = zp->io_priority; + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return; + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + if ((zonep = zone_find_by_id(zp->io_zoneid)) == NULL) + return; + + mutex_enter(&zonep->zone_stg_io_lock); + zonep->zone_zfs_queued[p]++; + mutex_exit(&zonep->zone_stg_io_lock); + zone_rele(zonep); +} + +/* + * Called from vdev_queue_io_to_issue. That function is where zio's are listed + * in FIFO order on one of the sync queues, then pulled off (by + * vdev_queue_io_remove) and issued. We potentially do zone-based scheduling + * here to find a zone's zio deeper in the sync queue and issue that instead + * of simply doing FIFO. + * + * We only do zone-based zio scheduling for the two synchronous I/O queues + * (read & write). These queues are normally serviced in FIFO order but we + * may decide to move a zone's zio to the head of the line. A typical I/O + * load will be mostly synchronous reads and some asynchronous writes (which + * are scheduled differently due to transaction groups). There will also be + * some synchronous writes for those apps which want to ensure their data is on + * disk. We want to make sure that a zone with a single-threaded app (e.g. the + * shell) that is doing synchronous I/O (typically reads) isn't penalized by + * other zones which are doing lots of synchronous I/O because they have many + * running threads. + * + * The vq->vq_lock mutex is held when we're executing this function so we + * can safely access the "last zone" variable on the queue. + */ +zio_t * +zfs_zone_schedule(vdev_queue_t *vq, zio_priority_t p, avl_index_t idx, + avl_tree_t *tree) +{ + vdev_queue_class_t *vqc = &vq->vq_class[p]; + uint_t cnt; + zoneid_t last_zone; + zio_t *zio; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + /* Don't change the order on the LBA ordered queues. */ + if (p != ZIO_PRIORITY_SYNC_READ && p != ZIO_PRIORITY_SYNC_WRITE) + return (avl_nearest(tree, idx, AVL_AFTER)); + + /* We depend on p being defined as either 0 or 1 */ + ASSERT(p < 2); + + cnt = avl_numnodes(tree); + last_zone = vq->vq_last_zone_id; + + /* + * If there are only a few zios in the queue then just issue the head. + * If there are more than a few zios already queued up, then use + * scheduling to get the next zio. + */ + if (!zfs_zone_schedule_enable || cnt < zfs_zone_schedule_thresh) + zio = avl_nearest(tree, idx, AVL_AFTER); + else + zio = get_next_zio(vqc, cnt, p, tree); + + vq->vq_last_zone_id = zio->io_zoneid; + + /* + * Probe with 4 args; the number of IOs in the queue, the zone that + * was last scheduled off this queue, the zone that was associated + * with the next IO that is scheduled, and which queue (priority). + */ + DTRACE_PROBE4(zfs__zone__sched, uint_t, cnt, uint_t, last_zone, + uint_t, zio->io_zoneid, uint_t, p); + + return (zio); +} + +#endif diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 38b57e0123..f9f93999db 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ @@ -1801,9 +1802,18 @@ zil_close(zilog_t *zilog) if (lwb != NULL) txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); - if (txg) + + if (zilog_is_dirty(zilog)) { + /* + * If we're dirty, always wait for the current transaction -- + * our lwb_max_txg may be in the past. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (txg) { txg_wait_synced(zilog->zl_dmu_pool, txg); - ASSERT(!zilog_is_dirty(zilog)); + } + + VERIFY(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index aa0f2945dd..2fa3213be0 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ #include <sys/sysmacros.h> @@ -39,6 +40,7 @@ #include <sys/ddt.h> #include <sys/blkptr.h> #include <sys/zfeature.h> +#include <sys/zfs_zone.h> /* * ========================================================================== @@ -557,11 +559,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_bookmark = *zb; if (pio != NULL) { + zio->io_zoneid = pio->io_zoneid; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); + } else { + zfs_zone_zio_init(zio); } return (zio); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 80888103fe..f681b1dc65 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -25,7 +25,7 @@ * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -83,6 +83,7 @@ #include <sys/zvol.h> #include <sys/dumphdr.h> #include <sys/zil_impl.h> +#include <sys/sdt.h> #include <sys/dbuf.h> #include <sys/dmu_tx.h> #include <sys/zfeature.h> @@ -137,6 +138,11 @@ typedef struct zvol_state { #define ZVOL_EXCL 0x4 #define ZVOL_WCE 0x8 +#define VOP_LATENCY_10MS 10000000 +#define VOP_LATENCY_100MS 100000000 +#define VOP_LATENCY_1S 1000000000 +#define VOP_LATENCY_10S 10000000000 + /* * zvol maximum transfer in one DMU tx. */ @@ -1378,6 +1384,9 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; rl_t *rl; int error = 0; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1394,6 +1403,14 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 0); + + mutex_enter(&zonep->zone_vfs_lock); + kstat_runq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1403,6 +1420,7 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - uio->uio_loffset) bytes = volsize - uio->uio_loffset; + tot_bytes += bytes; error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); if (error) { /* convert checksum errors into IO errors */ @@ -1412,6 +1430,39 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) } } zfs_range_unlock(rl); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.reads++; + zonep->zone_vfs_rwstats.nread += tot_bytes; + kstat_runq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 0, int, + error); + return (error); } @@ -1425,6 +1476,9 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) rl_t *rl; int error = 0; boolean_t sync; + zone_t *zonep = curzone; + uint64_t tot_bytes; + hrtime_t start, lat; zv = zfsdev_get_soft_state(minor, ZSST_ZVOL); if (zv == NULL) @@ -1441,6 +1495,19 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + DTRACE_PROBE3(zvol__uio__start, dev_t, dev, uio_t *, uio, int, 1); + + /* + * For the purposes of VFS kstat consumers, the "waitq" calculation is + * repurposed as the active queue for zvol write operations. There's no + * actual wait queue for zvol operations. + */ + mutex_enter(&zonep->zone_vfs_lock); + kstat_waitq_enter(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + start = gethrtime(); + tot_bytes = 0; + sync = !(zv->zv_flags & ZVOL_WCE) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); @@ -1454,6 +1521,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; + tot_bytes += bytes; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -1471,6 +1539,39 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); + + DTRACE_PROBE4(zvol__uio__done, dev_t, dev, uio_t *, uio, int, 1, int, + error); + + mutex_enter(&zonep->zone_vfs_lock); + zonep->zone_vfs_rwstats.writes++; + zonep->zone_vfs_rwstats.nwritten += tot_bytes; + kstat_waitq_exit(&zonep->zone_vfs_rwstats); + mutex_exit(&zonep->zone_vfs_lock); + + lat = gethrtime() - start; + + if (lat >= VOP_LATENCY_10MS) { + zone_vfs_kstat_t *zvp; + + zvp = zonep->zone_vfs_stats; + if (lat < VOP_LATENCY_100MS) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_1S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + } else if (lat < VOP_LATENCY_10S) { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + } else { + atomic_inc_64(&zvp->zv_10ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_100ms_ops.value.ui64); + atomic_inc_64(&zvp->zv_1s_ops.value.ui64); + atomic_inc_64(&zvp->zv_10s_ops.value.ui64); + } + } + return (error); } diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h new file mode 100644 index 0000000000..a790a797d1 --- /dev/null +++ b/usr/src/uts/common/inet/inet_hash.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _INET_INET_HASH_H +#define _INET_INET_HASH_H + +/* + * Common packet hashing routines shared across MAC, UDP, and others. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define INET_PKT_HASH_L2 0x01 +#define INET_PKT_HASH_L3 0x02 +#define INET_PKT_HASH_L4 0x04 + +extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_INET_HASH_H */ diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c index 85ee142dfc..c350d67c2d 100644 --- a/usr/src/uts/common/inet/ip/ip_attr.c +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) */ if (ixa->ixa_free_flags & IXA_FREE_CRED) crhold(ixa->ixa_cred); + + /* + * There is no cleanup in progress on this new copy. + */ + ixa->ixa_tcpcleanup = IXATC_IDLE; } /* diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 94b2396147..16a9b7e349 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -25,6 +25,9 @@ * Copyright 2013 Joyent, Inc. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ /* * This file contains the interface control functions for IP. diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 33a2fa5935..dedb4dadcc 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(ip_squeue_worker_wait, pri); + sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c index 58e3b59ff3..dda05a316d 100644 --- a/usr/src/uts/common/inet/ip/ip_tunables.c +++ b/usr/src/uts/common/inet/ip/ip_tunables.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index c325e8dc26..2ca770ebe9 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid) { espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat", "net", KSTAT_TYPE_NAMED, - sizeof (esp_kstats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_PERSISTENT, stackid); + sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid); if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL) return (B_FALSE); diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index f958ca2261..227d2075f8 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); @@ -152,6 +160,16 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; char *hook6_loop_out = "ipfilter_hook6_loop_out"; char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -248,6 +266,31 @@ ipf_stack_t *ifs; ifs->ifs_ipf_ipv4 = NULL; } + /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + #undef UNDO_HOOK #ifdef IPFDEBUG @@ -445,6 +488,48 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + /* * Reacquire ipf_global, now it is safe. */ WRITE_ENTER(&ifs->ifs_ipf_global); @@ -1011,7 +1096,6 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; - /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2045,6 +2129,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hook4_loop_in */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index a239f1c1ca..9aa2478c6a 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -125,6 +125,10 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_in; hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; /* flags to indicate whether hooks are registered. */ boolean_t ifs_hook4_physical_in; @@ -137,10 +141,16 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; /* ip_auth.c */ int ifs_fr_authsize; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5d56debc31 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c index bcd8e8d8fa..8b07c9b36e 100644 --- a/usr/src/uts/common/inet/sockmods/socksctp.c +++ b/usr/src/uts/common/inet/sockmods/socksctp.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -403,7 +404,6 @@ sosctp_connect(struct sonode *so, struct sockaddr *name, } if (name == NULL || namelen == 0) { - mutex_exit(&so->so_lock); error = EINVAL; eprintsoline(so, error); goto done; diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 2e08dc359b..1009f0700f 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -23,7 +23,7 @@ */ /* - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. */ /* @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -244,7 +248,7 @@ squeue_init(void) /* ARGSUSED */ squeue_t * -squeue_create(clock_t wait, pri_t pri) +squeue_create(clock_t wait, pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); @@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -779,7 +818,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -804,20 +843,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1487,6 +1546,7 @@ void squeue_synch_exit(conn_t *connp) { squeue_t *sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_run == curthread) { diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 1b20e40aca..3488630f2c 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright (c) 2013, Joyent Inc. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5a15aea4de..eec3beddd2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -76,7 +77,8 @@ #include <inet/ipclassifier.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> -#include <sys/ethernet.h> +#include <sys/vxlan.h> +#include <inet/inet_hash.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> @@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol, typedef union T_primitives *t_primp_t; /* + * Various protocols that encapsulate UDP have no real use for the source port. + * Instead, they want to vary the source port to provide better equal-cost + * multipathing and other systems that use fanout. Consider something like + * VXLAN. If you're actually sending multiple different streams to a single + * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP, + * SRC Port, DST Port) will always be the same. + * + * Here, we return a port to hash this to, if we know how to hash it. If for + * some reason we can't perform an L4 hash, then we just return the default + * value, usually the default port. After we determine the hash we transform it + * so that it's in the range of [ min, max ]. + * + * We'd like to avoid a pull up for the sake of performing the hash. If the + * first mblk_t doesn't have the full protocol header, then we just send it to + * the default. If for some reason we have an encapsulated packet that has its + * protocol header in different parts of an mblk_t, then we'll go with the + * default port. This means that that if a driver isn't consistent about how it + * generates the frames for a given flow, it will not always be consistently + * hashed. That should be an uncommon event. + */ +uint16_t +udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, + uint16_t def) +{ + size_t szused = 0; + struct ether_header *ether; + struct ether_vlan_header *vether; + ip6_t *ip6h; + ipha_t *ipha; + uint16_t sap; + uint64_t hash; + uint32_t mod; + + ASSERT(min <= max); + + if (type != UDP_HASH_VXLAN) + return (def); + + if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) + return (def); + + /* + * The following logic is VXLAN specific to get at the header, if we + * have formats, eg. GENEVE, then we should ignore this. + * + * The kernel overlay device often puts a first mblk_t for the data + * which is just the encap. If so, then we're going to use that and try + * to avoid a pull up. + */ + if (MBLKL(mp) == VXLAN_HDR_LEN) { + if (mp->b_cont == NULL) + return (def); + mp = mp->b_cont; + ether = (struct ether_header *)mp->b_rptr; + } else if (MBLKL(mp) < VXLAN_HDR_LEN) { + return (def); + } else { + szused = VXLAN_HDR_LEN; + ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused); + } + + /* Can we hold a MAC header? */ + if (MBLKL(mp) + szused < sizeof (struct ether_header)) + return (def); + + /* + * We need to lie about the starting offset into the message block for + * convenience. Undo it at the end. We know that inet_pkt_hash() won't + * modify the mblk_t. + */ + mp->b_rptr += szused; + hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 | + INET_PKT_HASH_L3 | INET_PKT_HASH_L4); + mp->b_rptr -= szused; + + if (hash == 0) + return (def); + + mod = max - min + 1; + return ((hash % mod) + min); +} + +/* * Return the next anonymous port in the privileged port range for * bind checking. * @@ -1583,6 +1668,11 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_rcvhdr ? 1 : 0; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SRCPORT_HASH: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_vxlanhash; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1718,6 +1808,26 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, udp->udp_rcvhdr = onoff; mutex_exit(&connp->conn_lock); return (0); + case UDP_SRCPORT_HASH: + /* + * This should have already been verified, but double + * check. + */ + if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { + return (error); + } + + /* First see if the val is something we understand */ + if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN) + return (EINVAL); + + if (!checkonly) { + mutex_enter(&connp->conn_lock); + udp->udp_vxlanhash = *i1; + mutex_exit(&connp->conn_lock); + } + /* Fully handled this option. */ + return (0); } break; } @@ -2001,13 +2111,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, uint32_t cksum; udp_t *udp = connp->conn_udp; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t ulp_hdr_len; + uint16_t srcport; data_len = msgdsize(data_mp); ulp_hdr_len = UDPH_SIZE; if (insert_spi) ulp_hdr_len += sizeof (uint32_t); + /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); if (mp == NULL) { @@ -2019,7 +2141,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); - udpha->uha_src_port = connp->conn_lport; + if (hash_srcport == B_TRUE) { + udpha->uha_src_port = htons(srcport); + } else { + udpha->uha_src_port = connp->conn_lport; + } udpha->uha_dst_port = dstport; udpha->uha_checksum = 0; udpha->uha_length = htons(data_len); @@ -3194,6 +3320,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t pktlen; uint_t alloclen; uint_t copylen; @@ -3202,10 +3329,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udpha_t *udpha; uint32_t cksum; ip_pkt_t *ipp; + uint16_t srcport; ASSERT(MUTEX_HELD(&connp->conn_lock)); /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + + /* * Copy the header template and leave space for an SPI */ copylen = connp->conn_ht_iphc_len; @@ -3303,6 +3441,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, *((uint32_t *)(udpha + 1)) = 0; udpha->uha_dst_port = dstport; + if (hash_srcport == B_TRUE) + udpha->uha_src_port = htons(srcport); + return (mp); } diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c279bb4a21..fec3dec4c9 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -292,6 +293,7 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 6a31ce5c22..3b07abc309 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _UDP_IMPL_H @@ -178,8 +179,11 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ + udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ + /* Because there's only VXLAN, cheat */ + /* and only use a single bit */ - udp_pad_to_bit_31 : 29; + udp_pad_to_bit_31 : 28; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ diff --git a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c index 94323582d6..c2c16e848b 100644 --- a/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c +++ b/usr/src/uts/common/io/1394/targets/av1394/av1394_async.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * av1394 asynchronous module @@ -359,9 +361,10 @@ av1394_async_poll(av1394_inst_t *avp, short events, int anyyet, short *reventsp, AV1394_TNF_ENTER(av1394_async_poll); if (events & POLLIN) { - if (av1394_peekq(rq)) { + if (av1394_peekq(rq)) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { mutex_enter(&ap->a_mutex); ap->a_pollevents |= POLLIN; *phpp = &ap->a_pollhead; diff --git a/usr/src/uts/common/io/aggr/aggr_port.c b/usr/src/uts/common/io/aggr/aggr_port.c index 00545d2c03..a39110255a 100644 --- a/usr/src/uts/common/io/aggr/aggr_port.c +++ b/usr/src/uts/common/io/aggr/aggr_port.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -528,8 +529,13 @@ aggr_port_promisc(aggr_port_t *port, boolean_t on) if (on) { mac_rx_clear(port->lp_mch); + /* We use the promisc callback because without hardware + * rings, we deliver through flows that will cause duplicate + * delivery of packets when we've flipped into this mode + * to compensate for the lack of hardware MAC matching + */ rc = mac_promisc_add(port->lp_mch, MAC_CLIENT_PROMISC_ALL, - aggr_recv_cb, port, &port->lp_mphp, + aggr_recv_promisc_cb, port, &port->lp_mphp, MAC_PROMISC_FLAGS_NO_TX_LOOP); if (rc != 0) { mac_rx_set(port->lp_mch, aggr_recv_cb, port); diff --git a/usr/src/uts/common/io/aggr/aggr_recv.c b/usr/src/uts/common/io/aggr/aggr_recv.c index 2bdb7872e3..0dfe234b70 100644 --- a/usr/src/uts/common/io/aggr/aggr_recv.c +++ b/usr/src/uts/common/io/aggr/aggr_recv.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ /* @@ -68,16 +69,27 @@ aggr_recv_lacp(aggr_port_t *port, mac_resource_handle_t mrh, mblk_t *mp) /* * Callback function invoked by MAC service module when packets are - * made available by a MAC port. + * made available by a MAC port, both in promisc_on mode and not. */ /* ARGSUSED */ -void -aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, - boolean_t loopback) +static void +aggr_recv_path_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback, boolean_t promisc_path) { aggr_port_t *port = (aggr_port_t *)arg; aggr_grp_t *grp = port->lp_grp; + /* In the case where lp_promisc_on has been turned on to + * compensate for insufficient hardware MAC matching and + * hardware rings are not in use we will fall back to + * using flows for delivery which can result in duplicates + * pushed up the stack. Only respect the chosen path. + */ + if (port->lp_promisc_on != promisc_path) { + freemsgchain(mp); + return; + } + if (grp->lg_lacp_mode == AGGR_LACP_OFF) { aggr_mac_rx(grp->lg_mh, mrh, mp); } else { @@ -161,3 +173,19 @@ aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, } } } + +/* ARGSUSED */ +void +aggr_recv_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_FALSE); +} + +/* ARGSUSED */ +void +aggr_recv_promisc_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp, + boolean_t loopback) +{ + aggr_recv_path_cb(arg, mrh, mp, loopback, B_TRUE); +} diff --git a/usr/src/uts/common/io/axf/ax88172reg.h b/usr/src/uts/common/io/axf/ax88172reg.h new file mode 100644 index 0000000000..8ca6ebc187 --- /dev/null +++ b/usr/src/uts/common/io/axf/ax88172reg.h @@ -0,0 +1,163 @@ +/* + * @(#)ax88172reg.h 1.1 09/06/15 + * Macro definitions for ASIX AX88172 USB to fast ethernet controler + * based on ASIX AX88172/88772 data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +#ifndef __AX88172_H__ +#define __AX88172_H__ + +/* + * Vendor command definitions + */ +#define VCMD_READ_SRAM 0x02 +#define VCMD_WRITE_RXSRAM 0x03 +#define VCMD_WRITE_TXSRAM 0x04 +#define VCMD_SOFTWARE_MII_OP 0x06 +#define VCMD_READ_MII_REG 0x07 +#define VCMD_WRITE_MII_REG 0x08 +#define VCMD_READ_MII_OPMODE 0x09 +#define VCMD_HARDWARE_MII_OP 0x0a +#define VCMD_READ_SROM 0x0b +#define VCMD_WRITE_SROM 0x0c +#define VCMD_WRITE_SROM_ENABLE 0x0d +#define VCMD_WRITE_SROM_DISABLE 0x0e +#define VCMD_READ_RXCTRL 0x0f +#define VCMD_WRITE_RXCTRL 0x10 +#define VCMD_READ_IPGS 0x11 +#define VCMD_WRITE_IPG 0x12 +#define VCMD_WRITE_IPG1 0x13 +#define VCMD_WRITE_IPG2 0x14 +#define VCMD_READ_MCAST_FILTER 0x15 +#define VCMD_WRITE_MCAST_FILTER 0x16 +#define VCMD_READ_NODE_ID 0x17 +#define VCMD_READ_PHY_IDS 0x19 +#define VCMD_READ_MEDIUM_STATUS 0x1a +#define VCMD_WRITE_MEDIUM_STATUS 0x1b +#define VCMD_SET_MONITOR_MODE 0x1c +#define VCMD_GET_MONITOR_MODE 0x1d +#define VCMD_READ_GPIO 0x1e +#define VCMD_WRITE_GPIO 0x1f + +/* ax88772 only, currently not supported */ +#define VCMD_WRITE_IPGS_88772 0x12 +#define VCMD_READ_NODE_ID_88772 0x13 +#define VCMD_WRITE_NODE_ID_88772 0x14 +#define VCMD_WRITE_TEST_REG_88772 0x17 +#define VCMD_SOFTWARE_RESET_88772 0x20 +#define VCMD_READ_PHY_SELECT_88772 0x21 +#define VCMD_WRITE_PHY_SELECT_88772 0x22 + + +/* + * Register definitions + */ + +/* Rx control register */ +#define RCR_SO 0x80 /* Start Operation */ +#define RCR_AP_88772 0x20 /* accept physical address from mcast filter */ +#define RCR_AM 0x10 /* accept multicast address */ +#define RCR_AB 0x08 /* accept broadcast address */ +#define RCR_SEP 0x04 /* save error packet */ +#define RCR_AMALL 0x02 /* accept all multicast address */ +#define RCR_PRO 0x01 /* promiscious, all frames received */ + +#define RCR_MFB 0x0300 +#define RCR_MFB_SHIFT 8 +#define RCR_MFB_2K (0U << RCR_MFB_SHIFT) +#define RCR_MFB_4K (1U << RCR_MFB_SHIFT) +#define RCR_MFB_8K (2U << RCR_MFB_SHIFT) +#define RCR_MFB_16K (3U << RCR_MFB_SHIFT) + +#define RCR_BITS \ + "\020" \ + "\010SO" \ + "\006AP" \ + "\005AM" \ + "\004AB" \ + "\003SEP" \ + "\002AMALL" \ + "\001PRO" + +/* Medium status register */ +#define MSR_SM 0x1000 /* super mac support */ +#define MSR_SBP 0x0800 /* stop backpressure */ +#define MSR_PS 0x0200 /* port speed in mii mode */ +#define MSR_RE 0x0100 /* rx enable */ +#define MSR_PF 0x0080 /* check only length/type for pause frame */ +#define MSR_JFE 0x0040 /* jumbo frame enable */ +#define MSR_TFC 0x0020 /* tx flow control enable */ +#define MSR_RFC 0x0010 /* rx flow control enable (178) */ +#define MSR_FCEN 0x0010 /* flow control enable (172/772) */ +#define MSR_ENCK 0x0008 /* Enable GTX_CLK and TXC clock output (178) */ +#define MSR_TXABT 0x0004 /* Tx abort allow, always set */ +#define MSR_FDPX 0x0002 /* full duplex */ +#define MSR_GM 0x0001 /* Gigabit mode (178) */ + +#define MSR_BITS \ + "\020" \ + "\015SM" \ + "\014SBP" \ + "\012PS" \ + "\011RE" \ + "\005FCEN" \ + "\004ENCK" \ + "\003TXABT" \ + "\002FDPX" \ + "\001GM" + +/* monitor mode register */ +#define MMR_RWMP 0x04 /* remote wakeup by magic pkt */ +#define MMR_RWLU 0x02 /* remote wakeup by linkup */ +#define MMR_MOM 0x01 /* monitor mode 1:en, 0:dis */ + +#define MMR_BITS \ + "\020" \ + "\003RWMP" \ + "\002RWLU" \ + "\001MOM" + +/* GPIO register */ +#define GPIO_RSE 0x80 /* reload serial eeprom (88772)*/ +#define GPIO_DATA2 0x20 +#define GPIO_EN2 0x10 +#define GPIO_DATA1 0x08 +#define GPIO_EN1 0x04 +#define GPIO_DATA0 0x02 +#define GPIO_EN0 0x01 + +#define GPIO_BITS \ + "\020" \ + "\010RSE" \ + "\006DATA2" \ + "\005EN2" \ + "\004DATA1" \ + "\003EN1" \ + "\002DATA0" \ + "\001EN0" + +/* Software reset register */ +#define SWRST_IPPD 0x40 /* internal phy power down control */ +#define SWRST_IPRL 0x20 /* internal phy reset control */ +#define SWRST_BZ 0x10 /* force Bulk In to return zero-length pkt */ +#define SWRST_PRL 0x08 /* external phy reset pin level */ +#define SWRST_PRTE 0x04 /* external phy tri-state enable */ +#define SWRST_RT 0x02 /* clear frame length error for Bulk-Out */ +#define SWRST_RR 0x01 /* clear frame length error for Bulk-In */ + +#define SWRST_BITS \ + "\020" \ + "\007IPPD" \ + "\006IPRL" \ + "\005BZ" \ + "\004PRL" \ + "\003PRTE" \ + "\002RT" \ + "\001RR" + +/* Software PHY Select Status register */ +#define SPSS_ASEL 0x02 /* 1:auto select 0:manual select */ +#define SPSS_PSEL 0x01 /* 1:intenal phy, 0:external (when ASEL=0) */ + +#endif /* __AX88172_H__ */ diff --git a/usr/src/uts/common/io/axf/axf_usbgem.c b/usr/src/uts/common/io/axf/axf_usbgem.c new file mode 100644 index 0000000000..28963f6849 --- /dev/null +++ b/usr/src/uts/common/io/axf/axf_usbgem.c @@ -0,0 +1,1539 @@ +/* + * axf_usbgem.c : ASIX AX88172/772 USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2004-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "@(#)axf_usbgem.c 1.3 12/02/09" + +/* + * Changelog: + */ + +/* + * TODO + * handle RXMODE_ENABLE in set_rx_filter() + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "ax88172reg.h" + +char ident[] = "ax88x72 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label +#define LE16P(p) ((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0]) + +#define AX88172(dp) \ + (((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88172) + +#define AX88772(dp) \ + (((struct axf_dev *)(dp)->private)->chip->type == CHIP_TYPE_AX88772) + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int axf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (axf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for ax88172 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * RX/TX buffer size + */ + +/* + * Local device definitions + */ +struct chip_info { + uint16_t vid; /* usb vendor id */ + uint16_t pid; /* usb product id */ + int type; + uint8_t gpio_reset[2]; + uint8_t gpio_speed[2]; + uint8_t gpio_duplex[2]; + char *name; +#define CHIP_TYPE_AX88172 0 +#define CHIP_TYPE_AX88772 1 +#define CHIP_TYPE_AX88178 2 +}; + +#define GPIO_DEFAULT {0x00, 0x15}, {0, 0}, {0, 0} +struct chip_info chiptbl_88x7x[] = { +/* AX88172 */ +{ + /* Planex UE2-100TX, Hawking UF200, TrendNet TU2-ET100 */ + 0x07b8, 0x420a, CHIP_TYPE_AX88172, + + /* + * the default setting covers below: + * gpio bit2 has to be 0 and gpio bit0 has to be 1 + */ + {0, 0}, + {GPIO_EN1, GPIO_DATA1 | GPIO_EN1}, + {0, 0}, + "Planex UE2-100TX", /* tested */ +}, +{ + 0x2001, 0x1a00, CHIP_TYPE_AX88172, + {0x9f, 0x9e}, {0, 0}, {0, 0}, + "D-Link dube100", /* XXX */ +}, +{ + 0x077b, 0x2226, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Linksys USB200M", +}, +{ + 0x0846, 0x1040, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Netgear FA120", +}, +{ + 0x0b95, 0x1720, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Intellinet, ST Lab USB Ethernet", +}, +{ + 0x08dd, 0x90ff, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Billionton Systems, USB2AR", +}, +{ + 0x0557, 0x2009, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "ATEN UC210T", +}, +{ + 0x0411, 0x003d, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Buffalo LUA-U2-KTX", +}, +{ + 0x6189, 0x182d, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Sitecom LN-029 USB 2.0 10/100 Ethernet adapter", +}, +{ + 0x07aa, 0x0017, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "corega FEther USB2-TX", +}, +{ + 0x1189, 0x0893, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "Surecom EP-1427X-2", +}, +{ + 0x1631, 0x6200, CHIP_TYPE_AX88172, + GPIO_DEFAULT, + "goodway corp usb gwusb2e", +}, +/* AX88772 and AX88178 */ +{ + 0x13b1, 0x0018, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "Linksys USB200M rev.2", +}, +{ + 0x1557, 0x7720, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "0Q0 cable ethernet", +}, +{ + 0x07d1, 0x3c05, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "DLink DUB E100 ver B1", +}, +{ + 0x2001, 0x3c05, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "DLink DUB E100 ver B1(2)", +}, +{ + 0x05ac, 0x1402, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "Apple Ethernet USB Adapter", +}, +{ + 0x1737, 0x0039, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Linksys USB1000", +}, +{ + 0x0411, 0x006e, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Buffalo LUA-U2-KGT/LUA-U2-GT", +}, +{ + 0x04bb, 0x0930, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "I/O DATA ETG-US2", +}, +{ + 0x050d, 0x5055, CHIP_TYPE_AX88178, + {0, 0}, {0, 0}, {0, 0}, + "Belkin F5D5055", +}, +{ + /* generic ax88772 must be the last entry */ + /* planex UE-200TX-G */ + 0x0b95, 0x7720, CHIP_TYPE_AX88772, + {0, 0}, {0, 0}, {0, 0}, + "ASIX AX88772/AX88178", /* tested */ +}, +}; + +#define CHIPTABLESIZE (sizeof (chiptbl_88x7x) / sizeof (struct chip_info)) + +struct axf_dev { + /* + * Misc HW information + */ + struct chip_info *chip; + uint8_t ipg[3]; + uint8_t gpio; + uint16_t rcr; + uint16_t msr; + uint8_t last_link_state; + boolean_t phy_has_reset; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t axf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void axf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int axf_reset_chip(struct usbgem_dev *); +static int axf_init_chip(struct usbgem_dev *); +static int axf_start_chip(struct usbgem_dev *); +static int axf_stop_chip(struct usbgem_dev *); +static int axf_set_media(struct usbgem_dev *); +static int axf_set_rx_filter(struct usbgem_dev *); +static int axf_get_stats(struct usbgem_dev *); +static void axf_interrupt(struct usbgem_dev *, mblk_t *); + +/* packet operations */ +static mblk_t *axf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *axf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUT(dp, req, val, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ (req), \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define IN(dp, req, val, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ (req), \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +axf_reset_phy(struct usbgem_dev *dp) +{ + uint8_t phys[2]; + uint8_t val8; + int err; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (AX88172(dp)) { + delay(drv_usectohz(5000)); + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: %s: gpio 0x%b", + dp->name, __func__, val8, GPIO_BITS)); + + /* reset MII PHY */ + val8 = lp->chip->gpio_reset[1] + | lp->chip->gpio_speed[dp->speed] + | lp->chip->gpio_duplex[dp->full_duplex]; + + OUT(dp, VCMD_WRITE_GPIO, + val8, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(5000)); + + val8 = lp->chip->gpio_reset[0] + | lp->chip->gpio_speed[dp->speed] + | lp->chip->gpio_duplex[dp->full_duplex]; + + OUT(dp, VCMD_WRITE_GPIO, + val8, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(5000)); + } else { + lp->gpio = GPIO_RSE | GPIO_DATA2 | GPIO_EN2; + OUT(dp, VCMD_WRITE_GPIO, lp->gpio, 0, + 0, NULL, &err, usberr); + drv_usecwait(1000); + + OUT(dp, VCMD_WRITE_PHY_SELECT_88772, + dp->mii_phy_addr == 16 ? 1 : 0, 0, 0, NULL, &err, usberr); + + OUT(dp, VCMD_SOFTWARE_RESET_88772, + SWRST_IPPD | SWRST_PRL, 0, 0, NULL, &err, usberr); + delay(drv_usectohz(150*1000)); + OUT(dp, VCMD_SOFTWARE_RESET_88772, + 0, 0, 0, NULL, &err, usberr); + + OUT(dp, VCMD_SOFTWARE_RESET_88772, + dp->mii_phy_addr == 16 ? SWRST_IPRL : SWRST_PRTE, + 0, 0, NULL, &err, usberr); + delay(drv_usectohz(150*1000)); + } + + + return (USB_SUCCESS); + +usberr: + return (USB_FAILURE); +} + +static int +axf_reset_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + if (AX88172(dp)) { + /* there are no ways to reset nic */ + return (USB_SUCCESS); + } +#ifdef NEVER + OUT(dp, VCMD_SOFTWARE_RESET_88772, + SWRST_RR | SWRST_RT, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_SOFTWARE_RESET_88772, + 0, 0, 0, NULL, &err, usberr); +usberr: +#endif + return (err); +} + +/* + * Setup ax88172 + */ +static int +axf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err = USB_SUCCESS; + uint16_t reg; + uint8_t buf[2]; + uint16_t tmp16; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* rx conrol register: read default value */ + if (!AX88172(dp)) { + /* clear rx control */ + OUT(dp, VCMD_WRITE_RXCTRL, 0, 0, 0, NULL, &err, usberr); + } + + IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr); + lp->rcr = LE16P(buf); + DPRINTF(0, (CE_CONT, "!%s: %s: rcr(default):%b", + dp->name, __func__, lp->rcr, RCR_BITS)); + + lp->rcr &= ~RCR_SO; + + /* Media status register */ + if (AX88172(dp)) { +#ifdef notdef + lp->msr = MSR_TXABT; +#else + lp->msr = 0; +#endif + } else { + lp->msr = MSR_RE | MSR_TXABT; + } + DPRINTF(0, (CE_CONT, "!%s: %s: msr:%b", + dp->name, __func__, lp->msr, MSR_BITS)); + err = axf_set_media(dp); + CHECK_AND_JUMP(err, usberr); + + /* write IPG0-2 registers */ + if (AX88172(dp)) { + OUT(dp, VCMD_WRITE_IPG, lp->ipg[0], 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_IPG1, lp->ipg[1], 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_IPG2, lp->ipg[2], 0, 0, NULL, &err, usberr); + } else { + /* EMPTY */ + } +#ifdef ENABLE_RX_IN_INIT_CHIP + /* enable Rx */ + lp->rcr |= RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +axf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; +#ifndef ENABLE_RX_IN_INIT_CHIP + /* enable Rx */ + lp->rcr |= RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); +#endif + return (err); +} + +static int +axf_stop_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; + + /* Disable Rx */ + lp->rcr &= ~RCR_SO; + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, 0, NULL, &err, usberr); + + /* + * Restore factory mac address + * if we have changed current mac address + */ + if (!AX88172(dp) && + bcmp(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, + ETHERADDRL) != 0) { + OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0, + ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr); + } +usberr: + return (axf_reset_chip(dp)); +} + +static int +axf_get_stats(struct usbgem_dev *dp) +{ + /* EMPTY */ + return (USB_SUCCESS); +} + +static uint_t +axf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_be(addr) >> (32 - 6)); +} + +static int +axf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint8_t mode; + uint8_t mhash[8]; + uint8_t buf[2]; + uint_t h; + int err = USB_SUCCESS; + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & RCR_SO) { + /* set promiscuous mode before changing it. */ + OUT(dp, VCMD_WRITE_RXCTRL, + lp->rcr | RCR_PRO, 0, 0, NULL, &err, usberr); + } + + lp->rcr &= ~(RCR_AP_88772 | RCR_AM | RCR_SEP | RCR_AMALL | RCR_PRO); + mode = RCR_AB; /* accept broadcast packets */ + + bzero(mhash, sizeof (mhash)); + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_PRO; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) { + /* accept all multicast packets */ + mode |= RCR_AMALL; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + mode |= RCR_AM; + for (i = 0; i < dp->mc_count; i++) { + h = dp->mc_list[i].hash; + mhash[h / 8] |= 1 << (h % 8); + } + } + if (AX88172(dp)) { + if (bcmp(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, ETHERADDRL) != 0) { + /* + * we use promiscious mode instead of changing the + * mac address in ax88172 + */ + mode |= RCR_PRO; + } + } else { + OUT(dp, VCMD_WRITE_NODE_ID_88772, 0, 0, + ETHERADDRL, dp->cur_addr.ether_addr_octet, &err, usberr); + } + lp->rcr |= mode; + + /* set multicast hash table */ + if (mode & RCR_AM) { + /* need to set up multicast hash table */ + OUT(dp, VCMD_WRITE_MCAST_FILTER, 0, 0, + sizeof (mhash), mhash, &err, usberr); + } + + /* update rcr */ + OUT(dp, VCMD_WRITE_RXCTRL, lp->rcr, 0, + 0, NULL, &err, usberr); + +#if DEBUG_LEVEL > 1 + /* verify rxctrl reg */ + IN(dp, VCMD_READ_RXCTRL, 0, 0, 2, buf, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, LE16P(buf), RCR_BITS); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +axf_set_media(struct usbgem_dev *dp) +{ + uint8_t val8; + uint8_t gpio; + uint8_t gpio_old; + int err = USB_SUCCESS; + uint16_t msr; + struct axf_dev *lp = dp->private; + + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &gpio, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: %s: called, gpio:%b", + dp->name, __func__, gpio, GPIO_BITS)); + + msr = lp->msr; + gpio_old = gpio; + gpio = lp->chip->gpio_reset[0]; + + /* setup speed */ + if (AX88172(dp)) { + /* EMPTY */ + } else { + msr &= ~(MSR_PS | MSR_GM | MSR_ENCK); + + switch (dp->speed) { + case USBGEM_SPD_1000: + msr |= MSR_GM | MSR_ENCK; + break; + + case USBGEM_SPD_100: + msr |= MSR_PS; + break; + + case USBGEM_SPD_10: + break; + } + } + gpio |= lp->chip->gpio_speed[dp->speed == USBGEM_SPD_100 ? 1 : 0]; + + /* select duplex */ + msr &= ~MSR_FDPX; + if (dp->full_duplex) { + msr |= MSR_FDPX; + + /* select flow control */ + if (AX88172(dp)) { + msr &= ~MSR_FCEN; + switch (dp->flow_control) { + case FLOW_CONTROL_TX_PAUSE: + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_RX_PAUSE: + msr |= MSR_FCEN; + break; + } + } else { + msr &= ~(MSR_RFC | MSR_TFC); + switch (dp->flow_control) { + case FLOW_CONTROL_TX_PAUSE: + msr |= MSR_TFC; + break; + + case FLOW_CONTROL_SYMMETRIC: + msr |= MSR_TFC | MSR_RFC; + break; + + case FLOW_CONTROL_RX_PAUSE: + msr |= MSR_RFC; + break; + } + } + } + gpio |= lp->chip->gpio_duplex[dp->full_duplex ? 1 : 0]; + + /* update medium status register */ + lp->msr = msr; + OUT(dp, VCMD_WRITE_MEDIUM_STATUS, lp->msr, 0, + 0, NULL, &err, usberr); + + if (gpio != gpio_old) { + /* LED control required for some products */ + OUT(dp, VCMD_WRITE_GPIO, + gpio, 0, 0, NULL, &err, usberr); + } + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end (%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +#define FILL_PKT_HEADER(bp, len) { \ + (bp)[0] = (uint8_t)(len); \ + (bp)[1] = (uint8_t)((len) >> 8); \ + (bp)[2] = (uint8_t)(~(len)); \ + (bp)[3] = (uint8_t)((~(len)) >> 8); \ +} + +#define PKT_HEADER_SIZE 4 + +/* + * send/receive packet check + */ +static mblk_t * +axf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int n; + size_t len; + size_t pkt_size; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + uint_t align_mask; + size_t header_size; + int pad_size; + + len = msgdsize(mp); + + if (AX88172(dp)) { +#ifdef notdef + align_mask = 63; +#else + align_mask = 511; +#endif + header_size = 0; + + if (len >= ETHERMIN && mp->b_cont == NULL && + (len & align_mask) != 0) { + /* use the mp "as is" */ + return (mp); + } + } else { + align_mask = 511; + header_size = PKT_HEADER_SIZE; + } + + /* + * re-allocate the mp + */ + /* minimum ethernet packet size of ETHERMIN */ + pkt_size = max(len, ETHERMIN); + + if (((pkt_size + header_size) & align_mask) == 0) { + /* padding is required in usb communication */ + pad_size = PKT_HEADER_SIZE; + } else { + pad_size = 0; + } + + if ((new = allocb(header_size + pkt_size + pad_size, 0)) == NULL) { + return (NULL); + } + + bp = new->b_rptr; + if (header_size) { + uint16_t tmp; + + /* add a header */ + tmp = (uint16_t)pkt_size; + FILL_PKT_HEADER(bp, tmp); + bp += header_size; + } + + /* copy contents of the buffer */ + for (tp = mp; tp; tp = tp->b_cont) { + n = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, n); + bp += n; + } + + /* add pads for ethernet packets */ + last_pos = new->b_rptr + header_size + pkt_size; + while (bp < last_pos) { + *bp++ = 0; + } + + /* add a zero-length pad segment for usb communications */ + if (pad_size) { + /* add a dummy header for zero-length packet */ + FILL_PKT_HEADER(bp, 0); + bp += pad_size; + } + + /* close the payload of the packet */ + new->b_wptr = bp; + + return (new); +} + +static void +axf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +axf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + mblk_t *tp; + int rest; + + if (AX88172(dp)) { + return (mp); + } + + tp = mp; + rest = tp->b_wptr - tp->b_rptr; + + if (rest <= PKT_HEADER_SIZE) { + /* + * the usb bulk-in frame doesn't include any valid + * ethernet packets. + */ + return (NULL); + } + + for (; ; ) { + uint16_t len; + uint16_t cksum; + + /* analyse the header of the received usb frame */ + len = LE16P(tp->b_rptr + 0); + cksum = LE16P(tp->b_rptr + 2); + + /* test if the header is valid */ + if (len + cksum != 0xffff) { + /* discard whole the packet */ + cmn_err(CE_WARN, + "!%s: %s: corrupted header:%04x %04x", + dp->name, __func__, len, cksum); + return (NULL); + } +#if DEBUG_LEVEL > 0 + if (len < ETHERMIN || len > ETHERMAX) { + cmn_err(CE_NOTE, + "!%s: %s: incorrect pktsize:%d", + dp->name, __func__, len); + } +#endif + /* extract a ethernet packet from the bulk-in frame */ + tp->b_rptr += PKT_HEADER_SIZE; + tp->b_wptr = tp->b_rptr + len; + + if (len & 1) { + /* + * skip a tailing pad byte if the packet + * length is odd + */ + len++; + } + rest -= len + PKT_HEADER_SIZE; + + if (rest <= PKT_HEADER_SIZE) { + /* no more vaild ethernet packets */ + break; + } + +#if DEBUG_LEVEL > 10 + axf_dump_packet(dp, tp->b_wptr, 18); +#endif + /* allocate a mblk_t header for the next ethernet packet */ + tp->b_next = dupb(mp); + tp->b_next->b_rptr = tp->b_rptr + len; + tp = tp->b_next; + } + + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +axf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint8_t buf[2]; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + + /* switch to software MII operation mode */ + OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + /* Read MII register */ + IN(dp, VCMD_READ_MII_REG, dp->mii_phy_addr, index, + 2, buf, errp, usberr); + + /* switch to hardware MII operation mode */ + OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + return (LE16P(buf)); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +axf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + uint8_t buf[2]; + + DPRINTF(4, (CE_CONT, "!%s: %s called, reg:%x val:%x", + dp->name, __func__, index, val)); + + /* switch software MII operation mode */ + OUT(dp, VCMD_SOFTWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + + /* Write to the specified MII register */ + buf[0] = (uint8_t)val; + buf[1] = (uint8_t)(val >> 8); + OUT(dp, VCMD_WRITE_MII_REG, dp->mii_phy_addr, index, + 2, buf, errp, usberr); + + /* switch to hardware MII operation mode */ + OUT(dp, VCMD_HARDWARE_MII_OP, 0, 0, 0, NULL, errp, usberr); + +usberr: + ; +} + +static void +axf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *bp; + struct axf_dev *lp = dp->private; + + bp = mp->b_rptr; + + DPRINTF(2, (CE_CONT, + "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7])); + + if (lp->last_link_state ^ bp[2]) { + usbgem_mii_update_link(dp); + } + + lp->last_link_state = bp[2]; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +#ifdef DEBUG_LEVEL +static void +axf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint8_t w0[2], w1[2], w2[2], w3[2]; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + + err = USB_SUCCESS; + + for (i = 0; i < size; i += 4) { + IN(dp, VCMD_READ_SROM, i + 0, 0, 2, w0, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 1, 0, 2, w1, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 2, 0, 2, w2, &err, usberr); + IN(dp, VCMD_READ_SROM, i + 3, 0, 2, w3, &err, usberr); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i, + (w0[1] << 8) | w0[0], + (w1[1] << 8) | w1[0], + (w2[1] << 8) | w2[0], + (w3[1] << 8) | w3[0]); + } +usberr: + ; +} +#endif + +static int +axf_attach_chip(struct usbgem_dev *dp) +{ + uint8_t phys[2]; + int err; + uint_t vcmd; + int ret; +#ifdef CONFIG_FULLSIZE_VLAN + uint8_t maxpktsize[2]; + uint16_t vlan_pktsize; +#endif +#ifdef DEBUG_LEVEL + uint8_t val8; +#endif + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__)); + + ret = USB_SUCCESS; + /* + * mac address in EEPROM has loaded to ID registers. + */ + vcmd = AX88172(dp) ? VCMD_READ_NODE_ID : VCMD_READ_NODE_ID_88772; + IN(dp, vcmd, 0, 0, + ETHERADDRL, dp->dev_addr.ether_addr_octet, &err, usberr); + + /* + * setup IPG values + */ + lp->ipg[0] = 0x15; + lp->ipg[1] = 0x0c; + lp->ipg[2] = 0x12; + + /* + * We cannot scan phy because the nic returns undefined + * value, i.e. remained garbage, when MII phy is not at the + * specified index. + */ +#ifdef DEBUG_LEVELx + if (lp->chip->vid == 0x07b8 && lp->chip->pid == 0x420a) { + /* + * restore the original phy address of brain + * damaged Planex UE2-100TX + */ + OUT(dp, VCMD_WRITE_SROM_ENABLE, 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, 0x11, 0xe004, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, 0, 0, 0, NULL, &err, usberr); + } +#endif + if (AX88172(dp)) { + IN(dp, VCMD_READ_PHY_IDS, 0, 0, 2, &phys, &err, usberr); + dp->mii_phy_addr = phys[1]; + DPRINTF(0, (CE_CONT, "!%s: %s: phys_addr:%d %d", + dp->name, __func__, phys[0], phys[1])); + } else { + /* use built-in phy */ + dp->mii_phy_addr = 0x10; + } + + dp->misc_flag |= USBGEM_VLAN; +#ifdef CONFIG_FULLSIZE_VLAN + if (AX88172(dp) || AX88772(dp)) { + /* check max packet size in srom */ + IN(dp, VCMD_READ_SROM, 0x10, 0, 2, maxpktsize, &err, usberr); + vlan_pktsize = ETHERMAX + ETHERFCSL + 4 /* VTAG_SIZE */; + + if (LE16P(maxpktsize) < vlan_pktsize) { + cmn_err(CE_CONT, + "!%s: %s: max packet size in srom is too small, " + "changing %d -> %d, do power cycle for the device", + dp->name, __func__, + LE16P(maxpktsize), vlan_pktsize); + OUT(dp, VCMD_WRITE_SROM_ENABLE, + 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, 0x10, + vlan_pktsize, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, + 0, 0, 0, NULL, &err, usberr); + + /* need to power off the device */ + ret = USB_FAILURE; + } + } +#endif +#ifdef DEBUG_LEVEL + IN(dp, VCMD_READ_GPIO, 0, 0, 1, &val8, &err, usberr); + cmn_err(CE_CONT, + "!%s: %s: ipg 0x%02x 0x%02x 0x%02x, gpio 0x%b", + dp->name, __func__, lp->ipg[0], lp->ipg[1], lp->ipg[2], + val8, GPIO_BITS); +#endif + /* fix rx buffer size */ + if (!AX88172(dp)) { + dp->rx_buf_len = 2048; + } + +#if DEBUG_LEVEL > 0 + axf_eeprom_dump(dp, 0x20); +#endif + return (ret); + +usberr: + cmn_err(CE_WARN, "%s: %s: usb error detected (%d)", + dp->name, __func__, err); + return (USB_FAILURE); +} + +static boolean_t +axf_scan_phy(struct usbgem_dev *dp) +{ + int i; + int err; + uint16_t val; + int phy_addr_saved; + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + phy_addr_saved = dp->mii_phy_addr; + + /* special probe routine for unreliable MII addr */ +#define PROBE_PAT \ + (MII_ABILITY_100BASE_TX_FD | \ + MII_ABILITY_100BASE_TX | \ + MII_ABILITY_10BASE_T_FD | \ + MII_ABILITY_10BASE_T) + + for (i = 0; i < 32; i++) { + dp->mii_phy_addr = i; + axf_mii_write(dp, MII_AN_ADVERT, 0, &err); + if (err != USBGEM_SUCCESS) { + break; + } + val = axf_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + if (val != 0) { + DPRINTF(0, (CE_CONT, "!%s: %s: index:%d, val %b != 0", + dp->name, __func__, i, val, MII_ABILITY_BITS)); + continue; + } + + axf_mii_write(dp, MII_AN_ADVERT, PROBE_PAT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + val = axf_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USBGEM_SUCCESS) { + break; + } + if ((val & MII_ABILITY_TECH) != PROBE_PAT) { + DPRINTF(0, (CE_CONT, "!%s: %s: " + "index:%d, pat:%x != val:%b", + dp->name, __func__, i, + PROBE_PAT, val, MII_ABILITY_BITS)); + continue; + } + + /* found */ + dp->mii_phy_addr = phy_addr_saved; + return (i); + } +#undef PROBE_PAT + if (i == 32) { + cmn_err(CE_CONT, "!%s: %s: no mii phy found", + dp->name, __func__); + } else { + cmn_err(CE_CONT, "!%s: %s: i/o error while scanning phy", + dp->name, __func__); + } + dp->mii_phy_addr = phy_addr_saved; + return (-1); +} + +static int +axf_mii_probe(struct usbgem_dev *dp) +{ + int my_guess; + int err; + uint8_t old_11th[2]; + uint8_t new_11th[2]; + struct axf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + (void) axf_reset_phy(dp); + lp->phy_has_reset = B_TRUE; + + if (AX88172(dp)) { + my_guess = axf_scan_phy(dp); + if (my_guess >= 0 && my_guess < 32 && + my_guess != dp->mii_phy_addr) { + /* + * phy addr in srom is wrong, need to fix it + */ + IN(dp, VCMD_READ_SROM, + 0x11, 0, 2, old_11th, &err, usberr); + + new_11th[0] = my_guess; + new_11th[1] = old_11th[1]; + + OUT(dp, VCMD_WRITE_SROM_ENABLE, + 0, 0, 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM, + 0x11, LE16P(new_11th), 0, NULL, &err, usberr); + OUT(dp, VCMD_WRITE_SROM_DISABLE, + 0, 0, 0, NULL, &err, usberr); +#if 1 + /* XXX - read back, but it doesn't work, why? */ + delay(drv_usectohz(1000*1000)); + IN(dp, VCMD_READ_SROM, + 0x11, 0, 2, new_11th, &err, usberr); +#endif + cmn_err(CE_NOTE, "!%s: %s: phy addr in srom fixed: " + "%04x -> %04x", + dp->name, __func__, + LE16P(old_11th), LE16P(new_11th)); + return (USBGEM_FAILURE); +usberr: + cmn_err(CE_NOTE, + "!%s: %s: failed to patch phy addr, " + "current: %04x", + dp->name, __func__, LE16P(old_11th)); + return (USBGEM_FAILURE); + } + } + return (usbgem_mii_probe_default(dp)); +} + +static int +axf_mii_init(struct usbgem_dev *dp) +{ + struct axf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (!lp->phy_has_reset) { + (void) axf_reset_phy(dp); + } + + /* prepare to reset phy on the next reconnect or resume */ + lp->phy_has_reset = B_FALSE; + + return (USB_SUCCESS); +} + +static int +axfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int revid; + int unit; + int vid; + int pid; + struct chip_info *p; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct axf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * Check if the chip is supported. + */ + vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-vendor-id", -1); + pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-product-id", -1); + revid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "usb-revision-id", -1); + + for (i = 0, p = chiptbl_88x7x; i < CHIPTABLESIZE; i++, p++) { + if (p->vid == vid && p->pid == pid) { + /* found */ + cmn_err(CE_CONT, "!%s%d: %s " + "(vid: 0x%04x, did: 0x%04x, revid: 0x%02x)", + drv_name, unit, p->name, vid, pid, revid); + goto chip_found; + } + } + + /* Not found */ + cmn_err(CE_WARN, "!%s: %s: wrong usb venid/prodid (0x%x, 0x%x)", + drv_name, __func__, vid, pid); + + /* assume 88772 */ + p = &chiptbl_88x7x[CHIPTABLESIZE - 1]; +chip_found: + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + /* + * softmac requires that ppa is the instance number + * of the device, otherwise it hangs in seaching the device. + */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 64; + + ugcp->usbgc_rx_header_len = 0; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; + + /* flow control */ + /* + * XXX - flow control caused link down frequently under + * heavy traffic + */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = 0; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + ugcp->usbgc_mii_hw_link_detection = B_TRUE; + ugcp->usbgc_mii_stop_mac_on_linkdown = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &axf_attach_chip; + ugcp->usbgc_reset_chip = &axf_reset_chip; + ugcp->usbgc_init_chip = &axf_init_chip; + ugcp->usbgc_start_chip = &axf_start_chip; + ugcp->usbgc_stop_chip = &axf_stop_chip; + ugcp->usbgc_multicast_hash = &axf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &axf_set_rx_filter; + ugcp->usbgc_set_media = &axf_set_media; + ugcp->usbgc_get_stats = &axf_get_stats; + ugcp->usbgc_interrupt = &axf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &axf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &axf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &axf_mii_probe; + ugcp->usbgc_mii_init = &axf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &axf_mii_read; + ugcp->usbgc_mii_write = &axf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct axf_dev), KM_SLEEP); + lp->chip = p; + lp->last_link_state = 0; + lp->phy_has_reset = B_FALSE; + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct axf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct axf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + + return (DDI_FAILURE); +} + +static int +axfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(axf_ops, axfattach, axfdetach); +#else +static struct module_info axfminfo = { + 0, /* mi_idnum */ + "axf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit axfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &axfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit axfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &axfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab axf_info = { + &axfrinit, /* st_rdinit */ + &axfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_axf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &axf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops axf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + axfattach, /* devo_attach */ + axfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_axf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &axf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!axf: _init: called")); + + status = usbgem_mod_init(&axf_ops, "axf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&axf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!axf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&axf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/cons.c b/usr/src/uts/common/io/cons.c index 6ef1b0b9f7..495ae93cf9 100644 --- a/usr/src/uts/common/io/cons.c +++ b/usr/src/uts/common/io/cons.c @@ -21,6 +21,7 @@ /* * Copyright (c) 1982, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* @@ -53,6 +54,7 @@ #include <sys/vnode.h> #include <sys/uio.h> #include <sys/stat.h> +#include <sys/limits.h> #include <sys/console.h> #include <sys/consdev.h> @@ -414,14 +416,24 @@ cnwrite(dev_t dev, struct uio *uio, struct cred *cred) */ if (vsconsvp != NULL && vsconsvp->v_stream != NULL) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; + + if (uio->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uio->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } /* * strwrite modifies uio so need to make copy. */ - (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + (void) uiodup(uio, &uiod.d_uio, uiod.d_iov, uio->uio_iovcnt); (void) strwrite(vsconsvp, &uiod.d_uio, cred); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); } if (rconsvp->v_stream != NULL) diff --git a/usr/src/uts/common/io/devpoll.c b/usr/src/uts/common/io/devpoll.c index a3fcbbba03..764636e218 100644 --- a/usr/src/uts/common/io/devpoll.c +++ b/usr/src/uts/common/io/devpoll.c @@ -25,6 +25,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -45,6 +46,8 @@ #include <sys/devpoll.h> #include <sys/rctl.h> #include <sys/resource.h> +#include <sys/schedctl.h> +#include <sys/epoll.h> #define RESERVED 1 @@ -237,7 +240,8 @@ dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) * stale entries! */ static int -dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) +dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, + pollcache_t *pcp, nfds_t nfds, int *fdcntp) { int start, ostart, end; int fdcnt, fd; @@ -247,7 +251,10 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) boolean_t no_wrap; pollhead_t *php; polldat_t *pdp; + pollfd_t *pfdp; + epoll_event_t *epoll; int error = 0; + short mask = POLLRDHUP | POLLWRBAND; ASSERT(MUTEX_HELD(&pcp->pc_lock)); if (pcp->pc_bitmap == NULL) { @@ -257,6 +264,14 @@ dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp) */ return (error); } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + pfdp = NULL; + epoll = (epoll_event_t *)dpbuf; + } else { + pfdp = (pollfd_t *)dpbuf; + epoll = NULL; + } retry: start = ostart = pcp->pc_mapstart; end = pcp->pc_mapend; @@ -316,11 +331,32 @@ repoll: * polling a closed fd. Hope this will remind * user to do a POLLREMOVE. */ - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].revents = POLLNVAL; - fdcnt++; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].revents = POLLNVAL; + fdcnt++; + continue; + } + + /* + * In the epoll compatibility case, we actually + * perform the implicit removal to remain + * closer to the epoll semantics. + */ + ASSERT(epoll != NULL); + + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); continue; } + if (fp != pdp->pd_fp) { /* * user is polling on a cached fd which was @@ -376,9 +412,69 @@ repoll: } if (revent != 0) { - pfdp[fdcnt].fd = fd; - pfdp[fdcnt].events = pdp->pd_events; - pfdp[fdcnt].revents = revent; + if (pfdp != NULL) { + pfdp[fdcnt].fd = fd; + pfdp[fdcnt].events = pdp->pd_events; + pfdp[fdcnt].revents = revent; + } else { + epoll_event_t *ep = &epoll[fdcnt]; + + ASSERT(epoll != NULL); + ep->data.u64 = pdp->pd_epolldata; + + /* + * If any of the event bits are set for + * which poll and epoll representations + * differ, swizzle in the native epoll + * values. + */ + if (revent & mask) { + ep->events = (revent & ~mask) | + ((revent & POLLRDHUP) ? + EPOLLRDHUP : 0) | + ((revent & POLLWRBAND) ? + EPOLLWRBAND : 0); + } else { + ep->events = revent; + } + + /* + * We define POLLWRNORM to be POLLOUT, + * but epoll has separate definitions + * for them; if POLLOUT is set and the + * user has asked for EPOLLWRNORM, set + * that as well. + */ + if ((revent & POLLOUT) && + (pdp->pd_events & EPOLLWRNORM)) { + ep->events |= EPOLLWRNORM; + } + } + + /* + * If POLLET is set, clear the bit in the + * bitmap -- which effectively latches the + * edge on a pollwakeup() from the driver. + */ + if (pdp->pd_events & POLLET) + BT_CLEAR(pcp->pc_bitmap, fd); + + /* + * If POLLONESHOT is set, perform the implicit + * POLLREMOVE. + */ + if (pdp->pd_events & POLLONESHOT) { + pdp->pd_fp = NULL; + pdp->pd_events = 0; + + if (php != NULL) { + pollhead_delete(php, pdp); + pdp->pd_php = NULL; + } + + BT_CLEAR(pcp->pc_bitmap, fd); + } + fdcnt++; } else if (php != NULL) { /* @@ -392,7 +488,7 @@ repoll: * in bitmap. */ if ((pdp->pd_php != NULL) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } if (pdp->pd_php == NULL) { @@ -473,11 +569,15 @@ dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) /* * allocate a pollcache skeleton here. Delay allocating bitmap * structures until dpwrite() time, since we don't know the - * optimal size yet. + * optimal size yet. We also delay setting the pid until either + * dpwrite() or attempt to poll on the instance, allowing parents + * to create instances of /dev/poll for their children. (In the + * epoll compatibility case, this check isn't performed to maintain + * semantic compatibility.) */ pcp = pcache_alloc(); dpep->dpe_pcache = pcp; - pcp->pc_pid = curproc->p_pid; + pcp->pc_pid = -1; *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ mutex_enter(&devpoll_lock); ASSERT(minordev < dptblsize); @@ -499,7 +599,9 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dp_entry_t *dpep; pollcache_t *pcp; pollfd_t *pollfdp, *pfdp; - int error; + dvpoll_epollfd_t *epfdp; + uintptr_t limit; + int error, size; ssize_t uiosize; nfds_t pollfdnum; struct pollhead *php = NULL; @@ -515,11 +617,23 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) ASSERT(dpep != NULL); mutex_exit(&devpoll_lock); pcp = dpep->dpe_pcache; - if (curproc->p_pid != pcp->pc_pid) { - return (EACCES); + + if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + curproc->p_pid != pcp->pc_pid) { + if (pcp->pc_pid != -1) + return (EACCES); + + pcp->pc_pid = curproc->p_pid; } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = sizeof (dvpoll_epollfd_t); + } else { + size = sizeof (pollfd_t); + } + uiosize = uiop->uio_resid; - pollfdnum = uiosize / sizeof (pollfd_t); + pollfdnum = uiosize / size; mutex_enter(&curproc->p_lock); if (pollfdnum > (uint_t)rctl_enforced_value( rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { @@ -534,6 +648,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * each polled fd to the cached set. */ pollfdp = kmem_alloc(uiosize, KM_SLEEP); + limit = (uintptr_t)pollfdp + (pollfdnum * size); /* * Although /dev/poll uses the write(2) interface to cache fds, it's @@ -555,9 +670,27 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) mutex_enter(&dpep->dpe_lock); dpep->dpe_writerwait++; while (dpep->dpe_refcnt != 0) { + /* + * We need to do a bit of a dance here: we need to drop + * our dpe_lock and grab the pc_lock to broadcast the pc_cv to + * kick any DP_POLL/DP_PPOLL sleepers. + */ + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag |= PC_WRITEWANTED; + cv_broadcast(&pcp->pc_cv); + mutex_exit(&pcp->pc_lock); + mutex_enter(&dpep->dpe_lock); + + if (dpep->dpe_refcnt == 0) + break; + if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { dpep->dpe_writerwait--; mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + mutex_exit(&pcp->pc_lock); kmem_free(pollfdp, uiosize); return (set_errno(EINTR)); } @@ -565,24 +698,103 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) dpep->dpe_writerwait--; dpep->dpe_flag |= DP_WRITER_PRESENT; dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); mutex_enter(&pcp->pc_lock); + pcp->pc_flag &= ~PC_WRITEWANTED; + if (pcp->pc_bitmap == NULL) { pcache_create(pcp, pollfdnum); } - for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) { + for (pfdp = pollfdp; (uintptr_t)pfdp < limit; + pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { fd = pfdp->fd; - if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) + if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { + /* + * epoll semantics demand that we return EBADF if our + * specified fd is invalid. + */ + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + error = EBADF; + break; + } + continue; + } + pdp = pcache_lookup_fd(pcp, fd); if (pfdp->events != POLLREMOVE) { + + fp = NULL; + if (pdp == NULL) { + /* + * If we're in epoll compatibility mode, check + * that the fd is valid before allocating + * anything for it; epoll semantics demand that + * we return EBADF if our specified fd is + * invalid. + */ + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + if ((fp = getf(fd)) == NULL) { + error = EBADF; + break; + } + + /* + * To (greatly) reduce EEXIST false + * positives, we denote that this + * fp has been epoll()'d; see below. + */ + fp->f_flag2 |= FEPOLLED; + } + pdp = pcache_alloc_fd(0); pdp->pd_fd = fd; pdp->pd_pcache = pcp; pcache_insert_fd(pcp, pdp, pollfdnum); + } else { + /* + * epoll semantics demand that we error out if + * a file descriptor is added twice, which we + * check (imperfectly) by checking if we both + * have the file descriptor cached and the + * file pointer that correponds to the file + * descriptor matches our cached value. (If + * there is a pointer mismatch, the file + * descriptor was closed without being removed. + * The converse is clearly not true, however, + * so to narrow the window by which a spurious + * EEXIST may be returned, we also check if + * this fp has been added to an epoll control + * descriptor in the past; if it hasn't, we + * know that this is due to fp reuse -- it's + * not a true EEXIST case. (By performing this + * additional check, we limit the window of + * spurious EEXIST to situations where a single + * file descriptor is being used across two or + * more epoll control descriptors -- and even + * then, the file descriptor must be closed and + * reused in a relatively tight time span.) + */ + if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + pdp->pd_fp != NULL && + (fp = getf(fd)) != NULL && + fp == pdp->pd_fp && + (fp->f_flag2 & FEPOLLED)) { + error = EEXIST; + releasef(fd); + break; + } } + + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* LINTED pointer alignment */ + epfdp = (dvpoll_epollfd_t *)pfdp; + pdp->pd_epolldata = epfdp->dpep_data; + } + ASSERT(pdp->pd_fd == fd); ASSERT(pdp->pd_pcache == pcp); if (fd >= pcp->pc_mapsize) { @@ -593,7 +805,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) if (fd > pcp->pc_mapend) { pcp->pc_mapend = fd; } - if ((fp = getf(fd)) == NULL) { + if (fp == NULL && (fp = getf(fd)) == NULL) { /* * The fd is not valid. Since we can't pass * this error back in the write() call, set @@ -609,7 +821,7 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) * same poll events. */ if ((pdp->pd_events == pfdp->events) && - (pdp->pd_fp != NULL)) { + (pdp->pd_fp == fp)) { /* * the events are already cached */ @@ -665,7 +877,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) } releasef(fd); } else { - if (pdp == NULL) { + if (pdp == NULL || pdp->pd_fp == NULL) { + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* + * As with the add case (above), epoll + * semantics demand that we error out + * in this case. + */ + error = ENOENT; + break; + } + continue; } ASSERT(pdp->pd_fd == fd); @@ -690,6 +912,17 @@ dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) return (error); } +#define DP_SIGMASK_RESTORE(ksetp) { \ + if (ksetp != NULL) { \ + mutex_enter(&p->p_lock); \ + if (lwp->lwp_cursig == 0) { \ + t->t_hold = lwp->lwp_sigoldmask; \ + t->t_flag &= ~T_TOMASK; \ + } \ + mutex_exit(&p->p_lock); \ + } \ +} + /*ARGSUSED*/ static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) @@ -701,7 +934,7 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) int error = 0; STRUCT_DECL(dvpoll, dvpoll); - if (cmd == DP_POLL) { + if (cmd == DP_POLL || cmd == DP_PPOLL) { /* do this now, before we sleep on DP_WRITER_PRESENT */ now = gethrtime(); } @@ -713,10 +946,39 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) mutex_exit(&devpoll_lock); ASSERT(dpep != NULL); pcp = dpep->dpe_pcache; - if (curproc->p_pid != pcp->pc_pid) - return (EACCES); mutex_enter(&dpep->dpe_lock); + + if (cmd == DP_EPOLLCOMPAT) { + if (dpep->dpe_refcnt != 0) { + /* + * We can't turn on epoll compatibility while there + * are outstanding operations. + */ + mutex_exit(&dpep->dpe_lock); + return (EBUSY); + } + + /* + * epoll compatibility is a one-way street: there's no way + * to turn it off for a particular open. + */ + dpep->dpe_flag |= DP_ISEPOLLCOMPAT; + mutex_exit(&dpep->dpe_lock); + + return (0); + } + + if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && + curproc->p_pid != pcp->pc_pid) { + if (pcp->pc_pid != -1) { + mutex_exit(&dpep->dpe_lock); + return (EACCES); + } + + pcp->pc_pid = curproc->p_pid; + } + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || (dpep->dpe_writerwait != 0)) { if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { @@ -729,15 +991,36 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) switch (cmd) { case DP_POLL: + case DP_PPOLL: { pollstate_t *ps; nfds_t nfds; int fdcnt = 0; + size_t size, fdsize, dpsize; hrtime_t deadline = 0; + k_sigset_t *ksetp = NULL; + k_sigset_t kset; + sigset_t set; + kthread_t *t = curthread; + klwp_t *lwp = ttolwp(t); + struct proc *p = ttoproc(curthread); STRUCT_INIT(dvpoll, mode); - error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), - STRUCT_SIZE(dvpoll)); + + /* + * The dp_setp member is only required/consumed for DP_PPOLL, + * which otherwise uses the same structure as DP_POLL. + */ + if (cmd == DP_POLL) { + dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - + (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); + } else { + ASSERT(cmd == DP_PPOLL); + dpsize = STRUCT_SIZE(dvpoll); + } + + error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); + if (error) { DP_REFRELE(dpep); return (EFAULT); @@ -755,6 +1038,52 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) deadline += now; } + if (cmd == DP_PPOLL) { + void *setp = STRUCT_FGETP(dvpoll, dp_setp); + + if (setp != NULL) { + if (copyin(setp, &set, sizeof (set))) { + DP_REFRELE(dpep); + return (EFAULT); + } + + sigutok(&set, &kset); + ksetp = &kset; + + mutex_enter(&p->p_lock); + schedctl_finish_sigblock(t); + lwp->lwp_sigoldmask = t->t_hold; + t->t_hold = *ksetp; + t->t_flag |= T_TOMASK; + + /* + * Like ppoll() with a non-NULL sigset, we'll + * call cv_reltimedwait_sig() just to check for + * signals. This call will return immediately + * with either 0 (signalled) or -1 (no signal). + * There are some conditions whereby we can + * get 0 from cv_reltimedwait_sig() without + * a true signal (e.g., a directed stop), so + * we restore our signal mask in the unlikely + * event that lwp_cursig is 0. + */ + if (!cv_reltimedwait_sig(&t->t_delay_cv, + &p->p_lock, 0, TR_CLOCK_TICK)) { + if (lwp->lwp_cursig == 0) { + t->t_hold = lwp->lwp_sigoldmask; + t->t_flag &= ~T_TOMASK; + } + + mutex_exit(&p->p_lock); + + DP_REFRELE(dpep); + return (EINTR); + } + + mutex_exit(&p->p_lock); + } + } + if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { /* * We are just using DP_POLL to sleep, so @@ -762,17 +1091,29 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) * Do not check for signals if we have a zero timeout. */ DP_REFRELE(dpep); - if (deadline == 0) + if (deadline == 0) { + DP_SIGMASK_RESTORE(ksetp); return (0); + } + mutex_enter(&curthread->t_delay_lock); while ((error = cv_timedwait_sig_hrtime(&curthread->t_delay_cv, &curthread->t_delay_lock, deadline)) > 0) continue; mutex_exit(&curthread->t_delay_lock); + + DP_SIGMASK_RESTORE(ksetp); + return (error == 0 ? EINTR : 0); } + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + size = nfds * (fdsize = sizeof (epoll_event_t)); + } else { + size = nfds * (fdsize = sizeof (pollfd_t)); + } + /* * XXX It would be nice not to have to alloc each time, but it * requires another per thread structure hook. This can be @@ -782,37 +1123,45 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) curthread->t_pollstate = pollstate_create(); ps = curthread->t_pollstate; } - if (ps->ps_dpbufsize < nfds) { - struct proc *p = ttoproc(curthread); + + if (ps->ps_dpbufsize < size) { /* - * The maximum size should be no large than - * current maximum open file count. + * If nfds is larger than twice the current maximum + * open file count, we'll silently clamp it. This + * only limits our exposure to allocating an + * inordinate amount of kernel memory; it doesn't + * otherwise affect the semantics. (We have this + * check at twice the maximum instead of merely the + * maximum because some applications pass an nfds that + * is only slightly larger than their limit.) */ mutex_enter(&p->p_lock); - if (nfds > p->p_fno_ctl) { - mutex_exit(&p->p_lock); - DP_REFRELE(dpep); - return (EINVAL); + if ((nfds >> 1) > p->p_fno_ctl) { + nfds = p->p_fno_ctl; + size = nfds * fdsize; } mutex_exit(&p->p_lock); - kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) * - ps->ps_dpbufsize); - ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) * - nfds, KM_SLEEP); - ps->ps_dpbufsize = nfds; + + if (ps->ps_dpbufsize < size) { + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); + ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); + ps->ps_dpbufsize = size; + } } mutex_enter(&pcp->pc_lock); for (;;) { - pcp->pc_flag = 0; - error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt); + pcp->pc_flag &= ~PC_POLLWAKE; + + error = dp_pcache_poll(dpep, ps->ps_dpbuf, + pcp, nfds, &fdcnt); if (fdcnt > 0 || error != 0) break; /* * A pollwake has happened since we polled cache. */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -822,8 +1171,40 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) /* immediate timeout; do not check signals */ break; } - error = cv_timedwait_sig_hrtime(&pcp->pc_cv, - &pcp->pc_lock, deadline); + + if (!(pcp->pc_flag & PC_WRITEWANTED)) { + error = cv_timedwait_sig_hrtime(&pcp->pc_cv, + &pcp->pc_lock, deadline); + } else { + error = 1; + } + + if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) { + /* + * We've been kicked off of our cv because a + * writer wants in. We're going to drop our + * reference count and then wait until the + * writer is gone -- at which point we'll + * reacquire the pc_lock and call into + * dp_pcache_poll() to get the updated state. + */ + mutex_exit(&pcp->pc_lock); + + mutex_enter(&dpep->dpe_lock); + dpep->dpe_refcnt--; + cv_broadcast(&dpep->dpe_cv); + + while ((dpep->dpe_flag & DP_WRITER_PRESENT) || + (dpep->dpe_writerwait != 0)) { + error = cv_wait_sig_swap(&dpep->dpe_cv, + &dpep->dpe_lock); + } + + dpep->dpe_refcnt++; + mutex_exit(&dpep->dpe_lock); + mutex_enter(&pcp->pc_lock); + } + /* * If we were awakened by a signal or timeout * then break the loop, else poll again. @@ -837,9 +1218,11 @@ dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) } mutex_exit(&pcp->pc_lock); + DP_SIGMASK_RESTORE(ksetp); + if (error == 0 && fdcnt > 0) { - if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll, - dp_fds), sizeof (pollfd_t) * fdcnt)) { + if (copyout(ps->ps_dpbuf, + STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { DP_REFRELE(dpep); return (EFAULT); } @@ -901,10 +1284,25 @@ static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, struct pollhead **phpp) { + minor_t minor; + dp_entry_t *dpep; + + minor = getminor(dev); + + mutex_enter(&devpoll_lock); + dpep = devpolltbl[minor]; + ASSERT(dpep != NULL); + mutex_exit(&devpoll_lock); + /* * Polling on a /dev/poll fd is not fully supported yet. */ - *reventsp = POLLERR; + if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { + /* no error in epoll compat. mode */ + *reventsp = 0; + } else { + *reventsp = POLLERR; + } return (0); } diff --git a/usr/src/uts/common/io/dld/dld_drv.c b/usr/src/uts/common/io/dld/dld_drv.c index 40cbe86170..62bc4a8ecf 100644 --- a/usr/src/uts/common/io/dld/dld_drv.c +++ b/usr/src/uts/common/io/dld/dld_drv.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent Inc. */ /* @@ -701,7 +702,8 @@ drv_ioc_prop_common(dld_ioc_macprop_t *prop, intptr_t arg, boolean_t set, err = EACCES; goto done; } - err = dls_devnet_setzid(dlh, dzp->diz_zid); + err = dls_devnet_setzid(dlh, dzp->diz_zid, + dzp->diz_transient); } else { kprop->pr_perm_flags = MAC_PROP_PERM_RW; (*(zoneid_t *)kprop->pr_val) = dls_devnet_getzid(dlh); @@ -865,7 +867,7 @@ drv_ioc_rename(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) return (err); if ((err = dls_devnet_rename(dir->dir_linkid1, dir->dir_linkid2, - dir->dir_link)) != 0) + dir->dir_link, dir->dir_zoneinit)) != 0) return (err); if (dir->dir_linkid2 == DATALINK_INVALID_LINKID) @@ -1376,7 +1378,8 @@ static dld_ioc_modentry_t dld_ioc_modtable[] = { {SIMNET_IOC, "simnet", 0, NULL, 0}, {BRIDGE_IOC, "bridge", 0, NULL, 0}, {IPTUN_IOC, "iptun", 0, NULL, 0}, - {IBPART_IOC, "ibp", -1, NULL, 0} + {IBPART_IOC, "ibp", -1, NULL, 0}, + {OVERLAY_IOC, "overlay", 0, NULL, 0} }; #define DLDIOC_CNT \ (sizeof (dld_ioc_modtable) / sizeof (dld_ioc_modentry_t)) diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index a438e43d91..79c3d8260a 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -41,7 +41,7 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_bind_req, proto_unbind_req, proto_promiscon_req, proto_promiscoff_req, proto_enabmulti_req, proto_disabmulti_req, proto_physaddr_req, proto_setphysaddr_req, proto_udqos_req, proto_req, proto_capability_req, - proto_notify_req, proto_passive_req; + proto_notify_req, proto_passive_req, proto_exclusive_req; static void proto_capability_advertise(dld_str_t *, mblk_t *); static int dld_capab_poll_disable(dld_str_t *, dld_capab_poll_t *); @@ -121,6 +121,9 @@ dld_proto(dld_str_t *dsp, mblk_t *mp) case DL_PASSIVE_REQ: proto_passive_req(dsp, mp); break; + case DL_EXCLUSIVE_REQ: + proto_exclusive_req(dsp, mp); + break; default: proto_req(dsp, mp); break; @@ -605,6 +608,10 @@ proto_promiscon_req(dld_str_t *dsp, mblk_t *mp) new_flags |= DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + new_flags |= DLS_PROMISC_RX_ONLY; + break; + default: dl_err = DL_NOTSUPPORTED; goto failed2; @@ -692,12 +699,24 @@ proto_promiscoff_req(dld_str_t *dsp, mblk_t *mp) new_flags &= ~DLS_PROMISC_PHYS; break; + case DL_PROMISC_RX_ONLY: + if (!(dsp->ds_promisc & DLS_PROMISC_RX_ONLY)) { + dl_err = DL_NOTENAB; + goto failed; + } + new_flags &= ~DLS_PROMISC_RX_ONLY; + break; + default: dl_err = DL_NOTSUPPORTED; mac_perim_exit(mph); goto failed; } + /* DLS_PROMISC_RX_ONLY can't be a solo flag */ + if (new_flags == DLS_PROMISC_RX_ONLY) + new_flags = 0; + /* * Adjust channel promiscuity. */ @@ -1295,7 +1314,8 @@ proto_passive_req(dld_str_t *dsp, mblk_t *mp) * If we've already become active by issuing an active primitive, * then it's too late to try to become passive. */ - if (dsp->ds_passivestate == DLD_ACTIVE) { + if (dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE) { dl_err = DL_OUTSTATE; goto failed; } @@ -1354,7 +1374,12 @@ dld_capab_direct(dld_str_t *dsp, void *data, uint_t flags) dls_rx_set(dsp, (dls_rx_t)direct->di_rx_cf, direct->di_rx_ch); - direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + if (direct->di_flags & DI_DIRECT_RAW) { + direct->di_tx_df = + (uintptr_t)str_mdata_raw_fastpath_put; + } else { + direct->di_tx_df = (uintptr_t)str_mdata_fastpath_put; + } direct->di_tx_dh = dsp; direct->di_tx_cb_df = (uintptr_t)mac_client_tx_notify; direct->di_tx_cb_dh = dsp->ds_mch; @@ -1516,8 +1541,9 @@ dld_capab(dld_str_t *dsp, uint_t type, void *data, uint_t flags) * completes. So we limit the check to DLD_ENABLE case. */ if ((flags == DLD_ENABLE && type != DLD_CAPAB_PERIM) && - (dsp->ds_sap != ETHERTYPE_IP || - !check_mod_above(dsp->ds_rq, "ip"))) { + ((dsp->ds_sap != ETHERTYPE_IP || + !check_mod_above(dsp->ds_rq, "ip")) && + !check_mod_above(dsp->ds_rq, "vnd"))) { return (ENOTSUP); } @@ -1599,9 +1625,15 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) } /* - * Direct capability negotiation interface between IP and DLD + * Direct capability negotiation interface between IP/VND and DLD. Note + * that for vnd we only allow the case where the media type is the + * native media type so we know that there are no transformations that + * would have to happen to the mac header that it receives. */ - if (dsp->ds_sap == ETHERTYPE_IP && check_mod_above(dsp->ds_rq, "ip")) { + if ((dsp->ds_sap == ETHERTYPE_IP && + check_mod_above(dsp->ds_rq, "ip")) || + (check_mod_above(dsp->ds_rq, "vnd") && + dsp->ds_mip->mi_media == dsp->ds_mip->mi_nativemedia)) { dld_capable = B_TRUE; subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1720,3 +1752,36 @@ dld_capabilities_disable(dld_str_t *dsp) if (dsp->ds_polling) (void) dld_capab_poll_disable(dsp, NULL); } + +static void +proto_exclusive_req(dld_str_t *dsp, mblk_t *mp) +{ + int ret = 0; + t_uscalar_t dl_err; + mac_perim_handle_t mph; + + if (dsp->ds_passivestate != DLD_UNINITIALIZED) { + dl_err = DL_OUTSTATE; + goto failed; + } + + if (MBLKL(mp) < DL_EXCLUSIVE_REQ_SIZE) { + dl_err = DL_BADPRIM; + goto failed; + } + + mac_perim_enter_by_mh(dsp->ds_mh, &mph); + ret = dls_exclusive_set(dsp, B_TRUE); + mac_perim_exit(mph); + + if (ret != 0) { + dl_err = DL_SYSERR; + goto failed; + } + + dsp->ds_passivestate = DLD_EXCLUSIVE; + dlokack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ); + return; +failed: + dlerrorack(dsp->ds_wq, mp, DL_EXCLUSIVE_REQ, dl_err, (t_uscalar_t)ret); +} diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index 6f0d0b9a6c..f5308e70ff 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -854,6 +854,77 @@ i_dld_ether_header_update_tag(mblk_t *mp, uint_t pri, uint16_t vid, return (mp); } +static boolean_t +i_dld_raw_ether_check(dld_str_t *dsp, mac_header_info_t *mhip, mblk_t **mpp) +{ + mblk_t *mp = *mpp; + mblk_t *newmp; + uint_t pri, vid, dvid; + + dvid = mac_client_vid(dsp->ds_mch); + + /* + * Discard the packet if this is a VLAN stream but the VID in + * the packet is not correct. + */ + vid = VLAN_ID(mhip->mhi_tci); + if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) + return (B_FALSE); + + /* + * Discard the packet if this packet is a tagged packet + * but both pri and VID are 0. + */ + pri = VLAN_PRI(mhip->mhi_tci); + if (mhip->mhi_istagged && !mhip->mhi_ispvid && pri == 0 && + vid == VLAN_ID_NONE) + return (B_FALSE); + + /* + * Update the priority bits to the per-stream priority if + * priority is not set in the packet. Update the VID for + * packets on a VLAN stream. + */ + pri = (pri == 0) ? dsp->ds_pri : 0; + if ((pri != 0) || (dvid != VLAN_ID_NONE)) { + if ((newmp = i_dld_ether_header_update_tag(mp, pri, + dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { + return (B_FALSE); + } + *mpp = newmp; + } + + return (B_TRUE); +} + +mac_tx_cookie_t +str_mdata_raw_fastpath_put(dld_str_t *dsp, mblk_t *mp, uintptr_t f_hint, + uint16_t flag) +{ + boolean_t is_ethernet = (dsp->ds_mip->mi_media == DL_ETHER); + mac_header_info_t mhi; + mac_tx_cookie_t cookie; + + if (mac_vlan_header_info(dsp->ds_mh, mp, &mhi) != 0) + goto discard; + + if (is_ethernet) { + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) + goto discard; + } + + if ((cookie = DLD_TX(dsp, mp, f_hint, flag)) != NULL) { + DLD_SETQFULL(dsp); + } + return (cookie); +discard: + /* TODO: bump kstat? */ + freemsg(mp); + return (NULL); +} + + + /* * M_DATA put (IP fast-path mode) */ @@ -902,7 +973,6 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) mblk_t *bp, *newmp; size_t size; mac_header_info_t mhi; - uint_t pri, vid, dvid; uint_t max_sdu; /* @@ -948,38 +1018,8 @@ str_mdata_raw_put(dld_str_t *dsp, mblk_t *mp) goto discard; if (is_ethernet) { - dvid = mac_client_vid(dsp->ds_mch); - - /* - * Discard the packet if this is a VLAN stream but the VID in - * the packet is not correct. - */ - vid = VLAN_ID(mhi.mhi_tci); - if ((dvid != VLAN_ID_NONE) && (vid != VLAN_ID_NONE)) - goto discard; - - /* - * Discard the packet if this packet is a tagged packet - * but both pri and VID are 0. - */ - pri = VLAN_PRI(mhi.mhi_tci); - if (mhi.mhi_istagged && !mhi.mhi_ispvid && pri == 0 && - vid == VLAN_ID_NONE) + if (i_dld_raw_ether_check(dsp, &mhi, &mp) == B_FALSE) goto discard; - - /* - * Update the priority bits to the per-stream priority if - * priority is not set in the packet. Update the VID for - * packets on a VLAN stream. - */ - pri = (pri == 0) ? dsp->ds_pri : 0; - if ((pri != 0) || (dvid != VLAN_ID_NONE)) { - if ((newmp = i_dld_ether_header_update_tag(mp, pri, - dvid, dsp->ds_dlp->dl_tagmode)) == NULL) { - goto discard; - } - mp = newmp; - } } if (DLD_TX(dsp, mp, 0, 0) != NULL) { diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 92993ada58..9fa649943c 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. */ /* @@ -248,19 +248,44 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) { int err = 0; uint32_t old_flags = dsp->ds_promisc; + uint32_t new_type = new_flags & ~DLS_PROMISC_RX_ONLY; mac_client_promisc_type_t mptype = MAC_CLIENT_PROMISC_ALL; + uint16_t mac_flags = 0; ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); ASSERT(!(new_flags & ~(DLS_PROMISC_SAP | DLS_PROMISC_MULTI | - DLS_PROMISC_PHYS))); + DLS_PROMISC_PHYS | DLS_PROMISC_RX_ONLY))); + + /* + * Asking us just to turn on DLS_PROMISC_RX_ONLY is not valid. + */ + if (new_flags == DLS_PROMISC_RX_ONLY) + return (EINVAL); /* * If the user has only requested DLS_PROMISC_MULTI then we need to make * sure that they don't see all packets. */ - if (new_flags == DLS_PROMISC_MULTI) + if (new_type == DLS_PROMISC_MULTI) mptype = MAC_CLIENT_PROMISC_MULTI; + /* + * Look at new flags and figure out the correct mac promisc flags. + * If we've only requested DLS_PROMISC_SAP and not _MULTI or _PHYS, + * don't turn on physical promisc mode. + */ + if (new_flags & DLS_PROMISC_RX_ONLY) + mac_flags |= MAC_PROMISC_FLAGS_NO_TX_LOOP; + if (new_type == DLS_PROMISC_SAP) + mac_flags |= MAC_PROMISC_FLAGS_NO_PHYS; + + /* + * There are three cases we care about here with respect to MAC. Going + * from nothing to something, something to nothing, something to + * something where we need to change how we're getting stuff from mac. + * In the last case, as long as they're not equal, we need to assume + * something has changed and do something about it. + */ if (dsp->ds_promisc == 0 && new_flags != 0) { /* * If only DLS_PROMISC_SAP, we don't turn on the @@ -268,9 +293,7 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, - (new_flags != DLS_PROMISC_SAP) ? 0 : - MAC_PROMISC_FLAGS_NO_PHYS); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) { dsp->ds_promisc = old_flags; return (err); @@ -296,19 +319,13 @@ dls_promisc(dld_str_t *dsp, uint32_t new_flags) MAC_CLIENT_PROMISC_ALL, dls_rx_vlan_promisc, dsp, &dsp->ds_vlan_mph, MAC_PROMISC_FLAGS_NO_PHYS); } - } else if (dsp->ds_promisc == DLS_PROMISC_SAP && new_flags != 0 && - new_flags != dsp->ds_promisc) { - /* - * If the old flag is PROMISC_SAP, but the current flag has - * changed to some new non-zero value, we need to turn the - * physical promiscuous mode. - */ + } else if (new_flags != 0 && new_flags != old_flags) { ASSERT(dsp->ds_mph != NULL); mac_promisc_remove(dsp->ds_mph); /* Honors both after-remove and before-add semantics! */ dsp->ds_promisc = new_flags; err = mac_promisc_add(dsp->ds_mch, mptype, - dls_rx_promisc, dsp, &dsp->ds_mph, 0); + dls_rx_promisc, dsp, &dsp->ds_mph, mac_flags); if (err != 0) dsp->ds_promisc = old_flags; } else { @@ -629,6 +646,22 @@ boolean_t dls_accept_promisc(dld_str_t *dsp, mac_header_info_t *mhip, dls_rx_t *ds_rx, void **ds_rx_arg, boolean_t loopback) { + if (dsp->ds_promisc == 0) { + /* + * If there are active walkers of the mi_promisc_list when + * promiscuousness is disabled, ds_promisc will be cleared, + * but the DLS will remain on the mi_promisc_list until the + * walk is completed. If we do not recognize this case here, + * we won't properly execute the ds_promisc case in the common + * accept routine -- and we will potentially accept a packet + * that has originated with this DLS (which in turn can + * induce recursion and death by stack overflow). If + * ds_promisc is zero, we know that we are in this window -- + * and we refuse to accept the packet. + */ + return (B_FALSE); + } + return (dls_accept_common(dsp, mhip, ds_rx, ds_rx_arg, B_TRUE, loopback)); } @@ -659,7 +692,10 @@ dls_mac_active_set(dls_link_t *dlp) * Set the function to start receiving packets. */ mac_rx_set(dlp->dl_mch, i_dls_link_rx, dlp); + } else if (dlp->dl_exclusive == B_TRUE) { + return (EBUSY); } + dlp->dl_nactive++; return (0); } @@ -685,7 +721,11 @@ dls_active_set(dld_str_t *dsp) if (dsp->ds_passivestate == DLD_PASSIVE) return (0); - /* If we're already active, then there's nothing more to do. */ + if (dsp->ds_dlp->dl_exclusive == B_TRUE && + dsp->ds_passivestate != DLD_EXCLUSIVE) + return (EBUSY); + + /* If we're already active, we need to check the link's exclusivity */ if ((dsp->ds_nactive == 0) && ((err = dls_mac_active_set(dsp->ds_dlp)) != 0)) { /* except for ENXIO all other errors are mapped to EBUSY */ @@ -694,7 +734,8 @@ dls_active_set(dld_str_t *dsp) return (err); } - dsp->ds_passivestate = DLD_ACTIVE; + dsp->ds_passivestate = dsp->ds_dlp->dl_exclusive == B_TRUE ? + DLD_EXCLUSIVE : DLD_ACTIVE; dsp->ds_nactive++; return (0); } @@ -725,7 +766,32 @@ dls_active_clear(dld_str_t *dsp, boolean_t all) if (dsp->ds_nactive != 0) return; - ASSERT(dsp->ds_passivestate == DLD_ACTIVE); + ASSERT(dsp->ds_passivestate == DLD_ACTIVE || + dsp->ds_passivestate == DLD_EXCLUSIVE); dls_mac_active_clear(dsp->ds_dlp); + /* + * We verify below to ensure that no other part of DLS has mucked with + * our exclusive state. + */ + if (dsp->ds_passivestate == DLD_EXCLUSIVE) + VERIFY(dls_exclusive_set(dsp, B_FALSE) == 0); dsp->ds_passivestate = DLD_UNINITIALIZED; } + +int +dls_exclusive_set(dld_str_t *dsp, boolean_t enable) +{ + ASSERT(MAC_PERIM_HELD(dsp->ds_mh)); + + if (enable == B_FALSE) { + dsp->ds_dlp->dl_exclusive = B_FALSE; + return (0); + } + + if (dsp->ds_dlp->dl_nactive != 0) + return (EBUSY); + + dsp->ds_dlp->dl_exclusive = B_TRUE; + + return (0); +} diff --git a/usr/src/uts/common/io/dls/dls_link.c b/usr/src/uts/common/io/dls/dls_link.c index 6b92a81e77..c5363e8194 100644 --- a/usr/src/uts/common/io/dls/dls_link.c +++ b/usr/src/uts/common/io/dls/dls_link.c @@ -602,6 +602,7 @@ i_dls_link_destroy(dls_link_t *dlp) dlp->dl_mip = NULL; dlp->dl_unknowns = 0; dlp->dl_nonip_cnt = 0; + dlp->dl_exclusive = B_FALSE; kmem_cache_free(i_dls_link_cachep, dlp); } diff --git a/usr/src/uts/common/io/dls/dls_mgmt.c b/usr/src/uts/common/io/dls/dls_mgmt.c index 049c4bd757..6111d62475 100644 --- a/usr/src/uts/common/io/dls/dls_mgmt.c +++ b/usr/src/uts/common/io/dls/dls_mgmt.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2013 Joyent, Inc. All rights reserved. */ /* @@ -105,12 +106,13 @@ typedef struct dls_devnet_s { zoneid_t dd_zid; /* current zone */ boolean_t dd_prop_loaded; taskqid_t dd_prop_taskid; + boolean_t dd_transient; /* link goes away when zone does */ } dls_devnet_t; static int i_dls_devnet_create_iptun(const char *, const char *, datalink_id_t *); static int i_dls_devnet_destroy_iptun(datalink_id_t); -static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t); +static int i_dls_devnet_setzid(dls_devnet_t *, zoneid_t, boolean_t, boolean_t); static int dls_devnet_unset(const char *, datalink_id_t *, boolean_t); /*ARGSUSED*/ @@ -145,7 +147,12 @@ dls_zone_remove(datalink_id_t linkid, void *arg) dls_devnet_t *ddp; if (dls_devnet_hold_tmp(linkid, &ddp) == 0) { - (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID); + /* + * Don't bother moving transient links back to the global zone + * since we will simply delete them in dls_devnet_unset. + */ + if (!ddp->dd_transient) + (void) dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); dls_devnet_rele_tmp(ddp); } return (0); @@ -526,6 +533,27 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = getzoneid(); + + if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, + sizeof (retval))) == 0) { + *linkid = retval.lr_linkid; + } + return (err); +} + +int +dls_mgmt_get_linkid_in_zone(const char *link, datalink_id_t *linkid, + zoneid_t zid) +{ + dlmgmt_door_getlinkid_t getlinkid; + dlmgmt_getlinkid_retval_t retval; + int err; + + ASSERT(getzoneid() == GLOBAL_ZONEID || zid == getzoneid()); + getlinkid.ld_cmd = DLMGMT_CMD_GETLINKID; + (void) strlcpy(getlinkid.ld_link, link, MAXLINKNAMELEN); + getlinkid.ld_zoneid = zid; if ((err = i_dls_mgmt_upcall(&getlinkid, sizeof (getlinkid), &retval, sizeof (retval))) == 0) { @@ -534,6 +562,7 @@ dls_mgmt_get_linkid(const char *link, datalink_id_t *linkid) return (err); } + datalink_id_t dls_mgmt_get_next(datalink_id_t linkid, datalink_class_t class, datalink_media_t dmedia, uint32_t flags) @@ -740,12 +769,23 @@ dls_devnet_stat_update(kstat_t *ksp, int rw) * Create the "link" kstats. */ static void -dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid) +dls_devnet_stat_create(dls_devnet_t *ddp, zoneid_t zoneid, zoneid_t newzoneid) { kstat_t *ksp; + char *nm; + char kname[MAXLINKNAMELEN]; + + if (zoneid != newzoneid) { + ASSERT(zoneid == GLOBAL_ZONEID); + (void) snprintf(kname, sizeof (kname), "z%d_%s", newzoneid, + ddp->dd_linkname); + nm = kname; + } else { + nm = ddp->dd_linkname; + } - if (dls_stat_create("link", 0, ddp->dd_linkname, zoneid, - dls_devnet_stat_update, ddp, &ksp) == 0) { + if (dls_stat_create("link", 0, nm, zoneid, + dls_devnet_stat_update, ddp, &ksp, newzoneid) == 0) { ASSERT(ksp != NULL); if (zoneid == ddp->dd_owner_zid) { ASSERT(ddp->dd_ksp == NULL); @@ -765,12 +805,12 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) { if (zoneid == ddp->dd_owner_zid) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } } else { if (ddp->dd_zone_ksp != NULL) { - kstat_delete(ddp->dd_zone_ksp); + dls_stat_delete(ddp->dd_zone_ksp); ddp->dd_zone_ksp = NULL; } } @@ -781,15 +821,25 @@ dls_devnet_stat_destroy(dls_devnet_t *ddp, zoneid_t zoneid) * and create the new set using the new name. */ static void -dls_devnet_stat_rename(dls_devnet_t *ddp) +dls_devnet_stat_rename(dls_devnet_t *ddp, boolean_t zoneinit) { if (ddp->dd_ksp != NULL) { - kstat_delete(ddp->dd_ksp); + dls_stat_delete(ddp->dd_ksp); ddp->dd_ksp = NULL; } - /* We can't rename a link while it's assigned to a non-global zone. */ + if (zoneinit && ddp->dd_zone_ksp != NULL) { + dls_stat_delete(ddp->dd_zone_ksp); + ddp->dd_zone_ksp = NULL; + } + /* + * We can't rename a link while it's assigned to a non-global zone + * unless we're first initializing the zone while readying it. + */ ASSERT(ddp->dd_zone_ksp == NULL); - dls_devnet_stat_create(ddp, ddp->dd_owner_zid); + dls_devnet_stat_create(ddp, ddp->dd_owner_zid, + (zoneinit ? ddp->dd_zid : ddp->dd_owner_zid)); + if (zoneinit) + dls_devnet_stat_create(ddp, ddp->dd_zid, ddp->dd_zid); } /* @@ -878,7 +928,8 @@ done: rw_exit(&i_dls_devnet_lock); if (err == 0) { if (zoneid != GLOBAL_ZONEID && - (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE)) != 0) + (err = i_dls_devnet_setzid(ddp, zoneid, B_FALSE, + B_FALSE)) != 0) (void) dls_devnet_unset(macname, &linkid, B_TRUE); /* * The kstat subsystem holds its own locks (rather perimeter) @@ -887,7 +938,7 @@ done: * lock hierarchy is kstat locks -> i_dls_devnet_lock. */ if (stat_create) - dls_devnet_stat_create(ddp, zoneid); + dls_devnet_stat_create(ddp, zoneid, zoneid); if (ddpp != NULL) *ddpp = ddp; } @@ -924,17 +975,78 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_ref != 0); if ((ddp->dd_ref != 1) || (!wait && (ddp->dd_tref != 0 || ddp->dd_prop_taskid != NULL))) { - mutex_exit(&ddp->dd_mutex); - rw_exit(&i_dls_devnet_lock); - return (EBUSY); + int zstatus = 0; + + /* + * There are a couple of alternatives that might be going on + * here; a) the zone is shutting down and it has a transient + * link assigned, in which case we want to clean it up instead + * of moving it back to the global zone, or b) its possible + * that we're trying to clean up an orphaned vnic that was + * delegated to a zone and which wasn't cleaned up properly + * when the zone went away. Check for either of these cases + * before we simply return EBUSY. + * + * zstatus indicates which situation we are dealing with: + * 0 - means return EBUSY + * 1 - means case (a), cleanup transient link + * -1 - means case (b), orphained VNIC + */ + if (ddp->dd_ref > 1 && ddp->dd_zid != GLOBAL_ZONEID) { + zone_t *zp; + + if ((zp = zone_find_by_id(ddp->dd_zid)) == NULL) { + zstatus = -1; + } else { + if (ddp->dd_transient) { + zone_status_t s = zone_status_get(zp); + + if (s >= ZONE_IS_SHUTTING_DOWN) + zstatus = 1; + } + zone_rele(zp); + } + } + + if (zstatus == 0) { + mutex_exit(&ddp->dd_mutex); + rw_exit(&i_dls_devnet_lock); + return (EBUSY); + } + + /* + * We want to delete the link, reset ref to 1; + */ + if (zstatus == -1) + /* Log a warning, but continue in this case */ + cmn_err(CE_WARN, "clear orphaned datalink: %s\n", + ddp->dd_linkname); + ddp->dd_ref = 1; } ddp->dd_flags |= DD_CONDEMNED; ddp->dd_ref--; *id = ddp->dd_linkid; - if (ddp->dd_zid != GLOBAL_ZONEID) - (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE); + if (ddp->dd_zid != GLOBAL_ZONEID) { + /* + * We need to release the dd_mutex before we try and destroy the + * stat. When we destroy it, we'll need to grab the lock for the + * kstat but if there's a concurrent reader of the kstat, we'll + * be blocked on it. This will lead to deadlock because these + * kstats employ a ks_update function (dls_devnet_stat_update) + * which needs the dd_mutex that we currently hold. + * + * Because we've already flagged the dls_devnet_t as + * DD_CONDEMNED and we still have a write lock on + * i_dls_devnet_lock, we should be able to release the dd_mutex. + */ + mutex_exit(&ddp->dd_mutex); + dls_devnet_stat_destroy(ddp, ddp->dd_zid); + mutex_enter(&ddp->dd_mutex); + (void) i_dls_devnet_setzid(ddp, GLOBAL_ZONEID, B_FALSE, + B_FALSE); + } /* * Remove this dls_devnet_t from the hash table. @@ -960,8 +1072,15 @@ dls_devnet_unset(const char *macname, datalink_id_t *id, boolean_t wait) ASSERT(ddp->dd_tref == 0 && ddp->dd_prop_taskid == NULL); } - if (ddp->dd_linkid != DATALINK_INVALID_LINKID) + if (ddp->dd_linkid != DATALINK_INVALID_LINKID) { + /* + * See the earlier call in this function for an explanation. + */ + mutex_exit(&ddp->dd_mutex); dls_devnet_stat_destroy(ddp, ddp->dd_owner_zid); + mutex_enter(&ddp->dd_mutex); + } + ddp->dd_prop_loaded = B_FALSE; ddp->dd_linkid = DATALINK_INVALID_LINKID; @@ -1111,7 +1230,7 @@ dls_devnet_rele(dls_devnet_t *ddp) } static int -dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) +dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp, zoneid_t zid) { char drv[MAXLINKNAMELEN]; uint_t ppa; @@ -1121,7 +1240,7 @@ dls_devnet_hold_by_name(const char *link, dls_devnet_t **ddpp) dls_dev_handle_t ddh; int err; - if ((err = dls_mgmt_get_linkid(link, &linkid)) == 0) + if ((err = dls_mgmt_get_linkid_in_zone(link, &linkid, zid)) == 0) return (dls_devnet_hold(linkid, ddpp)); /* @@ -1261,9 +1380,15 @@ dls_devnet_phydev(datalink_id_t vlanid, dev_t *devp) * * This case does not change the <link name, linkid> mapping, so the link's * kstats need to be updated with using name associated the given id2. + * + * The zonename parameter is used to allow us to create a VNIC in the global + * zone which is assigned to a non-global zone. Since there is a race condition + * in the create process if two VNICs have the same name, we need to rename it + * after it has been assigned to the zone. */ int -dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) +dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link, + boolean_t zoneinit) { dls_dev_handle_t ddh = NULL; int err = 0; @@ -1313,13 +1438,16 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) * is currently accessing the link kstats, or if the link is on-loan * to a non-global zone. Then set the DD_KSTAT_CHANGING flag to * prevent any access to the kstats while we delete and recreate - * kstats below. + * kstats below. However, we skip this check if we're renaming the + * vnic as part of bringing it up for a zone. */ mutex_enter(&ddp->dd_mutex); - if (ddp->dd_ref > 1) { - mutex_exit(&ddp->dd_mutex); - err = EBUSY; - goto done; + if (!zoneinit) { + if (ddp->dd_ref > 1) { + mutex_exit(&ddp->dd_mutex); + err = EBUSY; + goto done; + } } ddp->dd_flags |= DD_KSTAT_CHANGING; @@ -1333,7 +1461,15 @@ dls_devnet_rename(datalink_id_t id1, datalink_id_t id2, const char *link) /* rename mac client name and its flow if exists */ if ((err = mac_open(ddp->dd_mac, &mh)) != 0) goto done; - (void) mac_rename_primary(mh, link); + if (zoneinit) { + char tname[MAXLINKNAMELEN]; + + (void) snprintf(tname, sizeof (tname), "z%d_%s", + ddp->dd_zid, link); + (void) mac_rename_primary(mh, tname); + } else { + (void) mac_rename_primary(mh, link); + } mac_close(mh); goto done; } @@ -1406,7 +1542,7 @@ done: */ rw_exit(&i_dls_devnet_lock); if (err == 0) - dls_devnet_stat_rename(ddp); + dls_devnet_stat_rename(ddp, zoneinit); if (clear_dd_flag) { mutex_enter(&ddp->dd_mutex); @@ -1421,7 +1557,8 @@ done: } static int -i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) +i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop, + boolean_t transient) { int err; mac_perim_handle_t mph; @@ -1454,6 +1591,7 @@ i_dls_devnet_setzid(dls_devnet_t *ddp, zoneid_t new_zoneid, boolean_t setprop) } if ((err = dls_link_setzid(ddp->dd_mac, new_zoneid)) == 0) { ddp->dd_zid = new_zoneid; + ddp->dd_transient = transient; devnet_need_rebuild = B_TRUE; } @@ -1468,7 +1606,7 @@ done: } int -dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) +dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid, boolean_t transient) { dls_devnet_t *ddp; int err; @@ -1490,7 +1628,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) refheld = B_TRUE; } - if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE)) != 0) { + if ((err = i_dls_devnet_setzid(ddh, new_zid, B_TRUE, transient)) != 0) { if (refheld) dls_devnet_rele(ddp); return (err); @@ -1507,7 +1645,7 @@ dls_devnet_setzid(dls_dl_handle_t ddh, zoneid_t new_zid) if (old_zid != GLOBAL_ZONEID) dls_devnet_stat_destroy(ddh, old_zid); if (new_zid != GLOBAL_ZONEID) - dls_devnet_stat_create(ddh, new_zid); + dls_devnet_stat_create(ddh, new_zid, new_zid); return (0); } @@ -1545,15 +1683,19 @@ dls_devnet_islinkvisible(datalink_id_t linkid, zoneid_t zoneid) * Access a vanity naming node. */ int -dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +dls_devnet_open_in_zone(const char *link, dls_dl_handle_t *dhp, dev_t *devp, + zoneid_t zid) { dls_devnet_t *ddp; dls_link_t *dlp; - zoneid_t zid = getzoneid(); + zoneid_t czid = getzoneid(); int err; mac_perim_handle_t mph; - if ((err = dls_devnet_hold_by_name(link, &ddp)) != 0) + if (czid != GLOBAL_ZONEID && czid != zid) + return (ENOENT); + + if ((err = dls_devnet_hold_by_name(link, &ddp, zid)) != 0) return (err); dls_devnet_prop_task_wait(ddp); @@ -1586,6 +1728,12 @@ dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) return (0); } +int +dls_devnet_open(const char *link, dls_dl_handle_t *dhp, dev_t *devp) +{ + return (dls_devnet_open_in_zone(link, dhp, devp, getzoneid())); +} + /* * Close access to a vanity naming node. */ diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index 51e4be7260..82dceff278 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* @@ -30,30 +31,33 @@ #include <sys/dld_impl.h> #include <sys/mac_ether.h> -static mac_stat_info_t i_dls_si[] = { - { MAC_STAT_IFSPEED, "ifspeed", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_MULTIRCV, "multircv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTRCV, "brdcstrcv", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_MULTIXMT, "multixmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_BRDCSTXMT, "brdcstxmt", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NORCVBUF, "norcvbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IERRORS, "ierrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_NOXMTBUF, "noxmtbuf", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OERRORS, "oerrors", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_COLLISIONS, "collisions", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_IPACKETS, "ipackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OBYTES, "obytes", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_OPACKETS, "opackets", KSTAT_DATA_UINT32, 0 }, - { MAC_STAT_RBYTES, "rbytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_IPACKETS, "ipackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OBYTES, "obytes64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_OPACKETS, "opackets64", KSTAT_DATA_UINT64, 0 }, - { MAC_STAT_LINK_STATE, "link_state", KSTAT_DATA_UINT32, - (uint64_t)LINK_STATE_UNKNOWN} -}; - -#define STAT_INFO_COUNT (sizeof (i_dls_si) / sizeof (i_dls_si[0])) +/* + * structure for link kstats + */ +typedef struct { + kstat_named_t dk_ifspeed; + kstat_named_t dk_multircv; + kstat_named_t dk_brdcstrcv; + kstat_named_t dk_multixmt; + kstat_named_t dk_brdcstxmt; + kstat_named_t dk_norcvbuf; + kstat_named_t dk_ierrors; + kstat_named_t dk_noxmtbuf; + kstat_named_t dk_oerrors; + kstat_named_t dk_collisions; + kstat_named_t dk_rbytes; + kstat_named_t dk_ipackets; + kstat_named_t dk_obytes; + kstat_named_t dk_opackets; + kstat_named_t dk_rbytes64; + kstat_named_t dk_ipackets64; + kstat_named_t dk_obytes64; + kstat_named_t dk_opackets64; + kstat_named_t dk_link_state; + kstat_named_t dk_link_duplex; + kstat_named_t dk_unknowns; + kstat_named_t dk_zonename; +} dls_kstat_t; /* * Exported functions. @@ -61,42 +65,54 @@ static mac_stat_info_t i_dls_si[] = { int dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) { - kstat_named_t *knp; - uint_t i; - uint64_t val; + dls_kstat_t *dkp = ksp->ks_data; if (rw != KSTAT_READ) return (EACCES); - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - val = mac_stat_get(dlp->dl_mh, i_dls_si[i].msi_stat); - - switch (i_dls_si[i].msi_type) { - case KSTAT_DATA_UINT64: - knp->value.ui64 = val; - break; - case KSTAT_DATA_UINT32: - knp->value.ui32 = (uint32_t)val; - break; - default: - ASSERT(B_FALSE); - } - - knp++; - } + dkp->dk_ifspeed.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_IFSPEED); + dkp->dk_multircv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIRCV); + dkp->dk_brdcstrcv.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTRCV); + dkp->dk_multixmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_MULTIXMT); + dkp->dk_brdcstxmt.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_BRDCSTXMT); + dkp->dk_norcvbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NORCVBUF); + dkp->dk_ierrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_IERRORS); + dkp->dk_noxmtbuf.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_NOXMTBUF); + dkp->dk_oerrors.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OERRORS); + dkp->dk_collisions.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_COLLISIONS); + dkp->dk_rbytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes.value.ui32 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_rbytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_RBYTES); + dkp->dk_ipackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_IPACKETS); + dkp->dk_obytes64.value.ui64 = mac_stat_get(dlp->dl_mh, MAC_STAT_OBYTES); + dkp->dk_opackets64.value.ui64 = mac_stat_get(dlp->dl_mh, + MAC_STAT_OPACKETS); + dkp->dk_link_state.value.ui32 = mac_stat_get(dlp->dl_mh, + MAC_STAT_LINK_STATE); /* * Ethernet specific kstat "link_duplex" */ if (dlp->dl_mip->mi_nativemedia != DL_ETHER) { - knp->value.ui32 = LINK_DUPLEX_UNKNOWN; + dkp->dk_link_duplex.value.ui32 = LINK_DUPLEX_UNKNOWN; } else { - val = mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); - knp->value.ui32 = (uint32_t)val; + dkp->dk_link_duplex.value.ui32 = + (uint32_t)mac_stat_get(dlp->dl_mh, ETHER_STAT_LINK_DUPLEX); } - knp++; - knp->value.ui32 = dlp->dl_unknowns; + + dkp->dk_unknowns.value.ui32 = dlp->dl_unknowns; return (0); } @@ -104,30 +120,66 @@ dls_stat_update(kstat_t *ksp, dls_link_t *dlp, int rw) int dls_stat_create(const char *module, int instance, const char *name, zoneid_t zoneid, int (*update)(struct kstat *, int), void *private, - kstat_t **kspp) + kstat_t **kspp, zoneid_t newzoneid) { kstat_t *ksp; - kstat_named_t *knp; - uint_t i; + zone_t *zone; + dls_kstat_t *dkp; if ((ksp = kstat_create_zone(module, instance, name, "net", - KSTAT_TYPE_NAMED, STAT_INFO_COUNT + 2, 0, zoneid)) == NULL) { + KSTAT_TYPE_NAMED, sizeof (dls_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zoneid)) == NULL) { return (EINVAL); } ksp->ks_update = update; ksp->ks_private = private; + dkp = ksp->ks_data = kmem_zalloc(sizeof (dls_kstat_t), KM_SLEEP); + if ((zone = zone_find_by_id(newzoneid)) != NULL) { + ksp->ks_data_size += strlen(zone->zone_name) + 1; + } - knp = (kstat_named_t *)ksp->ks_data; - for (i = 0; i < STAT_INFO_COUNT; i++) { - kstat_named_init(knp, i_dls_si[i].msi_name, - i_dls_si[i].msi_type); - knp++; + kstat_named_init(&dkp->dk_ifspeed, "ifspeed", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_multircv, "multircv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstrcv, "brdcstrcv", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_multixmt, "multixmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_brdcstxmt, "brdcstxmt", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_norcvbuf, "norcvbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ierrors, "ierrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_noxmtbuf, "noxmtbuf", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_oerrors, "oerrors", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_collisions, "collisions", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes, "rbytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_ipackets, "ipackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_obytes, "obytes", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_opackets, "opackets", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_rbytes64, "rbytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_ipackets64, "ipackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_obytes64, "obytes64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_opackets64, "opackets64", KSTAT_DATA_UINT64); + kstat_named_init(&dkp->dk_link_state, "link_state", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_unknowns, "unknowns", KSTAT_DATA_UINT32); + kstat_named_init(&dkp->dk_zonename, "zonename", KSTAT_DATA_STRING); + + if (zone != NULL) { + kstat_named_setstr(&dkp->dk_zonename, zone->zone_name); + zone_rele(zone); } - kstat_named_init(knp++, "link_duplex", KSTAT_DATA_UINT32); - kstat_named_init(knp, "unknowns", KSTAT_DATA_UINT32); kstat_install(ksp); *kspp = ksp; return (0); } + +void +dls_stat_delete(kstat_t *ksp) +{ + void *data; + if (ksp != NULL) { + data = ksp->ks_data; + kstat_delete(ksp); + kmem_free(data, sizeof (dls_kstat_t)); + } +} diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE new file mode 100644 index 0000000000..00aefb6f51 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE @@ -0,0 +1,32 @@ +/* + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ diff --git a/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..ac6d2d1b15 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +DR_SAS DRIVER diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.c b/usr/src/uts/common/io/dr_sas/dr_sas.c new file mode 100644 index 0000000000..5b1dc82938 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.c @@ -0,0 +1,5506 @@ +/* + * dr_sas.c: source for dr_sas driver + * + * MegaRAID device driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Version: + * Author: + * Arun Chandrashekhar + * Manju R + * Rajesh Prabhakaran + * Seokmann Ju + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/cred.h> +#include <sys/modctl.h> +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/cmn_err.h> +#include <sys/kmem.h> +#include <sys/stat.h> +#include <sys/mkdev.h> +#include <sys/pci.h> +#include <sys/scsi/scsi.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/atomic.h> +#include <sys/signal.h> +#include <sys/fs/dv_node.h> /* devfs_clean */ + +#include "dr_sas.h" + +/* + * FMA header files + */ +#include <sys/ddifm.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/fm/io/ddi.h> + +/* + * Local static data + */ +static void *drsas_state = NULL; +static int debug_level_g = CL_NONE; + +#pragma weak scsi_hba_open +#pragma weak scsi_hba_close +#pragma weak scsi_hba_ioctl + +static ddi_dma_attr_t drsas_generic_dma_attr = { + DMA_ATTR_V0, /* dma_attr_version */ + 0, /* low DMA address range */ + 0xFFFFFFFFU, /* high DMA address range */ + 0xFFFFFFFFU, /* DMA counter register */ + 8, /* DMA address alignment */ + 0x07, /* DMA burstsizes */ + 1, /* min DMA size */ + 0xFFFFFFFFU, /* max DMA size */ + 0xFFFFFFFFU, /* segment boundary */ + DRSAS_MAX_SGE_CNT, /* dma_attr_sglen */ + 512, /* granularity of device */ + 0 /* bus specific DMA flags */ +}; + +int32_t drsas_max_cap_maxxfer = 0x1000000; + +/* + * cb_ops contains base level routines + */ +static struct cb_ops drsas_cb_ops = { + drsas_open, /* open */ + drsas_close, /* close */ + nodev, /* strategy */ + nodev, /* print */ + nodev, /* dump */ + nodev, /* read */ + nodev, /* write */ + drsas_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + nochpoll, /* poll */ + nodev, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_HOTPLUG, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev /* cb_awrite */ +}; + +/* + * dev_ops contains configuration routines + */ +static struct dev_ops drsas_ops = { + DEVO_REV, /* rev, */ + 0, /* refcnt */ + drsas_getinfo, /* getinfo */ + nulldev, /* identify */ + nulldev, /* probe */ + drsas_attach, /* attach */ + drsas_detach, /* detach */ + drsas_reset, /* reset */ + &drsas_cb_ops, /* char/block ops */ + NULL, /* bus ops */ + NULL, /* power */ + ddi_quiesce_not_supported, /* quiesce */ +}; + +char _depends_on[] = "misc/scsi"; + +static struct modldrv modldrv = { + &mod_driverops, /* module type - driver */ + DRSAS_VERSION, + &drsas_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* ml_rev - must be MODREV_1 */ + &modldrv, /* ml_linkage */ + NULL /* end of driver linkage */ +}; + +static struct ddi_device_acc_attr endian_attr = { + DDI_DEVICE_ATTR_V0, + DDI_STRUCTURE_LE_ACC, + DDI_STRICTORDER_ACC +}; + + +/* + * ************************************************************************** * + * * + * common entry points - for loadable kernel modules * + * * + * ************************************************************************** * + */ + +int +_init(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ret = ddi_soft_state_init(&drsas_state, + sizeof (struct drsas_instance), 0); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init state")); + return (ret); + } + + if ((ret = scsi_hba_init(&modlinkage)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: could not init scsi hba")); + ddi_soft_state_fini(&drsas_state); + return (ret); + } + + ret = mod_install(&modlinkage); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: mod_install failed")); + scsi_hba_fini(&modlinkage); + ddi_soft_state_fini(&drsas_state); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if ((ret = mod_remove(&modlinkage)) != DDI_SUCCESS) + return (ret); + + scsi_hba_fini(&modlinkage); + + ddi_soft_state_fini(&drsas_state); + + return (ret); +} + + +/* + * ************************************************************************** * + * * + * common entry points - for autoconfiguration * + * * + * ************************************************************************** * + */ + +static int +drsas_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int instance_no; + int nregs; + uint8_t added_isr_f = 0; + uint8_t added_soft_isr_f = 0; + uint8_t create_devctl_node_f = 0; + uint8_t create_scsi_node_f = 0; + uint8_t create_ioc_node_f = 0; + uint8_t tran_alloc_f = 0; + uint8_t irq; + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + uint16_t command; + off_t reglength = 0; + int intr_types = 0; + char *data; + int msi_enable = 0; + + scsi_hba_tran_t *tran; + ddi_dma_attr_t tran_dma_attr; + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + /* + * check to see whether this device is in a DMA-capable slot. + */ + if (ddi_slaveonly(dip) == DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Device in slave-only slot, unused", + instance_no)); + return (DDI_FAILURE); + } + + switch (cmd) { + case DDI_ATTACH: + con_log(CL_DLEVEL1, (CE_NOTE, "dr_sas: DDI_ATTACH")); + /* allocate the soft state for the instance */ + if (ddi_soft_state_zalloc(drsas_state, instance_no) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to allocate soft state", + instance_no)); + + return (DDI_FAILURE); + } + + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + if (instance == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Bad soft state", instance_no)); + + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + bzero((caddr_t)instance, + sizeof (struct drsas_instance)); + + instance->func_ptr = kmem_zalloc( + sizeof (struct drsas_func_ptr), KM_SLEEP); + ASSERT(instance->func_ptr); + + /* Setup the PCI configuration space handles */ + if (pci_config_setup(dip, &instance->pci_handle) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: pci config setup failed ", + instance_no)); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to get registers.")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + vendor_id = pci_config_get16(instance->pci_handle, + PCI_CONF_VENID); + device_id = pci_config_get16(instance->pci_handle, + PCI_CONF_DEVID); + + subsysvid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBVENID); + subsysid = pci_config_get16(instance->pci_handle, + PCI_CONF_SUBSYSID); + + pci_config_put16(instance->pci_handle, PCI_CONF_COMM, + (pci_config_get16(instance->pci_handle, + PCI_CONF_COMM) | PCI_COMM_ME)); + irq = pci_config_get8(instance->pci_handle, + PCI_CONF_ILINE); + + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "0x%x:0x%x 0x%x:0x%x, irq:%d drv-ver:%s", + instance_no, vendor_id, device_id, subsysvid, + subsysid, irq, DRSAS_VERSION)); + + /* enable bus-mastering */ + command = pci_config_get16(instance->pci_handle, + PCI_CONF_COMM); + + if (!(command & PCI_COMM_ME)) { + command |= PCI_COMM_ME; + + pci_config_put16(instance->pci_handle, + PCI_CONF_COMM, command); + + con_log(CL_ANN, (CE_CONT, "dr_sas%d: " + "enable bus-mastering", instance_no)); + } else { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "bus-mastering already set", instance_no)); + } + + /* initialize function pointers */ + if ((device_id == PCI_DEVICE_ID_LSI_2108VDE) || + (device_id == PCI_DEVICE_ID_LSI_2108V)) { + con_log(CL_DLEVEL1, (CE_CONT, "dr_sas%d: " + "2108V/DE detected", instance_no)); + instance->func_ptr->read_fw_status_reg = + read_fw_status_reg_ppc; + instance->func_ptr->issue_cmd = issue_cmd_ppc; + instance->func_ptr->issue_cmd_in_sync_mode = + issue_cmd_in_sync_mode_ppc; + instance->func_ptr->issue_cmd_in_poll_mode = + issue_cmd_in_poll_mode_ppc; + instance->func_ptr->enable_intr = + enable_intr_ppc; + instance->func_ptr->disable_intr = + disable_intr_ppc; + instance->func_ptr->intr_ack = intr_ack_ppc; + } else { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Invalid device detected")); + + pci_config_teardown(&instance->pci_handle); + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + ddi_soft_state_free(drsas_state, instance_no); + + return (DDI_FAILURE); + } + + instance->baseaddress = pci_config_get32( + instance->pci_handle, PCI_CONF_BASE0); + instance->baseaddress &= 0x0fffc; + + instance->dip = dip; + instance->vendor_id = vendor_id; + instance->device_id = device_id; + instance->subsysvid = subsysvid; + instance->subsysid = subsysid; + instance->instance = instance_no; + + /* Initialize FMA */ + instance->fm_capabilities = ddi_prop_get_int( + DDI_DEV_T_ANY, instance->dip, DDI_PROP_DONTPASS, + "fm-capable", DDI_FM_EREPORT_CAPABLE | + DDI_FM_ACCCHK_CAPABLE | DDI_FM_DMACHK_CAPABLE + | DDI_FM_ERRCB_CAPABLE); + + drsas_fm_init(instance); + + /* Initialize Interrupts */ + if ((ddi_dev_regsize(instance->dip, + REGISTER_SET_IO_2108, ®length) != DDI_SUCCESS) || + reglength < MINIMUM_MFI_MEM_SZ) { + return (DDI_FAILURE); + } + if (reglength > DEFAULT_MFI_MEM_SZ) { + reglength = DEFAULT_MFI_MEM_SZ; + con_log(CL_DLEVEL1, (CE_NOTE, + "dr_sas: register length to map is " + "0x%lx bytes", reglength)); + } + if (ddi_regs_map_setup(instance->dip, + REGISTER_SET_IO_2108, &instance->regmap, 0, + reglength, &endian_attr, &instance->regmap_handle) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: couldn't map control registers")); + goto fail_attach; + } + + /* + * Disable Interrupt Now. + * Setup Software interrupt + */ + instance->func_ptr->disable_intr(instance); + + msi_enable = 0; + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, 0, + "drsas-enable-msi", &data) == DDI_SUCCESS) { + if (strncmp(data, "yes", 3) == 0) { + msi_enable = 1; + con_log(CL_ANN, (CE_WARN, + "msi_enable = %d ENABLED", + msi_enable)); + } + ddi_prop_free(data); + } + + con_log(CL_DLEVEL1, (CE_WARN, "msi_enable = %d", + msi_enable)); + + /* Check for all supported interrupt types */ + if (ddi_intr_get_supported_types( + dip, &intr_types) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "ddi_intr_get_supported_types() failed")); + goto fail_attach; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "ddi_intr_get_supported_types() ret: 0x%x", + intr_types)); + + /* Initialize and Setup Interrupt handler */ + if (msi_enable && (intr_types & DDI_INTR_TYPE_MSIX)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSIX) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSIX interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSIX; + } else if (msi_enable && (intr_types & + DDI_INTR_TYPE_MSI)) { + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_MSI) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "MSI interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_MSI; + } else if (intr_types & DDI_INTR_TYPE_FIXED) { + msi_enable = 0; + if (drsas_add_intrs(instance, + DDI_INTR_TYPE_FIXED) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "FIXED interrupt query failed")); + goto fail_attach; + } + instance->intr_type = DDI_INTR_TYPE_FIXED; + } else { + con_log(CL_ANN, (CE_WARN, "Device cannot " + "suppport either FIXED or MSI/X " + "interrupts")); + goto fail_attach; + } + + added_isr_f = 1; + + /* setup the mfi based low level driver */ + if (init_mfi(instance) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "could not initialize the low level driver")); + + goto fail_attach; + } + + /* Initialize all Mutex */ + INIT_LIST_HEAD(&instance->completed_pool_list); + mutex_init(&instance->completed_pool_mtx, + "completed_pool_mtx", MUTEX_DRIVER, + DDI_INTR_PRI(instance->intr_pri)); + + mutex_init(&instance->int_cmd_mtx, "int_cmd_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + cv_init(&instance->int_cmd_cv, NULL, CV_DRIVER, NULL); + + mutex_init(&instance->cmd_pool_mtx, "cmd_pool_mtx", + MUTEX_DRIVER, DDI_INTR_PRI(instance->intr_pri)); + + /* Register our soft-isr for highlevel interrupts. */ + instance->isr_level = instance->intr_pri; + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, + &instance->soft_intr_id, NULL, NULL, + drsas_softintr, (caddr_t)instance) != + DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + " Software ISR did not register")); + + goto fail_attach; + } + + added_soft_isr_f = 1; + } + + /* Allocate a transport structure */ + tran = scsi_hba_tran_alloc(dip, SCSI_HBA_CANSLEEP); + + if (tran == NULL) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_tran_alloc failed")); + goto fail_attach; + } + + tran_alloc_f = 1; + + instance->tran = tran; + + tran->tran_hba_private = instance; + tran->tran_tgt_init = drsas_tran_tgt_init; + tran->tran_tgt_probe = scsi_hba_probe; + tran->tran_tgt_free = drsas_tran_tgt_free; + tran->tran_init_pkt = drsas_tran_init_pkt; + tran->tran_start = drsas_tran_start; + tran->tran_abort = drsas_tran_abort; + tran->tran_reset = drsas_tran_reset; + tran->tran_getcap = drsas_tran_getcap; + tran->tran_setcap = drsas_tran_setcap; + tran->tran_destroy_pkt = drsas_tran_destroy_pkt; + tran->tran_dmafree = drsas_tran_dmafree; + tran->tran_sync_pkt = drsas_tran_sync_pkt; + tran->tran_bus_config = drsas_tran_bus_config; + + tran_dma_attr = drsas_generic_dma_attr; + tran_dma_attr.dma_attr_sgllen = instance->max_num_sge; + + /* Attach this instance of the hba */ + if (scsi_hba_attach_setup(dip, &tran_dma_attr, tran, 0) + != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "scsi_hba_attach failed")); + + goto fail_attach; + } + + /* create devctl node for cfgadm command */ + if (ddi_create_minor_node(dip, "devctl", + S_IFCHR, INST2DEVCTL(instance_no), + DDI_NT_SCSI_NEXUS, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create devctl node.")); + + goto fail_attach; + } + + create_devctl_node_f = 1; + + /* create scsi node for cfgadm command */ + if (ddi_create_minor_node(dip, "scsi", S_IFCHR, + INST2SCSI(instance_no), + DDI_NT_SCSI_ATTACHMENT_POINT, 0) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create scsi node.")); + + goto fail_attach; + } + + create_scsi_node_f = 1; + + (void) sprintf(instance->iocnode, "%d:lsirdctl", + instance_no); + + /* + * Create a node for applications + * for issuing ioctl to the driver. + */ + if (ddi_create_minor_node(dip, instance->iocnode, + S_IFCHR, INST2LSIRDCTL(instance_no), + DDI_PSEUDO, 0) == DDI_FAILURE) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create ioctl node.")); + + goto fail_attach; + } + + create_ioc_node_f = 1; + + /* Create a taskq to handle dr events */ + if ((instance->taskq = ddi_taskq_create(dip, + "drsas_dr_taskq", 1, + TASKQ_DEFAULTPRI, 0)) == NULL) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to create taskq ")); + instance->taskq = NULL; + goto fail_attach; + } + + /* enable interrupt */ + instance->func_ptr->enable_intr(instance); + + /* initiate AEN */ + if (start_mfi_aen(instance)) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: failed to initiate AEN.")); + goto fail_initiate_aen; + } + + con_log(CL_DLEVEL1, (CE_NOTE, + "AEN started for instance %d.", instance_no)); + + /* Finally! We are on the air. */ + ddi_report_dev(dip); + + if (drsas_check_acc_handle(instance->regmap_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + if (drsas_check_acc_handle(instance->pci_handle) != + DDI_SUCCESS) { + goto fail_attach; + } + instance->dr_ld_list = + kmem_zalloc(MRDRV_MAX_LD * sizeof (struct drsas_ld), + KM_SLEEP); + break; + case DDI_PM_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_PM_RESUME")); + break; + case DDI_RESUME: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: DDI_RESUME")); + break; + default: + con_log(CL_ANN, (CE_WARN, + "dr_sas: invalid attach cmd=%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); + +fail_initiate_aen: +fail_attach: + if (create_devctl_node_f) { + ddi_remove_minor_node(dip, "devctl"); + } + + if (create_scsi_node_f) { + ddi_remove_minor_node(dip, "scsi"); + } + + if (create_ioc_node_f) { + ddi_remove_minor_node(dip, instance->iocnode); + } + + if (tran_alloc_f) { + scsi_hba_tran_free(tran); + } + + + if (added_soft_isr_f) { + ddi_remove_softintr(instance->soft_intr_id); + } + + if (added_isr_f) { + drsas_rem_intrs(instance); + } + + if (instance && instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + ddi_soft_state_free(drsas_state, instance_no); + + con_log(CL_ANN, (CE_NOTE, + "dr_sas: return failure from drsas_attach")); + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resultp) +{ + int rval; + int drsas_minor = getminor((dev_t)arg); + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + instance = (struct drsas_instance *) + ddi_get_soft_state(drsas_state, + MINOR2INST(drsas_minor)); + + if (instance == NULL) { + *resultp = NULL; + rval = DDI_FAILURE; + } else { + *resultp = instance->dip; + rval = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *resultp = (void *)instance; + rval = DDI_SUCCESS; + break; + default: + *resultp = NULL; + rval = DDI_FAILURE; + } + + return (rval); +} + +static int +drsas_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* CONSTCOND */ + ASSERT(NO_COMPETING_THREADS); + + instance_no = ddi_get_instance(dip); + + instance = (struct drsas_instance *)ddi_get_soft_state(drsas_state, + instance_no); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d could not get instance in detach", + instance_no)); + + return (DDI_FAILURE); + } + + con_log(CL_ANN, (CE_NOTE, + "dr_sas%d: detaching device 0x%4x:0x%4x:0x%4x:0x%4x", + instance_no, instance->vendor_id, instance->device_id, + instance->subsysvid, instance->subsysid)); + + switch (cmd) { + case DDI_DETACH: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_DETACH")); + + if (scsi_hba_detach(dip) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, + "dr_sas:%d failed to detach", + instance_no)); + + return (DDI_FAILURE); + } + + scsi_hba_tran_free(instance->tran); + + flush_cache(instance); + + if (abort_aen_cmd(instance, instance->aen_cmd)) { + con_log(CL_ANN, (CE_WARN, "drsas_detach: " + "failed to abort prevous AEN command")); + + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + if (instance->isr_level == HIGH_LEVEL_INTR) { + ddi_remove_softintr(instance->soft_intr_id); + } + + drsas_rem_intrs(instance); + + if (instance->taskq) { + ddi_taskq_destroy(instance->taskq); + } + kmem_free(instance->dr_ld_list, MRDRV_MAX_LD + * sizeof (struct drsas_ld)); + free_space_for_mfi(instance); + + drsas_fm_fini(instance); + + pci_config_teardown(&instance->pci_handle); + + kmem_free(instance->func_ptr, + sizeof (struct drsas_func_ptr)); + + ddi_soft_state_free(drsas_state, instance_no); + break; + case DDI_PM_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_PM_SUSPEND")); + + break; + case DDI_SUSPEND: + con_log(CL_ANN, (CE_NOTE, + "drsas_detach: DDI_SUSPEND")); + + break; + default: + con_log(CL_ANN, (CE_WARN, + "invalid detach command:0x%x", cmd)); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * ************************************************************************** * + * * + * common entry points - for character driver types * + * * + * ************************************************************************** * + */ +static int +drsas_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* Check root permissions */ + if (drv_priv(credp) != 0) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: Non-root ioctl access denied!")); + return (EPERM); + } + + /* Verify we are being opened as a character device */ + if (otyp != OTYP_CHR) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: ioctl node must be a char node")); + return (EINVAL); + } + + if (ddi_get_soft_state(drsas_state, MINOR2INST(getminor(*dev))) + == NULL) { + return (ENXIO); + } + + if (scsi_hba_open) { + rval = scsi_hba_open(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_close(dev_t dev, int openflags, int otyp, cred_t *credp) +{ + int rval = 0; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* no need for locks! */ + + if (scsi_hba_close) { + rval = scsi_hba_close(dev, openflags, otyp, credp); + } + + return (rval); +} + +static int +drsas_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int rval = 0; + + struct drsas_instance *instance; + struct drsas_ioctl *ioctl; + struct drsas_aen aen; + int i; + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ddi_get_soft_state(drsas_state, MINOR2INST(getminor(dev))); + + if (instance == NULL) { + /* invalid minor number */ + con_log(CL_ANN, (CE_WARN, "dr_sas: adapter not found.")); + return (ENXIO); + } + + ioctl = (struct drsas_ioctl *)kmem_zalloc(sizeof (struct drsas_ioctl), + KM_SLEEP); + ASSERT(ioctl); + + switch ((uint_t)cmd) { + case DRSAS_IOCTL_FIRMWARE: + for (i = 0; i < sizeof (struct drsas_ioctl); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)ioctl+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "drsas_ioctl " + "ERROR IOCTL copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + if (ioctl->control_code == DRSAS_DRIVER_IOCTL_COMMON) { + rval = handle_drv_ioctl(instance, ioctl, mode); + } else { + rval = handle_mfi_ioctl(instance, ioctl, mode); + } + for (i = 0; i < sizeof (struct drsas_ioctl) - 1; i++) { + if (ddi_copyout((uint8_t *)ioctl+i, + (uint8_t *)arg+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: ddi_copyout " + "failed")); + rval = 1; + break; + } + } + + break; + case DRSAS_IOCTL_AEN: + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyin((uint8_t *)arg+i, + (uint8_t *)&aen+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ERROR AEN copyin")); + kmem_free(ioctl, + sizeof (struct drsas_ioctl)); + return (EFAULT); + } + } + + rval = handle_mfi_aen(instance, &aen); + for (i = 0; i < sizeof (struct drsas_aen); i++) { + if (ddi_copyout((uint8_t *)&aen + i, + (uint8_t *)arg + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "drsas_ioctl: " + "ddi_copyout failed")); + rval = 1; + break; + } + } + + break; + default: + rval = scsi_hba_ioctl(dev, cmd, arg, + mode, credp, rvalp); + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_ioctl: " + "scsi_hba_ioctl called, ret = %x.", rval)); + } + + kmem_free(ioctl, sizeof (struct drsas_ioctl)); + return (rval); +} + +/* + * ************************************************************************** * + * * + * common entry points - for block driver types * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_reset(dev_info_t *dip, ddi_reset_cmd_t cmd) +{ + int instance_no; + + struct drsas_instance *instance; + + instance_no = ddi_get_instance(dip); + instance = (struct drsas_instance *)ddi_get_soft_state + (drsas_state, instance_no); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (!instance) { + con_log(CL_ANN, (CE_WARN, "dr_sas:%d could not get adapter " + "in reset", instance_no)); + return (DDI_FAILURE); + } + + instance->func_ptr->disable_intr(instance); + + con_log(CL_ANN1, (CE_NOTE, "flushing cache for instance %d", + instance_no)); + + flush_cache(instance); + + return (DDI_SUCCESS); +} + + +/* + * ************************************************************************** * + * * + * entry points (SCSI HBA) * + * * + * ************************************************************************** * + */ +/*ARGSUSED*/ +static int +drsas_tran_tgt_init(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + uint16_t tgt = sd->sd_address.a_target; + uint8_t lun = sd->sd_address.a_lun; + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init target %d lun %d", + tgt, lun)); + + instance = ADDR2MR(&sd->sd_address); + + if (ndi_dev_is_persistent_node(tgt_dip) == 0) { + (void) ndi_merge_node(tgt_dip, drsas_name_node); + ddi_set_name_addr(tgt_dip, NULL); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init in " + "ndi_dev_is_persistent_node DDI_FAILURE t = %d l = %d", + tgt, lun)); + return (DDI_FAILURE); + } + + con_log(CL_ANN1, (CE_NOTE, "drsas_tgt_init dev_dip %p tgt_dip %p", + (void *)instance->dr_ld_list[tgt].dip, (void *)tgt_dip)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == NULL && + strcmp(ddi_driver_name(sd->sd_dev), "sd") == 0) { + instance->dr_ld_list[tgt].dip = tgt_dip; + instance->dr_ld_list[tgt].lun_type = DRSAS_LD_LUN; + } + } + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static void +drsas_tran_tgt_free(dev_info_t *hba_dip, dev_info_t *tgt_dip, + scsi_hba_tran_t *hba_tran, struct scsi_device *sd) +{ + struct drsas_instance *instance; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + + instance = ADDR2MR(&sd->sd_address); + + con_log(CL_ANN1, (CE_NOTE, "tgt_free t = %d l = %d", tgt, lun)); + + if (tgt < MRDRV_MAX_LD && lun == 0) { + if (instance->dr_ld_list[tgt].dip == tgt_dip) { + instance->dr_ld_list[tgt].dip = NULL; + } + } +} + +static dev_info_t * +drsas_find_child(struct drsas_instance *instance, uint16_t tgt, uint8_t lun) +{ + dev_info_t *child = NULL; + char addr[SCSI_MAXNAMELEN]; + char tmp[MAXNAMELEN]; + + (void) sprintf(addr, "%x,%x", tgt, lun); + for (child = ddi_get_child(instance->dip); child; + child = ddi_get_next_sibling(child)) { + + if (drsas_name_node(child, tmp, MAXNAMELEN) != + DDI_SUCCESS) { + continue; + } + + if (strcmp(addr, tmp) == 0) { + break; + } + } + con_log(CL_ANN1, (CE_NOTE, "drsas_find_child: return child = %p", + (void *)child)); + return (child); +} + +static int +drsas_name_node(dev_info_t *dip, char *name, int len) +{ + int tgt, lun; + + tgt = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "target", -1); + con_log(CL_ANN1, (CE_NOTE, + "drsas_name_node: dip %p tgt %d", (void *)dip, tgt)); + if (tgt == -1) { + return (DDI_FAILURE); + } + lun = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "lun", -1); + con_log(CL_ANN1, + (CE_NOTE, "drsas_name_node: tgt %d lun %d", tgt, lun)); + if (lun == -1) { + return (DDI_FAILURE); + } + (void) snprintf(name, len, "%x,%x", tgt, lun); + return (DDI_SUCCESS); +} + +static struct scsi_pkt * +drsas_tran_init_pkt(struct scsi_address *ap, register struct scsi_pkt *pkt, + struct buf *bp, int cmdlen, int statuslen, int tgtlen, + int flags, int (*callback)(), caddr_t arg) +{ + struct scsa_cmd *acmd; + struct drsas_instance *instance; + struct scsi_pkt *new_pkt; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + instance = ADDR2MR(ap); + + /* step #1 : pkt allocation */ + if (pkt == NULL) { + pkt = scsi_hba_pkt_alloc(instance->dip, ap, cmdlen, statuslen, + tgtlen, sizeof (struct scsa_cmd), callback, arg); + if (pkt == NULL) { + return (NULL); + } + + acmd = PKT2CMD(pkt); + + /* + * Initialize the new pkt - we redundantly initialize + * all the fields for illustrative purposes. + */ + acmd->cmd_pkt = pkt; + acmd->cmd_flags = 0; + acmd->cmd_scblen = statuslen; + acmd->cmd_cdblen = cmdlen; + acmd->cmd_dmahandle = NULL; + acmd->cmd_ncookies = 0; + acmd->cmd_cookie = 0; + acmd->cmd_cookiecnt = 0; + acmd->cmd_nwin = 0; + + pkt->pkt_address = *ap; + pkt->pkt_comp = (void (*)())NULL; + pkt->pkt_flags = 0; + pkt->pkt_time = 0; + pkt->pkt_resid = 0; + pkt->pkt_state = 0; + pkt->pkt_statistics = 0; + pkt->pkt_reason = 0; + new_pkt = pkt; + } else { + acmd = PKT2CMD(pkt); + new_pkt = NULL; + } + + /* step #2 : dma allocation/move */ + if (bp && bp->b_bcount != 0) { + if (acmd->cmd_dmahandle == NULL) { + if (drsas_dma_alloc(instance, pkt, bp, flags, + callback) == DDI_FAILURE) { + if (new_pkt) { + scsi_hba_pkt_free(ap, new_pkt); + } + return ((struct scsi_pkt *)NULL); + } + } else { + if (drsas_dma_move(instance, pkt, bp) == DDI_FAILURE) { + return ((struct scsi_pkt *)NULL); + } + } + } + + return (pkt); +} + +static int +drsas_tran_start(struct scsi_address *ap, register struct scsi_pkt *pkt) +{ + uchar_t cmd_done = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d:SCSI CDB[0]=0x%x", + __func__, __LINE__, pkt->pkt_cdbp[0])); + + pkt->pkt_reason = CMD_CMPLT; + *pkt->pkt_scbp = STATUS_GOOD; /* clear arq scsi_status */ + + cmd = build_cmd(instance, ap, pkt, &cmd_done); + + /* + * Check if the command is already completed by the drsas_build_cmd() + * routine. In which case the busy_flag would be clear and scb will be + * NULL and appropriate reason provided in pkt_reason field + */ + if (cmd_done) { + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_scbp[0] = STATUS_GOOD; + pkt->pkt_state |= STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + return (TRAN_ACCEPT); + } + + if (cmd == NULL) { + return (TRAN_BUSY); + } + + if ((pkt->pkt_flags & FLAG_NOINTR) == 0) { + if (instance->fw_outstanding > instance->max_fw_cmds) { + con_log(CL_ANN, (CE_CONT, "dr_sas:Firmware busy")); + return_mfi_pkt(instance, cmd); + return (TRAN_BUSY); + } + + /* Synchronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, 0, 0, + DDI_DMA_SYNC_FORDEV); + + instance->func_ptr->issue_cmd(cmd, instance); + + } else { + struct drsas_header *hdr = &cmd->frame->hdr; + + cmd->sync_cmd = DRSAS_TRUE; + + instance->func_ptr-> issue_cmd_in_poll_mode(instance, cmd); + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state |= STATE_XFERRED_DATA | STATE_GOT_STATUS; + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, + &hdr->cmd_status)) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + + case MFI_STAT_SCSI_DONE_WITH_ERROR: + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + + ((struct scsi_status *)pkt->pkt_scbp)->sts_chk = 1; + break; + + case MFI_STAT_DEVICE_NOT_FOUND: + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + + default: + ((struct scsi_status *)pkt->pkt_scbp)->sts_busy = 1; + } + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + if (pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + } + + return (TRAN_ACCEPT); +} + +/*ARGSUSED*/ +static int +drsas_tran_abort(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* abort command not supported by H/W */ + + return (DDI_FAILURE); +} + +/*ARGSUSED*/ +static int +drsas_tran_reset(struct scsi_address *ap, int level) +{ + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* reset command not supported by H/W */ + + return (DDI_FAILURE); + +} + +/*ARGSUSED*/ +static int +drsas_tran_getcap(struct scsi_address *ap, char *cap, int whom) +{ + int rval = 0; + + struct drsas_instance *instance = ADDR2MR(ap); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* we do allow inquiring about capabilities for other targets */ + if (cap == NULL) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + /* Limit to 16MB max transfer */ + rval = drsas_max_cap_maxxfer; + break; + case SCSI_CAP_MSG_OUT: + rval = 1; + break; + case SCSI_CAP_DISCONNECT: + rval = 0; + break; + case SCSI_CAP_SYNCHRONOUS: + rval = 0; + break; + case SCSI_CAP_WIDE_XFER: + rval = 1; + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_UNTAGGED_QING: + rval = 1; + break; + case SCSI_CAP_PARITY: + rval = 1; + break; + case SCSI_CAP_INITIATOR_ID: + rval = instance->init_id; + break; + case SCSI_CAP_ARQ: + rval = 1; + break; + case SCSI_CAP_LINKED_CMDS: + rval = 0; + break; + case SCSI_CAP_RESET_NOTIFICATION: + rval = 1; + break; + case SCSI_CAP_GEOMETRY: + rval = -1; + + break; + default: + con_log(CL_DLEVEL2, (CE_NOTE, "Default cap coming 0x%x", + scsi_hba_lookup_capstr(cap))); + rval = -1; + break; + } + + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_tran_setcap(struct scsi_address *ap, char *cap, int value, int whom) +{ + int rval = 1; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + /* We don't allow setting capabilities for other targets */ + if (cap == NULL || whom == 0) { + return (-1); + } + + switch (scsi_hba_lookup_capstr(cap)) { + case SCSI_CAP_DMA_MAX: + case SCSI_CAP_MSG_OUT: + case SCSI_CAP_PARITY: + case SCSI_CAP_LINKED_CMDS: + case SCSI_CAP_RESET_NOTIFICATION: + case SCSI_CAP_DISCONNECT: + case SCSI_CAP_SYNCHRONOUS: + case SCSI_CAP_UNTAGGED_QING: + case SCSI_CAP_WIDE_XFER: + case SCSI_CAP_INITIATOR_ID: + case SCSI_CAP_ARQ: + /* + * None of these are settable via + * the capability interface. + */ + break; + case SCSI_CAP_TAGGED_QING: + rval = 1; + break; + case SCSI_CAP_SECTOR_SIZE: + rval = 1; + break; + + case SCSI_CAP_TOTAL_SECTORS: + rval = 1; + break; + default: + rval = -1; + break; + } + + return (rval); +} + +static void +drsas_tran_destroy_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } + + /* free the pkt */ + scsi_hba_pkt_free(ap, pkt); +} + +/*ARGSUSED*/ +static void +drsas_tran_dmafree(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + acmd->cmd_flags &= ~CFLAG_DMAVALID; + + (void) ddi_dma_unbind_handle(acmd->cmd_dmahandle); + + ddi_dma_free_handle(&acmd->cmd_dmahandle); + + acmd->cmd_dmahandle = NULL; + } +} + +/*ARGSUSED*/ +static void +drsas_tran_sync_pkt(struct scsi_address *ap, struct scsi_pkt *pkt) +{ + register struct scsa_cmd *acmd = PKT2CMD(pkt); + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, acmd->cmd_dma_offset, + acmd->cmd_dma_len, (acmd->cmd_flags & CFLAG_DMASEND) ? + DDI_DMA_SYNC_FORDEV : DDI_DMA_SYNC_FORCPU); + } +} + +/* + * drsas_isr(caddr_t) + * + * The Interrupt Service Routine + * + * Collect status for all completed commands and do callback + * + */ +static uint_t +drsas_isr(struct drsas_instance *instance) +{ + int need_softintr; + uint32_t producer; + uint32_t consumer; + uint32_t context; + + struct drsas_cmd *cmd; + + con_log(CL_ANN1, (CE_NOTE, "chkpnt:%s:%d", __func__, __LINE__)); + + ASSERT(instance); + if ((instance->intr_type == DDI_INTR_TYPE_FIXED) && + !instance->func_ptr->intr_ack(instance)) { + return (DDI_INTR_UNCLAIMED); + } + + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + producer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->producer); + consumer = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer); + + con_log(CL_ANN1, (CE_CONT, " producer %x consumer %x ", + producer, consumer)); + if (producer == consumer) { + con_log(CL_ANN1, (CE_WARN, "producer = consumer case")); + return (DDI_INTR_UNCLAIMED); + } + mutex_enter(&instance->completed_pool_mtx); + + while (consumer != producer) { + context = ddi_get32(instance->mfi_internal_dma_obj.acc_handle, + &instance->reply_queue[consumer]); + cmd = instance->cmd_list[context]; + mlist_add_tail(&cmd->list, &instance->completed_pool_list); + + consumer++; + if (consumer == (instance->max_fw_cmds + 1)) { + consumer = 0; + } + } + + mutex_exit(&instance->completed_pool_mtx); + + ddi_put32(instance->mfi_internal_dma_obj.acc_handle, + instance->consumer, consumer); + (void) ddi_dma_sync(instance->mfi_internal_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORDEV); + + if (instance->softint_running) { + need_softintr = 0; + } else { + need_softintr = 1; + } + + if (instance->isr_level == HIGH_LEVEL_INTR) { + if (need_softintr) { + ddi_trigger_softintr(instance->soft_intr_id); + } + } else { + /* + * Not a high-level interrupt, therefore call the soft level + * interrupt explicitly + */ + (void) drsas_softintr(instance); + } + + return (DDI_INTR_CLAIMED); +} + + +/* + * ************************************************************************** * + * * + * libraries * + * * + * ************************************************************************** * + */ +/* + * get_mfi_pkt : Get a command from the free pool + * After successful allocation, the caller of this routine + * must clear the frame buffer (memset to zero) before + * using the packet further. + * + * ***** Note ***** + * After clearing the frame buffer the context id of the + * frame buffer SHOULD be restored back. + */ +static struct drsas_cmd * +get_mfi_pkt(struct drsas_instance *instance) +{ + mlist_t *head = &instance->cmd_pool_list; + struct drsas_cmd *cmd = NULL; + + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + if (!mlist_empty(head)) { + cmd = mlist_entry(head->next, struct drsas_cmd, list); + mlist_del_init(head->next); + } + if (cmd != NULL) + cmd->pkt = NULL; + mutex_exit(&instance->cmd_pool_mtx); + + return (cmd); +} + +/* + * return_mfi_pkt : Return a cmd to free command pool + */ +static void +return_mfi_pkt(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + mutex_enter(&instance->cmd_pool_mtx); + ASSERT(mutex_owned(&instance->cmd_pool_mtx)); + + mlist_add(&cmd->list, &instance->cmd_pool_list); + + mutex_exit(&instance->cmd_pool_mtx); +} + +/* + * destroy_mfi_frame_pool + */ +static void +destroy_mfi_frame_pool(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + struct drsas_cmd *cmd; + + /* return all frames to pool */ + for (i = 0; i < max_cmd+1; i++) { + + cmd = instance->cmd_list[i]; + + if (cmd->frame_dma_obj_status == DMA_OBJ_ALLOCATED) + (void) drsas_free_dma_obj(instance, cmd->frame_dma_obj); + + cmd->frame_dma_obj_status = DMA_OBJ_FREED; + } + +} + +/* + * create_mfi_frame_pool + */ +static int +create_mfi_frame_pool(struct drsas_instance *instance) +{ + int i = 0; + int cookie_cnt; + uint16_t max_cmd; + uint16_t sge_sz; + uint32_t sgl_sz; + uint32_t tot_frame_size; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + sge_sz = sizeof (struct drsas_sge64); + + /* calculated the number of 64byte frames required for SGL */ + sgl_sz = sge_sz * instance->max_num_sge; + tot_frame_size = sgl_sz + MRMFI_FRAME_SIZE + SENSE_LENGTH; + + con_log(CL_DLEVEL3, (CE_NOTE, "create_mfi_frame_pool: " + "sgl_sz %x tot_frame_size %x", sgl_sz, tot_frame_size)); + + while (i < max_cmd+1) { + cmd = instance->cmd_list[i]; + + cmd->frame_dma_obj.size = tot_frame_size; + cmd->frame_dma_obj.dma_attr = drsas_generic_dma_attr; + cmd->frame_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + cmd->frame_dma_obj.dma_attr.dma_attr_sgllen = 1; + cmd->frame_dma_obj.dma_attr.dma_attr_align = 64; + + + cookie_cnt = drsas_alloc_dma_obj(instance, &cmd->frame_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC); + + if (cookie_cnt == -1 || cookie_cnt > 1) { + con_log(CL_ANN, (CE_WARN, + "create_mfi_frame_pool: could not alloc.")); + return (DDI_FAILURE); + } + + bzero(cmd->frame_dma_obj.buffer, tot_frame_size); + + cmd->frame_dma_obj_status = DMA_OBJ_ALLOCATED; + cmd->frame = (union drsas_frame *)cmd->frame_dma_obj.buffer; + cmd->frame_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address; + + cmd->sense = (uint8_t *)(((unsigned long) + cmd->frame_dma_obj.buffer) + + tot_frame_size - SENSE_LENGTH); + cmd->sense_phys_addr = + cmd->frame_dma_obj.dma_cookie[0].dmac_address + + tot_frame_size - SENSE_LENGTH; + + if (!cmd->frame || !cmd->sense) { + con_log(CL_ANN, (CE_NOTE, + "dr_sas: pci_pool_alloc failed")); + + return (ENOMEM); + } + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.context, cmd->index); + i++; + + con_log(CL_DLEVEL3, (CE_NOTE, "[%x]-%x", + cmd->index, cmd->frame_phys_addr)); + } + + return (DDI_SUCCESS); +} + +/* + * free_additional_dma_buffer + */ +static void +free_additional_dma_buffer(struct drsas_instance *instance) +{ + if (instance->mfi_internal_dma_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_internal_dma_obj); + instance->mfi_internal_dma_obj.status = DMA_OBJ_FREED; + } + + if (instance->mfi_evt_detail_obj.status == DMA_OBJ_ALLOCATED) { + (void) drsas_free_dma_obj(instance, + instance->mfi_evt_detail_obj); + instance->mfi_evt_detail_obj.status = DMA_OBJ_FREED; + } +} + +/* + * alloc_additional_dma_buffer + */ +static int +alloc_additional_dma_buffer(struct drsas_instance *instance) +{ + uint32_t reply_q_sz; + uint32_t internal_buf_size = PAGESIZE*2; + + /* max cmds plus 1 + producer & consumer */ + reply_q_sz = sizeof (uint32_t) * (instance->max_fw_cmds + 1 + 2); + + instance->mfi_internal_dma_obj.size = internal_buf_size; + instance->mfi_internal_dma_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_count_max = + 0xFFFFFFFFU; + instance->mfi_internal_dma_obj.dma_attr.dma_attr_sgllen = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_internal_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "dr_sas: could not alloc reply queue")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_internal_dma_obj.buffer, internal_buf_size); + + instance->mfi_internal_dma_obj.status |= DMA_OBJ_ALLOCATED; + + instance->producer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer); + instance->consumer = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 4); + instance->reply_queue = (uint32_t *)((unsigned long) + instance->mfi_internal_dma_obj.buffer + 8); + instance->internal_buf = (caddr_t)(((unsigned long) + instance->mfi_internal_dma_obj.buffer) + reply_q_sz + 8); + instance->internal_buf_dmac_add = + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + + (reply_q_sz + 8); + instance->internal_buf_size = internal_buf_size - + (reply_q_sz + 8); + + /* allocate evt_detail */ + instance->mfi_evt_detail_obj.size = sizeof (struct drsas_evt_detail); + instance->mfi_evt_detail_obj.dma_attr = drsas_generic_dma_attr; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_sgllen = 1; + instance->mfi_evt_detail_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &instance->mfi_evt_detail_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "alloc_additional_dma_buffer: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + bzero(instance->mfi_evt_detail_obj.buffer, + sizeof (struct drsas_evt_detail)); + + instance->mfi_evt_detail_obj.status |= DMA_OBJ_ALLOCATED; + + return (DDI_SUCCESS); +} + +/* + * free_space_for_mfi + */ +static void +free_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd = instance->max_fw_cmds; + + /* already freed */ + if (instance->cmd_list == NULL) { + return; + } + + free_additional_dma_buffer(instance); + + /* first free the MFI frame pool */ + destroy_mfi_frame_pool(instance); + + /* free all the commands in the cmd_list */ + for (i = 0; i < instance->max_fw_cmds+1; i++) { + kmem_free(instance->cmd_list[i], + sizeof (struct drsas_cmd)); + + instance->cmd_list[i] = NULL; + } + + /* free the cmd_list buffer itself */ + kmem_free(instance->cmd_list, + sizeof (struct drsas_cmd *) * (max_cmd+1)); + + instance->cmd_list = NULL; + + INIT_LIST_HEAD(&instance->cmd_pool_list); +} + +/* + * alloc_space_for_mfi + */ +static int +alloc_space_for_mfi(struct drsas_instance *instance) +{ + int i; + uint32_t max_cmd; + size_t sz; + + struct drsas_cmd *cmd; + + max_cmd = instance->max_fw_cmds; + + /* reserve 1 more slot for flush_cache */ + sz = sizeof (struct drsas_cmd *) * (max_cmd+1); + + /* + * instance->cmd_list is an array of struct drsas_cmd pointers. + * Allocate the dynamic array first and then allocate individual + * commands. + */ + instance->cmd_list = kmem_zalloc(sz, KM_SLEEP); + ASSERT(instance->cmd_list); + + for (i = 0; i < max_cmd+1; i++) { + instance->cmd_list[i] = kmem_zalloc(sizeof (struct drsas_cmd), + KM_SLEEP); + ASSERT(instance->cmd_list[i]); + } + + INIT_LIST_HEAD(&instance->cmd_pool_list); + + /* add all the commands to command pool (instance->cmd_pool) */ + for (i = 0; i < max_cmd; i++) { + cmd = instance->cmd_list[i]; + cmd->index = i; + + mlist_add_tail(&cmd->list, &instance->cmd_pool_list); + } + + /* single slot for flush_cache won't be added in command pool */ + cmd = instance->cmd_list[max_cmd]; + cmd->index = i; + + /* create a frame pool and assign one frame to each cmd */ + if (create_mfi_frame_pool(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + /* create a frame pool and assign one frame to each cmd */ + if (alloc_additional_dma_buffer(instance)) { + con_log(CL_ANN, (CE_NOTE, "error creating frame DMA pool")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * get_ctrl_info + */ +static int +get_ctrl_info(struct drsas_instance *instance, + struct drsas_ctrl_info *ctrl_info) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + struct drsas_ctrl_info *ci; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + ci = (struct drsas_ctrl_info *)instance->internal_buf; + + if (!ci) { + con_log(CL_ANN, (CE_WARN, + "Failed to alloc mem for ctrl info")); + return_mfi_pkt(instance, cmd); + return (DDI_FAILURE); + } + + (void) memset(ci, 0, sizeof (struct drsas_ctrl_info)); + + /* for( i = 0; i < DCMD_MBOX_SZ; i++ ) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_ctrl_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->internal_buf_dmac_add); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_ctrl_info)); + + cmd->frame_count = 1; + + if (!instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + ret = 0; + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, + (uint8_t *)ctrl_info, (uint8_t *)ci, + sizeof (struct drsas_ctrl_info), DDI_DEV_AUTOINCR); + } else { + con_log(CL_ANN, (CE_WARN, "get_ctrl_info: Ctrl info failed")); + ret = -1; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = -1; + } + + return (ret); +} + +/* + * abort_aen_cmd + */ +static int +abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort) +{ + int ret = 0; + + struct drsas_cmd *cmd; + struct drsas_abort_frame *abort_fr; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, + "Failed to get a cmd for ctrl info")); + return (DDI_FAILURE); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + abort_fr = &cmd->frame->abort; + + /* prepare and issue the abort frame */ + ddi_put8(cmd->frame_dma_obj.acc_handle, + &abort_fr->cmd, MFI_CMD_OP_ABORT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &abort_fr->cmd_status, + MFI_CMD_STATUS_SYNC_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &abort_fr->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &abort_fr->abort_context, + cmd_to_abort->index); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_lo, cmd_to_abort->frame_phys_addr); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &abort_fr->abort_mfi_phys_addr_hi, 0); + + instance->aen_cmd->abort_aen = 1; + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "abort_aen_cmd: issue_cmd_in_sync_mode failed")); + ret = -1; + } else { + ret = 0; + } + + instance->aen_cmd->abort_aen = 1; + instance->aen_cmd = 0; + + return_mfi_pkt(instance, cmd); + (void) drsas_common_check(instance, cmd); + + return (ret); +} + +/* + * init_mfi + */ +static int +init_mfi(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd; + struct drsas_ctrl_info ctrl_info; + struct drsas_init_frame *init_frame; + struct drsas_init_queue_info *initq_info; + + /* we expect the FW state to be READY */ + if (mfi_state_transition_to_ready(instance)) { + con_log(CL_ANN, (CE_WARN, "dr_sas: F/W is not ready")); + goto fail_ready_state; + } + + /* get various operational parameters from status register */ + instance->max_num_sge = + (instance->func_ptr->read_fw_status_reg(instance) & + 0xFF0000) >> 0x10; + /* + * Reduce the max supported cmds by 1. This is to ensure that the + * reply_q_sz (1 more than the max cmd that driver may send) + * does not exceed max cmds that the FW can support + */ + instance->max_fw_cmds = + instance->func_ptr->read_fw_status_reg(instance) & 0xFFFF; + instance->max_fw_cmds = instance->max_fw_cmds - 1; + + instance->max_num_sge = + (instance->max_num_sge > DRSAS_MAX_SGE_CNT) ? + DRSAS_MAX_SGE_CNT : instance->max_num_sge; + + /* create a pool of commands */ + if (alloc_space_for_mfi(instance) != DDI_SUCCESS) + goto fail_alloc_fw_space; + + /* + * Prepare a init frame. Note the init frame points to queue info + * structure. Each frame has SGL allocated after first 64 bytes. For + * this frame - since we don't need any SGL - we use SGL's space as + * queue info structure + */ + cmd = get_mfi_pkt(instance); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + init_frame = (struct drsas_init_frame *)cmd->frame; + initq_info = (struct drsas_init_queue_info *) + ((unsigned long)init_frame + 64); + + (void) memset(init_frame, 0, MRMFI_FRAME_SIZE); + (void) memset(initq_info, 0, sizeof (struct drsas_init_queue_info)); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &initq_info->init_flags, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_entries, instance->max_fw_cmds + 1); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->producer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->consumer_index_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 4); + + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_hi, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &initq_info->reply_queue_start_phys_addr_lo, + instance->mfi_internal_dma_obj.dma_cookie[0].dmac_address + 8); + + ddi_put8(cmd->frame_dma_obj.acc_handle, + &init_frame->cmd, MFI_CMD_OP_INIT); + ddi_put8(cmd->frame_dma_obj.acc_handle, &init_frame->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &init_frame->flags, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_lo, + cmd->frame_phys_addr + 64); + ddi_put32(cmd->frame_dma_obj.acc_handle, + &init_frame->queue_info_new_phys_addr_hi, 0); + + ddi_put32(cmd->frame_dma_obj.acc_handle, &init_frame->data_xfer_len, + sizeof (struct drsas_init_queue_info)); + + cmd->frame_count = 1; + + /* issue the init frame in polled mode */ + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "failed to init firmware")); + goto fail_fw_init; + } + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + goto fail_fw_init; + } + + /* gather misc FW related information */ + if (!get_ctrl_info(instance, &ctrl_info)) { + instance->max_sectors_per_req = ctrl_info.max_request_size; + con_log(CL_ANN1, (CE_NOTE, "product name %s ld present %d", + ctrl_info.product_name, ctrl_info.ld_present_count)); + } else { + instance->max_sectors_per_req = instance->max_num_sge * + PAGESIZE / 512; + } + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + goto fail_fw_init; + } + + return (DDI_SUCCESS); + +fail_fw_init: +fail_alloc_fw_space: + + free_space_for_mfi(instance); + +fail_ready_state: + ddi_regs_map_free(&instance->regmap_handle); + +fail_mfi_reg_setup: + return (DDI_FAILURE); +} + +/* + * mfi_state_transition_to_ready : Move the FW to READY state + * + * @reg_set : MFI register set + */ +static int +mfi_state_transition_to_ready(struct drsas_instance *instance) +{ + int i; + uint8_t max_wait; + uint32_t fw_ctrl; + uint32_t fw_state; + uint32_t cur_state; + + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & MFI_STATE_MASK; + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW state = 0x%x", fw_state)); + + while (fw_state != MFI_STATE_READY) { + con_log(CL_ANN, (CE_NOTE, + "mfi_state_transition_to_ready:FW state%x", fw_state)); + + switch (fw_state) { + case MFI_STATE_FAULT: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW in FAULT state!!")); + + return (ENODEV); + case MFI_STATE_WAIT_HANDSHAKE: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW waiting for HANDSHAKE")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + /* WR_IB_MSG_0(MFI_INIT_CLEAR_HANDSHAKE, instance); */ + WR_IB_DOORBELL(MFI_INIT_CLEAR_HANDSHAKE | + MFI_INIT_HOTPLUG, instance); + + max_wait = 2; + cur_state = MFI_STATE_WAIT_HANDSHAKE; + break; + case MFI_STATE_BOOT_MESSAGE_PENDING: + /* set the CLR bit in IMR0 */ + con_log(CL_ANN, (CE_NOTE, + "dr_sas: FW state boot message pending")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_CLEAR_HANDSHAKE|MFI_INIT_HOTPLUG) + * to be set + */ + WR_IB_DOORBELL(MFI_INIT_HOTPLUG, instance); + + max_wait = 10; + cur_state = MFI_STATE_BOOT_MESSAGE_PENDING; + break; + case MFI_STATE_OPERATIONAL: + /* bring it to READY state; assuming max wait 2 secs */ + instance->func_ptr->disable_intr(instance); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: FW in OPERATIONAL state")); + /* + * PCI_Hot Plug: MFI F/W requires + * (MFI_INIT_READY | MFI_INIT_MFIMODE | MFI_INIT_ABORT) + * to be set + */ + /* WR_IB_DOORBELL(MFI_INIT_READY, instance); */ + WR_IB_DOORBELL(MFI_RESET_FLAGS, instance); + + max_wait = 10; + cur_state = MFI_STATE_OPERATIONAL; + break; + case MFI_STATE_UNDEFINED: + /* this state should not last for more than 2 seconds */ + con_log(CL_ANN, (CE_NOTE, "FW state undefined")); + + max_wait = 2; + cur_state = MFI_STATE_UNDEFINED; + break; + case MFI_STATE_BB_INIT: + max_wait = 2; + cur_state = MFI_STATE_BB_INIT; + break; + case MFI_STATE_FW_INIT: + max_wait = 2; + cur_state = MFI_STATE_FW_INIT; + break; + case MFI_STATE_DEVICE_SCAN: + max_wait = 10; + cur_state = MFI_STATE_DEVICE_SCAN; + break; + default: + con_log(CL_ANN, (CE_NOTE, + "dr_sas: Unknown state 0x%x", fw_state)); + return (ENODEV); + } + + /* the cur_state should not last for more than max_wait secs */ + for (i = 0; i < (max_wait * MILLISEC); i++) { + /* fw_state = RD_OB_MSG_0(instance) & MFI_STATE_MASK; */ + fw_state = + instance->func_ptr->read_fw_status_reg(instance) & + MFI_STATE_MASK; + + if (fw_state == cur_state) { + delay(1 * drv_usectohz(MILLISEC)); + } else { + break; + } + } + + /* return error if fw_state hasn't changed after max_wait */ + if (fw_state == cur_state) { + con_log(CL_ANN, (CE_NOTE, + "FW state hasn't changed in %d secs", max_wait)); + return (ENODEV); + } + }; + + fw_ctrl = RD_IB_DOORBELL(instance); + + con_log(CL_ANN1, (CE_NOTE, + "mfi_state_transition_to_ready:FW ctrl = 0x%x", fw_ctrl)); + + /* + * Write 0xF to the doorbell register to do the following. + * - Abort all outstanding commands (bit 0). + * - Transition from OPERATIONAL to READY state (bit 1). + * - Discard (possible) low MFA posted in 64-bit mode (bit-2). + * - Set to release FW to continue running (i.e. BIOS handshake + * (bit 3). + */ + WR_IB_DOORBELL(0xF, instance); + + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + return (ENODEV); + } + return (DDI_SUCCESS); +} + +/* + * get_seq_num + */ +static int +get_seq_num(struct drsas_instance *instance, + struct drsas_evt_log_info *eli) +{ + int ret = DDI_SUCCESS; + + dma_obj_t dcmd_dma_obj; + struct drsas_cmd *cmd; + struct drsas_dcmd_frame *dcmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + cmn_err(CE_WARN, "dr_sas: failed to get a cmd"); + return (ENOMEM); + } + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = sizeof (struct drsas_evt_log_info); + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, + "get_seq_num: could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + (void) memset(dcmd_dma_obj.buffer, 0, + sizeof (struct drsas_evt_log_info)); + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_GET_INFO); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_log_info)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + dcmd_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + cmn_err(CE_WARN, "get_seq_num: " + "failed to issue DRSAS_DCMD_CTRL_EVENT_GET_INFO"); + ret = DDI_FAILURE; + } else { + /* copy the data back into callers buffer */ + ddi_rep_get8(cmd->frame_dma_obj.acc_handle, (uint8_t *)eli, + (uint8_t *)dcmd_dma_obj.buffer, + sizeof (struct drsas_evt_log_info), DDI_DEV_AUTOINCR); + ret = DDI_SUCCESS; + } + + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + ret = DDI_FAILURE; + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) { + ret = DDI_FAILURE; + } + return (ret); +} + +/* + * start_mfi_aen + */ +static int +start_mfi_aen(struct drsas_instance *instance) +{ + int ret = 0; + + struct drsas_evt_log_info eli; + union drsas_evt_class_locale class_locale; + + /* get the latest sequence number from FW */ + (void) memset(&eli, 0, sizeof (struct drsas_evt_log_info)); + + if (get_seq_num(instance, &eli)) { + cmn_err(CE_WARN, "start_mfi_aen: failed to get seq num"); + return (-1); + } + + /* register AEN with FW for latest sequence number plus 1 */ + class_locale.members.reserved = 0; + class_locale.members.locale = DR_EVT_LOCALE_ALL; + class_locale.members.class = DR_EVT_CLASS_INFO; + ret = register_mfi_aen(instance, eli.newest_seq_num + 1, + class_locale.word); + + if (ret) { + cmn_err(CE_WARN, "start_mfi_aen: aen registration failed"); + return (-1); + } + + return (ret); +} + +/* + * flush_cache + */ +static void +flush_cache(struct drsas_instance *instance) +{ + struct drsas_cmd *cmd = NULL; + struct drsas_dcmd_frame *dcmd; + uint32_t max_cmd = instance->max_fw_cmds; + + cmd = instance->cmd_list[max_cmd]; + + if (cmd == NULL) + return; + + dcmd = &cmd->frame->dcmd; + + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 0); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_NONE); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_CACHE_FLUSH); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.b[0], + DR_FLUSH_CTRL_CACHE | DR_FLUSH_DISK_CACHE); + + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_poll_mode(instance, cmd)) { + con_log(CL_ANN1, (CE_WARN, + "flush_cache: failed to issue MFI_DCMD_CTRL_CACHE_FLUSH")); + } + con_log(CL_DLEVEL1, (CE_NOTE, "done")); +} + +/* + * service_mfi_aen- Completes an AEN command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + */ +static void +service_mfi_aen(struct drsas_instance *instance, struct drsas_cmd *cmd) +{ + uint32_t seq_num; + struct drsas_evt_detail *evt_detail = + (struct drsas_evt_detail *)instance->mfi_evt_detail_obj.buffer; + int rval = 0; + int tgt = 0; + ddi_acc_handle_t acc_handle; + + acc_handle = cmd->frame_dma_obj.acc_handle; + + cmd->cmd_status = ddi_get8(acc_handle, &cmd->frame->io.cmd_status); + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + /* + * log the MFI AEN event to the sysevent queue so that + * application will get noticed + */ + if (ddi_log_sysevent(instance->dip, DDI_VENDOR_LSI, "LSIMEGA", "SAS", + NULL, NULL, DDI_NOSLEEP) != DDI_SUCCESS) { + int instance_no = ddi_get_instance(instance->dip); + con_log(CL_ANN, (CE_WARN, + "dr_sas%d: Failed to log AEN event", instance_no)); + } + /* + * Check for any ld devices that has changed state. i.e. online + * or offline. + */ + con_log(CL_ANN1, (CE_NOTE, + "AEN: code = %x class = %x locale = %x args = %x", + ddi_get32(acc_handle, &evt_detail->code), + evt_detail->cl.members.class, + ddi_get16(acc_handle, &evt_detail->cl.members.locale), + ddi_get8(acc_handle, &evt_detail->arg_type))); + + switch (ddi_get32(acc_handle, &evt_detail->code)) { + case DR_EVT_CFG_CLEARED: { + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + if (instance->dr_ld_list[tgt].dip != NULL) { + rval = drsas_service_evt(instance, tgt, 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, + "dr_sas: CFG CLEARED AEN rval = %d " + "tgt id = %d", rval, tgt)); + } + } + break; + } + + case DR_EVT_LD_DELETED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_UNCONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD DELETED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_DELETED */ + + case DR_EVT_LD_CREATED: { + rval = drsas_service_evt(instance, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), 0, + DRSAS_EVT_CONFIG_TGT, NULL); + con_log(CL_ANN1, (CE_WARN, "dr_sas: LD CREATED AEN rval = %d " + "tgt id = %d index = %d", rval, + ddi_get16(acc_handle, &evt_detail->args.ld.target_id), + ddi_get8(acc_handle, &evt_detail->args.ld.ld_index))); + break; + } /* End of DR_EVT_LD_CREATED */ + } /* End of Main Switch */ + + /* get copy of seq_num and class/locale for re-registration */ + seq_num = ddi_get32(acc_handle, &evt_detail->seq_num); + seq_num++; + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + ddi_put8(acc_handle, &cmd->frame->dcmd.cmd_status, 0x0); + ddi_put32(acc_handle, &cmd->frame->dcmd.mbox.w[0], seq_num); + + instance->aen_seq_num = seq_num; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + instance->func_ptr->issue_cmd(cmd, instance); +} + +/* + * complete_cmd_in_sync_mode - Completes an internal command + * @instance: Adapter soft state + * @cmd: Command to be completed + * + * The issue_cmd_in_sync_mode() function waits for a command to complete + * after it issues a command. This function wakes up that waiting routine by + * calling wake_up() on the wait queue. + */ +static void +complete_cmd_in_sync_mode(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + cmd->cmd_status = ddi_get8(cmd->frame_dma_obj.acc_handle, + &cmd->frame->io.cmd_status); + + cmd->sync_cmd = DRSAS_FALSE; + + if (cmd->cmd_status == ENODATA) { + cmd->cmd_status = 0; + } + + cv_broadcast(&instance->int_cmd_cv); +} + +/* + * drsas_softintr - The Software ISR + * @param arg : HBA soft state + * + * called from high-level interrupt if hi-level interrupt are not there, + * otherwise triggered as a soft interrupt + */ +static uint_t +drsas_softintr(struct drsas_instance *instance) +{ + struct scsi_pkt *pkt; + struct scsa_cmd *acmd; + struct drsas_cmd *cmd; + struct mlist_head *pos, *next; + mlist_t process_list; + struct drsas_header *hdr; + struct scsi_arq_status *arqstat; + + con_log(CL_ANN1, (CE_CONT, "drsas_softintr called")); + + ASSERT(instance); + mutex_enter(&instance->completed_pool_mtx); + + if (mlist_empty(&instance->completed_pool_list)) { + mutex_exit(&instance->completed_pool_mtx); + return (DDI_INTR_UNCLAIMED); + } + + instance->softint_running = 1; + + INIT_LIST_HEAD(&process_list); + mlist_splice(&instance->completed_pool_list, &process_list); + INIT_LIST_HEAD(&instance->completed_pool_list); + + mutex_exit(&instance->completed_pool_mtx); + + /* perform all callbacks first, before releasing the SCBs */ + mlist_for_each_safe(pos, next, &process_list) { + cmd = mlist_entry(pos, struct drsas_cmd, list); + + /* syncronize the Cmd frame for the controller */ + (void) ddi_dma_sync(cmd->frame_dma_obj.dma_handle, + 0, 0, DDI_DMA_SYNC_FORCPU); + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (DDI_INTR_UNCLAIMED); + } + + hdr = &cmd->frame->hdr; + + /* remove the internal command from the process list */ + mlist_del_init(&cmd->list); + + switch (ddi_get8(cmd->frame_dma_obj.acc_handle, &hdr->cmd)) { + case MFI_CMD_OP_PD_SCSI: + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_LD_READ: + case MFI_CMD_OP_LD_WRITE: + /* + * MFI_CMD_OP_PD_SCSI and MFI_CMD_OP_LD_SCSI + * could have been issued either through an + * IO path or an IOCTL path. If it was via IOCTL, + * we will send it to internal completion. + */ + if (cmd->sync_cmd == DRSAS_TRUE) { + complete_cmd_in_sync_mode(instance, cmd); + break; + } + + /* regular commands */ + acmd = cmd->cmd; + pkt = CMD2PKT(acmd); + + if (acmd->cmd_flags & CFLAG_DMAVALID) { + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, + acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } + + pkt->pkt_reason = CMD_CMPLT; + pkt->pkt_statistics = 0; + pkt->pkt_state = STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA | STATE_GOT_STATUS; + + con_log(CL_ANN1, (CE_CONT, + "CDB[0] = %x completed for %s: size %lx context %x", + pkt->pkt_cdbp[0], ((acmd->islogical) ? "LD" : "PD"), + acmd->cmd_dmacount, hdr->context)); + + if (pkt->pkt_cdbp[0] == SCMD_INQUIRY) { + struct scsi_inquiry *inq; + + if (acmd->cmd_dmacount != 0) { + bp_mapin(acmd->cmd_buf); + inq = (struct scsi_inquiry *) + acmd->cmd_buf->b_un.b_addr; + + /* don't expose physical drives to OS */ + if (acmd->islogical && + (hdr->cmd_status == MFI_STAT_OK)) { + display_scsi_inquiry( + (caddr_t)inq); + } else if ((hdr->cmd_status == + MFI_STAT_OK) && inq->inq_dtype == + DTYPE_DIRECT) { + + display_scsi_inquiry( + (caddr_t)inq); + + /* for physical disk */ + hdr->cmd_status = + MFI_STAT_DEVICE_NOT_FOUND; + } + } + } + + switch (hdr->cmd_status) { + case MFI_STAT_OK: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_CC_IN_PROGRESS: + case MFI_STAT_LD_RECON_IN_PROGRESS: + pkt->pkt_scbp[0] = STATUS_GOOD; + break; + case MFI_STAT_LD_INIT_IN_PROGRESS: + con_log(CL_ANN, + (CE_WARN, "Initialization in Progress")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + case MFI_STAT_SCSI_DONE_WITH_ERROR: + con_log(CL_ANN1, (CE_CONT, "scsi_done error")); + + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + if (pkt->pkt_cdbp[0] == SCMD_TEST_UNIT_READY) { + + con_log(CL_ANN, + (CE_WARN, "TEST_UNIT_READY fail")); + + } else { + pkt->pkt_state |= STATE_ARQ_DONE; + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= + STATE_GOT_BUS | STATE_GOT_TARGET + | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + ddi_rep_get8( + cmd->frame_dma_obj.acc_handle, + (uint8_t *) + &(arqstat->sts_sensedata), + cmd->sense, + acmd->cmd_scblen - + offsetof(struct scsi_arq_status, + sts_sensedata), DDI_DEV_AUTOINCR); + } + break; + case MFI_STAT_LD_OFFLINE: + case MFI_STAT_DEVICE_NOT_FOUND: + con_log(CL_ANN1, (CE_CONT, + "device not found error")); + pkt->pkt_reason = CMD_DEV_GONE; + pkt->pkt_statistics = STAT_DISCON; + break; + case MFI_STAT_LD_LBA_OUT_OF_RANGE: + pkt->pkt_state |= STATE_ARQ_DONE; + pkt->pkt_reason = CMD_CMPLT; + ((struct scsi_status *) + pkt->pkt_scbp)->sts_chk = 1; + + arqstat = (void *)(pkt->pkt_scbp); + arqstat->sts_rqpkt_reason = CMD_CMPLT; + arqstat->sts_rqpkt_resid = 0; + arqstat->sts_rqpkt_state |= STATE_GOT_BUS + | STATE_GOT_TARGET | STATE_SENT_CMD + | STATE_XFERRED_DATA; + *(uint8_t *)&arqstat->sts_rqpkt_status = + STATUS_GOOD; + + arqstat->sts_sensedata.es_valid = 1; + arqstat->sts_sensedata.es_key = + KEY_ILLEGAL_REQUEST; + arqstat->sts_sensedata.es_class = + CLASS_EXTENDED_SENSE; + + /* + * LOGICAL BLOCK ADDRESS OUT OF RANGE: + * ASC: 0x21h; ASCQ: 0x00h; + */ + arqstat->sts_sensedata.es_add_code = 0x21; + arqstat->sts_sensedata.es_qual_code = 0x00; + + break; + + default: + con_log(CL_ANN, (CE_CONT, "Unknown status!")); + pkt->pkt_reason = CMD_TRAN_ERR; + + break; + } + + atomic_add_16(&instance->fw_outstanding, (-1)); + + return_mfi_pkt(instance, cmd); + + (void) drsas_common_check(instance, cmd); + + if (acmd->cmd_dmahandle) { + if (drsas_check_dma_handle( + acmd->cmd_dmahandle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, + DDI_SERVICE_UNAFFECTED); + pkt->pkt_reason = CMD_TRAN_ERR; + pkt->pkt_statistics = 0; + } + } + + /* Call the callback routine */ + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + + break; + case MFI_CMD_OP_SMP: + case MFI_CMD_OP_STP: + complete_cmd_in_sync_mode(instance, cmd); + break; + case MFI_CMD_OP_DCMD: + /* see if got an event notification */ + if (ddi_get32(cmd->frame_dma_obj.acc_handle, + &cmd->frame->dcmd.opcode) == + DR_DCMD_CTRL_EVENT_WAIT) { + if ((instance->aen_cmd == cmd) && + (instance->aen_cmd->abort_aen)) { + con_log(CL_ANN, (CE_WARN, + "drsas_softintr: " + "aborted_aen returned")); + } else { + atomic_add_16(&instance->fw_outstanding, + (-1)); + service_mfi_aen(instance, cmd); + } + } else { + complete_cmd_in_sync_mode(instance, cmd); + } + + break; + case MFI_CMD_OP_ABORT: + con_log(CL_ANN, (CE_WARN, "MFI_CMD_OP_ABORT complete")); + /* + * MFI_CMD_OP_ABORT successfully completed + * in the synchronous mode + */ + complete_cmd_in_sync_mode(instance, cmd); + break; + default: + drsas_fm_ereport(instance, DDI_FM_DEVICE_NO_RESPONSE); + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + + if (cmd->pkt != NULL) { + pkt = cmd->pkt; + if (((pkt->pkt_flags & FLAG_NOINTR) == 0) && + pkt->pkt_comp) { + (*pkt->pkt_comp)(pkt); + } + } + con_log(CL_ANN, (CE_WARN, "Cmd type unknown !")); + break; + } + } + + instance->softint_running = 0; + + return (DDI_INTR_CLAIMED); +} + +/* + * drsas_alloc_dma_obj + * + * Allocate the memory and other resources for an dma object. + */ +static int +drsas_alloc_dma_obj(struct drsas_instance *instance, dma_obj_t *obj, + uchar_t endian_flags) +{ + int i; + size_t alen = 0; + uint_t cookie_cnt; + struct ddi_device_acc_attr tmp_endian_attr; + + tmp_endian_attr = endian_attr; + tmp_endian_attr.devacc_attr_endian_flags = endian_flags; + + i = ddi_dma_alloc_handle(instance->dip, &obj->dma_attr, + DDI_DMA_SLEEP, NULL, &obj->dma_handle); + if (i != DDI_SUCCESS) { + + switch (i) { + case DDI_DMA_BADATTR : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- Bad attribute")); + break; + case DDI_DMA_NORESOURCES : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle- No Resources")); + break; + default : + con_log(CL_ANN, (CE_WARN, + "Failed ddi_dma_alloc_handle: " + "unknown status %d", i)); + break; + } + + return (-1); + } + + if ((ddi_dma_mem_alloc(obj->dma_handle, obj->size, &tmp_endian_attr, + DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, NULL, + &obj->buffer, &alen, &obj->acc_handle) != DDI_SUCCESS) || + alen < obj->size) { + + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_mem_alloc")); + + return (-1); + } + + if (ddi_dma_addr_bind_handle(obj->dma_handle, NULL, obj->buffer, + obj->size, DDI_DMA_RDWR | DDI_DMA_STREAMING, DDI_DMA_SLEEP, + NULL, &obj->dma_cookie[0], &cookie_cnt) != DDI_SUCCESS) { + + ddi_dma_mem_free(&obj->acc_handle); + ddi_dma_free_handle(&obj->dma_handle); + + con_log(CL_ANN, (CE_WARN, "Failed : ddi_dma_addr_bind_handle")); + + return (-1); + } + + if (drsas_check_dma_handle(obj->dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + if (drsas_check_acc_handle(obj->acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_LOST); + return (-1); + } + + return (cookie_cnt); +} + +/* + * drsas_free_dma_obj(struct drsas_instance *, dma_obj_t) + * + * De-allocate the memory and other resources for an dma object, which must + * have been alloated by a previous call to drsas_alloc_dma_obj() + */ +static int +drsas_free_dma_obj(struct drsas_instance *instance, dma_obj_t obj) +{ + + if (drsas_check_dma_handle(obj.dma_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + if (drsas_check_acc_handle(obj.acc_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + return (DDI_FAILURE); + } + + (void) ddi_dma_unbind_handle(obj.dma_handle); + ddi_dma_mem_free(&obj.acc_handle); + ddi_dma_free_handle(&obj.dma_handle); + + return (DDI_SUCCESS); +} + +/* + * drsas_dma_alloc(instance_t *, struct scsi_pkt *, struct buf *, + * int, int (*)()) + * + * Allocate dma resources for a new scsi command + */ +static int +drsas_dma_alloc(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp, int flags, int (*callback)()) +{ + int dma_flags; + int (*cb)(caddr_t); + int i; + + ddi_dma_attr_t tmp_dma_attr = drsas_generic_dma_attr; + struct scsa_cmd *acmd = PKT2CMD(pkt); + + acmd->cmd_buf = bp; + + if (bp->b_flags & B_READ) { + acmd->cmd_flags &= ~CFLAG_DMASEND; + dma_flags = DDI_DMA_READ; + } else { + acmd->cmd_flags |= CFLAG_DMASEND; + dma_flags = DDI_DMA_WRITE; + } + + if (flags & PKT_CONSISTENT) { + acmd->cmd_flags |= CFLAG_CONSISTENT; + dma_flags |= DDI_DMA_CONSISTENT; + } + + if (flags & PKT_DMA_PARTIAL) { + dma_flags |= DDI_DMA_PARTIAL; + } + + dma_flags |= DDI_DMA_REDZONE; + + cb = (callback == NULL_FUNC) ? DDI_DMA_DONTWAIT : DDI_DMA_SLEEP; + + tmp_dma_attr.dma_attr_sgllen = instance->max_num_sge; + tmp_dma_attr.dma_attr_addr_hi = 0xffffffffffffffffull; + + if ((i = ddi_dma_alloc_handle(instance->dip, &tmp_dma_attr, + cb, 0, &acmd->cmd_dmahandle)) != DDI_SUCCESS) { + switch (i) { + case DDI_DMA_BADATTR: + bioerror(bp, EFAULT); + return (DDI_FAILURE); + + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + return (DDI_FAILURE); + + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_alloc_handle: " + "impossible result (0x%x)", i)); + bioerror(bp, EFAULT); + return (DDI_FAILURE); + } + } + + i = ddi_dma_buf_bind_handle(acmd->cmd_dmahandle, bp, dma_flags, + cb, 0, &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies); + + switch (i) { + case DDI_DMA_PARTIAL_MAP: + if ((dma_flags & DDI_DMA_PARTIAL) == 0) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "DDI_DMA_PARTIAL_MAP impossible")); + goto no_dma_cookies; + } + + if (ddi_dma_numwin(acmd->cmd_dmahandle, &acmd->cmd_nwin) == + DDI_FAILURE) { + con_log(CL_ANN, (CE_PANIC, "ddi_dma_numwin failed")); + goto no_dma_cookies; + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + + con_log(CL_ANN, (CE_PANIC, "ddi_dma_getwin failed")); + goto no_dma_cookies; + } + + goto get_dma_cookies; + case DDI_DMA_MAPPED: + acmd->cmd_nwin = 1; + acmd->cmd_dma_len = 0; + acmd->cmd_dma_offset = 0; + +get_dma_cookies: + i = 0; + acmd->cmd_dmacount = 0; + for (;;) { + acmd->cmd_dmacount += + acmd->cmd_dmacookies[i++].dmac_size; + + if (i == instance->max_num_sge || + i == acmd->cmd_ncookies) + break; + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookie = i; + acmd->cmd_cookiecnt = i; + + acmd->cmd_flags |= CFLAG_DMAVALID; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); + case DDI_DMA_NORESOURCES: + bioerror(bp, 0); + break; + case DDI_DMA_NOMAPPING: + bioerror(bp, EFAULT); + break; + case DDI_DMA_TOOBIG: + bioerror(bp, EINVAL); + break; + case DDI_DMA_INUSE: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle:" + " DDI_DMA_INUSE impossible")); + break; + default: + con_log(CL_ANN, (CE_PANIC, "ddi_dma_buf_bind_handle: " + "impossible result (0x%x)", i)); + break; + } + +no_dma_cookies: + ddi_dma_free_handle(&acmd->cmd_dmahandle); + acmd->cmd_dmahandle = NULL; + acmd->cmd_flags &= ~CFLAG_DMAVALID; + return (DDI_FAILURE); +} + +/* + * drsas_dma_move(struct drsas_instance *, struct scsi_pkt *, struct buf *) + * + * move dma resources to next dma window + * + */ +static int +drsas_dma_move(struct drsas_instance *instance, struct scsi_pkt *pkt, + struct buf *bp) +{ + int i = 0; + + struct scsa_cmd *acmd = PKT2CMD(pkt); + + /* + * If there are no more cookies remaining in this window, + * must move to the next window first. + */ + if (acmd->cmd_cookie == acmd->cmd_ncookies) { + if (acmd->cmd_curwin == acmd->cmd_nwin && acmd->cmd_nwin == 1) { + return (DDI_SUCCESS); + } + + /* at last window, cannot move */ + if (++acmd->cmd_curwin >= acmd->cmd_nwin) { + return (DDI_FAILURE); + } + + if (ddi_dma_getwin(acmd->cmd_dmahandle, acmd->cmd_curwin, + &acmd->cmd_dma_offset, &acmd->cmd_dma_len, + &acmd->cmd_dmacookies[0], &acmd->cmd_ncookies) == + DDI_FAILURE) { + return (DDI_FAILURE); + } + + acmd->cmd_cookie = 0; + } else { + /* still more cookies in this window - get the next one */ + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[0]); + } + + /* get remaining cookies in this window, up to our maximum */ + for (;;) { + acmd->cmd_dmacount += acmd->cmd_dmacookies[i++].dmac_size; + acmd->cmd_cookie++; + + if (i == instance->max_num_sge || + acmd->cmd_cookie == acmd->cmd_ncookies) { + break; + } + + ddi_dma_nextcookie(acmd->cmd_dmahandle, + &acmd->cmd_dmacookies[i]); + } + + acmd->cmd_cookiecnt = i; + + if (bp->b_bcount >= acmd->cmd_dmacount) { + pkt->pkt_resid = bp->b_bcount - acmd->cmd_dmacount; + } else { + pkt->pkt_resid = 0; + } + + return (DDI_SUCCESS); +} + +/* + * build_cmd + */ +static struct drsas_cmd * +build_cmd(struct drsas_instance *instance, struct scsi_address *ap, + struct scsi_pkt *pkt, uchar_t *cmd_done) +{ + uint16_t flags = 0; + uint32_t i; + uint32_t context; + uint32_t sge_bytes; + ddi_acc_handle_t acc_handle; + struct drsas_cmd *cmd; + struct drsas_sge64 *mfi_sgl; + struct scsa_cmd *acmd = PKT2CMD(pkt); + struct drsas_pthru_frame *pthru; + struct drsas_io_frame *ldio; + + /* find out if this is logical or physical drive command. */ + acmd->islogical = MRDRV_IS_LOGICAL(ap); + acmd->device_id = MAP_DEVICE_ID(instance, ap); + *cmd_done = 0; + + /* get the command packet */ + if (!(cmd = get_mfi_pkt(instance))) { + return (NULL); + } + + acc_handle = cmd->frame_dma_obj.acc_handle; + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(acc_handle, &cmd->frame->hdr.context, cmd->index); + + cmd->pkt = pkt; + cmd->cmd = acmd; + + /* lets get the command directions */ + if (acmd->cmd_flags & CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_WRITE; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORDEV); + } + } else if (acmd->cmd_flags & ~CFLAG_DMASEND) { + flags = MFI_FRAME_DIR_READ; + + if (acmd->cmd_flags & CFLAG_CONSISTENT) { + (void) ddi_dma_sync(acmd->cmd_dmahandle, + acmd->cmd_dma_offset, acmd->cmd_dma_len, + DDI_DMA_SYNC_FORCPU); + } + } else { + flags = MFI_FRAME_DIR_NONE; + } + + flags |= MFI_FRAME_SGL64; + + switch (pkt->pkt_cdbp[0]) { + + /* + * case SCMD_SYNCHRONIZE_CACHE: + * flush_cache(instance); + * return_mfi_pkt(instance, cmd); + * *cmd_done = 1; + * + * return (NULL); + */ + + case SCMD_READ: + case SCMD_WRITE: + case SCMD_READ_G1: + case SCMD_WRITE_G1: + if (acmd->islogical) { + ldio = (struct drsas_io_frame *)cmd->frame; + + /* + * preare the Logical IO frame: + * 2nd bit is zero for all read cmds + */ + ddi_put8(acc_handle, &ldio->cmd, + (pkt->pkt_cdbp[0] & 0x02) ? MFI_CMD_OP_LD_WRITE + : MFI_CMD_OP_LD_READ); + ddi_put8(acc_handle, &ldio->cmd_status, 0x0); + ddi_put8(acc_handle, &ldio->scsi_status, 0x0); + ddi_put8(acc_handle, &ldio->target_id, acmd->device_id); + ddi_put16(acc_handle, &ldio->timeout, 0); + ddi_put8(acc_handle, &ldio->reserved_0, 0); + ddi_put16(acc_handle, &ldio->pad_0, 0); + ddi_put16(acc_handle, &ldio->flags, flags); + + /* Initialize sense Information */ + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &ldio->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &ldio->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + ddi_put32(acc_handle, &ldio->start_lba_hi, 0); + ddi_put8(acc_handle, &ldio->access_byte, + (acmd->cmd_cdblen != 6) ? pkt->pkt_cdbp[1] : 0); + ddi_put8(acc_handle, &ldio->sge_count, + acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&ldio->sgl; + + context = ddi_get32(acc_handle, &ldio->context); + + if (acmd->cmd_cdblen == CDB_GROUP0) { + ddi_put32(acc_handle, &ldio->lba_count, ( + (uint16_t)(pkt->pkt_cdbp[4]))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[3])) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 8) | + ((uint32_t)((pkt->pkt_cdbp[1]) & 0x1F) + << 16))); + } else if (acmd->cmd_cdblen == CDB_GROUP1) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[8])) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 8))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP2) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[9])) | + ((uint16_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } else if (acmd->cmd_cdblen == CDB_GROUP3) { + ddi_put32(acc_handle, &ldio->lba_count, ( + ((uint16_t)(pkt->pkt_cdbp[13])) | + ((uint16_t)(pkt->pkt_cdbp[12]) << 8) | + ((uint16_t)(pkt->pkt_cdbp[11]) << 16) | + ((uint16_t)(pkt->pkt_cdbp[10]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[9])) | + ((uint32_t)(pkt->pkt_cdbp[8]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[7]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[6]) << 24))); + + ddi_put32(acc_handle, &ldio->start_lba_lo, ( + ((uint32_t)(pkt->pkt_cdbp[5])) | + ((uint32_t)(pkt->pkt_cdbp[4]) << 8) | + ((uint32_t)(pkt->pkt_cdbp[3]) << 16) | + ((uint32_t)(pkt->pkt_cdbp[2]) << 24))); + } + + break; + } + /* fall through For all non-rd/wr cmds */ + default: + + switch (pkt->pkt_cdbp[0]) { + case SCMD_MODE_SENSE: + case SCMD_MODE_SENSE_G1: { + union scsi_cdb *cdbp; + uint16_t page_code; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = (uint16_t)cdbp->cdb_un.sg.scsi[0]; + switch (page_code) { + case 0x3: + case 0x4: + (void) drsas_mode_sense_build(pkt); + return_mfi_pkt(instance, cmd); + *cmd_done = 1; + return (NULL); + } + break; + } + default: + break; + } + + pthru = (struct drsas_pthru_frame *)cmd->frame; + + /* prepare the DCDB frame */ + ddi_put8(acc_handle, &pthru->cmd, (acmd->islogical) ? + MFI_CMD_OP_LD_SCSI : MFI_CMD_OP_PD_SCSI); + ddi_put8(acc_handle, &pthru->cmd_status, 0x0); + ddi_put8(acc_handle, &pthru->scsi_status, 0x0); + ddi_put8(acc_handle, &pthru->target_id, acmd->device_id); + ddi_put8(acc_handle, &pthru->lun, 0); + ddi_put8(acc_handle, &pthru->cdb_len, acmd->cmd_cdblen); + ddi_put16(acc_handle, &pthru->timeout, 0); + ddi_put16(acc_handle, &pthru->flags, flags); + ddi_put32(acc_handle, &pthru->data_xfer_len, + acmd->cmd_dmacount); + ddi_put8(acc_handle, &pthru->sge_count, acmd->cmd_cookiecnt); + mfi_sgl = (struct drsas_sge64 *)&pthru->sgl; + + bzero(cmd->sense, SENSE_LENGTH); + ddi_put8(acc_handle, &pthru->sense_len, SENSE_LENGTH); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, + cmd->sense_phys_addr); + + context = ddi_get32(acc_handle, &pthru->context); + ddi_rep_put8(acc_handle, (uint8_t *)pkt->pkt_cdbp, + (uint8_t *)pthru->cdb, acmd->cmd_cdblen, DDI_DEV_AUTOINCR); + + break; + } +#ifdef lint + context = context; +#endif + /* prepare the scatter-gather list for the firmware */ + for (i = 0; i < acmd->cmd_cookiecnt; i++, mfi_sgl++) { + ddi_put64(acc_handle, &mfi_sgl->phys_addr, + acmd->cmd_dmacookies[i].dmac_laddress); + ddi_put32(acc_handle, &mfi_sgl->length, + acmd->cmd_dmacookies[i].dmac_size); + } + + sge_bytes = sizeof (struct drsas_sge64)*acmd->cmd_cookiecnt; + + cmd->frame_count = (sge_bytes / MRMFI_FRAME_SIZE) + + ((sge_bytes % MRMFI_FRAME_SIZE) ? 1 : 0) + 1; + + if (cmd->frame_count >= 8) { + cmd->frame_count = 8; + } + + return (cmd); +} + +/* + * issue_mfi_pthru + */ +static int +issue_mfi_pthru(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint_t model; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + dma_obj_t pthru_dma_obj; + struct drsas_pthru_frame *kpthru; + struct drsas_pthru_frame *pthru; + int i; + pthru = &cmd->frame->pthru; + kpthru = (struct drsas_pthru_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + + xferlen = kpthru->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP32")); + xferlen = kpthru->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_pthru: DDI_MODEL_LP64")); + xferlen = kpthru->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kpthru->sgl.sge64[0].phys_addr; +#endif + } + + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + pthru_dma_obj.size = xferlen; + pthru_dma_obj.dma_attr = drsas_generic_dma_attr; + pthru_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + pthru_dma_obj.dma_attr.dma_attr_sgllen = 1; + pthru_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &pthru_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_pthru: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kpthru->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf+i, + (uint8_t *)pthru_dma_obj.buffer+i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = pthru_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &pthru->cmd, kpthru->cmd); + ddi_put8(acc_handle, &pthru->sense_len, kpthru->sense_len); + ddi_put8(acc_handle, &pthru->cmd_status, 0); + ddi_put8(acc_handle, &pthru->scsi_status, 0); + ddi_put8(acc_handle, &pthru->target_id, kpthru->target_id); + ddi_put8(acc_handle, &pthru->lun, kpthru->lun); + ddi_put8(acc_handle, &pthru->cdb_len, kpthru->cdb_len); + ddi_put8(acc_handle, &pthru->sge_count, kpthru->sge_count); + ddi_put16(acc_handle, &pthru->timeout, kpthru->timeout); + ddi_put32(acc_handle, &pthru->data_xfer_len, kpthru->data_xfer_len); + + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_hi, 0); + /* pthru->sense_buf_phys_addr_lo = cmd->sense_phys_addr; */ + ddi_put32(acc_handle, &pthru->sense_buf_phys_addr_lo, 0); + + ddi_rep_put8(acc_handle, (uint8_t *)kpthru->cdb, (uint8_t *)pthru->cdb, + pthru->cdb_len, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &pthru->flags, kpthru->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &pthru->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru: fw_ioctl failed")); + } else { + if (xferlen && kpthru->flags & MFI_FRAME_DIR_READ) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)pthru_dma_obj.buffer+i, + (uint8_t *)ubuf+i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_pthru : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kpthru->cmd_status = ddi_get8(acc_handle, &pthru->cmd_status); + kpthru->scsi_status = ddi_get8(acc_handle, &pthru->scsi_status); + + con_log(CL_ANN, (CE_NOTE, "issue_mfi_pthru: cmd_status %x, " + "scsi_status %x", kpthru->cmd_status, kpthru->scsi_status)); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, pthru_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_dcmd + */ +static int +issue_mfi_dcmd(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *ubuf; + uint32_t kphys_addr = 0; + uint32_t xferlen = 0; + uint32_t model; + dma_obj_t dcmd_dma_obj; + struct drsas_dcmd_frame *kdcmd; + struct drsas_dcmd_frame *dcmd; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + dcmd = &cmd->frame->dcmd; + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_dcmd: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + if (xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + dcmd_dma_obj.size = xferlen; + dcmd_dma_obj.dma_attr = drsas_generic_dma_attr; + dcmd_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + dcmd_dma_obj.dma_attr.dma_attr_sgllen = 1; + dcmd_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &dcmd_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + if (kdcmd->flags & MFI_FRAME_DIR_WRITE) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyin((uint8_t *)ubuf + i, + (uint8_t *)dcmd_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + kphys_addr = dcmd_dma_obj.dma_cookie[0].dmac_address; + } + + ddi_put8(acc_handle, &dcmd->cmd, kdcmd->cmd); + ddi_put8(acc_handle, &dcmd->cmd_status, 0); + ddi_put8(acc_handle, &dcmd->sge_count, kdcmd->sge_count); + ddi_put16(acc_handle, &dcmd->timeout, kdcmd->timeout); + ddi_put32(acc_handle, &dcmd->data_xfer_len, kdcmd->data_xfer_len); + ddi_put32(acc_handle, &dcmd->opcode, kdcmd->opcode); + + ddi_rep_put8(acc_handle, (uint8_t *)kdcmd->mbox.b, + (uint8_t *)dcmd->mbox.b, DCMD_MBOX_SZ, DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &dcmd->flags, kdcmd->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].length, xferlen); + ddi_put32(acc_handle, &dcmd->sgl.sge32[0].phys_addr, kphys_addr); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_dcmd: fw_ioctl failed")); + } else { + if (xferlen && (kdcmd->flags & MFI_FRAME_DIR_READ)) { + for (i = 0; i < xferlen; i++) { + if (ddi_copyout( + (uint8_t *)dcmd_dma_obj.buffer + i, + (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_dcmd : " + "copy to user space failed")); + return (DDI_FAILURE); + } + } + } + } + + kdcmd->cmd_status = ddi_get8(acc_handle, &dcmd->cmd_status); + + if (xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, dcmd_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_smp + */ +static int +issue_mfi_smp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *request_ubuf; + void *response_ubuf; + uint32_t request_xferlen = 0; + uint32_t response_xferlen = 0; + uint_t model; + dma_obj_t request_dma_obj; + dma_obj_t response_dma_obj; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + struct drsas_smp_frame *ksmp; + struct drsas_smp_frame *smp; + struct drsas_sge32 *sge32; +#ifndef _ILP32 + struct drsas_sge64 *sge64; +#endif + int i; + uint64_t tmp_sas_addr; + + smp = &cmd->frame->smp; + ksmp = (struct drsas_smp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_ILP32")); + + sge32 = &ksmp->sgl[0].sge32[0]; + response_xferlen = sge32[0].length; + request_xferlen = sge32[1].length; + con_log(CL_ANN, (CE_NOTE, "issue_mfi_smp: " + "response_xferlen = %x, request_xferlen = %x", + response_xferlen, request_xferlen)); + + response_ubuf = (void *)(ulong_t)sge32[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge32[1].phys_addr; + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: " + "response_ubuf = %p, request_ubuf = %p", + response_ubuf, request_ubuf)); +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: DDI_MODEL_LP64")); + + sge64 = &ksmp->sgl[0].sge64[0]; + response_xferlen = sge64[0].length; + request_xferlen = sge64[1].length; + + response_ubuf = (void *)(ulong_t)sge64[0].phys_addr; + request_ubuf = (void *)(ulong_t)sge64[1].phys_addr; +#endif + } + if (request_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + request_dma_obj.size = request_xferlen; + request_dma_obj.dma_attr = drsas_generic_dma_attr; + request_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + request_dma_obj.dma_attr.dma_attr_sgllen = 1; + request_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &request_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyin((uint8_t *)request_ubuf + i, + (uint8_t *)request_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + response_dma_obj.size = response_xferlen; + response_dma_obj.dma_attr = drsas_generic_dma_attr; + response_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + response_dma_obj.dma_attr.dma_attr_sgllen = 1; + response_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &response_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyin((uint8_t *)response_ubuf + i, + (uint8_t *)response_dma_obj.buffer + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_smp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &smp->cmd, ksmp->cmd); + ddi_put8(acc_handle, &smp->cmd_status, 0); + ddi_put8(acc_handle, &smp->connection_status, 0); + ddi_put8(acc_handle, &smp->sge_count, ksmp->sge_count); + /* smp->context = ksmp->context; */ + ddi_put16(acc_handle, &smp->timeout, ksmp->timeout); + ddi_put32(acc_handle, &smp->data_xfer_len, ksmp->data_xfer_len); + + bcopy((void *)&ksmp->sas_addr, (void *)&tmp_sas_addr, + sizeof (uint64_t)); + ddi_put64(acc_handle, &smp->sas_addr, tmp_sas_addr); + + ddi_put16(acc_handle, &smp->flags, ksmp->flags & ~MFI_FRAME_SGL64); + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + sge32 = &smp->sgl[0].sge32[0]; + ddi_put32(acc_handle, &sge32[0].length, response_xferlen); + ddi_put32(acc_handle, &sge32[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge32[1].length, request_xferlen); + ddi_put32(acc_handle, &sge32[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#else + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: DDI_MODEL_LP64")); + sge64 = &smp->sgl[0].sge64[0]; + ddi_put32(acc_handle, &sge64[0].length, response_xferlen); + ddi_put64(acc_handle, &sge64[0].phys_addr, + response_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &sge64[1].length, request_xferlen); + ddi_put64(acc_handle, &sge64[1].phys_addr, + request_dma_obj.dma_cookie[0].dmac_address); +#endif + } + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp : " + "smp->response_xferlen = %d, smp->request_xferlen = %d " + "smp->data_xfer_len = %d", ddi_get32(acc_handle, &sge32[0].length), + ddi_get32(acc_handle, &sge32[1].length), + ddi_get32(acc_handle, &smp->data_xfer_len))); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp: fw_ioctl failed")); + } else { + con_log(CL_ANN1, (CE_NOTE, + "issue_mfi_smp: copy to user space")); + + if (request_xferlen) { + for (i = 0; i < request_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)request_dma_obj.buffer + + i, (uint8_t *)request_ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to user space" + " failed")); + return (DDI_FAILURE); + } + } + } + + if (response_xferlen) { + for (i = 0; i < response_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)response_dma_obj.buffer + + i, (uint8_t *)response_ubuf + + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_smp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + + ksmp->cmd_status = ddi_get8(acc_handle, &smp->cmd_status); + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_smp: smp->cmd_status = %d", + ddi_get8(acc_handle, &smp->cmd_status))); + + + if (request_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, request_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (response_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, response_dma_obj) != + DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * issue_mfi_stp + */ +static int +issue_mfi_stp(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + struct drsas_cmd *cmd, int mode) +{ + void *fis_ubuf; + void *data_ubuf; + uint32_t fis_xferlen = 0; + uint32_t data_xferlen = 0; + uint_t model; + dma_obj_t fis_dma_obj; + dma_obj_t data_dma_obj; + struct drsas_stp_frame *kstp; + struct drsas_stp_frame *stp; + ddi_acc_handle_t acc_handle = cmd->frame_dma_obj.acc_handle; + int i; + + stp = &cmd->frame->stp; + kstp = (struct drsas_stp_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; + } + else + { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_ILP32")); + + fis_xferlen = kstp->sgl.sge32[0].length; + data_xferlen = kstp->sgl.sge32[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge32[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge32[1].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, "issue_mfi_stp: DDI_MODEL_LP64")); + + fis_xferlen = kstp->sgl.sge64[0].length; + data_xferlen = kstp->sgl.sge64[1].length; + + fis_ubuf = (void *)(ulong_t)kstp->sgl.sge64[0].phys_addr; + data_ubuf = (void *)(ulong_t)kstp->sgl.sge64[1].phys_addr; +#endif + } + + + if (fis_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: " + "fis_ubuf = %p fis_xferlen = %x", fis_ubuf, fis_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + fis_dma_obj.size = fis_xferlen; + fis_dma_obj.dma_attr = drsas_generic_dma_attr; + fis_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + fis_dma_obj.dma_attr.dma_attr_sgllen = 1; + fis_dma_obj.dma_attr.dma_attr_align = 1; + + /* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &fis_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp : " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyin((uint8_t *)fis_ubuf + i, + (uint8_t *)fis_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + if (data_xferlen) { + con_log(CL_ANN, (CE_NOTE, "issue_mfi_stp: data_ubuf = %p " + "data_xferlen = %x", data_ubuf, data_xferlen)); + + /* means IOCTL requires DMA */ + /* allocate the data transfer buffer */ + data_dma_obj.size = data_xferlen; + data_dma_obj.dma_attr = drsas_generic_dma_attr; + data_dma_obj.dma_attr.dma_attr_addr_hi = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_count_max = 0xFFFFFFFFU; + data_dma_obj.dma_attr.dma_attr_sgllen = 1; + data_dma_obj.dma_attr.dma_attr_align = 1; + +/* allocate kernel buffer for DMA */ + if (drsas_alloc_dma_obj(instance, &data_dma_obj, + (uchar_t)DDI_STRUCTURE_LE_ACC) != 1) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "could not allocate data transfer buffer.")); + return (DDI_FAILURE); + } + + /* If IOCTL requires DMA WRITE, do ddi_copyin IOCTL data copy */ + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyin((uint8_t *)data_ubuf + i, + (uint8_t *)data_dma_obj.buffer + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: " + "copy from user space failed")); + return (DDI_FAILURE); + } + } + } + + ddi_put8(acc_handle, &stp->cmd, kstp->cmd); + ddi_put8(acc_handle, &stp->cmd_status, 0); + ddi_put8(acc_handle, &stp->connection_status, 0); + ddi_put8(acc_handle, &stp->target_id, kstp->target_id); + ddi_put8(acc_handle, &stp->sge_count, kstp->sge_count); + + ddi_put16(acc_handle, &stp->timeout, kstp->timeout); + ddi_put32(acc_handle, &stp->data_xfer_len, kstp->data_xfer_len); + + ddi_rep_put8(acc_handle, (uint8_t *)kstp->fis, (uint8_t *)stp->fis, 10, + DDI_DEV_AUTOINCR); + + ddi_put16(acc_handle, &stp->flags, kstp->flags & ~MFI_FRAME_SGL64); + ddi_put32(acc_handle, &stp->stp_flags, kstp->stp_flags); + ddi_put32(acc_handle, &stp->sgl.sge32[0].length, fis_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[0].phys_addr, + fis_dma_obj.dma_cookie[0].dmac_address); + ddi_put32(acc_handle, &stp->sgl.sge32[1].length, data_xferlen); + ddi_put32(acc_handle, &stp->sgl.sge32[1].phys_addr, + data_dma_obj.dma_cookie[0].dmac_address); + + cmd->sync_cmd = DRSAS_TRUE; + cmd->frame_count = 1; + + if (instance->func_ptr->issue_cmd_in_sync_mode(instance, cmd)) { + con_log(CL_ANN, (CE_WARN, "issue_mfi_stp: fw_ioctl failed")); + } else { + + if (fis_xferlen) { + for (i = 0; i < fis_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)fis_dma_obj.buffer + i, + (uint8_t *)fis_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to " + "user space failed")); + return (DDI_FAILURE); + } + } + } + } + if (data_xferlen) { + for (i = 0; i < data_xferlen; i++) { + if (ddi_copyout( + (uint8_t *)data_dma_obj.buffer + i, + (uint8_t *)data_ubuf + i, 1, mode)) { + con_log(CL_ANN, (CE_WARN, + "issue_mfi_stp : copy to" + " user space failed")); + return (DDI_FAILURE); + } + } + } + + kstp->cmd_status = ddi_get8(acc_handle, &stp->cmd_status); + + if (fis_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, fis_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + if (data_xferlen) { + /* free kernel buffer */ + if (drsas_free_dma_obj(instance, data_dma_obj) != DDI_SUCCESS) + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +/* + * fill_up_drv_ver + */ +static void +fill_up_drv_ver(struct drsas_drv_ver *dv) +{ + (void) memset(dv, 0, sizeof (struct drsas_drv_ver)); + + (void) memcpy(dv->signature, "$LSI LOGIC$", strlen("$LSI LOGIC$")); + (void) memcpy(dv->os_name, "Solaris", strlen("Solaris")); + (void) memcpy(dv->drv_name, "dr_sas", strlen("dr_sas")); + (void) memcpy(dv->drv_ver, DRSAS_VERSION, strlen(DRSAS_VERSION)); + (void) memcpy(dv->drv_rel_date, DRSAS_RELDATE, + strlen(DRSAS_RELDATE)); +} + +/* + * handle_drv_ioctl + */ +static int +handle_drv_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int i; + int rval = DDI_SUCCESS; + int *props = NULL; + void *ubuf; + + uint8_t *pci_conf_buf; + uint32_t xferlen; + uint32_t num_props; + uint_t model; + struct drsas_dcmd_frame *kdcmd; + struct drsas_drv_ver dv; + struct drsas_pci_information pi; + + kdcmd = (struct drsas_dcmd_frame *)&ioctl->frame[0]; + + model = ddi_model_convert_from(mode & FMODELS); + if (model == DDI_MODEL_ILP32) { + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + + xferlen = kdcmd->sgl.sge32[0].length; + + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; + } else { +#ifdef _ILP32 + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_ILP32")); + xferlen = kdcmd->sgl.sge32[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge32[0].phys_addr; +#else + con_log(CL_ANN1, (CE_NOTE, + "handle_drv_ioctl: DDI_MODEL_LP64")); + xferlen = kdcmd->sgl.sge64[0].length; + ubuf = (void *)(ulong_t)kdcmd->sgl.sge64[0].phys_addr; +#endif + } + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "dataBuf=%p size=%d bytes", ubuf, xferlen)); + + switch (kdcmd->opcode) { + case DRSAS_DRIVER_IOCTL_DRIVER_VERSION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION")); + + fill_up_drv_ver(&dv); + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&dv + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_DRIVER_VERSION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + if (i == xferlen) + kdcmd->cmd_status = 0; + break; + case DRSAS_DRIVER_IOCTL_PCI_INFORMATION: + con_log(CL_ANN1, (CE_NOTE, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMAITON")); + + if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, instance->dip, + 0, "reg", &props, &num_props)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION : " + "ddi_prop_look_int_array failed")); + rval = DDI_FAILURE; + } else { + + pi.busNumber = (props[0] >> 16) & 0xFF; + pi.deviceNumber = (props[0] >> 11) & 0x1f; + pi.functionNumber = (props[0] >> 8) & 0x7; + ddi_prop_free((void *)props); + } + + pci_conf_buf = (uint8_t *)&pi.pciHeaderInfo; + + for (i = 0; i < (sizeof (struct drsas_pci_information) - + offsetof(struct drsas_pci_information, pciHeaderInfo)); + i++) { + pci_conf_buf[i] = + pci_config_get8(instance->pci_handle, i); + } + for (i = 0; i < xferlen; i++) { + if (ddi_copyout((uint8_t *)&pi + i, (uint8_t *)ubuf + i, + 1, mode)) { + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "DRSAS_DRIVER_IOCTL_PCI_INFORMATION" + " : copy to user space failed")); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + } + + if (i == xferlen) + kdcmd->cmd_status = 0; + + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_drv_ioctl: " + "invalid driver specific IOCTL opcode = 0x%x", + kdcmd->opcode)); + kdcmd->cmd_status = 1; + rval = DDI_FAILURE; + break; + } + + return (rval); +} + +/* + * handle_mfi_ioctl + */ +static int +handle_mfi_ioctl(struct drsas_instance *instance, struct drsas_ioctl *ioctl, + int mode) +{ + int rval = DDI_SUCCESS; + + struct drsas_header *hdr; + struct drsas_cmd *cmd; + + cmd = get_mfi_pkt(instance); + + if (!cmd) { + con_log(CL_ANN, (CE_WARN, "dr_sas: " + "failed to get a cmd packet")); + return (DDI_FAILURE); + } + + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + hdr = (struct drsas_header *)&ioctl->frame[0]; + + switch (hdr->cmd) { + case MFI_CMD_OP_DCMD: + rval = issue_mfi_dcmd(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_SMP: + rval = issue_mfi_smp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_STP: + rval = issue_mfi_stp(instance, ioctl, cmd, mode); + break; + case MFI_CMD_OP_LD_SCSI: + case MFI_CMD_OP_PD_SCSI: + rval = issue_mfi_pthru(instance, ioctl, cmd, mode); + break; + default: + con_log(CL_ANN, (CE_WARN, "handle_mfi_ioctl: " + "invalid mfi ioctl hdr->cmd = %d", hdr->cmd)); + rval = DDI_FAILURE; + break; + } + + + return_mfi_pkt(instance, cmd); + if (drsas_common_check(instance, cmd) != DDI_SUCCESS) + rval = DDI_FAILURE; + return (rval); +} + +/* + * AEN + */ +static int +handle_mfi_aen(struct drsas_instance *instance, struct drsas_aen *aen) +{ + int rval = 0; + + rval = register_mfi_aen(instance, instance->aen_seq_num, + aen->class_locale_word); + + aen->cmd_status = (uint8_t)rval; + + return (rval); +} + +static int +register_mfi_aen(struct drsas_instance *instance, uint32_t seq_num, + uint32_t class_locale_word) +{ + int ret_val; + + struct drsas_cmd *cmd, *aen_cmd; + struct drsas_dcmd_frame *dcmd; + union drsas_evt_class_locale curr_aen; + union drsas_evt_class_locale prev_aen; + + /* + * If there an AEN pending already (aen_cmd), check if the + * class_locale of that pending AEN is inclusive of the new + * AEN request we currently have. If it is, then we don't have + * to do anything. In other words, whichever events the current + * AEN request is subscribing to, have already been subscribed + * to. + * + * If the old_cmd is _not_ inclusive, then we have to abort + * that command, form a class_locale that is superset of both + * old and current and re-issue to the FW + */ + + curr_aen.word = class_locale_word; + aen_cmd = instance->aen_cmd; + if (aen_cmd) { + prev_aen.word = ddi_get32(aen_cmd->frame_dma_obj.acc_handle, + &aen_cmd->frame->dcmd.mbox.w[1]); + + /* + * A class whose enum value is smaller is inclusive of all + * higher values. If a PROGRESS (= -1) was previously + * registered, then a new registration requests for higher + * classes need not be sent to FW. They are automatically + * included. + * + * Locale numbers don't have such hierarchy. They are bitmap + * values + */ + if ((prev_aen.members.class <= curr_aen.members.class) && + !((prev_aen.members.locale & curr_aen.members.locale) ^ + curr_aen.members.locale)) { + /* + * Previously issued event registration includes + * current request. Nothing to do. + */ + + return (0); + } else { + curr_aen.members.locale |= prev_aen.members.locale; + + if (prev_aen.members.class < curr_aen.members.class) + curr_aen.members.class = prev_aen.members.class; + + ret_val = abort_aen_cmd(instance, aen_cmd); + + if (ret_val) { + con_log(CL_ANN, (CE_WARN, "register_mfi_aen: " + "failed to abort prevous AEN command")); + + return (ret_val); + } + } + } else { + curr_aen.word = class_locale_word; + } + + cmd = get_mfi_pkt(instance); + + if (!cmd) + return (ENOMEM); + /* Clear the frame buffer and assign back the context id */ + (void) memset((char *)&cmd->frame[0], 0, sizeof (union drsas_frame)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &cmd->frame->hdr.context, + cmd->index); + + dcmd = &cmd->frame->dcmd; + + /* for(i = 0; i < DCMD_MBOX_SZ; i++) dcmd->mbox.b[i] = 0; */ + (void) memset(dcmd->mbox.b, 0, DCMD_MBOX_SZ); + + (void) memset(instance->mfi_evt_detail_obj.buffer, 0, + sizeof (struct drsas_evt_detail)); + + /* Prepare DCMD for aen registration */ + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd, MFI_CMD_OP_DCMD); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->cmd_status, 0x0); + ddi_put8(cmd->frame_dma_obj.acc_handle, &dcmd->sge_count, 1); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->flags, + MFI_FRAME_DIR_READ); + ddi_put16(cmd->frame_dma_obj.acc_handle, &dcmd->timeout, 0); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->data_xfer_len, + sizeof (struct drsas_evt_detail)); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->opcode, + DR_DCMD_CTRL_EVENT_WAIT); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[0], seq_num); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->mbox.w[1], + curr_aen.word); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].phys_addr, + instance->mfi_evt_detail_obj.dma_cookie[0].dmac_address); + ddi_put32(cmd->frame_dma_obj.acc_handle, &dcmd->sgl.sge32[0].length, + sizeof (struct drsas_evt_detail)); + + instance->aen_seq_num = seq_num; + + + /* + * Store reference to the cmd used to register for AEN. When an + * application wants us to register for AEN, we have to abort this + * cmd and re-register with a new EVENT LOCALE supplied by that app + */ + instance->aen_cmd = cmd; + + cmd->frame_count = 1; + + /* Issue the aen registration frame */ + /* atomic_add_16 (&instance->fw_outstanding, 1); */ + instance->func_ptr->issue_cmd(cmd, instance); + + return (0); +} + +static void +display_scsi_inquiry(caddr_t scsi_inq) +{ +#define MAX_SCSI_DEVICE_CODE 14 + int i; + char inquiry_buf[256] = {0}; + int len; + const char *const scsi_device_types[] = { + "Direct-Access ", + "Sequential-Access", + "Printer ", + "Processor ", + "WORM ", + "CD-ROM ", + "Scanner ", + "Optical Device ", + "Medium Changer ", + "Communications ", + "Unknown ", + "Unknown ", + "Unknown ", + "Enclosure ", + }; + + len = 0; + + len += snprintf(inquiry_buf + len, 265 - len, " Vendor: "); + for (i = 8; i < 16; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Model: "); + + for (i = 16; i < 32; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, " Rev: "); + + for (i = 32; i < 36; i++) { + len += snprintf(inquiry_buf + len, 265 - len, "%c", + scsi_inq[i]); + } + + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + + + i = scsi_inq[0] & 0x1f; + + + len += snprintf(inquiry_buf + len, 265 - len, " Type: %s ", + i < MAX_SCSI_DEVICE_CODE ? scsi_device_types[i] : + "Unknown "); + + + len += snprintf(inquiry_buf + len, 265 - len, + " ANSI SCSI revision: %02x", scsi_inq[2] & 0x07); + + if ((scsi_inq[2] & 0x07) == 1 && (scsi_inq[3] & 0x0f) == 1) { + len += snprintf(inquiry_buf + len, 265 - len, " CCS\n"); + } else { + len += snprintf(inquiry_buf + len, 265 - len, "\n"); + } + + con_log(CL_ANN1, (CE_CONT, inquiry_buf)); +} + +static int +read_fw_status_reg_ppc(struct drsas_instance *instance) +{ + return ((int)RD_OB_SCRATCH_PAD_0(instance)); +} + +static void +issue_cmd_ppc(struct drsas_cmd *cmd, struct drsas_instance *instance) +{ + atomic_add_16(&instance->fw_outstanding, 1); + + /* Issue the command to the FW */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); +} + +/* + * issue_cmd_in_sync_mode + */ +static int +issue_cmd_in_sync_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * (10 * MILLISEC); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: called")); + + cmd->cmd_status = ENODATA; + + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + mutex_enter(&instance->int_cmd_mtx); + + for (i = 0; i < msecs && (cmd->cmd_status == ENODATA); i++) { + cv_wait(&instance->int_cmd_cv, &instance->int_cmd_mtx); + } + + mutex_exit(&instance->int_cmd_mtx); + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_sync_mode_ppc: done")); + + if (i < (msecs -1)) { + return (DDI_SUCCESS); + } else { + return (DDI_FAILURE); + } +} + +/* + * issue_cmd_in_poll_mode + */ +static int +issue_cmd_in_poll_mode_ppc(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int i; + uint16_t flags; + uint32_t msecs = MFI_POLL_TIMEOUT_SECS * MILLISEC; + struct drsas_header *frame_hdr; + + con_log(CL_ANN1, (CE_NOTE, "issue_cmd_in_poll_mode_ppc: called")); + + frame_hdr = (struct drsas_header *)cmd->frame; + ddi_put8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status, + MFI_CMD_STATUS_POLL_MODE); + flags = ddi_get16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags); + flags |= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE; + + ddi_put16(cmd->frame_dma_obj.acc_handle, &frame_hdr->flags, flags); + + /* issue the frame using inbound queue port */ + WR_IB_QPORT((cmd->frame_phys_addr) | + (((cmd->frame_count - 1) << 1) | 1), instance); + + /* wait for cmd_status to change from 0xFF */ + for (i = 0; i < msecs && ( + ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE); i++) { + drv_usecwait(MILLISEC); /* wait for 1000 usecs */ + } + + if (ddi_get8(cmd->frame_dma_obj.acc_handle, &frame_hdr->cmd_status) + == MFI_CMD_STATUS_POLL_MODE) { + con_log(CL_ANN, (CE_NOTE, "issue_cmd_in_poll_mode: " + "cmd polling timed out")); + return (DDI_FAILURE); + } + + return (DDI_SUCCESS); +} + +static void +enable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: called")); + + /* WR_OB_DOORBELL_CLEAR(0xFFFFFFFF, instance); */ + WR_OB_DOORBELL_CLEAR(OB_DOORBELL_CLEAR_MASK, instance); + + /* WR_OB_INTR_MASK(~0x80000000, instance); */ + WR_OB_INTR_MASK(~(MFI_REPLY_2108_MESSAGE_INTR_MASK), instance); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); + + con_log(CL_ANN1, (CE_NOTE, "enable_intr_ppc: " + "outbound_intr_mask = 0x%x", mask)); +} + +static void +disable_intr_ppc(struct drsas_instance *instance) +{ + uint32_t mask; + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: called")); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: before : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* WR_OB_INTR_MASK(0xFFFFFFFF, instance); */ + WR_OB_INTR_MASK(OB_INTR_MASK, instance); + + con_log(CL_ANN1, (CE_NOTE, "disable_intr_ppc: after : " + "outbound_intr_mask = 0x%x", RD_OB_INTR_MASK(instance))); + + /* dummy read to force PCI flush */ + mask = RD_OB_INTR_MASK(instance); +#ifdef lint + mask = mask; +#endif +} + +static int +intr_ack_ppc(struct drsas_instance *instance) +{ + uint32_t status; + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: called")); + + /* check if it is our interrupt */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: status = 0x%x", status)); + + if (!(status & MFI_REPLY_2108_MESSAGE_INTR)) { + return (DDI_INTR_UNCLAIMED); + } + + /* clear the interrupt by writing back the same value */ + WR_OB_DOORBELL_CLEAR(status, instance); + + /* dummy READ */ + status = RD_OB_INTR_STATUS(instance); + + con_log(CL_ANN1, (CE_NOTE, "intr_ack_ppc: interrupt cleared")); + + return (DDI_INTR_CLAIMED); +} + +static int +drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd) +{ + int ret = DDI_SUCCESS; + + if (drsas_check_dma_handle(cmd->frame_dma_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_internal_dma_obj.dma_handle) + != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_dma_handle(instance->mfi_evt_detail_obj.dma_handle) != + DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + if (drsas_check_acc_handle(instance->regmap_handle) != DDI_SUCCESS) { + ddi_fm_service_impact(instance->dip, DDI_SERVICE_UNAFFECTED); + + ddi_fm_acc_err_clear(instance->regmap_handle, DDI_FME_VER0); + + if (cmd->pkt != NULL) { + cmd->pkt->pkt_reason = CMD_TRAN_ERR; + cmd->pkt->pkt_statistics = 0; + } + ret = DDI_FAILURE; + } + + return (ret); +} + +/*ARGSUSED*/ +static int +drsas_fm_error_cb(dev_info_t *dip, ddi_fm_error_t *err, const void *impl_data) +{ + /* + * as the driver can always deal with an error in any dma or + * access handle, we can just return the fme_status value. + */ + pci_ereport_post(dip, err, NULL); + return (err->fme_status); +} + +static void +drsas_fm_init(struct drsas_instance *instance) +{ + /* Need to change iblock to priority for new MSI intr */ + ddi_iblock_cookie_t fm_ibc; + + /* Only register with IO Fault Services if we have some capability */ + if (instance->fm_capabilities) { + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_FLAGERR_ACC; + drsas_generic_dma_attr.dma_attr_flags = DDI_DMA_FLAGERR; + + /* + * Register capabilities with IO Fault Services. + * fm_capabilities will be updated to indicate + * capabilities actually supported (not requested.) + */ + + ddi_fm_init(instance->dip, &instance->fm_capabilities, &fm_ibc); + + /* + * Initialize pci ereport capabilities if ereport + * capable (should always be.) + */ + + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_setup(instance->dip); + } + + /* + * Register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_register(instance->dip, + drsas_fm_error_cb, (void*) instance); + } + } else { + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +static void +drsas_fm_fini(struct drsas_instance *instance) +{ + /* Only unregister FMA capabilities if registered */ + if (instance->fm_capabilities) { + /* + * Un-register error callback if error callback capable. + */ + if (DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + ddi_fm_handler_unregister(instance->dip); + } + + /* + * Release any resources allocated by pci_ereport_setup() + */ + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities) || + DDI_FM_ERRCB_CAP(instance->fm_capabilities)) { + pci_ereport_teardown(instance->dip); + } + + /* Unregister from IO Fault Services */ + ddi_fm_fini(instance->dip); + + /* Adjust access and dma attributes for FMA */ + endian_attr.devacc_attr_access = DDI_DEFAULT_ACC; + drsas_generic_dma_attr.dma_attr_flags = 0; + } +} + +int +drsas_check_acc_handle(ddi_acc_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_acc_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +int +drsas_check_dma_handle(ddi_dma_handle_t handle) +{ + ddi_fm_error_t de; + + if (handle == NULL) { + return (DDI_FAILURE); + } + + ddi_fm_dma_err_get(handle, &de, DDI_FME_VERSION); + + return (de.fme_status); +} + +void +drsas_fm_ereport(struct drsas_instance *instance, char *detail) +{ + uint64_t ena; + char buf[FM_MAX_CLASS]; + + (void) snprintf(buf, FM_MAX_CLASS, "%s.%s", DDI_FM_DEVICE, detail); + ena = fm_ena_generate(0, FM_ENA_FMT1); + if (DDI_FM_EREPORT_CAP(instance->fm_capabilities)) { + ddi_fm_ereport_post(instance->dip, buf, ena, DDI_NOSLEEP, + FM_VERSION, DATA_TYPE_UINT8, FM_EREPORT_VERSION, NULL); + } +} + +static int +drsas_add_intrs(struct drsas_instance *instance, int intr_type) +{ + + dev_info_t *dip = instance->dip; + int avail, actual, count; + int i, flag, ret; + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: intr_type = %x", + intr_type)); + + /* Get number of interrupts */ + ret = ddi_intr_get_nintrs(dip, intr_type, &count); + if ((ret != DDI_SUCCESS) || (count == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_nintrs() failed:" + "ret %d count %d", ret, count)); + + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: count = %d ", count)); + + /* Get number of available interrupts */ + ret = ddi_intr_get_navail(dip, intr_type, &avail); + if ((ret != DDI_SUCCESS) || (avail == 0)) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_navail() failed:" + "ret %d avail %d", ret, avail)); + + return (DDI_FAILURE); + } + con_log(CL_DLEVEL1, (CE_WARN, "drsas_add_intrs: avail = %d ", avail)); + + /* Only one interrupt routine. So limit the count to 1 */ + if (count > 1) { + count = 1; + } + + /* + * Allocate an array of interrupt handlers. Currently we support + * only one interrupt. The framework can be extended later. + */ + instance->intr_size = count * sizeof (ddi_intr_handle_t); + instance->intr_htable = kmem_zalloc(instance->intr_size, KM_SLEEP); + ASSERT(instance->intr_htable); + + flag = ((intr_type == DDI_INTR_TYPE_MSI) || (intr_type == + DDI_INTR_TYPE_MSIX)) ? DDI_INTR_ALLOC_STRICT:DDI_INTR_ALLOC_NORMAL; + + /* Allocate interrupt */ + ret = ddi_intr_alloc(dip, instance->intr_htable, intr_type, 0, + count, &actual, flag); + + if ((ret != DDI_SUCCESS) || (actual == 0)) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "avail = %d", avail)); + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + if (actual < count) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "Requested = %d Received = %d", count, actual)); + } + instance->intr_cnt = actual; + + /* + * Get the priority of the interrupt allocated. + */ + if ((ret = ddi_intr_get_pri(instance->intr_htable[0], + &instance->intr_pri)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "get priority call failed")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + /* + * Test for high level mutex. we don't support them. + */ + if (instance->intr_pri >= ddi_intr_get_hilevel_pri()) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs: " + "High level interrupts not supported.")); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + con_log(CL_DLEVEL1, (CE_NOTE, "drsas_add_intrs: intr_pri = 0x%x ", + instance->intr_pri)); + + /* Call ddi_intr_add_handler() */ + for (i = 0; i < actual; i++) { + ret = ddi_intr_add_handler(instance->intr_htable[i], + (ddi_intr_handler_t *)drsas_isr, (caddr_t)instance, + (caddr_t)(uintptr_t)i); + + if (ret != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "drsas_add_intrs:" + "failed %d", ret)); + + for (i = 0; i < actual; i++) { + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + } + + con_log(CL_DLEVEL1, (CE_WARN, " ddi_intr_add_handler done")); + + if ((ret = ddi_intr_get_cap(instance->intr_htable[0], + &instance->intr_cap)) != DDI_SUCCESS) { + con_log(CL_ANN, (CE_WARN, "ddi_intr_get_cap() failed %d", + ret)); + + /* Free already allocated intr */ + for (i = 0; i < actual; i++) { + (void) ddi_intr_remove_handler( + instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + kmem_free(instance->intr_htable, instance->intr_size); + return (DDI_FAILURE); + } + + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + con_log(CL_ANN, (CE_WARN, "Calling ddi_intr_block _enable")); + + (void) ddi_intr_block_enable(instance->intr_htable, + instance->intr_cnt); + } else { + con_log(CL_ANN, (CE_NOTE, " calling ddi_intr_enable")); + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_enable(instance->intr_htable[i]); + con_log(CL_ANN, (CE_NOTE, "ddi intr enable returns " + "%d", i)); + } + } + + return (DDI_SUCCESS); + +} + + +static void +drsas_rem_intrs(struct drsas_instance *instance) +{ + int i; + + con_log(CL_ANN, (CE_NOTE, "drsas_rem_intrs called")); + + /* Disable all interrupts first */ + if (instance->intr_cap & DDI_INTR_FLAG_BLOCK) { + (void) ddi_intr_block_disable(instance->intr_htable, + instance->intr_cnt); + } else { + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_disable(instance->intr_htable[i]); + } + } + + /* Remove all the handlers */ + + for (i = 0; i < instance->intr_cnt; i++) { + (void) ddi_intr_remove_handler(instance->intr_htable[i]); + (void) ddi_intr_free(instance->intr_htable[i]); + } + + kmem_free(instance->intr_htable, instance->intr_size); +} + +static int +drsas_tran_bus_config(dev_info_t *parent, uint_t flags, + ddi_bus_config_op_t op, void *arg, dev_info_t **childp) +{ + struct drsas_instance *instance; + int config; + int rval; + + char *ptr = NULL; + int tgt, lun; + + con_log(CL_ANN1, (CE_NOTE, "Bus config called for op = %x", op)); + + if ((instance = ddi_get_soft_state(drsas_state, + ddi_get_instance(parent))) == NULL) { + return (NDI_FAILURE); + } + + /* Hold nexus during bus_config */ + ndi_devi_enter(parent, &config); + switch (op) { + case BUS_CONFIG_ONE: { + + /* parse wwid/target name out of name given */ + if ((ptr = strchr((char *)arg, '@')) == NULL) { + rval = NDI_FAILURE; + break; + } + ptr++; + + if (drsas_parse_devname(arg, &tgt, &lun) != 0) { + rval = NDI_FAILURE; + break; + } + + if (lun == 0) { + rval = drsas_config_ld(instance, tgt, lun, childp); + } else { + rval = NDI_FAILURE; + } + + break; + } + case BUS_CONFIG_DRIVER: + case BUS_CONFIG_ALL: { + + rval = drsas_config_all_devices(instance); + + rval = NDI_SUCCESS; + break; + } + } + + if (rval == NDI_SUCCESS) { + rval = ndi_busop_bus_config(parent, flags, op, arg, childp, 0); + + } + ndi_devi_exit(parent, config); + + con_log(CL_ANN1, (CE_NOTE, "drsas_tran_bus_config: rval = %x", + rval)); + return (rval); +} + +static int +drsas_config_all_devices(struct drsas_instance *instance) +{ + int rval, tgt; + + for (tgt = 0; tgt < MRDRV_MAX_LD; tgt++) { + (void) drsas_config_ld(instance, tgt, 0, NULL); + + } + + rval = NDI_SUCCESS; + return (rval); +} + +static int +drsas_parse_devname(char *devnm, int *tgt, int *lun) +{ + char devbuf[SCSI_MAXNAMELEN]; + char *addr; + char *p, *tp, *lp; + long num; + + /* Parse dev name and address */ + (void) strcpy(devbuf, devnm); + addr = ""; + for (p = devbuf; *p != '\0'; p++) { + if (*p == '@') { + addr = p + 1; + *p = '\0'; + } else if (*p == ':') { + *p = '\0'; + break; + } + } + + /* Parse target and lun */ + for (p = tp = addr, lp = NULL; *p != '\0'; p++) { + if (*p == ',') { + lp = p + 1; + *p = '\0'; + break; + } + } + if (tgt && tp) { + if (ddi_strtol(tp, NULL, 0x10, &num)) { + return (DDI_FAILURE); /* Can declare this as constant */ + } + *tgt = (int)num; + } + if (lun && lp) { + if (ddi_strtol(lp, NULL, 0x10, &num)) { + return (DDI_FAILURE); + } + *lun = (int)num; + } + return (DDI_SUCCESS); /* Success case */ +} + +static int +drsas_config_ld(struct drsas_instance *instance, uint16_t tgt, + uint8_t lun, dev_info_t **ldip) +{ + struct scsi_device *sd; + dev_info_t *child; + int rval; + + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: t = %d l = %d", + tgt, lun)); + + if ((child = drsas_find_child(instance, tgt, lun)) != NULL) { + if (ldip) { + *ldip = child; + } + con_log(CL_ANN1, (CE_NOTE, + "drsas_config_ld: Child = %p found t = %d l = %d", + (void *)child, tgt, lun)); + return (NDI_SUCCESS); + } + + sd = kmem_zalloc(sizeof (struct scsi_device), KM_SLEEP); + sd->sd_address.a_hba_tran = instance->tran; + sd->sd_address.a_target = (uint16_t)tgt; + sd->sd_address.a_lun = (uint8_t)lun; + + if (scsi_hba_probe(sd, NULL) == SCSIPROBE_EXISTS) + rval = drsas_config_scsi_device(instance, sd, ldip); + else + rval = NDI_FAILURE; + + /* sd_unprobe is blank now. Free buffer manually */ + if (sd->sd_inq) { + kmem_free(sd->sd_inq, SUN_INQSIZE); + sd->sd_inq = (struct scsi_inquiry *)NULL; + } + + kmem_free(sd, sizeof (struct scsi_device)); + con_log(CL_ANN1, (CE_NOTE, "drsas_config_ld: return rval = %d", + rval)); + return (rval); +} + +static int +drsas_config_scsi_device(struct drsas_instance *instance, + struct scsi_device *sd, dev_info_t **dipp) +{ + char *nodename = NULL; + char **compatible = NULL; + int ncompatible = 0; + char *childname; + dev_info_t *ldip = NULL; + int tgt = sd->sd_address.a_target; + int lun = sd->sd_address.a_lun; + int dtype = sd->sd_inq->inq_dtype & DTYPE_MASK; + int rval; + + con_log(CL_ANN1, (CE_WARN, "dr_sas: scsi_device t%dL%d", tgt, lun)); + scsi_hba_nodename_compatible_get(sd->sd_inq, NULL, dtype, + NULL, &nodename, &compatible, &ncompatible); + + if (nodename == NULL) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: Found no compatible driver " + "for t%dL%d", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + childname = (dtype == DTYPE_DIRECT) ? "sd" : nodename; + con_log(CL_ANN1, (CE_WARN, + "dr_sas: Childname = %2s nodename = %s", childname, nodename)); + + /* Create a dev node */ + rval = ndi_devi_alloc(instance->dip, childname, DEVI_SID_NODEID, &ldip); + con_log(CL_ANN1, (CE_WARN, + "dr_sas_config_scsi_device: ndi_devi_alloc rval = %x", rval)); + if (rval == NDI_SUCCESS) { + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "target", tgt) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d target", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + if (ndi_prop_update_int(DDI_DEV_T_NONE, ldip, "lun", lun) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d lun", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + if (ndi_prop_update_string_array(DDI_DEV_T_NONE, ldip, + "compatible", compatible, ncompatible) != + DDI_PROP_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to create " + "property for t%dl%d compatible", tgt, lun)); + rval = NDI_FAILURE; + goto finish; + } + + rval = ndi_devi_online(ldip, NDI_ONLINE_ATTACH); + if (rval != NDI_SUCCESS) { + con_log(CL_ANN1, (CE_WARN, "dr_sas: unable to online " + "t%dl%d", tgt, lun)); + ndi_prop_remove_all(ldip); + (void) ndi_devi_free(ldip); + } else { + con_log(CL_ANN1, (CE_WARN, "dr_sas: online Done :" + "0 t%dl%d", tgt, lun)); + } + + } +finish: + if (dipp) { + *dipp = ldip; + } + + con_log(CL_DLEVEL1, (CE_WARN, + "dr_sas: config_scsi_device rval = %d t%dL%d", + rval, tgt, lun)); + scsi_hba_nodename_compatible_free(nodename, compatible); + return (rval); +} + +/*ARGSUSED*/ +static int +drsas_service_evt(struct drsas_instance *instance, int tgt, int lun, int event, + uint64_t wwn) +{ + struct drsas_eventinfo *mrevt = NULL; + + con_log(CL_ANN1, (CE_NOTE, + "drsas_service_evt called for t%dl%d event = %d", + tgt, lun, event)); + + if ((instance->taskq == NULL) || (mrevt = + kmem_zalloc(sizeof (struct drsas_eventinfo), KM_NOSLEEP)) == NULL) { + return (ENOMEM); + } + + mrevt->instance = instance; + mrevt->tgt = tgt; + mrevt->lun = lun; + mrevt->event = event; + + if ((ddi_taskq_dispatch(instance->taskq, + (void (*)(void *))drsas_issue_evt_taskq, mrevt, DDI_NOSLEEP)) != + DDI_SUCCESS) { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: Event task failed for t%dl%d event = %d", + tgt, lun, event)); + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + return (DDI_FAILURE); + } + return (DDI_SUCCESS); +} + +static void +drsas_issue_evt_taskq(struct drsas_eventinfo *mrevt) +{ + struct drsas_instance *instance = mrevt->instance; + dev_info_t *dip, *pdip; + int circ1 = 0; + char *devname; + + con_log(CL_ANN1, (CE_NOTE, "drsas_issue_evt_taskq: called for" + " tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + if (mrevt->tgt < MRDRV_MAX_LD && mrevt->lun == 0) { + dip = instance->dr_ld_list[mrevt->tgt].dip; + } else { + return; + } + + ndi_devi_enter(instance->dip, &circ1); + switch (mrevt->event) { + case DRSAS_EVT_CONFIG_TGT: + if (dip == NULL) { + + if (mrevt->lun == 0) { + (void) drsas_config_ld(instance, mrevt->tgt, + 0, NULL); + } + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_CONFIG_TGT dip != NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + case DRSAS_EVT_UNCONFIG_TGT: + if (dip) { + if (i_ddi_devi_attached(dip)) { + + pdip = ddi_get_parent(dip); + + devname = kmem_zalloc(MAXNAMELEN + 1, KM_SLEEP); + (void) ddi_deviname(dip, devname); + + (void) devfs_clean(pdip, devname + 1, + DV_CLEAN_FORCE); + kmem_free(devname, MAXNAMELEN + 1); + } + (void) ndi_devi_offline(dip, NDI_DEVI_REMOVE); + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT called:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } else { + con_log(CL_ANN1, (CE_NOTE, + "dr_sas: EVT_UNCONFIG_TGT dip == NULL:" + " for tgt %d lun %d event %d", + mrevt->tgt, mrevt->lun, mrevt->event)); + } + break; + } + kmem_free(mrevt, sizeof (struct drsas_eventinfo)); + ndi_devi_exit(instance->dip, circ1); +} + +static int +drsas_mode_sense_build(struct scsi_pkt *pkt) +{ + union scsi_cdb *cdbp; + uint16_t page_code; + struct scsa_cmd *acmd; + struct buf *bp; + struct mode_header *modehdrp; + + cdbp = (void *)pkt->pkt_cdbp; + page_code = cdbp->cdb_un.sg.scsi[0]; + acmd = PKT2CMD(pkt); + bp = acmd->cmd_buf; + if ((!bp) && bp->b_un.b_addr && bp->b_bcount && acmd->cmd_dmacount) { + con_log(CL_ANN1, (CE_WARN, "Failing MODESENSE Command")); + /* ADD pkt statistics as Command failed. */ + return (NULL); + } + + bp_mapin(bp); + bzero(bp->b_un.b_addr, bp->b_bcount); + + switch (page_code) { + case 0x3: { + struct mode_format *page3p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page3p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page3p->mode_page.code = 0x3; + page3p->mode_page.length = + (uchar_t)(sizeof (struct mode_format)); + page3p->data_bytes_sect = 512; + page3p->sect_track = 63; + break; + } + case 0x4: { + struct mode_geometry *page4p = NULL; + modehdrp = (struct mode_header *)(bp->b_un.b_addr); + modehdrp->bdesc_length = MODE_BLK_DESC_LENGTH; + + page4p = (void *)((caddr_t)modehdrp + + MODE_HEADER_LENGTH + MODE_BLK_DESC_LENGTH); + page4p->mode_page.code = 0x4; + page4p->mode_page.length = + (uchar_t)(sizeof (struct mode_geometry)); + page4p->heads = 255; + page4p->rpm = 10000; + break; + } + default: + break; + } + return (NULL); +} diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.conf b/usr/src/uts/common/io/dr_sas/dr_sas.conf new file mode 100644 index 0000000000..3792f43ca4 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.conf @@ -0,0 +1,15 @@ +# +# Copyright (c) 2008-2009, LSI Logic Corporation. +# All rights reserved. +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# dr_sas.conf for sol 10 (and later) for all supported architectures +# +# global definitions + +# MSI specific flag. user can uncomment this line and set flag "yes" to enable MSI +#drsas-enable-msi="yes"; diff --git a/usr/src/uts/common/io/dr_sas/dr_sas.h b/usr/src/uts/common/io/dr_sas/dr_sas.h new file mode 100644 index 0000000000..8f78658edf --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas.h @@ -0,0 +1,1766 @@ +/* + * dr_sas.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_H_ +#define _DR_SAS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/scsi/scsi.h> +#include "dr_sas_list.h" + +/* + * MegaRAID SAS2.0 Driver meta data + */ +#define DRSAS_VERSION "LSIv2.0" +#define DRSAS_RELDATE "Jan 9, 2009" + +#define DRSAS_TRUE 1 +#define DRSAS_FALSE 0 + +/* + * MegaRAID SAS2.0 device id conversion definitions. + */ +#define INST2LSIRDCTL(x) ((x) << INST_MINOR_SHIFT) + +/* + * MegaRAID SAS2.0 supported controllers + */ +#define PCI_DEVICE_ID_LSI_2108VDE 0x0078 +#define PCI_DEVICE_ID_LSI_2108V 0x0079 + +/* + * Register Index for 2108 Controllers. + */ +#define REGISTER_SET_IO_2108 (2) + +#define DRSAS_MAX_SGE_CNT 0x50 + +#define DRSAS_IOCTL_DRIVER 0x12341234 +#define DRSAS_IOCTL_FIRMWARE 0x12345678 +#define DRSAS_IOCTL_AEN 0x87654321 + +#define DRSAS_1_SECOND 1000000 + +/* Dynamic Enumeration Flags */ +#define DRSAS_PD_LUN 1 +#define DRSAS_LD_LUN 0 +#define DRSAS_PD_TGT_MAX 255 +#define DRSAS_GET_PD_MAX(s) ((s)->dr_pd_max) +#define WWN_STRLEN 17 + +/* + * ===================================== + * MegaRAID SAS2.0 MFI firmware definitions + * ===================================== + */ +/* + * MFI stands for MegaRAID SAS2.0 FW Interface. This is just a moniker for + * protocol between the software and firmware. Commands are issued using + * "message frames" + */ + +/* + * FW posts its state in upper 4 bits of outbound_msg_0 register + */ +#define MFI_STATE_SHIFT 28 +#define MFI_STATE_MASK ((uint32_t)0xF<<MFI_STATE_SHIFT) +#define MFI_STATE_UNDEFINED ((uint32_t)0x0<<MFI_STATE_SHIFT) +#define MFI_STATE_BB_INIT ((uint32_t)0x1<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT ((uint32_t)0x4<<MFI_STATE_SHIFT) +#define MFI_STATE_WAIT_HANDSHAKE ((uint32_t)0x6<<MFI_STATE_SHIFT) +#define MFI_STATE_FW_INIT_2 ((uint32_t)0x7<<MFI_STATE_SHIFT) +#define MFI_STATE_DEVICE_SCAN ((uint32_t)0x8<<MFI_STATE_SHIFT) +#define MFI_STATE_BOOT_MESSAGE_PENDING ((uint32_t)0x9<<MFI_STATE_SHIFT) +#define MFI_STATE_FLUSH_CACHE ((uint32_t)0xA<<MFI_STATE_SHIFT) +#define MFI_STATE_READY ((uint32_t)0xB<<MFI_STATE_SHIFT) +#define MFI_STATE_OPERATIONAL ((uint32_t)0xC<<MFI_STATE_SHIFT) +#define MFI_STATE_FAULT ((uint32_t)0xF<<MFI_STATE_SHIFT) + +#define MRMFI_FRAME_SIZE 64 + +/* + * During FW init, clear pending cmds & reset state using inbound_msg_0 + * + * ABORT : Abort all pending cmds + * READY : Move from OPERATIONAL to READY state; discard queue info + * MFIMODE : Discard (possible) low MFA posted in 64-bit mode (??) + * CLR_HANDSHAKE: FW is waiting for HANDSHAKE from BIOS or Driver + */ +#define MFI_INIT_ABORT 0x00000001 +#define MFI_INIT_READY 0x00000002 +#define MFI_INIT_MFIMODE 0x00000004 +#define MFI_INIT_CLEAR_HANDSHAKE 0x00000008 +#define MFI_INIT_HOTPLUG 0x00000010 +#define MFI_STOP_ADP 0x00000020 +#define MFI_RESET_FLAGS MFI_INIT_READY|MFI_INIT_MFIMODE|MFI_INIT_ABORT + +/* + * MFI frame flags + */ +#define MFI_FRAME_POST_IN_REPLY_QUEUE 0x0000 +#define MFI_FRAME_DONT_POST_IN_REPLY_QUEUE 0x0001 +#define MFI_FRAME_SGL32 0x0000 +#define MFI_FRAME_SGL64 0x0002 +#define MFI_FRAME_SENSE32 0x0000 +#define MFI_FRAME_SENSE64 0x0004 +#define MFI_FRAME_DIR_NONE 0x0000 +#define MFI_FRAME_DIR_WRITE 0x0008 +#define MFI_FRAME_DIR_READ 0x0010 +#define MFI_FRAME_DIR_BOTH 0x0018 + +/* + * Definition for cmd_status + */ +#define MFI_CMD_STATUS_POLL_MODE 0xFF +#define MFI_CMD_STATUS_SYNC_MODE 0xFF + +/* + * MFI command opcodes + */ +#define MFI_CMD_OP_INIT 0x00 +#define MFI_CMD_OP_LD_READ 0x01 +#define MFI_CMD_OP_LD_WRITE 0x02 +#define MFI_CMD_OP_LD_SCSI 0x03 +#define MFI_CMD_OP_PD_SCSI 0x04 +#define MFI_CMD_OP_DCMD 0x05 +#define MFI_CMD_OP_ABORT 0x06 +#define MFI_CMD_OP_SMP 0x07 +#define MFI_CMD_OP_STP 0x08 + +#define DR_DCMD_CTRL_GET_INFO 0x01010000 + +#define DR_DCMD_CTRL_CACHE_FLUSH 0x01101000 +#define DR_FLUSH_CTRL_CACHE 0x01 +#define DR_FLUSH_DISK_CACHE 0x02 + +#define DR_DCMD_CTRL_SHUTDOWN 0x01050000 +#define DRSAS_ENABLE_DRIVE_SPINDOWN 0x01 + +#define DR_DCMD_CTRL_EVENT_GET_INFO 0x01040100 +#define DR_DCMD_CTRL_EVENT_GET 0x01040300 +#define DR_DCMD_CTRL_EVENT_WAIT 0x01040500 +#define DR_DCMD_LD_GET_PROPERTIES 0x03030000 +#define DR_DCMD_PD_GET_INFO 0x02020000 + +/* + * Solaris Specific MAX values + */ +#define MAX_SGL 24 +/* + * MFI command completion codes + */ +enum MFI_STAT { + MFI_STAT_OK = 0x00, + MFI_STAT_INVALID_CMD = 0x01, + MFI_STAT_INVALID_DCMD = 0x02, + MFI_STAT_INVALID_PARAMETER = 0x03, + MFI_STAT_INVALID_SEQUENCE_NUMBER = 0x04, + MFI_STAT_ABORT_NOT_POSSIBLE = 0x05, + MFI_STAT_APP_HOST_CODE_NOT_FOUND = 0x06, + MFI_STAT_APP_IN_USE = 0x07, + MFI_STAT_APP_NOT_INITIALIZED = 0x08, + MFI_STAT_ARRAY_INDEX_INVALID = 0x09, + MFI_STAT_ARRAY_ROW_NOT_EMPTY = 0x0a, + MFI_STAT_CONFIG_RESOURCE_CONFLICT = 0x0b, + MFI_STAT_DEVICE_NOT_FOUND = 0x0c, + MFI_STAT_DRIVE_TOO_SMALL = 0x0d, + MFI_STAT_FLASH_ALLOC_FAIL = 0x0e, + MFI_STAT_FLASH_BUSY = 0x0f, + MFI_STAT_FLASH_ERROR = 0x10, + MFI_STAT_FLASH_IMAGE_BAD = 0x11, + MFI_STAT_FLASH_IMAGE_INCOMPLETE = 0x12, + MFI_STAT_FLASH_NOT_OPEN = 0x13, + MFI_STAT_FLASH_NOT_STARTED = 0x14, + MFI_STAT_FLUSH_FAILED = 0x15, + MFI_STAT_HOST_CODE_NOT_FOUNT = 0x16, + MFI_STAT_LD_CC_IN_PROGRESS = 0x17, + MFI_STAT_LD_INIT_IN_PROGRESS = 0x18, + MFI_STAT_LD_LBA_OUT_OF_RANGE = 0x19, + MFI_STAT_LD_MAX_CONFIGURED = 0x1a, + MFI_STAT_LD_NOT_OPTIMAL = 0x1b, + MFI_STAT_LD_RBLD_IN_PROGRESS = 0x1c, + MFI_STAT_LD_RECON_IN_PROGRESS = 0x1d, + MFI_STAT_LD_WRONG_RAID_LEVEL = 0x1e, + MFI_STAT_MAX_SPARES_EXCEEDED = 0x1f, + MFI_STAT_MEMORY_NOT_AVAILABLE = 0x20, + MFI_STAT_MFC_HW_ERROR = 0x21, + MFI_STAT_NO_HW_PRESENT = 0x22, + MFI_STAT_NOT_FOUND = 0x23, + MFI_STAT_NOT_IN_ENCL = 0x24, + MFI_STAT_PD_CLEAR_IN_PROGRESS = 0x25, + MFI_STAT_PD_TYPE_WRONG = 0x26, + MFI_STAT_PR_DISABLED = 0x27, + MFI_STAT_ROW_INDEX_INVALID = 0x28, + MFI_STAT_SAS_CONFIG_INVALID_ACTION = 0x29, + MFI_STAT_SAS_CONFIG_INVALID_DATA = 0x2a, + MFI_STAT_SAS_CONFIG_INVALID_PAGE = 0x2b, + MFI_STAT_SAS_CONFIG_INVALID_TYPE = 0x2c, + MFI_STAT_SCSI_DONE_WITH_ERROR = 0x2d, + MFI_STAT_SCSI_IO_FAILED = 0x2e, + MFI_STAT_SCSI_RESERVATION_CONFLICT = 0x2f, + MFI_STAT_SHUTDOWN_FAILED = 0x30, + MFI_STAT_TIME_NOT_SET = 0x31, + MFI_STAT_WRONG_STATE = 0x32, + MFI_STAT_LD_OFFLINE = 0x33, + /* UNUSED: 0x34 to 0xfe */ + MFI_STAT_INVALID_STATUS = 0xFF +}; + +enum DR_EVT_CLASS { + DR_EVT_CLASS_DEBUG = -2, + DR_EVT_CLASS_PROGRESS = -1, + DR_EVT_CLASS_INFO = 0, + DR_EVT_CLASS_WARNING = 1, + DR_EVT_CLASS_CRITICAL = 2, + DR_EVT_CLASS_FATAL = 3, + DR_EVT_CLASS_DEAD = 4 +}; + +enum DR_EVT_LOCALE { + DR_EVT_LOCALE_LD = 0x0001, + DR_EVT_LOCALE_PD = 0x0002, + DR_EVT_LOCALE_ENCL = 0x0004, + DR_EVT_LOCALE_BBU = 0x0008, + DR_EVT_LOCALE_SAS = 0x0010, + DR_EVT_LOCALE_CTRL = 0x0020, + DR_EVT_LOCALE_CONFIG = 0x0040, + DR_EVT_LOCALE_CLUSTER = 0x0080, + DR_EVT_LOCALE_ALL = 0xffff +}; + +#define DR_EVT_CFG_CLEARED 0x0004 +#define DR_EVT_LD_CREATED 0x008a +#define DR_EVT_LD_DELETED 0x008b +#define DR_EVT_PD_REMOVED_EXT 0x00f8 +#define DR_EVT_PD_INSERTED_EXT 0x00f7 + +enum LD_STATE { + LD_OFFLINE = 0, + LD_PARTIALLY_DEGRADED = 1, + LD_DEGRADED = 2, + LD_OPTIMAL = 3, + LD_INVALID = 0xFF +}; + +enum DRSAS_EVT { + DRSAS_EVT_CONFIG_TGT = 0, + DRSAS_EVT_UNCONFIG_TGT = 1, + DRSAS_EVT_UNCONFIG_SMP = 2 +}; + +#define DMA_OBJ_ALLOCATED 1 +#define DMA_OBJ_REALLOCATED 2 +#define DMA_OBJ_FREED 3 + +/* + * dma_obj_t - Our DMA object + * @param buffer : kernel virtual address + * @param size : size of the data to be allocated + * @param acc_handle : access handle + * @param dma_handle : dma handle + * @param dma_cookie : scatter-gather list + * @param dma_attr : dma attributes for this buffer + * Our DMA object. The caller must initialize the size and dma attributes + * (dma_attr) fields before allocating the resources. + */ +typedef struct { + caddr_t buffer; + uint32_t size; + ddi_acc_handle_t acc_handle; + ddi_dma_handle_t dma_handle; + ddi_dma_cookie_t dma_cookie[DRSAS_MAX_SGE_CNT]; + ddi_dma_attr_t dma_attr; + uint8_t status; + uint8_t reserved[3]; +} dma_obj_t; + +struct drsas_eventinfo { + struct drsas_instance *instance; + int tgt; + int lun; + int event; +}; + +struct drsas_ld { + dev_info_t *dip; + uint8_t lun_type; + uint8_t reserved[3]; +}; + +struct drsas_pd { + dev_info_t *dip; + uint8_t lun_type; + uint8_t dev_id; + uint8_t flags; + uint8_t reserved; +}; + +struct drsas_pd_info { + uint16_t deviceId; + uint16_t seqNum; + uint8_t inquiryData[96]; + uint8_t vpdPage83[64]; + uint8_t notSupported; + uint8_t scsiDevType; + uint8_t a; + uint8_t device_speed; + uint32_t mediaerrcnt; + uint32_t other; + uint32_t pred; + uint32_t lastpred; + uint16_t fwState; + uint8_t disabled; + uint8_t linkspwwd; + uint32_t ddfType; + struct { + uint8_t count; + uint8_t isPathBroken; + uint8_t connectorIndex[2]; + uint8_t reserved[4]; + uint64_t sasAddr[2]; + uint8_t reserved2[16]; + } pathInfo; +}; + +typedef struct drsas_instance { + uint32_t *producer; + uint32_t *consumer; + + uint32_t *reply_queue; + dma_obj_t mfi_internal_dma_obj; + + uint8_t init_id; + uint8_t reserved[3]; + + uint16_t max_num_sge; + uint16_t max_fw_cmds; + uint32_t max_sectors_per_req; + + struct drsas_cmd **cmd_list; + + mlist_t cmd_pool_list; + kmutex_t cmd_pool_mtx; + + mlist_t cmd_pend_list; + kmutex_t cmd_pend_mtx; + + dma_obj_t mfi_evt_detail_obj; + struct drsas_cmd *aen_cmd; + + uint32_t aen_seq_num; + uint32_t aen_class_locale_word; + + scsi_hba_tran_t *tran; + + kcondvar_t int_cmd_cv; + kmutex_t int_cmd_mtx; + + kcondvar_t aen_cmd_cv; + kmutex_t aen_cmd_mtx; + + kcondvar_t abort_cmd_cv; + kmutex_t abort_cmd_mtx; + + dev_info_t *dip; + ddi_acc_handle_t pci_handle; + + timeout_id_t timeout_id; + uint32_t unique_id; + uint16_t fw_outstanding; + caddr_t regmap; + ddi_acc_handle_t regmap_handle; + uint8_t isr_level; + ddi_iblock_cookie_t iblock_cookie; + ddi_iblock_cookie_t soft_iblock_cookie; + ddi_softintr_t soft_intr_id; + uint8_t softint_running; + kmutex_t completed_pool_mtx; + mlist_t completed_pool_list; + + caddr_t internal_buf; + uint32_t internal_buf_dmac_add; + uint32_t internal_buf_size; + + uint16_t vendor_id; + uint16_t device_id; + uint16_t subsysvid; + uint16_t subsysid; + int instance; + int baseaddress; + char iocnode[16]; + + int fm_capabilities; + + struct drsas_func_ptr *func_ptr; + /* MSI interrupts specific */ + ddi_intr_handle_t *intr_htable; + int intr_type; + int intr_cnt; + size_t intr_size; + uint_t intr_pri; + int intr_cap; + + ddi_taskq_t *taskq; + struct drsas_ld *dr_ld_list; +} drsas_t; + +struct drsas_func_ptr { + int (*read_fw_status_reg)(struct drsas_instance *); + void (*issue_cmd)(struct drsas_cmd *, struct drsas_instance *); + int (*issue_cmd_in_sync_mode)(struct drsas_instance *, + struct drsas_cmd *); + int (*issue_cmd_in_poll_mode)(struct drsas_instance *, + struct drsas_cmd *); + void (*enable_intr)(struct drsas_instance *); + void (*disable_intr)(struct drsas_instance *); + int (*intr_ack)(struct drsas_instance *); +}; + +/* + * ### Helper routines ### + */ + +/* + * con_log() - console log routine + * @param level : indicates the severity of the message. + * @fparam mt : format string + * + * con_log displays the error messages on the console based on the current + * debug level. Also it attaches the appropriate kernel severity level with + * the message. + * + * + * console messages debug levels + */ +#define CL_NONE 0 /* No debug information */ +#define CL_ANN 1 /* print unconditionally, announcements */ +#define CL_ANN1 2 /* No o/p */ +#define CL_DLEVEL1 3 /* debug level 1, informative */ +#define CL_DLEVEL2 4 /* debug level 2, verbose */ +#define CL_DLEVEL3 5 /* debug level 3, very verbose */ + +#ifdef __SUNPRO_C +#define __func__ "" +#endif + +#define con_log(level, fmt) { if (debug_level_g >= level) cmn_err fmt; } + +/* + * ### SCSA definitions ### + */ +#define PKT2TGT(pkt) ((pkt)->pkt_address.a_target) +#define PKT2LUN(pkt) ((pkt)->pkt_address.a_lun) +#define PKT2TRAN(pkt) ((pkt)->pkt_adress.a_hba_tran) +#define ADDR2TRAN(ap) ((ap)->a_hba_tran) + +#define TRAN2MR(tran) (struct drsas_instance *)(tran)->tran_hba_private) +#define ADDR2MR(ap) (TRAN2MR(ADDR2TRAN(ap)) + +#define PKT2CMD(pkt) ((struct scsa_cmd *)(pkt)->pkt_ha_private) +#define CMD2PKT(sp) ((sp)->cmd_pkt) +#define PKT2REQ(pkt) (&(PKT2CMD(pkt)->request)) + +#define CMD2ADDR(cmd) (&CMD2PKT(cmd)->pkt_address) +#define CMD2TRAN(cmd) (CMD2PKT(cmd)->pkt_address.a_hba_tran) +#define CMD2MR(cmd) (TRAN2MR(CMD2TRAN(cmd))) + +#define CFLAG_DMAVALID 0x0001 /* requires a dma operation */ +#define CFLAG_DMASEND 0x0002 /* Transfer from the device */ +#define CFLAG_CONSISTENT 0x0040 /* consistent data transfer */ + +/* + * ### Data structures for ioctl inteface and internal commands ### + */ + +/* + * Data direction flags + */ +#define UIOC_RD 0x00001 +#define UIOC_WR 0x00002 + +#define SCP2HOST(scp) (scp)->device->host /* to host */ +#define SCP2HOSTDATA(scp) SCP2HOST(scp)->hostdata /* to soft state */ +#define SCP2CHANNEL(scp) (scp)->device->channel /* to channel */ +#define SCP2TARGET(scp) (scp)->device->id /* to target */ +#define SCP2LUN(scp) (scp)->device->lun /* to LUN */ + +#define SCSIHOST2ADAP(host) (((caddr_t *)(host->hostdata))[0]) +#define SCP2ADAPTER(scp) \ + (struct drsas_instance *)SCSIHOST2ADAP(SCP2HOST(scp)) + +#define MRDRV_IS_LOGICAL_SCSA(instance, acmd) \ + (acmd->device_id < MRDRV_MAX_LD) ? 1 : 0 +#define MRDRV_IS_LOGICAL(ap) \ + ((ap->a_target < MRDRV_MAX_LD) && (ap->a_lun == 0)) ? 1 : 0 +#define MAP_DEVICE_ID(instance, ap) \ + (ap->a_target) + +#define HIGH_LEVEL_INTR 1 +#define NORMAL_LEVEL_INTR 0 + +/* + * scsa_cmd - Per-command mr private data + * @param cmd_dmahandle : dma handle + * @param cmd_dmacookies : current dma cookies + * @param cmd_pkt : scsi_pkt reference + * @param cmd_dmacount : dma count + * @param cmd_cookie : next cookie + * @param cmd_ncookies : cookies per window + * @param cmd_cookiecnt : cookies per sub-win + * @param cmd_nwin : number of dma windows + * @param cmd_curwin : current dma window + * @param cmd_dma_offset : current window offset + * @param cmd_dma_len : current window length + * @param cmd_flags : private flags + * @param cmd_cdblen : length of cdb + * @param cmd_scblen : length of scb + * @param cmd_buf : command buffer + * @param channel : channel for scsi sub-system + * @param target : target for scsi sub-system + * @param lun : LUN for scsi sub-system + * + * - Allocated at same time as scsi_pkt by scsi_hba_pkt_alloc(9E) + * - Pointed to by pkt_ha_private field in scsi_pkt + */ +struct scsa_cmd { + ddi_dma_handle_t cmd_dmahandle; + ddi_dma_cookie_t cmd_dmacookies[DRSAS_MAX_SGE_CNT]; + struct scsi_pkt *cmd_pkt; + ulong_t cmd_dmacount; + uint_t cmd_cookie; + uint_t cmd_ncookies; + uint_t cmd_cookiecnt; + uint_t cmd_nwin; + uint_t cmd_curwin; + off_t cmd_dma_offset; + ulong_t cmd_dma_len; + ulong_t cmd_flags; + uint_t cmd_cdblen; + uint_t cmd_scblen; + struct buf *cmd_buf; + ushort_t device_id; + uchar_t islogical; + uchar_t lun; + struct drsas_device *drsas_dev; +}; + + +struct drsas_cmd { + union drsas_frame *frame; + uint32_t frame_phys_addr; + uint8_t *sense; + uint32_t sense_phys_addr; + dma_obj_t frame_dma_obj; + uint8_t frame_dma_obj_status; + + uint32_t index; + uint8_t sync_cmd; + uint8_t cmd_status; + uint16_t abort_aen; + mlist_t list; + uint32_t frame_count; + struct scsa_cmd *cmd; + struct scsi_pkt *pkt; +}; + +#define MAX_MGMT_ADAPTERS 1024 +#define IOC_SIGNATURE "MR-SAS" + +#define IOC_CMD_FIRMWARE 0x0 +#define DRSAS_DRIVER_IOCTL_COMMON 0xF0010000 +#define DRSAS_DRIVER_IOCTL_DRIVER_VERSION 0xF0010100 +#define DRSAS_DRIVER_IOCTL_PCI_INFORMATION 0xF0010200 +#define DRSAS_DRIVER_IOCTL_MRRAID_STATISTICS 0xF0010300 + + +#define DRSAS_MAX_SENSE_LENGTH 32 + +struct drsas_mgmt_info { + + uint16_t count; + struct drsas_instance *instance[MAX_MGMT_ADAPTERS]; + uint16_t map[MAX_MGMT_ADAPTERS]; + int max_index; +}; + +#pragma pack(1) + +/* + * SAS controller properties + */ +struct drsas_ctrl_prop { + uint16_t seq_num; + uint16_t pred_fail_poll_interval; + uint16_t intr_throttle_count; + uint16_t intr_throttle_timeouts; + + uint8_t rebuild_rate; + uint8_t patrol_read_rate; + uint8_t bgi_rate; + uint8_t cc_rate; + uint8_t recon_rate; + + uint8_t cache_flush_interval; + + uint8_t spinup_drv_count; + uint8_t spinup_delay; + + uint8_t cluster_enable; + uint8_t coercion_mode; + uint8_t disk_write_cache_disable; + uint8_t alarm_enable; + + uint8_t reserved[44]; +}; + +/* + * SAS controller information + */ +struct drsas_ctrl_info { + /* PCI device information */ + struct { + uint16_t vendor_id; + uint16_t device_id; + uint16_t sub_vendor_id; + uint16_t sub_device_id; + uint8_t reserved[24]; + } pci; + + /* Host interface information */ + struct { + uint8_t PCIX : 1; + uint8_t PCIE : 1; + uint8_t iSCSI : 1; + uint8_t SAS_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } host_interface; + + /* Device (backend) interface information */ + struct { + uint8_t SPI : 1; + uint8_t SAS_3G : 1; + uint8_t SATA_1_5G : 1; + uint8_t SATA_3G : 1; + uint8_t reserved_0 : 4; + uint8_t reserved_1[6]; + uint8_t port_count; + uint64_t port_addr[8]; + } device_interface; + + /* List of components residing in flash. All str are null terminated */ + uint32_t image_check_word; + uint32_t image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char built_time[16]; + } image_component[8]; + + /* + * List of flash components that have been flashed on the card, but + * are not in use, pending reset of the adapter. This list will be + * empty if a flash operation has not occurred. All stings are null + * terminated + */ + uint32_t pending_image_component_count; + + struct { + char name[8]; + char version[32]; + char build_date[16]; + char build_time[16]; + } pending_image_component[8]; + + uint8_t max_arms; + uint8_t max_spans; + uint8_t max_arrays; + uint8_t max_lds; + + char product_name[80]; + char serial_no[32]; + + /* + * Other physical/controller/operation information. Indicates the + * presence of the hardware + */ + struct { + uint32_t bbu : 1; + uint32_t alarm : 1; + uint32_t nvram : 1; + uint32_t uart : 1; + uint32_t reserved : 28; + } hw_present; + + uint32_t current_fw_time; + + /* Maximum data transfer sizes */ + uint16_t max_concurrent_cmds; + uint16_t max_sge_count; + uint32_t max_request_size; + + /* Logical and physical device counts */ + uint16_t ld_present_count; + uint16_t ld_degraded_count; + uint16_t ld_offline_count; + + uint16_t pd_present_count; + uint16_t pd_disk_present_count; + uint16_t pd_disk_pred_failure_count; + uint16_t pd_disk_failed_count; + + /* Memory size information */ + uint16_t nvram_size; + uint16_t memory_size; + uint16_t flash_size; + + /* Error counters */ + uint16_t mem_correctable_error_count; + uint16_t mem_uncorrectable_error_count; + + /* Cluster information */ + uint8_t cluster_permitted; + uint8_t cluster_active; + uint8_t reserved_1[2]; + + /* Controller capabilities structures */ + struct { + uint32_t raid_level_0 : 1; + uint32_t raid_level_1 : 1; + uint32_t raid_level_5 : 1; + uint32_t raid_level_1E : 1; + uint32_t reserved : 28; + } raid_levels; + + struct { + uint32_t rbld_rate : 1; + uint32_t cc_rate : 1; + uint32_t bgi_rate : 1; + uint32_t recon_rate : 1; + uint32_t patrol_rate : 1; + uint32_t alarm_control : 1; + uint32_t cluster_supported : 1; + uint32_t bbu : 1; + uint32_t spanning_allowed : 1; + uint32_t dedicated_hotspares : 1; + uint32_t revertible_hotspares : 1; + uint32_t foreign_config_import : 1; + uint32_t self_diagnostic : 1; + uint32_t reserved : 19; + } adapter_operations; + + struct { + uint32_t read_policy : 1; + uint32_t write_policy : 1; + uint32_t io_policy : 1; + uint32_t access_policy : 1; + uint32_t reserved : 28; + } ld_operations; + + struct { + uint8_t min; + uint8_t max; + uint8_t reserved[2]; + } stripe_size_operations; + + struct { + uint32_t force_online : 1; + uint32_t force_offline : 1; + uint32_t force_rebuild : 1; + uint32_t reserved : 29; + } pd_operations; + + struct { + uint32_t ctrl_supports_sas : 1; + uint32_t ctrl_supports_sata : 1; + uint32_t allow_mix_in_encl : 1; + uint32_t allow_mix_in_ld : 1; + uint32_t allow_sata_in_cluster : 1; + uint32_t reserved : 27; + } pd_mix_support; + + /* Include the controller properties (changeable items) */ + uint8_t reserved_2[12]; + struct drsas_ctrl_prop properties; + + uint8_t pad[0x800 - 0x640]; +}; + +/* + * ================================== + * MegaRAID SAS2.0 driver definitions + * ================================== + */ +#define MRDRV_MAX_NUM_CMD 1024 + +#define MRDRV_MAX_PD_CHANNELS 2 +#define MRDRV_MAX_LD_CHANNELS 2 +#define MRDRV_MAX_CHANNELS (MRDRV_MAX_PD_CHANNELS + \ + MRDRV_MAX_LD_CHANNELS) +#define MRDRV_MAX_DEV_PER_CHANNEL 128 +#define MRDRV_DEFAULT_INIT_ID -1 +#define MRDRV_MAX_CMD_PER_LUN 1000 +#define MRDRV_MAX_LUN 1 +#define MRDRV_MAX_LD 64 + +#define MRDRV_RESET_WAIT_TIME 300 +#define MRDRV_RESET_NOTICE_INTERVAL 5 + +#define DRSAS_IOCTL_CMD 0 + +/* + * FW can accept both 32 and 64 bit SGLs. We want to allocate 32/64 bit + * SGLs based on the size of dma_addr_t + */ +#define IS_DMA64 (sizeof (dma_addr_t) == 8) + +#define IB_MSG_0_OFF 0x10 /* XScale */ +#define OB_MSG_0_OFF 0x18 /* XScale */ +#define IB_DOORBELL_OFF 0x20 /* XScale & ROC */ +#define OB_INTR_STATUS_OFF 0x30 /* XScale & ROC */ +#define OB_INTR_MASK_OFF 0x34 /* XScale & ROC */ +#define IB_QPORT_OFF 0x40 /* XScale & ROC */ +#define OB_DOORBELL_CLEAR_OFF 0xA0 /* ROC */ +#define OB_SCRATCH_PAD_0_OFF 0xB0 /* ROC */ +#define OB_INTR_MASK 0xFFFFFFFF +#define OB_DOORBELL_CLEAR_MASK 0xFFFFFFFF + +/* + * All MFI register set macros accept drsas_register_set* + */ +#define WR_IB_MSG_0(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_MSG_0_OFF), (v)) + +#define RD_OB_MSG_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_MSG_0_OFF)) + +#define WR_IB_DOORBELL(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF), (v)) + +#define RD_IB_DOORBELL(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_DOORBELL_OFF)) + +#define WR_OB_INTR_STATUS(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF), (v)) + +#define RD_OB_INTR_STATUS(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_STATUS_OFF)) + +#define WR_OB_INTR_MASK(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), (v)) + +#define RD_OB_INTR_MASK(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF)) + +#define WR_IB_QPORT(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + IB_QPORT_OFF), (v)) + +#define WR_OB_DOORBELL_CLEAR(v, instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_DOORBELL_CLEAR_OFF), \ + (v)) + +#define RD_OB_SCRATCH_PAD_0(instance) ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_SCRATCH_PAD_0_OFF)) + +/* + * When FW is in MFI_STATE_READY or MFI_STATE_OPERATIONAL, the state data + * of Outbound Msg Reg 0 indicates max concurrent cmds supported, max SGEs + * supported per cmd and if 64-bit MFAs (M64) is enabled or disabled. + */ +#define MFI_OB_INTR_STATUS_MASK 0x00000002 + +/* + * This MFI_REPLY_2108_MESSAGE_INTR flag is used also + * in enable_intr_ppc also. Hence bit 2, i.e. 0x4 has + * been set in this flag along with bit 1. + */ +#define MFI_REPLY_2108_MESSAGE_INTR 0x00000001 +#define MFI_REPLY_2108_MESSAGE_INTR_MASK 0x00000005 + +#define MFI_POLL_TIMEOUT_SECS 60 + +#define MFI_ENABLE_INTR(instance) ddi_put32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF), 1) +#define MFI_DISABLE_INTR(instance) \ +{ \ + uint32_t disable = 1; \ + uint32_t mask = ddi_get32((instance)->regmap_handle, \ + (uint32_t *)((uintptr_t)(instance)->regmap + OB_INTR_MASK_OFF));\ + mask &= ~disable; \ + ddi_put32((instance)->regmap_handle, (uint32_t *) \ + (uintptr_t)((instance)->regmap + OB_INTR_MASK_OFF), mask); \ +} + +/* By default, the firmware programs for 8 Kbytes of memory */ +#define DEFAULT_MFI_MEM_SZ 8192 +#define MINIMUM_MFI_MEM_SZ 4096 + +/* DCMD Message Frame MAILBOX0-11 */ +#define DCMD_MBOX_SZ 12 + + +struct drsas_register_set { + uint32_t reserved_0[4]; + + uint32_t inbound_msg_0; + uint32_t inbound_msg_1; + uint32_t outbound_msg_0; + uint32_t outbound_msg_1; + + uint32_t inbound_doorbell; + uint32_t inbound_intr_status; + uint32_t inbound_intr_mask; + + uint32_t outbound_doorbell; + uint32_t outbound_intr_status; + uint32_t outbound_intr_mask; + + uint32_t reserved_1[2]; + + uint32_t inbound_queue_port; + uint32_t outbound_queue_port; + + uint32_t reserved_2[22]; + + uint32_t outbound_doorbell_clear; + + uint32_t reserved_3[3]; + + uint32_t outbound_scratch_pad; + + uint32_t reserved_4[3]; + + uint32_t inbound_low_queue_port; + + uint32_t inbound_high_queue_port; + + uint32_t reserved_5; + uint32_t index_registers[820]; +}; + +struct drsas_sge32 { + uint32_t phys_addr; + uint32_t length; +}; + +struct drsas_sge64 { + uint64_t phys_addr; + uint32_t length; +}; + +union drsas_sgl { + struct drsas_sge32 sge32[1]; + struct drsas_sge64 sge64[1]; +}; + +struct drsas_header { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xferlen; +}; + +union drsas_sgl_frame { + struct drsas_sge32 sge32[8]; + struct drsas_sge64 sge64[5]; +}; + +struct drsas_init_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t data_xfer_len; + + uint32_t queue_info_new_phys_addr_lo; + uint32_t queue_info_new_phys_addr_hi; + uint32_t queue_info_old_phys_addr_lo; + uint32_t queue_info_old_phys_addr_hi; + + uint32_t reserved_4[6]; +}; + +struct drsas_init_queue_info { + uint32_t init_flags; + uint32_t reply_queue_entries; + + uint32_t reply_queue_start_phys_addr_lo; + uint32_t reply_queue_start_phys_addr_hi; + uint32_t producer_index_phys_addr_lo; + uint32_t producer_index_phys_addr_hi; + uint32_t consumer_index_phys_addr_lo; + uint32_t consumer_index_phys_addr_hi; +}; + +struct drsas_io_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t access_byte; + uint8_t reserved_0; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t lba_count; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint32_t start_lba_lo; + uint32_t start_lba_hi; + + union drsas_sgl sgl; +}; + +struct drsas_pthru_frame { + uint8_t cmd; + uint8_t sense_len; + uint8_t cmd_status; + uint8_t scsi_status; + + uint8_t target_id; + uint8_t lun; + uint8_t cdb_len; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + uint32_t data_xfer_len; + + uint32_t sense_buf_phys_addr_lo; + uint32_t sense_buf_phys_addr_hi; + + uint8_t cdb[16]; + union drsas_sgl sgl; +}; + +struct drsas_dcmd_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + uint8_t reserved_1[4]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + uint32_t opcode; + + union { + uint8_t b[DCMD_MBOX_SZ]; + uint16_t s[6]; + uint32_t w[3]; + } mbox; + + union drsas_sgl sgl; +}; + +struct drsas_abort_frame { + uint8_t cmd; + uint8_t reserved_0; + uint8_t cmd_status; + + uint8_t reserved_1; + uint32_t reserved_2; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t reserved_3; + uint32_t reserved_4; + + uint32_t abort_context; + uint32_t pad_1; + + uint32_t abort_mfi_phys_addr_lo; + uint32_t abort_mfi_phys_addr_hi; + + uint32_t reserved_5[6]; +}; + +struct drsas_smp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t reserved_2[3]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint64_t sas_addr; + + union drsas_sgl sgl[2]; +}; + +struct drsas_stp_frame { + uint8_t cmd; + uint8_t reserved_1; + uint8_t cmd_status; + uint8_t connection_status; + + uint8_t target_id; + uint8_t reserved_2[2]; + uint8_t sge_count; + + uint32_t context; + uint8_t req_id; + uint8_t msgvector; + uint16_t pad_0; + + uint16_t flags; + uint16_t timeout; + + uint32_t data_xfer_len; + + uint16_t fis[10]; + uint32_t stp_flags; + union drsas_sgl sgl; +}; + +union drsas_frame { + struct drsas_header hdr; + struct drsas_init_frame init; + struct drsas_io_frame io; + struct drsas_pthru_frame pthru; + struct drsas_dcmd_frame dcmd; + struct drsas_abort_frame abort; + struct drsas_smp_frame smp; + struct drsas_stp_frame stp; + + uint8_t raw_bytes[64]; +}; + +typedef struct drsas_pd_address { + uint16_t device_id; + uint16_t encl_id; + + union { + struct { + uint8_t encl_index; + uint8_t slot_number; + } pd_address; + struct { + uint8_t encl_position; + uint8_t encl_connector_index; + } encl_address; + }address; + + uint8_t scsi_dev_type; + + union { + uint8_t port_bitmap; + uint8_t port_numbers; + } connected; + + uint64_t sas_addr[2]; +} drsas_pd_address_t; + +union drsas_evt_class_locale { + struct { + uint16_t locale; + uint8_t reserved; + int8_t class; + } members; + + uint32_t word; +}; + +struct drsas_evt_log_info { + uint32_t newest_seq_num; + uint32_t oldest_seq_num; + uint32_t clear_seq_num; + uint32_t shutdown_seq_num; + uint32_t boot_seq_num; +}; + +struct drsas_progress { + uint16_t progress; + uint16_t elapsed_seconds; +}; + +struct drsas_evtarg_ld { + uint16_t target_id; + uint8_t ld_index; + uint8_t reserved; +}; + +struct drsas_evtarg_pd { + uint16_t device_id; + uint8_t encl_index; + uint8_t slot_number; +}; + +struct drsas_evt_detail { + uint32_t seq_num; + uint32_t time_stamp; + uint32_t code; + union drsas_evt_class_locale cl; + uint8_t arg_type; + uint8_t reserved1[15]; + + union { + struct { + struct drsas_evtarg_pd pd; + uint8_t cdb_length; + uint8_t sense_length; + uint8_t reserved[2]; + uint8_t cdb[16]; + uint8_t sense[64]; + } cdbSense; + + struct drsas_evtarg_ld ld; + + struct { + struct drsas_evtarg_ld ld; + uint64_t count; + } ld_count; + + struct { + uint64_t lba; + struct drsas_evtarg_ld ld; + } ld_lba; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prevOwner; + uint32_t newOwner; + } ld_owner; + + struct { + uint64_t ld_lba; + uint64_t pd_lba; + struct drsas_evtarg_ld ld; + struct drsas_evtarg_pd pd; + } ld_lba_pd_lba; + + struct { + struct drsas_evtarg_ld ld; + struct drsas_progress prog; + } ld_prog; + + struct { + struct drsas_evtarg_ld ld; + uint32_t prev_state; + uint32_t new_state; + } ld_state; + + struct { + uint64_t strip; + struct drsas_evtarg_ld ld; + } ld_strip; + + struct drsas_evtarg_pd pd; + + struct { + struct drsas_evtarg_pd pd; + uint32_t err; + } pd_err; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + } pd_lba; + + struct { + uint64_t lba; + struct drsas_evtarg_pd pd; + struct drsas_evtarg_ld ld; + } pd_lba_ld; + + struct { + struct drsas_evtarg_pd pd; + struct drsas_progress prog; + } pd_prog; + + struct { + struct drsas_evtarg_pd pd; + uint32_t prevState; + uint32_t newState; + } pd_state; + + struct { + uint16_t vendorId; + uint16_t deviceId; + uint16_t subVendorId; + uint16_t subDeviceId; + } pci; + + uint32_t rate; + char str[96]; + + struct { + uint32_t rtc; + uint32_t elapsedSeconds; + } time; + + struct { + uint32_t ecar; + uint32_t elog; + char str[64]; + } ecc; + + drsas_pd_address_t pd_addr; + + uint8_t b[96]; + uint16_t s[48]; + uint32_t w[24]; + uint64_t d[12]; + } args; + + char description[128]; + +}; + +/* only 63 are usable by the application */ +#define MAX_LOGICAL_DRIVES 64 +/* only 255 physical devices may be used */ +#define MAX_PHYSICAL_DEVICES 256 +#define MAX_PD_PER_ENCLOSURE 64 +/* maximum disks per array */ +#define MAX_ROW_SIZE 32 +/* maximum spans per logical drive */ +#define MAX_SPAN_DEPTH 8 +/* maximum number of arrays a hot spare may be dedicated to */ +#define MAX_ARRAYS_DEDICATED 16 +/* maximum number of arrays which may exist */ +#define MAX_ARRAYS 128 +/* maximum number of foreign configs that may ha managed at once */ +#define MAX_FOREIGN_CONFIGS 8 +/* maximum spares (global and dedicated combined) */ +#define MAX_SPARES_FOR_THE_CONTROLLER MAX_PHYSICAL_DEVICES +/* maximum possible Target IDs (i.e. 0 to 63) */ +#define MAX_TARGET_ID 63 +/* maximum number of supported enclosures */ +#define MAX_ENCLOSURES 32 +/* maximum number of PHYs per controller */ +#define MAX_PHYS_PER_CONTROLLER 16 +/* maximum number of LDs per array (due to DDF limitations) */ +#define MAX_LDS_PER_ARRAY 16 + +/* + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + * + * Logical Drive commands + * + * ----------------------------------------------------------------------------- + * ----------------------------------------------------------------------------- + */ +#define DR_DCMD_LD 0x03000000, /* Logical Device (LD) opcodes */ + +/* + * Input: dcmd.opcode - DR_DCMD_LD_GET_LIST + * dcmd.mbox - reserved + * dcmd.sge IN - ptr to returned DR_LD_LIST structure + * Desc: Return the logical drive list structure + * Status: No error + */ + +/* + * defines the logical drive reference structure + */ +typedef union _DR_LD_REF { /* LD reference structure */ + struct { + uint8_t targetId; /* LD target id (0 to MAX_TARGET_ID) */ + uint8_t reserved; /* reserved for in line with DR_PD_REF */ + uint16_t seqNum; /* Sequence Number */ + } ld_ref; + uint32_t ref; /* shorthand reference to full 32-bits */ +} DR_LD_REF; /* 4 bytes */ + +/* + * defines the logical drive list structure + */ +typedef struct _DR_LD_LIST { + uint32_t ldCount; /* number of LDs */ + uint32_t reserved; /* pad to 8-byte boundary */ + struct { + DR_LD_REF ref; /* LD reference */ + uint8_t state; /* current LD state (DR_LD_STATE) */ + uint8_t reserved[3]; /* pad to 8-byte boundary */ + uint64_t size; /* LD size */ + } ldList[MAX_LOGICAL_DRIVES]; +} DR_LD_LIST; + +struct drsas_drv_ver { + uint8_t signature[12]; + uint8_t os_name[16]; + uint8_t os_ver[12]; + uint8_t drv_name[20]; + uint8_t drv_ver[32]; + uint8_t drv_rel_date[20]; +}; + +#define PCI_TYPE0_ADDRESSES 6 +#define PCI_TYPE1_ADDRESSES 2 +#define PCI_TYPE2_ADDRESSES 5 + +struct drsas_pci_common_header { + uint16_t vendorID; /* (ro) */ + uint16_t deviceID; /* (ro) */ + uint16_t command; /* Device control */ + uint16_t status; + uint8_t revisionID; /* (ro) */ + uint8_t progIf; /* (ro) */ + uint8_t subClass; /* (ro) */ + uint8_t baseClass; /* (ro) */ + uint8_t cacheLineSize; /* (ro+) */ + uint8_t latencyTimer; /* (ro+) */ + uint8_t headerType; /* (ro) */ + uint8_t bist; /* Built in self test */ + + union { + struct { + uint32_t baseAddresses[PCI_TYPE0_ADDRESSES]; + uint32_t cis; + uint16_t subVendorID; + uint16_t subSystemID; + uint32_t romBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t reserved2; + uint8_t interruptLine; + uint8_t interruptPin; /* (ro) */ + uint8_t minimumGrant; /* (ro) */ + uint8_t maximumLatency; /* (ro) */ + } type_0; + + struct { + uint32_t baseAddresses[PCI_TYPE1_ADDRESSES]; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + uint8_t ioBase; + uint8_t ioLimit; + uint16_t secondaryStatus; + uint16_t memoryBase; + uint16_t memoryLimit; + uint16_t prefetchBase; + uint16_t prefetchLimit; + uint32_t prefetchBaseUpper32; + uint32_t prefetchLimitUpper32; + uint16_t ioBaseUpper16; + uint16_t ioLimitUpper16; + uint8_t capabilitiesPtr; + uint8_t reserved1[3]; + uint32_t romBaseAddress; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_1; + + struct { + uint32_t socketRegistersBaseAddress; + uint8_t capabilitiesPtr; + uint8_t reserved; + uint16_t secondaryStatus; + uint8_t primaryBus; + uint8_t secondaryBus; + uint8_t subordinateBus; + uint8_t secondaryLatency; + struct { + uint32_t base; + uint32_t limit; + } range[PCI_TYPE2_ADDRESSES-1]; + uint8_t interruptLine; + uint8_t interruptPin; + uint16_t bridgeControl; + } type_2; + } header; +}; + +struct drsas_pci_link_capability { + union { + struct { + uint32_t linkSpeed :4; + uint32_t linkWidth :6; + uint32_t aspmSupport :2; + uint32_t losExitLatency :3; + uint32_t l1ExitLatency :3; + uint32_t rsvdp :6; + uint32_t portNumber :8; + } bits; + + uint32_t asUlong; + } cap; + +}; + +struct drsas_pci_link_status_capability { + union { + struct { + uint16_t linkSpeed :4; + uint16_t negotiatedLinkWidth :6; + uint16_t linkTrainingError :1; + uint16_t linkTraning :1; + uint16_t slotClockConfig :1; + uint16_t rsvdZ :3; + } bits; + + uint16_t asUshort; + } stat_cap; + + uint16_t reserved; + +}; + +struct drsas_pci_capabilities { + struct drsas_pci_link_capability linkCapability; + struct drsas_pci_link_status_capability linkStatusCapability; +}; + +struct drsas_pci_information +{ + uint32_t busNumber; + uint8_t deviceNumber; + uint8_t functionNumber; + uint8_t interruptVector; + uint8_t reserved; + struct drsas_pci_common_header pciHeaderInfo; + struct drsas_pci_capabilities capability; + uint8_t reserved2[32]; +}; + +struct drsas_ioctl { + uint16_t version; + uint16_t controller_id; + uint8_t signature[8]; + uint32_t reserved_1; + uint32_t control_code; + uint32_t reserved_2[2]; + uint8_t frame[64]; + union drsas_sgl_frame sgl_frame; + uint8_t sense_buff[DRSAS_MAX_SENSE_LENGTH]; + uint8_t data[1]; +}; + +struct drsas_aen { + uint16_t host_no; + uint16_t cmd_status; + uint32_t seq_num; + uint32_t class_locale_word; +}; +#pragma pack() + +#ifndef DDI_VENDOR_LSI +#define DDI_VENDOR_LSI "LSI" +#endif /* DDI_VENDOR_LSI */ + +static int drsas_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int drsas_attach(dev_info_t *, ddi_attach_cmd_t); +static int drsas_reset(dev_info_t *, ddi_reset_cmd_t); +static int drsas_detach(dev_info_t *, ddi_detach_cmd_t); +static int drsas_open(dev_t *, int, int, cred_t *); +static int drsas_close(dev_t, int, int, cred_t *); +static int drsas_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +static int drsas_tran_tgt_init(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static struct scsi_pkt *drsas_tran_init_pkt(struct scsi_address *, register + struct scsi_pkt *, struct buf *, int, int, int, int, + int (*)(), caddr_t); +static int drsas_tran_start(struct scsi_address *, + register struct scsi_pkt *); +static int drsas_tran_abort(struct scsi_address *, struct scsi_pkt *); +static int drsas_tran_reset(struct scsi_address *, int); +static int drsas_tran_getcap(struct scsi_address *, char *, int); +static int drsas_tran_setcap(struct scsi_address *, char *, int, int); +static void drsas_tran_destroy_pkt(struct scsi_address *, + struct scsi_pkt *); +static void drsas_tran_dmafree(struct scsi_address *, struct scsi_pkt *); +static void drsas_tran_sync_pkt(struct scsi_address *, struct scsi_pkt *); +static uint_t drsas_isr(); +static uint_t drsas_softintr(); + +static int init_mfi(struct drsas_instance *); +static int drsas_free_dma_obj(struct drsas_instance *, dma_obj_t); +static int drsas_alloc_dma_obj(struct drsas_instance *, dma_obj_t *, + uchar_t); +static struct drsas_cmd *get_mfi_pkt(struct drsas_instance *); +static void return_mfi_pkt(struct drsas_instance *, + struct drsas_cmd *); + +static void free_space_for_mfi(struct drsas_instance *); +static void free_additional_dma_buffer(struct drsas_instance *); +static int alloc_additional_dma_buffer(struct drsas_instance *); +static int read_fw_status_reg_ppc(struct drsas_instance *); +static void issue_cmd_ppc(struct drsas_cmd *, struct drsas_instance *); +static int issue_cmd_in_poll_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static int issue_cmd_in_sync_mode_ppc(struct drsas_instance *, + struct drsas_cmd *); +static void enable_intr_ppc(struct drsas_instance *); +static void disable_intr_ppc(struct drsas_instance *); +static int intr_ack_ppc(struct drsas_instance *); +static int mfi_state_transition_to_ready(struct drsas_instance *); +static void destroy_mfi_frame_pool(struct drsas_instance *); +static int create_mfi_frame_pool(struct drsas_instance *); +static int drsas_dma_alloc(struct drsas_instance *, struct scsi_pkt *, + struct buf *, int, int (*)()); +static int drsas_dma_move(struct drsas_instance *, + struct scsi_pkt *, struct buf *); +static void flush_cache(struct drsas_instance *instance); +static void display_scsi_inquiry(caddr_t); +static int start_mfi_aen(struct drsas_instance *instance); +static int handle_drv_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_ioctl(struct drsas_instance *instance, + struct drsas_ioctl *ioctl, int mode); +static int handle_mfi_aen(struct drsas_instance *instance, + struct drsas_aen *aen); +static void fill_up_drv_ver(struct drsas_drv_ver *dv); +static struct drsas_cmd *build_cmd(struct drsas_instance *instance, + struct scsi_address *ap, struct scsi_pkt *pkt, + uchar_t *cmd_done); +static int register_mfi_aen(struct drsas_instance *instance, + uint32_t seq_num, uint32_t class_locale_word); +static int issue_mfi_pthru(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_dcmd(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_smp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int issue_mfi_stp(struct drsas_instance *instance, struct + drsas_ioctl *ioctl, struct drsas_cmd *cmd, int mode); +static int abort_aen_cmd(struct drsas_instance *instance, + struct drsas_cmd *cmd_to_abort); + +static int drsas_common_check(struct drsas_instance *instance, + struct drsas_cmd *cmd); +static void drsas_fm_init(struct drsas_instance *instance); +static void drsas_fm_fini(struct drsas_instance *instance); +static int drsas_fm_error_cb(dev_info_t *, ddi_fm_error_t *, + const void *); +static void drsas_fm_ereport(struct drsas_instance *instance, + char *detail); +static int drsas_check_dma_handle(ddi_dma_handle_t handle); +static int drsas_check_acc_handle(ddi_acc_handle_t handle); + +static void drsas_rem_intrs(struct drsas_instance *instance); +static int drsas_add_intrs(struct drsas_instance *instance, int intr_type); + +static void drsas_tran_tgt_free(dev_info_t *, dev_info_t *, + scsi_hba_tran_t *, struct scsi_device *); +static int drsas_tran_bus_config(dev_info_t *, uint_t, + ddi_bus_config_op_t, void *, dev_info_t **); +static int drsas_parse_devname(char *, int *, int *); +static int drsas_config_all_devices(struct drsas_instance *); +static int drsas_config_scsi_device(struct drsas_instance *, + struct scsi_device *, dev_info_t **); +static int drsas_config_ld(struct drsas_instance *, uint16_t, + uint8_t, dev_info_t **); +static dev_info_t *drsas_find_child(struct drsas_instance *, uint16_t, + uint8_t); +static int drsas_name_node(dev_info_t *, char *, int); +static void drsas_issue_evt_taskq(struct drsas_eventinfo *); +static int drsas_service_evt(struct drsas_instance *, int, int, int, + uint64_t); +static int drsas_mode_sense_build(struct scsi_pkt *); + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_H_ */ diff --git a/usr/src/uts/common/io/dr_sas/dr_sas_list.h b/usr/src/uts/common/io/dr_sas/dr_sas_list.h new file mode 100644 index 0000000000..4154a77796 --- /dev/null +++ b/usr/src/uts/common/io/dr_sas/dr_sas_list.h @@ -0,0 +1,212 @@ +/* + * dr_sas_list.h: header for dr_sas + * + * Solaris MegaRAID driver for SAS2.0 controllers + * Copyright (c) 2008-2009, LSI Logic Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _DR_SAS_LIST_H_ +#define _DR_SAS_LIST_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct mlist_head { + struct mlist_head *next, *prev; +}; + +typedef struct mlist_head mlist_t; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct mlist_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} + + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_add(struct mlist_head *new, + struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + + +/* + * mlist_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static void mlist_add(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head, head->next); +} + + +/* + * mlist_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static void mlist_add_tail(struct mlist_head *new, struct mlist_head *head) +{ + __list_add(new, head->prev, head); +} + + + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static void __list_del(struct mlist_head *prev, + struct mlist_head *next) +{ + next->prev = prev; + prev->next = next; +} + + +/* + * mlist_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static void mlist_del_init(struct mlist_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + + +/* + * mlist_empty - tests whether a list is empty + * @head: the list to test. + */ +static int mlist_empty(struct mlist_head *head) +{ + return (head->next == head); +} + + +/* + * mlist_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static void mlist_splice(struct mlist_head *list, struct mlist_head *head) +{ + struct mlist_head *first = list->next; + + if (first != list) { + struct mlist_head *last = list->prev; + struct mlist_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; + } +} + + +/* + * mlist_entry - get the struct for this entry + * @ptr: the &struct mlist_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define mlist_entry(ptr, type, member) \ + ((type *)((size_t)(ptr) - offsetof(type, member))) + + +/* + * mlist_for_each - iterate over a list + * @pos: the &struct mlist_head to use as a loop counter. + * @head: the head for your list. + */ +#define mlist_for_each(pos, head) \ + for (pos = (head)->next, prefetch(pos->next); pos != (head); \ + pos = pos->next, prefetch(pos->next)) + + +/* + * mlist_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct mlist_head to use as a loop counter. + * @n: another &struct mlist_head to use as temporary storage + * @head: the head for your list. + */ +#define mlist_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +#ifdef __cplusplus +} +#endif + +#endif /* _DR_SAS_LIST_H_ */ diff --git a/usr/src/uts/common/io/eventfd.c b/usr/src/uts/common/io/eventfd.c new file mode 100644 index 0000000000..6683a9ca8e --- /dev/null +++ b/usr/src/uts/common/io/eventfd.c @@ -0,0 +1,414 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * Support for the eventfd facility, a Linux-borne facility for user-generated + * file descriptor-based events. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/eventfd.h> +#include <sys/conf.h> +#include <sys/vmem.h> +#include <sys/sysmacros.h> +#include <sys/filio.h> +#include <sys/stat.h> +#include <sys/file.h> + +struct eventfd_state; +typedef struct eventfd_state eventfd_state_t; + +struct eventfd_state { + kmutex_t efd_lock; /* lock protecting state */ + boolean_t efd_semaphore; /* boolean: sema. semantics */ + kcondvar_t efd_cv; /* condvar */ + pollhead_t efd_pollhd; /* poll head */ + uint64_t efd_value; /* value */ + eventfd_state_t *efd_next; /* next state on global list */ +}; + +/* + * Internal global variables. + */ +static kmutex_t eventfd_lock; /* lock protecting state */ +static dev_info_t *eventfd_devi; /* device info */ +static vmem_t *eventfd_minor; /* minor number arena */ +static void *eventfd_softstate; /* softstate pointer */ +static eventfd_state_t *eventfd_state; /* global list of state */ + +/*ARGSUSED*/ +static int +eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + eventfd_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + + if (minor != EVENTFDMNRN_EVENTFD) + return (ENXIO); + + mutex_enter(&eventfd_lock); + + minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { + vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&eventfd_lock); + return (NULL); + } + + state = ddi_get_soft_state(eventfd_softstate, minor); + *devp = makedevice(major, minor); + + state->efd_next = eventfd_state; + eventfd_state = state; + + mutex_exit(&eventfd_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + eventfd_state_t *state; + minor_t minor = getminor(dev); + uint64_t val, oval; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + state = ddi_get_soft_state(eventfd_softstate, minor); + + mutex_enter(&state->efd_lock); + + while (state->efd_value == 0) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->efd_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { + mutex_exit(&state->efd_lock); + return (EINTR); + } + } + + /* + * We have a non-zero value and we own the lock; our behavior now + * depends on whether or not EFD_SEMAPHORE was set when the eventfd + * was created. + */ + val = oval = state->efd_value; + + if (state->efd_semaphore) { + state->efd_value--; + val = 1; + } else { + state->efd_value = 0; + } + + err = uiomove(&val, sizeof (val), UIO_READ, uio); + + mutex_exit(&state->efd_lock); + + if (oval == EVENTFD_VALMAX) { + cv_broadcast(&state->efd_cv); + pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); + } + + return (err); +} + +/*ARGSUSED*/ +static int +eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) +{ + eventfd_state_t *state; + minor_t minor = getminor(dev); + uint64_t val, oval; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) + return (err); + + if (val > EVENTFD_VALMAX) + return (EINVAL); + + state = ddi_get_soft_state(eventfd_softstate, minor); + + mutex_enter(&state->efd_lock); + + while (val > EVENTFD_VALMAX - state->efd_value) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->efd_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { + mutex_exit(&state->efd_lock); + return (EINTR); + } + } + + /* + * We now know that we can add the value without overflowing. + */ + state->efd_value = (oval = state->efd_value) + val; + + mutex_exit(&state->efd_lock); + + if (oval == 0) { + cv_broadcast(&state->efd_cv); + pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); + } + + return (0); +} + +/*ARGSUSED*/ +static int +eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + eventfd_state_t *state; + minor_t minor = getminor(dev); + short revents = 0; + + state = ddi_get_soft_state(eventfd_softstate, minor); + + mutex_enter(&state->efd_lock); + + if (state->efd_value > 0) + revents |= POLLRDNORM | POLLIN; + + if (state->efd_value < EVENTFD_VALMAX) + revents |= POLLWRNORM | POLLOUT; + + if (!(*reventsp = revents & events) && !anyyet) + *phpp = &state->efd_pollhd; + + mutex_exit(&state->efd_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + eventfd_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(eventfd_softstate, minor); + + switch (cmd) { + case EVENTFDIOC_SEMAPHORE: { + mutex_enter(&state->efd_lock); + state->efd_semaphore ^= 1; + mutex_exit(&state->efd_lock); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + eventfd_state_t *state, **sp; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(eventfd_softstate, minor); + + if (state->efd_pollhd.ph_list != NULL) { + pollwakeup(&state->efd_pollhd, POLLERR); + pollhead_clean(&state->efd_pollhd); + } + + mutex_enter(&eventfd_lock); + + /* + * Remove our state from our global list. + */ + for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->efd_next; + + ddi_soft_state_free(eventfd_softstate, minor); + vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&eventfd_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&eventfd_lock); + + if (ddi_soft_state_init(&eventfd_softstate, + sizeof (eventfd_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); + mutex_exit(&eventfd_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, + EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); + ddi_soft_state_fini(&eventfd_softstate); + mutex_exit(&eventfd_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + eventfd_devi = devi; + + eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, + UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&eventfd_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&eventfd_lock); + vmem_destroy(eventfd_minor); + + ddi_remove_minor_node(eventfd_devi, NULL); + eventfd_devi = NULL; + + ddi_soft_state_fini(&eventfd_softstate); + mutex_exit(&eventfd_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)eventfd_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops eventfd_cb_ops = { + eventfd_open, /* open */ + eventfd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + eventfd_read, /* read */ + eventfd_write, /* write */ + eventfd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + eventfd_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops eventfd_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + eventfd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + eventfd_attach, /* attach */ + eventfd_detach, /* detach */ + nodev, /* reset */ + &eventfd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "eventfd support", /* name of module */ + &eventfd_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/eventfd.conf b/usr/src/uts/common/io/eventfd.conf new file mode 100644 index 0000000000..f9c6dc11b2 --- /dev/null +++ b/usr/src/uts/common/io/eventfd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="eventfd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c index c8af6ae527..2930fe578c 100644 --- a/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c +++ b/usr/src/uts/common/io/fibre-channel/fca/oce/oce_rx.c @@ -532,8 +532,7 @@ oce_drain_rq_cq(void *arg) if (dev->function_mode & FLEX10_MODE) { if (cqe->u0.s.vlan_tag_present && cqe->u0.s.qnq) { - oce_rx_insert_tag(mp, - cqe->u0.s.vlan_tag); + oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag); } } else if (cqe->u0.s.vlan_tag_present) { oce_rx_insert_tag(mp, cqe->u0.s.vlan_tag); diff --git a/usr/src/uts/common/io/fibre-channel/impl/fctl.c b/usr/src/uts/common/io/fibre-channel/impl/fctl.c index 634de6c6dd..87105e779d 100644 --- a/usr/src/uts/common/io/fibre-channel/impl/fctl.c +++ b/usr/src/uts/common/io/fibre-channel/impl/fctl.c @@ -24,6 +24,7 @@ */ /* * Copyright 2012 Garrett D'Amore <garrett@damore.org>. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* * Fibre channel Transport Library (fctl) @@ -5500,6 +5501,11 @@ fc_ulp_get_adapter_paths(char *pathList, int count) maxPorts ++; } + if (maxPorts == 0) { + mutex_exit(&fctl_port_lock); + return (0); + } + /* Now allocate a buffer to store all the pointers for comparisons */ portList = kmem_zalloc(sizeof (fc_local_port_t *) * maxPorts, KM_SLEEP); diff --git a/usr/src/uts/common/io/gsqueue/gsqueue.c b/usr/src/uts/common/io/gsqueue/gsqueue.c new file mode 100644 index 0000000000..b484b16142 --- /dev/null +++ b/usr/src/uts/common/io/gsqueue/gsqueue.c @@ -0,0 +1,612 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Serialization queues are a technique used in illumos to provide what's + * commonly known as a 'vertical' perimeter. The idea (described a bit in + * uts/common/inet/squeue.c) is to provide a means to make sure that message + * blocks (mblk_t) are processed in a specific order. Subsystems like ip and vnd + * consume these on different policies, ip on a conn_t basis, vnd on a per + * device basis, and use this to ensure that only one packet is being processed + * at a given time. + * + * Serialization queues were originally used by ip. As part of that + * implementation, many of the details of ip were baked into it. That includes + * things like conn_t, ip receive attributes, and the notion of sets. While an + * individual serialization queue, or gsqueue_t, is a useful level of + * abstraction, it isn't the basis on which monst consumers want to manage them. + * Instead, we have the notion of a set of serialization queues. These sets are + * DR (CPU Dynamic reconfiguration) aware, and allow consumers to have a + * gsqueue_t per CPU to fanout on without managing them all itself. In the + * original implementation, this existed, but they were heavily tied into the + * infrastructure of IP, and its notion of polling on the underlying MAC + * devices. + * + * The result of that past is a new interface to serialization queues and a + * similar, but slightly different, abstraction to sets of these + * (gsqueue_set_t). When designing this there are two different approaches that + * one could consider. The first is that the system has one gsqueue_set_t that + * the entire world shares, whether IP or some other consumer. The other is that + * every consumer has their own set. + * + * The trade offs between these two failure modes are the pathological failure + * modes. There is no guarantee that any two consumers here are equivalent. In + * fact, they very likely have very different latency profiles. If they are + * being processed in the same queue, that can lead to very odd behaviors. More + * generally, if we have a series of processing functions from one consumer + * which are generally short, and another which are generally long, that'll + * cause undue latency that's harder to observe. If we instead take the approach + * that each consumer should have its own set that it fans out over then we + * won't end up with the problem that a given serialization queue will have + * multiple latency profiles, but instead we'll see cpu contention for the bound + * gsqueue_t worker thread. Keep in mind though, that only the gsqueue_t worker + * thread is bound and it is in fact possible for it to be processed by other + * threads on other CPUs. + * + * We've opted to go down the second path, so each consumer has its own + * independent set of serialization queues that it is bound over. + * + * Structure Hierarchies + * --------------------- + * + * At the top level, we have a single list of gsqueue_set_t. The gsqueue_set_t + * encapsulates all the per-CPU gsqueue_t that exist in the form of + * gsqueue_cpu_t. The gsqueue_cpu_t has been designed such that it could + * accommodate more than one gsqueue_t, but today there is a one to one mapping. + * + * We maintain two different lists of gsqueue_cpu_t, the active and defunct + * sets. The active set is maintained in the array `gs_cpus`. There are NCPU + * entries available in `gs_cpus` with the total number of currently active cpus + * described in `gs_ncpus`. The ordering of `gs_cpus` is unimportant. When + * there is no longer a need for a given binding (see the following section for + * more explanation on when this is the case) then we move the entry to the + * `gs_defunct` list which is just a singly linked list of gsqueue_cpu_t. + * + * In addition, each gsqueue_set_t can have a series of callbacks registered + * with it. These are described in the following section. Graphically, a given + * gsqueue_set_t looks roughly like the following: + * + * +---------------+ + * | gsqueue_set_t | + * +---------------+ + * | | | + * | | * . . . gs_cpus + * | | | + * | | | +-------------------------------------------------+ + * | | +----->| gsqueue_cpu_t || gsqueue_cpu_t || gsqueue_cpu_t |... + * | | +-------------------------------------------------+ + * | | + * | * . . . gs_defunct + * | | + * | | +---------------+ +---------------+ +---------------+ + * | +--->| gsqueue_cpu_t |-->| gsqueue_cpu_t |-->| gsqueue_cpu_t |... + * | +---------------+ +---------------+ +---------------+ + * * . . . gs_cbs + * | + * | +--------------+ +--------------+ +--------------+ + * +--->| gsqueue_cb_t |-->| gsqueue_cb_t |->| gsqueue_cb_t |... + * +--------------+ +--------------+ +--------------+ + * + * CPU DR, gsqueue_t, and gsqueue_t + * -------------------------------- + * + * Recall, that every serialization queue (gsqueue_t or squeue_t) has a worker + * thread that may end up doing work. As part of supporting fanout, we have one + * gsqueue_t per CPU, and its worker thread is bound to that CPU. Because of + * this binding, we need to deal with CPU DR changes. + * + * The gsqueue driver maintains a single CPU DR callback that is used for the + * entire sub-system. We break down CPU DR events into three groups. Offline + * events, online events, and events we can ignore. When the first group occurs, + * we need to go through every gsqueue_t, find the gsqueue_cpu_t that + * corresponds to that processor id, and unbind all of its gsqueue_t's. It's + * rather important that we only unbind the gsqueue_t's and not actually destroy + * them. When this happens, they could very easily have data queued inside of + * them and it's unreasonable to just throw out everything in them at this + * point. The data remains intact and service continues uinterrupted. + * + * When we receive an online event, we do the opposite. We try to find a + * gsqueue_cpu_t that previously was bound to this CPU (by leaving its gqc_cpuid + * field intact) in the defunct list. If we find one, we remove it from the + * defunct list and add it to the active list as well as binding the gsqueue_t + * to the CPU in question. If we don't find one, then we create a new one. + * + * To deal with these kinds of situations, we allow a consumer to register + * callbacks for the gsqueue_t that they are interested in. These callbacks will + * fire whenever we are handling a topology change. The design of the callbacks + * is not that the user can take any administrative action during them, but + * rather set something for them to do asynchronously. It is illegal to make any + * calls into the gsqueue system while you are in a callback. + * + * Locking + * ------- + * + * The lock ordering here is fairly straightforward. Due to our use of CPU + * binding and the CPU DR callbacks, we have an additional lock to consider + * cpu_lock. Because of that, the following are the rules for locking: + * + * + * o If performing binding operations, you must grab cpu_lock. cpu_lock is + * also at the top of the order. + * + * o cpu_lock > gsqueue_lock > gsqueue_t`gs_lock > squeue_t`sq_lock + * If you need to take multiple locks, you must take the greatest + * (left-most) one first. + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/stream.h> +#include <sys/modctl.h> +#include <sys/cpuvar.h> +#include <sys/list.h> +#include <sys/sysmacros.h> + +#include <sys/gsqueue.h> +#include <sys/squeue_impl.h> + +typedef struct gsqueue_cb { + struct gsqueue_cb *gcb_next; + gsqueue_cb_f gcb_func; + void *gcb_arg; +} gsqueue_cb_t; + +typedef struct gsqueue_cpu { + struct gsqueue_cpu *gqc_next; + squeue_t *gqc_head; + processorid_t gqc_cpuid; +} gsqueue_cpu_t; + +struct gsqueue_set { + list_node_t gs_next; + uint_t gs_wwait; + pri_t gs_wpri; + kmutex_t gs_lock; + int gs_ncpus; + gsqueue_cpu_t **gs_cpus; + gsqueue_cpu_t *gs_defunct; + gsqueue_cb_t *gs_cbs; +}; + +static kmutex_t gsqueue_lock; +static list_t gsqueue_list; +static kmem_cache_t *gsqueue_cb_cache; +static kmem_cache_t *gsqueue_cpu_cache; +static kmem_cache_t *gsqueue_set_cache; + +static gsqueue_cpu_t * +gsqueue_cpu_create(uint_t wwait, pri_t wpri, processorid_t cpuid) +{ + gsqueue_cpu_t *scp; + + scp = kmem_cache_alloc(gsqueue_cpu_cache, KM_SLEEP); + + scp->gqc_next = NULL; + scp->gqc_cpuid = cpuid; + scp->gqc_head = squeue_create(wwait, wpri, B_FALSE); + scp->gqc_head->sq_state = SQS_DEFAULT; + squeue_bind(scp->gqc_head, cpuid); + + return (scp); +} + +static void +gsqueue_cpu_destroy(gsqueue_cpu_t *scp) +{ + squeue_destroy(scp->gqc_head); + kmem_cache_free(gsqueue_cpu_cache, scp); +} + +gsqueue_set_t * +gsqueue_set_create(uint_t wwait, pri_t wpri) +{ + int i; + gsqueue_set_t *gssp; + + gssp = kmem_cache_alloc(gsqueue_set_cache, KM_SLEEP); + gssp->gs_wwait = wwait; + gssp->gs_wpri = wpri; + gssp->gs_ncpus = 0; + + /* + * We're grabbing CPU lock. Once we let go of it we have to ensure all + * set up of the gsqueue_set_t is complete, as it'll be in there for the + * various CPU DR bits. + */ + mutex_enter(&cpu_lock); + + for (i = 0; i < NCPU; i++) { + gsqueue_cpu_t *scp; + cpu_t *cp = cpu_get(i); + if (cp != NULL && CPU_ACTIVE(cp) && + cp->cpu_flags & CPU_EXISTS) { + scp = gsqueue_cpu_create(wwait, wpri, cp->cpu_id); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + } + } + + /* Finally we can add it to our global list and be done */ + mutex_enter(&gsqueue_lock); + list_insert_tail(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + mutex_exit(&cpu_lock); + + return (gssp); +} + +void +gsqueue_set_destroy(gsqueue_set_t *gssp) +{ + int i; + gsqueue_cpu_t *scp; + + /* + * Go through and unbind all of the squeues while cpu_lock is held and + * move them to the defunct list. Once that's done, we don't need to do + * anything else with cpu_lock. + */ + mutex_enter(&cpu_lock); + mutex_enter(&gsqueue_lock); + list_remove(&gsqueue_list, gssp); + mutex_exit(&gsqueue_lock); + + mutex_enter(&gssp->gs_lock); + + for (i = 0; i < gssp->gs_ncpus; i++) { + scp = gssp->gs_cpus[i]; + squeue_unbind(scp->gqc_head); + scp->gqc_next = gssp->gs_defunct; + gssp->gs_defunct = scp; + gssp->gs_cpus[i] = NULL; + } + gssp->gs_ncpus = 0; + + mutex_exit(&gssp->gs_lock); + mutex_exit(&cpu_lock); + + while (gssp->gs_defunct != NULL) { + gsqueue_cpu_t *scp; + + scp = gssp->gs_defunct; + gssp->gs_defunct = scp->gqc_next; + gsqueue_cpu_destroy(scp); + } + + while (gssp->gs_cbs != NULL) { + gsqueue_cb_t *cbp; + + cbp = gssp->gs_cbs; + gssp->gs_cbs = cbp->gcb_next; + kmem_cache_free(gsqueue_cb_cache, cbp); + } + + ASSERT(gssp->gs_ncpus == 0); + ASSERT(gssp->gs_defunct == NULL); + ASSERT(gssp->gs_cbs == NULL); + kmem_cache_free(gsqueue_set_cache, gssp); +} + +gsqueue_t * +gsqueue_set_get(gsqueue_set_t *gssp, uint_t index) +{ + squeue_t *sqp; + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_cpus[index % gssp->gs_ncpus]; + sqp = scp->gqc_head; + mutex_exit(&gssp->gs_lock); + return ((gsqueue_t *)sqp); +} + +uintptr_t +gsqueue_set_cb_add(gsqueue_set_t *gssp, gsqueue_cb_f cb, void *arg) +{ + gsqueue_cb_t *cbp; + + cbp = kmem_cache_alloc(gsqueue_cb_cache, KM_SLEEP); + cbp->gcb_func = cb; + cbp->gcb_arg = arg; + + mutex_enter(&gssp->gs_lock); + cbp->gcb_next = gssp->gs_cbs; + gssp->gs_cbs = cbp; + mutex_exit(&gssp->gs_lock); + return ((uintptr_t)cbp); +} + +int +gsqueue_set_cb_remove(gsqueue_set_t *gssp, uintptr_t id) +{ + gsqueue_cb_t *cbp, *prev; + mutex_enter(&gssp->gs_lock); + cbp = gssp->gs_cbs; + prev = NULL; + while (cbp != NULL) { + if ((uintptr_t)cbp != id) { + prev = cbp; + cbp = cbp->gcb_next; + continue; + } + + if (prev == NULL) { + gssp->gs_cbs = cbp->gcb_next; + } else { + prev->gcb_next = cbp->gcb_next; + } + + mutex_exit(&gssp->gs_lock); + kmem_cache_free(gsqueue_cb_cache, cbp); + return (0); + } + mutex_exit(&gssp->gs_lock); + return (-1); +} + +void +gsqueue_enter_one(gsqueue_t *gsp, mblk_t *mp, gsqueue_proc_f func, void *arg, + int flags, uint8_t tag) +{ + squeue_t *sqp = (squeue_t *)gsp; + + ASSERT(mp->b_next == NULL); + ASSERT(mp->b_prev == NULL); + mp->b_queue = (queue_t *)func; + mp->b_prev = arg; + sqp->sq_enter(sqp, mp, mp, 1, NULL, flags, tag); +} + +static void +gsqueue_notify(gsqueue_set_t *gssp, squeue_t *sqp, boolean_t online) +{ + gsqueue_cb_t *cbp; + + ASSERT(MUTEX_HELD(&gssp->gs_lock)); + cbp = gssp->gs_cbs; + while (cbp != NULL) { + cbp->gcb_func(gssp, (gsqueue_t *)sqp, cbp->gcb_arg, online); + cbp = cbp->gcb_next; + } + +} + +/* + * When we online a processor we need to go through and either bind a defunct + * squeue or create a new one. We'll try to reuse a gsqueue_cpu_t from the + * defunct list that used to be on that processor. If no such gsqueue_cpu_t + * exists, then we'll create a new one. We'd rather avoid taking over an + * existing defunct one that used to be on another CPU, as its not unreasonable + * to believe that its CPU will come back. More CPUs are offlined and onlined by + * the administrator or by creating cpu sets than actually get offlined by FMA. + */ +static void +gsqueue_handle_online(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + gsqueue_cpu_t *scp; + + mutex_enter(&gssp->gs_lock); + scp = gssp->gs_defunct; + while (scp != NULL) { + if (scp->gqc_cpuid == id) + break; + scp = scp->gqc_next; + } + + if (scp == NULL) { + scp = gsqueue_cpu_create(gssp->gs_wwait, + gssp->gs_wpri, id); + } else { + squeue_bind(scp->gqc_head, id); + } + ASSERT(gssp->gs_ncpus < NCPU); + gssp->gs_cpus[gssp->gs_ncpus] = scp; + gssp->gs_ncpus++; + gsqueue_notify(gssp, scp->gqc_head, B_TRUE); + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +static void +gsqueue_handle_offline(processorid_t id) +{ + gsqueue_set_t *gssp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + mutex_enter(&gsqueue_lock); + for (gssp = list_head(&gsqueue_list); gssp != NULL; + gssp = list_next(&gsqueue_list, gssp)) { + int i; + gsqueue_cpu_t *scp = NULL; + + mutex_enter(&gssp->gs_lock); + for (i = 0; i < gssp->gs_ncpus; i++) { + if (gssp->gs_cpus[i]->gqc_cpuid == id) { + scp = gssp->gs_cpus[i]; + break; + } + } + + if (scp != NULL) { + squeue_unbind(scp->gqc_head); + scp->gqc_next = gssp->gs_defunct; + gssp->gs_defunct = scp; + gssp->gs_cpus[i] = gssp->gs_cpus[gssp->gs_ncpus-1]; + gssp->gs_ncpus--; + gsqueue_notify(gssp, scp->gqc_head, B_FALSE); + } + mutex_exit(&gssp->gs_lock); + } + mutex_exit(&gsqueue_lock); +} + +/* ARGSUSED */ +static int +gsqueue_cpu_setup(cpu_setup_t what, int id, void *unused) +{ + cpu_t *cp; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu_get(id); + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + if (cp != NULL && CPU_ACTIVE(cp) && cp->cpu_flags & CPU_EXISTS) + gsqueue_handle_online(cp->cpu_id); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + gsqueue_handle_offline(cp->cpu_id); + break; + default: + break; + } + + return (0); +} + + +/* ARGSUSED */ +static int +gsqueue_set_cache_construct(void *buf, void *arg, int kmflags) +{ + gsqueue_set_t *gssp = buf; + + gssp->gs_cpus = kmem_alloc(sizeof (gsqueue_cpu_t *) * NCPU, kmflags); + if (gssp->gs_cpus == NULL) + return (-1); + + mutex_init(&gssp->gs_lock, NULL, MUTEX_DRIVER, NULL); + gssp->gs_ncpus = 0; + gssp->gs_defunct = NULL; + gssp->gs_cbs = NULL; + + return (0); +} + +static void +gsqueue_set_cache_destruct(void *buf, void *arg) +{ + gsqueue_set_t *gssp = buf; + + kmem_free(gssp->gs_cpus, sizeof (gsqueue_cpu_t *) * NCPU); + gssp->gs_cpus = NULL; + mutex_destroy(&gssp->gs_lock); +} + +static void +gsqueue_ddiinit(void) +{ + list_create(&gsqueue_list, sizeof (gsqueue_set_t), + offsetof(gsqueue_set_t, gs_next)); + mutex_init(&gsqueue_lock, NULL, MUTEX_DRIVER, NULL); + + gsqueue_cb_cache = kmem_cache_create("gsqueue_cb_cache", + sizeof (gsqueue_cb_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_cpu_cache = kmem_cache_create("gsqueue_cpu_cache", + sizeof (gsqueue_cpu_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + gsqueue_set_cache = kmem_cache_create("squeue_set_cache", + sizeof (gsqueue_set_t), + 0, gsqueue_set_cache_construct, gsqueue_set_cache_destruct, + NULL, NULL, NULL, 0); + + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); +} + +static int +gsqueue_ddifini(void) +{ + mutex_enter(&gsqueue_lock); + if (list_is_empty(&gsqueue_list) == 0) { + mutex_exit(&gsqueue_lock); + return (EBUSY); + } + list_destroy(&gsqueue_list); + mutex_exit(&gsqueue_lock); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(gsqueue_cpu_setup, NULL); + mutex_exit(&cpu_lock); + + kmem_cache_destroy(gsqueue_set_cache); + kmem_cache_destroy(gsqueue_cpu_cache); + kmem_cache_destroy(gsqueue_cb_cache); + + mutex_destroy(&gsqueue_lock); + + return (0); +} + +static struct modlmisc gsqueue_modmisc = { + &mod_miscops, + "gsqueue" +}; + +static struct modlinkage gsqueue_modlinkage = { + MODREV_1, + &gsqueue_modmisc, + NULL +}; + +int +_init(void) +{ + int ret; + + gsqueue_ddiinit(); + if ((ret = mod_install(&gsqueue_modlinkage)) != 0) { + VERIFY(gsqueue_ddifini() == 0); + return (ret); + } + + return (ret); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&gsqueue_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int ret; + + if ((ret = gsqueue_ddifini()) != 0) + return (ret); + + if ((ret = mod_remove(&gsqueue_modlinkage)) != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/io/inotify.c b/usr/src/uts/common/io/inotify.c new file mode 100644 index 0000000000..c5c7b379e3 --- /dev/null +++ b/usr/src/uts/common/io/inotify.c @@ -0,0 +1,1508 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * Support for the inotify facility, a Linux-borne facility for asynchronous + * notification of certain events on specified files or directories. Our + * implementation broadly leverages the file event monitoring facility, and + * would actually be quite straightforward were it not for a very serious + * blunder in the inotify interface: in addition to allowing for one to be + * notified on events on a particular file or directory, inotify also allows + * for one to be notified on certain events on files _within_ a watched + * directory -- even though those events have absolutely nothing to do with + * the directory itself. This leads to all sorts of madness because file + * operations are (of course) not undertaken on paths but rather on open + * files -- and the relationships between open files and the paths that resolve + * to those files are neither static nor isomorphic. We implement this + * concept by having _child watches_ when directories are watched with events + * in IN_CHILD_EVENTS. We add child watches when a watch on a directory is + * first added, and we modify those child watches dynamically as files are + * created, deleted, moved into or moved out of the specified directory. This + * mechanism works well, absent hard links. Hard links, unfortunately, break + * this rather badly, and the user is warned that watches on directories that + * have multiple directory entries referring to the same file may behave + * unexpectedly. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/inotify.h> +#include <sys/fem.h> +#include <sys/conf.h> +#include <sys/stat.h> +#include <sys/vfs_opreg.h> +#include <sys/vmem.h> +#include <sys/avl.h> +#include <sys/sysmacros.h> +#include <sys/cyclic.h> +#include <sys/filio.h> + +struct inotify_state; +struct inotify_kevent; + +typedef struct inotify_watch inotify_watch_t; +typedef struct inotify_state inotify_state_t; +typedef struct inotify_kevent inotify_kevent_t; + +struct inotify_watch { + kmutex_t inw_lock; /* lock protecting ref count */ + int inw_refcnt; /* reference count */ + uint8_t inw_zombie:1; /* boolean: is zombie */ + uint8_t inw_fired:1; /* boolean: fired one-shot */ + uint8_t inw_active:1; /* boolean: watch is active */ + uint8_t inw_orphaned:1; /* boolean: orphaned */ + kcondvar_t inw_cv; /* condvar for zombifier */ + uint32_t inw_mask; /* mask of watch */ + int32_t inw_wd; /* watch descriptor */ + vnode_t *inw_vp; /* underlying vnode */ + inotify_watch_t *inw_parent; /* parent, if a child */ + avl_node_t inw_byvp; /* watches by vnode */ + avl_node_t inw_bywd; /* watches by descriptor */ + avl_tree_t inw_children; /* children, if a parent */ + char *inw_name; /* name, if a child */ + list_node_t inw_orphan; /* orphan list */ + cred_t *inw_cred; /* cred, if orphaned */ + inotify_state_t *inw_state; /* corresponding state */ +}; + +struct inotify_kevent { + inotify_kevent_t *ine_next; /* next event in queue */ + struct inotify_event ine_event; /* event (variable size) */ +}; + +#define INOTIFY_EVENT_LENGTH(ev) \ + (sizeof (inotify_kevent_t) + (ev)->ine_event.len) + +struct inotify_state { + kmutex_t ins_lock; /* lock protecting state */ + avl_tree_t ins_byvp; /* watches by vnode */ + avl_tree_t ins_bywd; /* watches by descriptor */ + vmem_t *ins_wds; /* watch identifier arena */ + int ins_maxwatches; /* maximum number of watches */ + int ins_maxevents; /* maximum number of events */ + int ins_nevents; /* current # of events */ + int32_t ins_size; /* total size of events */ + inotify_kevent_t *ins_head; /* head of event queue */ + inotify_kevent_t *ins_tail; /* tail of event queue */ + pollhead_t ins_pollhd; /* poll head */ + kcondvar_t ins_cv; /* condvar for reading */ + list_t ins_orphans; /* orphan list */ + cyclic_id_t ins_cleaner; /* cyclic for cleaning */ + inotify_watch_t *ins_zombies; /* zombie watch list */ + cred_t *ins_cred; /* creator's credentials */ + inotify_state_t *ins_next; /* next state on global list */ +}; + +/* + * Tunables (exported read-only in lx-branded zones via /proc). + */ +int inotify_maxwatches = 8192; /* max watches per instance */ +int inotify_maxevents = 16384; /* max events */ +int inotify_maxinstances = 128; /* max instances per user */ + +/* + * Internal global variables. + */ +static kmutex_t inotify_lock; /* lock protecting state */ +static dev_info_t *inotify_devi; /* device info */ +static fem_t *inotify_femp; /* FEM pointer */ +static vmem_t *inotify_minor; /* minor number arena */ +static void *inotify_softstate; /* softstate pointer */ +static inotify_state_t *inotify_state; /* global list if state */ + +static void inotify_watch_event(inotify_watch_t *, uint64_t, char *); +static void inotify_watch_insert(inotify_watch_t *, vnode_t *, char *); +static void inotify_watch_delete(inotify_watch_t *, uint32_t); +static void inotify_watch_remove(inotify_state_t *state, + inotify_watch_t *watch); + +static int +inotify_fop_close(femarg_t *vf, int flag, int count, offset_t offset, + cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_close(vf, flag, count, offset, cr, ct)) == 0) { + inotify_watch_event(watch, flag & FWRITE ? + IN_CLOSE_WRITE : IN_CLOSE_NOWRITE, NULL); + } + + return (rval); +} + +static int +inotify_fop_create(femarg_t *vf, char *name, vattr_t *vap, vcexcl_t excl, + int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct, + vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_create(vf, name, vap, excl, mode, + vpp, cr, flag, ct, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE, name); + } + + return (rval); +} + +static int +inotify_fop_link(femarg_t *vf, vnode_t *svp, char *tnm, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_link(vf, svp, tnm, cr, ct, flags)) == 0) { + inotify_watch_insert(watch, svp, tnm); + inotify_watch_event(watch, IN_CREATE, tnm); + } + + return (rval); +} + +static int +inotify_fop_mkdir(femarg_t *vf, char *name, vattr_t *vap, vnode_t **vpp, + cred_t *cr, caller_context_t *ct, int flags, vsecattr_t *vsecp) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_mkdir(vf, name, vap, vpp, cr, + ct, flags, vsecp)) == 0) { + inotify_watch_insert(watch, *vpp, name); + inotify_watch_event(watch, IN_CREATE | IN_ISDIR, name); + } + + return (rval); +} + +static int +inotify_fop_open(femarg_t *vf, int mode, cred_t *cr, caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_open(vf, mode, cr, ct)) == 0) + inotify_watch_event(watch, IN_OPEN, NULL); + + return (rval); +} + +static int +inotify_fop_read(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_read(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_ACCESS, NULL); + + return (rval); +} + +static int +inotify_fop_readdir(femarg_t *vf, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_readdir(vf, uiop, cr, eofp, ct, flags); + inotify_watch_event(watch, IN_ACCESS | IN_ISDIR, NULL); + + return (rval); +} + +int +inotify_fop_remove(femarg_t *vf, char *nm, cred_t *cr, caller_context_t *ct, + int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_remove(vf, nm, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE, nm); + + return (rval); +} + +int +inotify_fop_rmdir(femarg_t *vf, char *nm, vnode_t *cdir, cred_t *cr, + caller_context_t *ct, int flags) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_rmdir(vf, nm, cdir, cr, ct, flags)) == 0) + inotify_watch_event(watch, IN_DELETE | IN_ISDIR, nm); + + return (rval); +} + +static int +inotify_fop_setattr(femarg_t *vf, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval; + + if ((rval = vnext_setattr(vf, vap, flags, cr, ct)) == 0) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + return (rval); +} + +static int +inotify_fop_write(femarg_t *vf, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + int rval = vnext_write(vf, uiop, ioflag, cr, ct); + inotify_watch_event(watch, IN_MODIFY, NULL); + + return (rval); +} + +static int +inotify_fop_vnevent(femarg_t *vf, vnevent_t vnevent, vnode_t *dvp, char *name, + caller_context_t *ct) +{ + inotify_watch_t *watch = vf->fa_fnode->fn_available; + + switch (vnevent) { + case VE_RENAME_SRC: + inotify_watch_event(watch, IN_MOVE_SELF, NULL); + inotify_watch_delete(watch, IN_MOVE_SELF); + break; + case VE_REMOVE: + /* + * Linux will apparently fire an IN_ATTRIB event when the link + * count changes (including when it drops to 0 on a remove). + * This is merely somewhat odd; what is amazing is that this + * IN_ATTRIB event is not visible on an inotify watch on the + * parent directory. (IN_ATTRIB events are normally sent to + * watches on the parent directory). While it's hard to + * believe that this constitutes desired semantics, ltp + * unfortunately tests this case (if implicitly); in the name + * of bug-for-bug compatibility, we fire IN_ATTRIB iff we are + * explicitly watching the file that has been removed. + */ + if (watch->inw_parent == NULL) + inotify_watch_event(watch, IN_ATTRIB, NULL); + + /*FALLTHROUGH*/ + case VE_RENAME_DEST: + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_RMDIR: + /* + * It seems that IN_ISDIR should really be OR'd in here, but + * Linux doesn't seem to do that in this case; for the sake of + * bug-for-bug compatibility, we don't do it either. + */ + inotify_watch_event(watch, IN_DELETE_SELF, NULL); + inotify_watch_delete(watch, IN_DELETE_SELF); + break; + case VE_CREATE: + inotify_watch_event(watch, IN_MODIFY | IN_ATTRIB, NULL); + break; + case VE_LINK: + inotify_watch_event(watch, IN_ATTRIB, NULL); + break; + case VE_RENAME_SRC_DIR: + inotify_watch_event(watch, IN_MOVED_FROM, name); + break; + case VE_RENAME_DEST_DIR: + if (name == NULL) + name = dvp->v_path; + + inotify_watch_insert(watch, dvp, name); + inotify_watch_event(watch, IN_MOVED_TO, name); + break; + case VE_SUPPORT: + case VE_MOUNTEDOVER: + case VE_TRUNCATE: + break; + } + + return (vnext_vnevent(vf, vnevent, dvp, name, ct)); +} + +const fs_operation_def_t inotify_vnodesrc_template[] = { + VOPNAME_CLOSE, { .femop_close = inotify_fop_close }, + VOPNAME_CREATE, { .femop_create = inotify_fop_create }, + VOPNAME_LINK, { .femop_link = inotify_fop_link }, + VOPNAME_MKDIR, { .femop_mkdir = inotify_fop_mkdir }, + VOPNAME_OPEN, { .femop_open = inotify_fop_open }, + VOPNAME_READ, { .femop_read = inotify_fop_read }, + VOPNAME_READDIR, { .femop_readdir = inotify_fop_readdir }, + VOPNAME_REMOVE, { .femop_remove = inotify_fop_remove }, + VOPNAME_RMDIR, { .femop_rmdir = inotify_fop_rmdir }, + VOPNAME_SETATTR, { .femop_setattr = inotify_fop_setattr }, + VOPNAME_WRITE, { .femop_write = inotify_fop_write }, + VOPNAME_VNEVENT, { .femop_vnevent = inotify_fop_vnevent }, + NULL, NULL +}; + +static int +inotify_watch_cmpwd(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + if (lhs->inw_wd < rhs->inw_wd) + return (-1); + + if (lhs->inw_wd > rhs->inw_wd) + return (1); + + return (0); +} + +static int +inotify_watch_cmpvp(inotify_watch_t *lhs, inotify_watch_t *rhs) +{ + uintptr_t lvp = (uintptr_t)lhs->inw_vp, rvp = (uintptr_t)rhs->inw_vp; + + if (lvp < rvp) + return (-1); + + if (lvp > rvp) + return (1); + + return (0); +} + +static void +inotify_watch_hold(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 0); + watch->inw_refcnt++; + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_release(inotify_watch_t *watch) +{ + mutex_enter(&watch->inw_lock); + VERIFY(watch->inw_refcnt > 1); + + if (--watch->inw_refcnt == 1 && watch->inw_zombie) { + /* + * We're down to our last reference; kick anyone that might be + * waiting. + */ + cv_signal(&watch->inw_cv); + } + + mutex_exit(&watch->inw_lock); +} + +static void +inotify_watch_event(inotify_watch_t *watch, uint64_t mask, char *name) +{ + inotify_kevent_t *event, *tail; + inotify_state_t *state = watch->inw_state; + uint32_t wd = watch->inw_wd, cookie = 0, len; + int align = sizeof (uintptr_t) - 1; + boolean_t removal = mask & IN_REMOVAL ? B_TRUE : B_FALSE; + inotify_watch_t *source = watch; + + if (!(mask &= watch->inw_mask) || mask == IN_ISDIR) + return; + + if (watch->inw_parent != NULL) { + /* + * This is an event on the child; if this isn't a valid child + * event, return. Otherwise, we move our watch to be our + * parent (which we know is around because we have a hold on + * it) and continue. + */ + if (!(mask & IN_CHILD_EVENTS)) + return; + + name = watch->inw_name; + watch = watch->inw_parent; + } + + if (!removal) { + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || + watch->inw_fired || !watch->inw_active) { + mutex_exit(&state->ins_lock); + return; + } + } else { + if (!watch->inw_active) + return; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + } + + /* + * If this is an operation on a directory and it's a child event + * (event if it's not on a child), we specify IN_ISDIR. + */ + if (source->inw_vp->v_type == VDIR && (mask & IN_CHILD_EVENTS)) + mask |= IN_ISDIR; + + if (mask & (IN_MOVED_FROM | IN_MOVED_TO)) + cookie = (uint32_t)curthread->t_did; + + if (state->ins_nevents >= state->ins_maxevents) { + /* + * We're at our maximum number of events -- turn our event + * into an IN_Q_OVERFLOW event, which will be coalesced if + * it's already the tail event. + */ + mask = IN_Q_OVERFLOW; + wd = (uint32_t)-1; + cookie = 0; + len = 0; + } + + if ((tail = state->ins_tail) != NULL && tail->ine_event.wd == wd && + tail->ine_event.mask == mask && tail->ine_event.cookie == cookie && + ((tail->ine_event.len == 0 && len == 0) || + (name != NULL && tail->ine_event.len != 0 && + strcmp(tail->ine_event.name, name) == 0))) { + /* + * This is an implicitly coalesced event; we're done. + */ + if (!removal) + mutex_exit(&state->ins_lock); + return; + } + + if (name != NULL) { + if ((len = strlen(name) + 1) & align) + len += (align + 1) - (len & align); + } else { + len = 0; + } + + event = kmem_zalloc(sizeof (inotify_kevent_t) + len, KM_SLEEP); + event->ine_event.wd = wd; + event->ine_event.mask = (uint32_t)mask; + event->ine_event.cookie = cookie; + event->ine_event.len = len; + + if (name != NULL) + strcpy(event->ine_event.name, name); + + if (tail != NULL) { + tail->ine_next = event; + } else { + VERIFY(state->ins_head == NULL); + state->ins_head = event; + cv_broadcast(&state->ins_cv); + } + + state->ins_tail = event; + state->ins_nevents++; + state->ins_size += sizeof (event->ine_event) + len; + + if (removal) + return; + + if ((watch->inw_mask & IN_ONESHOT) && !watch->inw_fired) { + /* + * If this is a one-shot, we need to remove the watch. (Note + * that this will recurse back into inotify_watch_event() to + * fire the IN_IGNORED event -- but with "removal" set.) + */ + watch->inw_fired = 1; + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + pollwakeup(&state->ins_pollhd, POLLRDNORM | POLLIN); +} + +/* + * Destroy a watch. By the time we're in here, the watch must have exactly + * one reference. + */ +static void +inotify_watch_destroy(inotify_watch_t *watch) +{ + VERIFY(MUTEX_HELD(&watch->inw_lock)); + + if (watch->inw_name != NULL) + kmem_free(watch->inw_name, strlen(watch->inw_name) + 1); + + kmem_free(watch, sizeof (inotify_watch_t)); +} + +/* + * Zombify a watch. By the time we come in here, it must be true that the + * watch has already been fem_uninstall()'d -- the only reference should be + * in the state's data structure. If we can get away with freeing it, we'll + * do that -- but if the reference count is greater than one due to an active + * vnode operation, we'll put this watch on the zombie list on the state + * structure. + */ +static void +inotify_watch_zombify(inotify_watch_t *watch) +{ + inotify_state_t *state = watch->inw_state; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(!watch->inw_zombie); + + watch->inw_zombie = 1; + + if (watch->inw_parent != NULL) { + inotify_watch_release(watch->inw_parent); + } else { + avl_remove(&state->ins_byvp, watch); + avl_remove(&state->ins_bywd, watch); + vmem_free(state->ins_wds, (void *)(uintptr_t)watch->inw_wd, 1); + watch->inw_wd = -1; + } + + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + /* + * There are no operations in flight and there is no way + * for anyone to discover this watch -- we can destroy it. + */ + inotify_watch_destroy(watch); + } else { + /* + * There are operations in flight; we will need to enqueue + * this for later destruction. + */ + watch->inw_parent = state->ins_zombies; + state->ins_zombies = watch; + mutex_exit(&watch->inw_lock); + } +} + +static inotify_watch_t * +inotify_watch_add(inotify_state_t *state, inotify_watch_t *parent, + const char *name, vnode_t *vp, uint32_t mask) +{ + inotify_watch_t *watch; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + + watch = kmem_zalloc(sizeof (inotify_watch_t), KM_SLEEP); + + watch->inw_vp = vp; + watch->inw_mask = mask; + watch->inw_state = state; + watch->inw_refcnt = 1; + + if (parent == NULL) { + watch->inw_wd = (int)(uintptr_t)vmem_alloc(state->ins_wds, + 1, VM_BESTFIT | VM_SLEEP); + avl_add(&state->ins_byvp, watch); + avl_add(&state->ins_bywd, watch); + + avl_create(&watch->inw_children, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + } else { + VERIFY(name != NULL); + inotify_watch_hold(parent); + watch->inw_mask &= IN_CHILD_EVENTS; + watch->inw_parent = parent; + watch->inw_name = kmem_alloc(strlen(name) + 1, KM_SLEEP); + strcpy(watch->inw_name, name); + + avl_add(&parent->inw_children, watch); + } + + /* + * Add our monitor to the vnode. We must not have the watch lock held + * when we do this, as it will immediately hold our watch. + */ + err = fem_install(vp, inotify_femp, watch, OPARGUNIQ, + (void (*)(void *))inotify_watch_hold, + (void (*)(void *))inotify_watch_release); + + VERIFY(err == 0); + + return (watch); +} + +/* + * Remove a (non-child) watch. This is called from either synchronous context + * via inotify_rm_watch() or monitor context via either a vnevent or a + * one-shot. + */ +static void +inotify_watch_remove(inotify_state_t *state, inotify_watch_t *watch) +{ + inotify_watch_t *child; + int err; + + VERIFY(MUTEX_HELD(&state->ins_lock)); + VERIFY(watch->inw_parent == NULL); + + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + /* + * If we have children, we're going to remove them all and set them + * all to be zombies. + */ + while ((child = avl_first(&watch->inw_children)) != NULL) { + VERIFY(child->inw_parent == watch); + avl_remove(&watch->inw_children, child); + + err = fem_uninstall(child->inw_vp, inotify_femp, child); + VERIFY(err == 0); + + /* + * If this child watch has been orphaned, remove it from the + * state's list of orphans. + */ + if (child->inw_orphaned) { + list_remove(&state->ins_orphans, child); + crfree(child->inw_cred); + } + + VN_RELE(child->inw_vp); + + /* + * We're down (or should be down) to a single reference to + * this child watch; it's safe to zombify it. + */ + inotify_watch_zombify(child); + } + + inotify_watch_event(watch, IN_IGNORED | IN_REMOVAL, NULL); + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- we know that the only reference + * can come from operations in flight. + */ + inotify_watch_zombify(watch); +} + +/* + * Delete a watch. Should only be called from VOP context. + */ +static void +inotify_watch_delete(inotify_watch_t *watch, uint32_t event) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = watch->inw_vp }, *parent; + int err; + + if (event != IN_DELETE_SELF && !(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie) { + mutex_exit(&state->ins_lock); + return; + } + + if ((parent = watch->inw_parent) == NULL) { + if (event == IN_DELETE_SELF) { + /* + * If we're here because we're being deleted and we + * are not a child watch, we need to delete the entire + * watch, children and all. + */ + inotify_watch_remove(state, watch); + } + + mutex_exit(&state->ins_lock); + return; + } else { + if (event == IN_DELETE_SELF && + !(parent->inw_mask & IN_EXCL_UNLINK)) { + /* + * This is a child watch for a file that is being + * removed and IN_EXCL_UNLINK has not been specified; + * indicate that it is orphaned and add it to the list + * of orphans. (This list will be checked by the + * cleaning cyclic to determine when the watch has + * become the only hold on the vnode, at which point + * the watch can be zombified.) Note that we check + * if the watch is orphaned before we orphan it: hard + * links make it possible for VE_REMOVE to be called + * multiple times on the same vnode. (!) + */ + if (!watch->inw_orphaned) { + watch->inw_orphaned = 1; + watch->inw_cred = CRED(); + crhold(watch->inw_cred); + list_insert_head(&state->ins_orphans, watch); + } + + mutex_exit(&state->ins_lock); + return; + } + + if (watch->inw_orphaned) { + /* + * If we're here, a file was orphaned and then later + * moved -- which almost certainly means that hard + * links are on the scene. We choose the orphan over + * the move because we don't want to spuriously + * drop events if we can avoid it. + */ + crfree(watch->inw_cred); + list_remove(&state->ins_orphans, watch); + } + } + + if (avl_find(&parent->inw_children, &cmp, NULL) == NULL) { + /* + * This watch has already been deleted from the parent. + */ + mutex_exit(&state->ins_lock); + return; + } + + avl_remove(&parent->inw_children, watch); + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + VN_RELE(watch->inw_vp); + + /* + * It's now safe to zombify the watch -- which won't actually delete + * it as we know that the reference count is greater than 1. + */ + inotify_watch_zombify(watch); + mutex_exit(&state->ins_lock); +} + +/* + * Insert a new child watch. Should only be called from VOP context when + * a child is created in a watched directory. + */ +static void +inotify_watch_insert(inotify_watch_t *watch, vnode_t *vp, char *name) +{ + inotify_state_t *state = watch->inw_state; + inotify_watch_t cmp = { .inw_vp = vp }; + + if (!(watch->inw_mask & IN_CHILD_EVENTS)) + return; + + mutex_enter(&state->ins_lock); + + if (watch->inw_zombie || watch->inw_parent != NULL || vp == NULL) { + mutex_exit(&state->ins_lock); + return; + } + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + return; + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, watch, name, vp, watch->inw_mask); + VERIFY(watch != NULL); + + mutex_exit(&state->ins_lock); +} + + +static int +inotify_add_watch(inotify_state_t *state, vnode_t *vp, uint32_t mask, + int32_t *wdp) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + uint32_t set; + + set = (mask & (IN_ALL_EVENTS | IN_MODIFIERS)) | IN_UNMASKABLE; + + /* + * Lookup our vnode to determine if we already have a watch on it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * We don't have this watch; allocate a new one, provided that + * we have fewer than our limit. + */ + if (avl_numnodes(&state->ins_bywd) >= state->ins_maxwatches) { + mutex_exit(&state->ins_lock); + return (ENOSPC); + } + + VN_HOLD(vp); + watch = inotify_watch_add(state, NULL, NULL, vp, set); + *wdp = watch->inw_wd; + mutex_exit(&state->ins_lock); + + return (0); + } + + VERIFY(!watch->inw_zombie); + + if (!(mask & IN_MASK_ADD)) { + /* + * Note that if we're resetting our event mask and we're + * transitioning from an event mask that includes child events + * to one that doesn't, there will be potentially some stale + * child watches. This is basically fine: they won't fire, + * and they will correctly be removed when the watch is + * removed. + */ + watch->inw_mask = 0; + } + + watch->inw_mask |= set; + + *wdp = watch->inw_wd; + + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_add_child(inotify_state_t *state, vnode_t *vp, char *name) +{ + inotify_watch_t *watch, cmp = { .inw_vp = vp }; + vnode_t *cvp; + int err; + + /* + * Verify that the specified child doesn't have a directory component + * within it. + */ + if (strchr(name, '/') != NULL) + return (EINVAL); + + /* + * Lookup the underlying file. Note that this will succeed even if + * we don't have permissions to actually read the file. + */ + if ((err = lookupnameat(name, + UIO_SYSSPACE, NO_FOLLOW, NULL, &cvp, vp)) != 0) { + return (err); + } + + /* + * Use our vnode to find our watch, and then add our child watch to it. + */ + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_byvp, &cmp, NULL)) == NULL) { + /* + * This is unexpected -- it means that we don't have the + * watch that we thought we had. + */ + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (ENXIO); + } + + /* + * Now lookup the child vnode in the watch; we'll only add it if it + * isn't already there. + */ + cmp.inw_vp = cvp; + + if (avl_find(&watch->inw_children, &cmp, NULL) != NULL) { + mutex_exit(&state->ins_lock); + VN_RELE(cvp); + return (0); + } + + watch = inotify_watch_add(state, watch, name, cvp, watch->inw_mask); + VERIFY(watch != NULL); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_rm_watch(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + inotify_watch_remove(state, watch); + mutex_exit(&state->ins_lock); + + return (0); +} + +static int +inotify_activate(inotify_state_t *state, int32_t wd) +{ + inotify_watch_t *watch, cmp = { .inw_wd = wd }; + + mutex_enter(&state->ins_lock); + + if ((watch = avl_find(&state->ins_bywd, &cmp, NULL)) == NULL) { + mutex_exit(&state->ins_lock); + return (EINVAL); + } + + watch->inw_active = 1; + + mutex_exit(&state->ins_lock); + + return (0); +} + +/* + * Called periodically as a cyclic to process the orphans and zombies. + */ +static void +inotify_clean(void *arg) +{ + inotify_state_t *state = arg; + inotify_watch_t *watch, *parent, *next, **prev; + cred_t *savecred; + int err; + + mutex_enter(&state->ins_lock); + + for (watch = list_head(&state->ins_orphans); + watch != NULL; watch = next) { + next = list_next(&state->ins_orphans, watch); + + VERIFY(!watch->inw_zombie); + VERIFY((parent = watch->inw_parent) != NULL); + + if (watch->inw_vp->v_count > 1) + continue; + + avl_remove(&parent->inw_children, watch); + err = fem_uninstall(watch->inw_vp, inotify_femp, watch); + VERIFY(err == 0); + + list_remove(&state->ins_orphans, watch); + + /* + * For purposes of releasing the vnode, we need to switch our + * cred to be the cred of the orphaning thread (which we held + * at the time this watch was orphaned). + */ + savecred = curthread->t_cred; + curthread->t_cred = watch->inw_cred; + VN_RELE(watch->inw_vp); + crfree(watch->inw_cred); + curthread->t_cred = savecred; + + inotify_watch_zombify(watch); + } + + prev = &state->ins_zombies; + + while ((watch = *prev) != NULL) { + mutex_enter(&watch->inw_lock); + + if (watch->inw_refcnt == 1) { + *prev = watch->inw_parent; + inotify_watch_destroy(watch); + continue; + } + + prev = &watch->inw_parent; + mutex_exit(&watch->inw_lock); + } + + mutex_exit(&state->ins_lock); +} + +/*ARGSUSED*/ +static int +inotify_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + int instances = 0; + cyc_handler_t hdlr; + cyc_time_t when; + char c[64]; + + if (minor != INOTIFYMNRN_INOTIFY) + return (ENXIO); + + mutex_enter(&inotify_lock); + + for (state = inotify_state; state != NULL; state = state->ins_next) { + if (state->ins_cred == cred_p) + instances++; + } + + if (instances >= inotify_maxinstances) { + mutex_exit(&inotify_lock); + return (EMFILE); + } + + minor = (minor_t)(uintptr_t)vmem_alloc(inotify_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(inotify_softstate, minor) != DDI_SUCCESS) { + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&inotify_lock); + return (NULL); + } + + state = ddi_get_soft_state(inotify_softstate, minor); + *devp = makedevice(major, minor); + + crhold(cred_p); + state->ins_cred = cred_p; + state->ins_next = inotify_state; + inotify_state = state; + + (void) snprintf(c, sizeof (c), "inotify_watchid_%d", minor); + state->ins_wds = vmem_create(c, (void *)1, UINT32_MAX, 1, + NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER); + + avl_create(&state->ins_bywd, + (int(*)(const void *, const void *))inotify_watch_cmpwd, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_bywd)); + + avl_create(&state->ins_byvp, + (int(*)(const void *, const void *))inotify_watch_cmpvp, + sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_byvp)); + + list_create(&state->ins_orphans, sizeof (inotify_watch_t), + offsetof(inotify_watch_t, inw_orphan)); + + state->ins_maxwatches = inotify_maxwatches; + state->ins_maxevents = inotify_maxevents; + + mutex_exit(&inotify_lock); + + mutex_enter(&cpu_lock); + + hdlr.cyh_func = inotify_clean; + hdlr.cyh_level = CY_LOW_LEVEL; + hdlr.cyh_arg = state; + + when.cyt_when = 0; + when.cyt_interval = NANOSEC; + + state->ins_cleaner = cyclic_add(&hdlr, &when); + mutex_exit(&cpu_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + inotify_state_t *state; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + int err = 0, nevents = 0; + size_t len; + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + while (state->ins_head == NULL) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->ins_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->ins_cv, &state->ins_lock)) { + mutex_exit(&state->ins_lock); + return (EINTR); + } + } + + /* + * We have events and we have our lock; return as many as we can. + */ + while ((event = state->ins_head) != NULL) { + len = sizeof (event->ine_event) + event->ine_event.len; + + if (uio->uio_resid < len) { + if (nevents == 0) + err = EINVAL; + break; + } + + nevents++; + + if ((err = uiomove(&event->ine_event, len, UIO_READ, uio)) != 0) + break; + + VERIFY(state->ins_nevents > 0); + state->ins_nevents--; + + VERIFY(state->ins_size > 0); + state->ins_size -= len; + + if ((state->ins_head = event->ine_next) == NULL) { + VERIFY(event == state->ins_tail); + VERIFY(state->ins_nevents == 0); + state->ins_tail = NULL; + } + + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + mutex_exit(&state->ins_lock); + + return (err); +} + +/*ARGSUSED*/ +static int +inotify_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + mutex_enter(&state->ins_lock); + + if (state->ins_head != NULL) { + *reventsp = events & (POLLRDNORM | POLLIN); + } else { + *reventsp = 0; + + if (!anyyet) + *phpp = &state->ins_pollhd; + } + + mutex_exit(&state->ins_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + inotify_state_t *state; + minor_t minor = getminor(dev); + file_t *fp; + int rval; + + state = ddi_get_soft_state(inotify_softstate, minor); + + switch (cmd) { + case INOTIFYIOC_ADD_WATCH: { + inotify_addwatch_t addwatch; + file_t *fp; + + if (copyin((void *)arg, &addwatch, sizeof (addwatch)) != 0) + return (EFAULT); + + if ((fp = getf(addwatch.inaw_fd)) == NULL) + return (EBADF); + + rval = inotify_add_watch(state, fp->f_vnode, + addwatch.inaw_mask, rv); + + releasef(addwatch.inaw_fd); + return (rval); + } + + case INOTIFYIOC_ADD_CHILD: { + inotify_addchild_t addchild; + char name[MAXPATHLEN]; + + if (copyin((void *)arg, &addchild, sizeof (addchild)) != 0) + return (EFAULT); + + if (copyinstr(addchild.inac_name, name, MAXPATHLEN, NULL) != 0) + return (EFAULT); + + if ((fp = getf(addchild.inac_fd)) == NULL) + return (EBADF); + + rval = inotify_add_child(state, fp->f_vnode, name); + + releasef(addchild.inac_fd); + return (rval); + } + + case INOTIFYIOC_RM_WATCH: + return (inotify_rm_watch(state, arg)); + + case INOTIFYIOC_ACTIVATE: + return (inotify_activate(state, arg)); + + case FIONREAD: { + int32_t size; + + mutex_enter(&state->ins_lock); + size = state->ins_size; + mutex_exit(&state->ins_lock); + + if (copyout(&size, (void *)arg, sizeof (size)) != 0) + return (EFAULT); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +inotify_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + inotify_state_t *state, **sp; + inotify_watch_t *watch, *zombies; + inotify_kevent_t *event; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(inotify_softstate, minor); + + if (state->ins_pollhd.ph_list != NULL) { + pollwakeup(&state->ins_pollhd, POLLERR); + pollhead_clean(&state->ins_pollhd); + } + + mutex_enter(&state->ins_lock); + + /* + * First, destroy all of our watches. + */ + while ((watch = avl_first(&state->ins_bywd)) != NULL) + inotify_watch_remove(state, watch); + + /* + * And now destroy our event queue. + */ + while ((event = state->ins_head) != NULL) { + state->ins_head = event->ine_next; + kmem_free(event, INOTIFY_EVENT_LENGTH(event)); + } + + zombies = state->ins_zombies; + state->ins_zombies = NULL; + mutex_exit(&state->ins_lock); + + /* + * Now that our state lock is dropped, we can synchronously wait on + * any zombies. + */ + while ((watch = zombies) != NULL) { + zombies = zombies->inw_parent; + + mutex_enter(&watch->inw_lock); + + while (watch->inw_refcnt > 1) + cv_wait(&watch->inw_cv, &watch->inw_lock); + + inotify_watch_destroy(watch); + } + + mutex_enter(&cpu_lock); + cyclic_remove(state->ins_cleaner); + mutex_exit(&cpu_lock); + + mutex_enter(&inotify_lock); + + /* + * Remove our state from our global list, and release our hold on + * the cred. + */ + for (sp = &inotify_state; *sp != state; sp = &((*sp)->ins_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->ins_next; + crfree(state->ins_cred); + + ddi_soft_state_free(inotify_softstate, minor); + vmem_free(inotify_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&inotify_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +inotify_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&inotify_lock); + + if (ddi_soft_state_init(&inotify_softstate, + sizeof (inotify_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/inotify failed to create soft state"); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "inotify", S_IFCHR, + INOTIFYMNRN_INOTIFY, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create minor node"); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + if (fem_create("inotify_fem", + inotify_vnodesrc_template, &inotify_femp) != 0) { + cmn_err(CE_NOTE, "/dev/inotify couldn't create FEM state"); + ddi_remove_minor_node(devi, NULL); + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + inotify_devi = devi; + + inotify_minor = vmem_create("inotify_minor", (void *)INOTIFYMNRN_CLONE, + UINT32_MAX - INOTIFYMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&inotify_lock); + fem_free(inotify_femp); + vmem_destroy(inotify_minor); + + ddi_remove_minor_node(inotify_devi, NULL); + inotify_devi = NULL; + + ddi_soft_state_fini(&inotify_softstate); + mutex_exit(&inotify_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +inotify_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)inotify_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops inotify_cb_ops = { + inotify_open, /* open */ + inotify_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + inotify_read, /* read */ + nodev, /* write */ + inotify_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + inotify_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops inotify_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + inotify_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + inotify_attach, /* attach */ + inotify_detach, /* detach */ + nodev, /* reset */ + &inotify_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "inotify support", /* name of module */ + &inotify_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/inotify.conf b/usr/src/uts/common/io/inotify.conf new file mode 100644 index 0000000000..ce9da6180f --- /dev/null +++ b/usr/src/uts/common/io/inotify.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014 Joyent, Inc. All rights reserved. +# + +name="inotify" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/ixgbe/ixgbe_main.c b/usr/src/uts/common/io/ixgbe/ixgbe_main.c index 848e3470c7..117b7da16a 100644 --- a/usr/src/uts/common/io/ixgbe/ixgbe_main.c +++ b/usr/src/uts/common/io/ixgbe/ixgbe_main.c @@ -1792,6 +1792,7 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, void *arg1, void *arg2) { ixgbe_t *ixgbe = (ixgbe_t *)arg1; + int prev = ixgbe->intr_cnt; switch (cbaction) { /* IRM callback */ @@ -1805,7 +1806,8 @@ ixgbe_cbfunc(dev_info_t *dip, ddi_cb_action_t cbaction, void *cbarg, if (ixgbe_intr_adjust(ixgbe, cbaction, count) != DDI_SUCCESS) { ixgbe_error(ixgbe, - "IRM CB: Failed to adjust interrupts"); + "IRM CB: Failed to adjust interrupts [%d %d %d]", + cbaction, count, prev); goto cb_fail; } break; diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c index 49ca6f0475..25da45be39 100644 --- a/usr/src/uts/common/io/ksocket/ksocket.c +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -22,6 +22,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/file.h> @@ -820,7 +821,7 @@ ksocket_spoll(ksocket_t ks, int timo, short events, short *revents, if (error != 0 || *revents != 0) break; - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; if (timo == -1) { @@ -931,3 +932,15 @@ ksocket_rele(ksocket_t ks) cv_signal(&so->so_closing_cv); } } + +int +ksocket_krecv_set(ksocket_t ks, ksocket_krecv_f cb, void *arg) +{ + return (so_krecv_set(KSTOSO(ks), (so_krecv_f)cb, arg)); +} + +void +ksocket_krecv_unblock(ksocket_t ks) +{ + return (so_krecv_unblock(KSTOSO(ks))); +} diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h index ac5251540f..516a68d358 100644 --- a/usr/src/uts/common/io/ksocket/ksocket_impl.h +++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h @@ -22,11 +22,17 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #ifndef _INET_KSOCKET_KSOCKET_IMPL_H #define _INET_KSOCKET_KSOCKET_IMPL_H +/* + * Note that if this relationship ever changes, the logic in ksocket_krecv_set + * must be updated and we must maintain local state about this on whatever the + * new ksocket object is. + */ #define KSTOSO(ks) ((struct sonode *)(ks)) #define SOTOKS(so) ((ksocket_t)(uintptr_t)(so)) diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 98ec8b4366..99bdbda0cd 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -3101,6 +3101,9 @@ mac_prop_check_size(mac_prop_id_t id, uint_t valsize, boolean_t is_range) case MAC_PROP_WL_MLME: minsize = sizeof (wl_mlme_t); break; + case MAC_PROP_VN_PROMISC_FILTERED: + minsize = sizeof (boolean_t); + break; } return (valsize >= minsize); diff --git a/usr/src/uts/common/io/mac/mac_client.c b/usr/src/uts/common/io/mac/mac_client.c index f4dd8aab25..d719899901 100644 --- a/usr/src/uts/common/io/mac/mac_client.c +++ b/usr/src/uts/common/io/mac/mac_client.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -3256,7 +3256,8 @@ mac_promisc_add(mac_client_handle_t mch, mac_client_promisc_type_t type, } if ((mcip->mci_state_flags & MCIS_IS_VNIC) && - type == MAC_CLIENT_PROMISC_ALL) { + type == MAC_CLIENT_PROMISC_ALL && + (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED)) { /* * The function is being invoked by the upper MAC client * of a VNIC. The VNIC should only see the traffic @@ -4135,16 +4136,15 @@ mac_info_get(const char *name, mac_info_t *minfop) /* * To get the capabilities that MAC layer cares about, such as rings, factory * mac address, vnic or not, it should directly invoke this function. If the - * link is part of a bridge, then the only "capability" it has is the inability - * to do zero copy. + * link is part of a bridge, then the link is unable to do zero copy. */ boolean_t i_mac_capab_get(mac_handle_t mh, mac_capab_t cap, void *cap_data) { mac_impl_t *mip = (mac_impl_t *)mh; - if (mip->mi_bridge_link != NULL) - return (cap == MAC_CAPAB_NO_ZCOPY); + if (mip->mi_bridge_link != NULL && cap == MAC_CAPAB_NO_ZCOPY) + return (B_TRUE); else if (mip->mi_callbacks->mc_callbacks & MC_GETCAPAB) return (mip->mi_getcapab(mip->mi_driver, cap, cap_data)); else @@ -4323,7 +4323,13 @@ mac_addr_len(mac_handle_t mh) boolean_t mac_is_vnic(mac_handle_t mh) { - return (((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC); + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_VNIC) != 0); +} + +boolean_t +mac_is_overlay(mac_handle_t mh) +{ + return ((((mac_impl_t *)mh)->mi_state_flags & MIS_IS_OVERLAY) != 0); } mac_handle_t @@ -5537,3 +5543,23 @@ mac_client_set_rings(mac_client_handle_t mch, int rxrings, int txrings) mrp->mrp_ntxrings = txrings; } } + +boolean_t +mac_get_promisc_filtered(mac_client_handle_t mch) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + return (mcip->mci_protect_flags & MPT_FLAG_PROMISC_FILTERED); +} + +void +mac_set_promisc_filtered(mac_client_handle_t mch, boolean_t enable) +{ + mac_client_impl_t *mcip = (mac_client_impl_t *)mch; + + ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip)); + if (enable) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; + else + mcip->mci_protect_flags &= ~MPT_FLAG_PROMISC_FILTERED; +} diff --git a/usr/src/uts/common/io/mac/mac_datapath_setup.c b/usr/src/uts/common/io/mac/mac_datapath_setup.c index 5ca673ea6e..b0f184b161 100644 --- a/usr/src/uts/common/io/mac/mac_datapath_setup.c +++ b/usr/src/uts/common/io/mac/mac_datapath_setup.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -603,6 +604,7 @@ mac_srs_cpu_setup(cpu_setup_t what, int id, void *arg) * * TODO: Cleanup and tighten some of the assumptions. */ +boolean_t mac_check_overlay = B_TRUE; boolean_t mac_use_bw_heuristic = B_TRUE; static int mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) @@ -610,6 +612,7 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) uint64_t cpu_speed, bw = 0; int srings = 0; boolean_t bw_enabled = B_FALSE; + mac_client_impl_t *mcip = flent->fe_mcip; ASSERT(!(flent->fe_type & FLOW_USER)); if (flent->fe_resource_props.mrp_mask & MRP_MAXBW && @@ -637,7 +640,16 @@ mac_compute_soft_ring_count(flow_entry_t *flent, int rx_srs_cnt, int maxcpus) */ if (mac_soft_ring_enable) srings = srings * 2; + } else if (mac_check_overlay == B_TRUE && + (mcip->mci_state_flags & MCIS_IS_VNIC) != 0) { + /* Is this a VNIC on an overlay? */ + mac_handle_t mh = (mac_handle_t)mcip->mci_mip; + if (mac_is_overlay(mh) == B_TRUE) { + srings = mac_rx_soft_ring_10gig_count; + } } + + } else { /* * Soft ring computation using CPU speed and specified diff --git a/usr/src/uts/common/io/mac/mac_protect.c b/usr/src/uts/common/io/mac/mac_protect.c index cd7fcb9a5d..e341f3d562 100644 --- a/usr/src/uts/common/io/mac/mac_protect.c +++ b/usr/src/uts/common/io/mac/mac_protect.c @@ -2297,6 +2297,9 @@ mac_protect_init(mac_client_impl_t *mcip) sizeof (dhcpv6_cid_t), offsetof(dhcpv6_cid_t, dc_node)); avl_create(&mcip->mci_v6_dyn_ip, compare_dhcpv6_ip, sizeof (dhcpv6_addr_t), offsetof(dhcpv6_addr_t, da_node)); + + if (mcip->mci_state_flags & MCIS_IS_VNIC) + mcip->mci_protect_flags |= MPT_FLAG_PROMISC_FILTERED; } void diff --git a/usr/src/uts/common/io/mac/mac_provider.c b/usr/src/uts/common/io/mac/mac_provider.c index 57d1996d84..a17795be22 100644 --- a/usr/src/uts/common/io/mac/mac_provider.c +++ b/usr/src/uts/common/io/mac/mac_provider.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -350,6 +351,9 @@ mac_register(mac_register_t *mregp, mac_handle_t *mhp) if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) mip->mi_state_flags |= MIS_IS_AGGR; + if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) + mip->mi_state_flags |= MIS_IS_OVERLAY; + mac_addr_factory_init(mip); /* diff --git a/usr/src/uts/common/io/mac/mac_sched.c b/usr/src/uts/common/io/mac/mac_sched.c index eb179b07c7..6fcdb468c2 100644 --- a/usr/src/uts/common/io/mac/mac_sched.c +++ b/usr/src/uts/common/io/mac/mac_sched.c @@ -1370,7 +1370,7 @@ int mac_srs_worker_wakeup_ticks = 0; * said, the constant is left as a static variable to allow it to be * dynamically tuned in the field if and as needed. */ -static uintptr_t mac_rx_srs_stack_needed = 10240; +static uintptr_t mac_rx_srs_stack_needed = 13312; static uint_t mac_rx_srs_stack_toodeep; #ifndef STACK_GROWTH_DOWN diff --git a/usr/src/uts/common/io/mac/mac_stat.c b/usr/src/uts/common/io/mac/mac_stat.c index 31972f94d8..c1a5c9c069 100644 --- a/usr/src/uts/common/io/mac/mac_stat.c +++ b/usr/src/uts/common/io/mac/mac_stat.c @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ /* @@ -390,8 +391,8 @@ i_mac_stat_create(void *handle, const char *modname, const char *statname, kstat_t *ksp; kstat_named_t *knp; - ksp = kstat_create(modname, 0, statname, "net", - KSTAT_TYPE_NAMED, count, 0); + ksp = kstat_create_zone(modname, 0, statname, "net", + KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return (NULL); @@ -948,9 +949,9 @@ mac_driver_stat_create(mac_impl_t *mip) major_t major = getmajor(mip->mi_phy_dev); count = MAC_MOD_NKSTAT + MAC_NKSTAT + mip->mi_type->mt_statcount; - ksp = kstat_create((const char *)ddi_major_to_name(major), + ksp = kstat_create_zone((const char *)ddi_major_to_name(major), getminor(mip->mi_phy_dev) - 1, MAC_KSTAT_NAME, - MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0); + MAC_KSTAT_CLASS, KSTAT_TYPE_NAMED, count, 0, getzoneid()); if (ksp == NULL) return; diff --git a/usr/src/uts/common/io/mr_sas/mr_sas.conf b/usr/src/uts/common/io/mr_sas/mr_sas.conf index cfda434e23..6c585c6a42 100644 --- a/usr/src/uts/common/io/mr_sas/mr_sas.conf +++ b/usr/src/uts/common/io/mr_sas/mr_sas.conf @@ -13,3 +13,11 @@ # Fast-Path specific flag. Default is "yes". # mrsas-enable-fp="yes"; +flow_control="dmult" queue="qsort" tape="sctp"; + +# MSI specific flag. To enable MSI modify the flag value to "yes" +mrsas-enable-msi="yes"; + +# Fast-Path specific flag. To enable Fast-Path modify the flag value to "yes" +mrsas-enable-fp="yes"; + diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE new file mode 100644 index 0000000000..187088ff34 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2014, Thales UK Limited + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip new file mode 100644 index 0000000000..cde8b65b37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +NFAST CRYPTO ACCELERATOR DRIVER diff --git a/usr/src/uts/common/io/nfp/autoversion.h b/usr/src/uts/common/io/nfp/autoversion.h new file mode 100644 index 0000000000..b9021942b2 --- /dev/null +++ b/usr/src/uts/common/io/nfp/autoversion.h @@ -0,0 +1,21 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* AUTOGENERATED - DO NOT EDIT */ +#ifndef AUTOVERSION_H +#define AUTOVERSION_H + +#define VERSION_RELEASEMAJOR 2 +#define VERSION_RELEASEMINOR 26 +#define VERSION_RELEASEPATCH 40 +#define VERSION_NO "2.26.40cam999" +#define VERSION_COMPNAME "nfdrv" + +#endif diff --git a/usr/src/uts/common/io/nfp/drvlist.c b/usr/src/uts/common/io/nfp/drvlist.c new file mode 100644 index 0000000000..a04b1fd5b0 --- /dev/null +++ b/usr/src/uts/common/io/nfp/drvlist.c @@ -0,0 +1,19 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include "nfp_common.h" +#include "nfp_cmd.h" + +const nfpcmd_dev *nfp_drvlist[] = { + &i21285_cmddev, + &i21555_cmddev, + NULL +}; + diff --git a/usr/src/uts/common/io/nfp/hostif.c b/usr/src/uts/common/io/nfp/hostif.c new file mode 100644 index 0000000000..684be703ea --- /dev/null +++ b/usr/src/uts/common/io/nfp/hostif.c @@ -0,0 +1,1192 @@ +/* + +hostif.c: nFast PCI driver for Solaris 2.5, 2.6, 2.7 and 2.8 + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +??/??/2001 jsh added support for solaris 2.8 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv +12/02/2002 jsh added high level interrupt support + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" + +#include "nfp.h" + +/* mapped memory attributes, no-swap endianess (done in higher level) */ +static struct ddi_device_acc_attr nosw_attr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC +}; + +/* dma attributes */ +static ddi_dma_attr_t dma_attrs = { + DMA_ATTR_V0, /* version number */ + (uint64_t)0x0, /* low address */ + (uint64_t)0xffffffff, /* high address */ + (uint64_t)0xffffff, /* DMA counter max */ + (uint64_t)0x1, /* alignment */ + 0x0c, /* burst sizes */ + 0x1, /* minimum transfer size */ + (uint64_t)0x3ffffff, /* maximum transfer size */ + (uint64_t)0x7fff, /* maximum segment size */ + 1, /* no scatter/gather lists */ + 1, /* granularity */ + 0 /* DMA flags */ +}; + +/* + * Debug message control + * Debug Levels: + * 0 = no messages + * 1 = Errors + * 2 = Subroutine calls & control flow + * 3 = I/O Data (verbose!) + * Can be set with adb or in the /etc/system file with + * "set nfp:nfp_debug=<value>" + */ + +int nfp_debug= 1; + +static void *state_head; /* opaque handle top of state structs */ + +static int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp); +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp); +static int nfp_release_dev( dev_info_t *dip ); + +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp); +static int nfp_strategy(struct buf *bp); + +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp); +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp); + +static void nfp_wrtimeout (void *pdev); +static void nfp_rdtimeout (void *pdev); + +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result); +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); + +static void nfp_read_complete_final(nfp_dev *pdev, int ok); +static void nfp_write_complete_final(nfp_dev *pdev, int ok); + +/* nfp file ops --------------------------------------------------- */ + +static struct cb_ops nfp_cb_ops = { + nfp_open, + nfp_close, + nodev, /* no nfp_strategy */ + nodev, /* no print routine */ + nodev, /* no dump routine */ + nfp_read, + nfp_write, + nfp_ioctl, + nodev, /* no devmap routine */ + nodev, /* no mmap routine */ + nodev, /* no segmap routine */ + nfp_chpoll, + ddi_prop_op, + 0, /* not a STREAMS driver, no cb_str routine */ + D_NEW | D_MP | EXTRA_CB_FLAGS, /* must be safe for multi-thread/multi-processor */ + CB_REV, + nodev, /* aread */ + nodev /* awrite */ +}; + +static struct dev_ops nfp_ops = { + DEVO_REV, /* DEVO_REV indicated by manual */ + 0, /* device reference count */ + nfp_getinfo, + nulldev, /* identify */ + nulldev, /* probe */ + nfp_attach, + nfp_detach, + nodev, /* device reset routine */ + &nfp_cb_ops, + (struct bus_ops *)0, /* bus operations */ +}; + +extern struct mod_ops mod_driverops; +static struct modldrv modldrv = { + &mod_driverops, + NFP_DRVNAME, + &nfp_ops, +}; + +static struct modlinkage modlinkage = { + MODREV_1, /* MODREV_1 indicated by manual */ + (void *)&modldrv, + NULL, /* termination of list of linkage structures */ +}; + +/* interface resource allocation */ + +int nfp_alloc_pci_push( nfp_dev *pdev ) { + /* allocate resources needed for PCI Push, + * if not already allocated. + * return True if successful + */ + nfp_err ret; + uint_t cookie_count; + size_t real_length; + + if(!pdev->read_buf) { + /* allocate read buffer */ + pdev->read_buf = kmem_zalloc( NFP_READBUF_SIZE, KM_NOSLEEP ); + } + if(!pdev->read_buf) { + nfp_log( NFP_DBG1, "nfp_attach: kmem_zalloc read buffer failed"); + pdev->read_buf = NULL; + return 0; + } + + if(!pdev->rd_dma_ok) { + /* allocate dma handle for read buffer */ + ret = ddi_dma_alloc_handle( pdev->dip, + &dma_attrs, + DDI_DMA_DONTWAIT, + NULL, + &pdev->read_dma_handle ); + if( ret != DDI_SUCCESS ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_alloc_handle failed (%d)", + ret ); + return 0; + } + + /* Allocate the memory for dma transfers */ + ret = ddi_dma_mem_alloc(pdev->read_dma_handle, NFP_READBUF_SIZE, &nosw_attr, + DDI_DMA_CONSISTENT, DDI_DMA_DONTWAIT, NULL, + (caddr_t*)&pdev->read_buf, &real_length, &pdev->acchandle); + if (ret != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_alloc_pci_push: ddi_dma_mem_alloc failed (%d)", ret); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + + ret = ddi_dma_addr_bind_handle( pdev->read_dma_handle, + NULL, /* kernel address space */ + (caddr_t)pdev->read_buf, real_length, + DDI_DMA_READ | DDI_DMA_CONSISTENT, /* dma flags */ + DDI_DMA_DONTWAIT, NULL, + &pdev->read_dma_cookie, &cookie_count ); + if( ret != DDI_DMA_MAPPED ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: ddi_dma_addr_bind_handle failed (%d)", + ret); + ddi_dma_mem_free(&pdev->acchandle); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + if( cookie_count > 1 ) { + nfp_log( NFP_DBG1, + "nfp_alloc_pci_push: error:" + " ddi_dma_addr_bind_handle wants %d transfers", + cookie_count); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + return 0; + } + pdev->rd_dma_ok = 1; + } + return pdev->rd_dma_ok; +} + +void nfp_free_pci_push( nfp_dev *pdev ) { + /* free resources allocated to PCI Push */ + if( pdev->rd_dma_ok ) { + (void) ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL); + ddi_dma_mem_free(&pdev->acchandle); + (void) ddi_dma_unbind_handle( pdev->read_dma_handle ); + ddi_dma_free_handle( &pdev->read_dma_handle ); + pdev->rd_dma_ok = 0; + } + if( pdev->read_buf ) { + kmem_free( pdev->read_buf, NFP_READBUF_SIZE ); + pdev->read_buf = NULL; + } +} + +/* include definition of nfp_set_ifvers() */ +#define nfp_ifvers NFDEV_IF_PCI_PUSH +#include "nfp_ifvers.c" +#undef nfp_ifvers + +/*--------------------*/ +/* nfp_isr */ +/*--------------------*/ + +static u_int nfp_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + nfp_err ne; + int handled; + + nfp_log( NFP_DBG3, "nfp_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + + /* The isr needs to be mutex'ed - an SMP can call us while we're still + * running! + */ + mutex_enter(&pdev->low_mutex); + ne= pdev->cmddev->isr( pdev->common.cmdctx, &handled ); + mutex_exit(&pdev->low_mutex); + + if( !ne && handled ) + return DDI_INTR_CLAIMED; + if (ne) + nfp_log( NFP_DBG1, "nfp_isr: failed"); + else + nfp_log( NFP_DBG3, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + +static u_int nfp_soft_isr( char *pdev_in ) { + /* LINTED: alignment */ + nfp_dev *pdev= (nfp_dev *)pdev_in; + int rd, wr; + + nfp_log( NFP_DBG3, "nfp_soft_isr: entered"); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_soft_isr: cannot find dev"); + return DDI_INTR_UNCLAIMED; + } + rd= wr= 0; + + mutex_enter(&pdev->high_mutex); + if(pdev->high_read) { + pdev->high_read= 0; + mutex_exit(&pdev->high_mutex); + rd= 1; + } + if(pdev->high_write) { + pdev->high_write= 0; + wr= 1; + } + mutex_exit(&pdev->high_mutex); + + if(rd) { + nfp_log( NFP_DBG3, "nfp_soft_isr: read done"); + nfp_read_complete_final(pdev, pdev->rd_ok); + } + if(wr) { + nfp_log( NFP_DBG3, "nfp_soft_isr: write done"); + nfp_write_complete_final(pdev, pdev->wr_ok); + } + if( rd || wr ) + return DDI_INTR_CLAIMED; + + nfp_log( NFP_DBG2, "nfp_isr: unclaimed"); + return DDI_INTR_UNCLAIMED; +} + + +/*-------------------------*/ +/* nfp_read */ +/*-------------------------*/ + +void nfp_read_complete(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete: entering"); + + if(pdev->high_intr) { + nfp_log(NFP_DBG2, "nfp_read_complete: high_intr"); + mutex_enter(&pdev->high_mutex); + nfp_log(NFP_DBG3, "nfp_read_complete: high_mutex entered"); + if(pdev->high_read) + nfp_log(NFP_DBG1, "nfp_read_complete: high_read allread set!"); + pdev->high_read= 1; + pdev->rd_ok= ok; + nfp_log(NFP_DBG3, "nfp_read_complete: exiting high_mutex"); + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_read_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_read_complete: exiting"); +} + +static void nfp_read_complete_final(nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_read_complete_final: entering"); + if(pdev->rdtimeout) + (void) untimeout(pdev->rdtimeout); + if(!pdev->rd_outstanding) { + nfp_log( NFP_DBG1,"nfp_read_complete_final: !pdev->rd_outstanding"); + } + nfp_log( NFP_DBG2,"nfp_read_complete_final: pdev->rd_outstanding=0, ok %d", ok); + mutex_enter(&pdev->isr_mutex); + pdev->rd_outstanding= 0; + pdev->rd_ready= 1; + pdev->rd_ok= ok; + cv_broadcast(&pdev->rd_cv); + mutex_exit(&pdev->isr_mutex); + pollwakeup (&pdev->pollhead, POLLRDNORM); + nfp_log( NFP_DBG2,"nfp_read_complete_final: exiting"); +} + +static void nfp_rdtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_rdtimeout: read timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_rdtimeout: NULL pdev." ); + return; + } + pdev->rdtimeout= 0; + nfp_read_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_read(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_read: entered" ); + if (ddi_get_soft_state(state_head, getminor(dev)) != NULL) { + nfp_log( NFP_DBG1, "nfp_read: unable to get nfp_dev"); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_read: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_READ, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_read: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_write */ +/*-------------------------*/ + +void nfp_write_complete( nfp_dev *pdev, int ok) { + nfp_log( NFP_DBG2,"nfp_write_complete: entering"); + + if(pdev->high_intr) { + mutex_enter(&pdev->high_mutex); + if(pdev->high_write) + nfp_log(NFP_DBG1, "nfp_write_complete: high_write allread set!"); + pdev->high_write= 1; + pdev->wr_ok= ok; + mutex_exit(&pdev->high_mutex); + ddi_trigger_softintr(pdev->soft_int_id); + } else + nfp_write_complete_final( pdev, ok ); + nfp_log( NFP_DBG2,"nfp_write_complete: exiting"); +} + +static void nfp_write_complete_final( nfp_dev *pdev, int ok) { + struct buf *local_wr_bp; + nfp_log( NFP_DBG2,"nfp_write_complete_final: entering"); + if(pdev->wrtimeout) + (void) untimeout(pdev->wrtimeout); + + if (!pdev->wr_bp) { + nfp_log( NFP_DBG2, "nfp_write_complete_final: write: wr_bp == NULL." ); + return; + } + + bp_mapout(pdev->wr_bp); + pdev->wr_bp->b_resid = ok ? 0 : pdev->wr_bp->b_bcount; + /* Make sure we set wr_ready before calling biodone to avoid a race */ + pdev->wr_ready = 1; + bioerror(pdev->wr_bp, ok ? 0 : ENXIO); + local_wr_bp = pdev->wr_bp; + pdev->wr_bp = 0; + biodone(local_wr_bp); + nfp_log( NFP_DBG2, "nfp_write_complete_final: isr_mutex extited"); + pollwakeup (&pdev->pollhead, POLLWRNORM); + + nfp_log( NFP_DBG2, "nfp_write_complete_final: leaving"); +} + +static void nfp_wrtimeout( void *pdev_in ) +{ + nfp_dev *pdev= (nfp_dev *)pdev_in; + + nfp_log( NFP_DBG1, "nfp_wrtimeout: write timed out"); + + if (!pdev) { + nfp_log( NFP_DBG1, "nfp_wrtimeout: NULL pdev." ); + return; + } + pdev->wrtimeout= 0; + nfp_write_complete_final(pdev, 0); +} + +/* ARGSUSED */ +static int nfp_write(dev_t dev, struct uio *uiop, cred_t *credp) { + int ret; + nfp_log( NFP_DBG2, "nfp_write: entered." ); + if (ddi_get_soft_state(state_head, getminor(dev)) == NULL) { + nfp_log( NFP_DBG1, "nfp_chread: unable to get nfp_dev."); + return (ENODEV); + } + nfp_log( NFP_DBG2, "nfp_write: about to physio." ); + ret = physio(nfp_strategy, (struct buf *)0, dev, B_WRITE, minphys, uiop ); + if(ret) + nfp_log( NFP_DBG1, "nfp_write: physio returned %x.", ret ); + return ret; +} + +/*-------------------------*/ +/* nfp_strategy */ +/*-------------------------*/ + +#define NFP_STRAT_ERR(thebp,err,txt) \ + nfp_log( NFP_DBG1, "nfp_strategy: " txt ".\n"); \ + (thebp)->b_resid = (thebp)->b_bcount; \ + bioerror ((thebp), err); \ + biodone ((thebp)); + +static int nfp_strategy(struct buf *bp) { + register struct nfp_dev *pdev; + nfp_err ne; + + nfp_log( NFP_DBG2, "nfp_strategy: entered." ); + if (!(pdev = ddi_get_soft_state(state_head, getminor(bp->b_edev)))) { + NFP_STRAT_ERR (bp, ENXIO, "unable to get nfp_dev"); + return (0); + } + + if (bp->b_flags & B_READ) { + int count; + /* read */ + if (!pdev->rd_ready) { + NFP_STRAT_ERR (bp,ENXIO,"read called when not ready"); + return (0); + } + pdev->rd_ready=0; + pdev->rd_pending = 0; + if( !pdev->rd_ok) { + NFP_STRAT_ERR (bp,ENXIO,"read failed"); + return (0); + } + /* copy data from module */ + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + nfp_log( NFP_DBG3, "nfp_strategy: copying kernel read buffer"); + if( ddi_dma_sync(pdev->read_dma_handle,0,0,DDI_DMA_SYNC_FORKERNEL) != DDI_SUCCESS ) + { + NFP_STRAT_ERR(bp,ENXIO,"ddi_dma_sync(read_dma_handle) failed"); + return (0); + } + /* LINTED: alignment */ + count= *(unsigned int *)(pdev->read_buf+4); + count= FROM_LE32_MEM(&count); + nfp_log( NFP_DBG3, "nfp_strategy: read count %d", count); + if(count<0 || count>bp->b_bcount) { + NFP_STRAT_ERR(bp,ENXIO,"bad read byte count from device"); + nfp_log( NFP_DBG1, "nfp_strategy: bad read byte count (%d) from device", count); + return (0); + } + bp_mapin (bp); + bcopy( pdev->read_buf + 8, bp->b_un.b_addr, count ); + bp_mapout (bp); + } else { + bp_mapin (bp); + ne= pdev->cmddev->read_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx, &count ); + bp_mapout (bp); + if( ne != NFP_SUCCESS) { + NFP_STRAT_ERR (bp,nfp_oserr(ne),"read_block failed"); + return (0); + } + } + bioerror(bp, 0); + bp->b_resid = 0; + biodone (bp); + } else { + /* write */ + if (!pdev->wr_ready) { + NFP_STRAT_ERR (bp,ENXIO,"write called when not ready"); + return (0); + } + if (pdev->wr_bp) { + NFP_STRAT_ERR (bp,ENXIO,"wr_bp != NULL"); + return (0); + } + pdev->wrtimeout= timeout(nfp_wrtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + pdev->wr_bp = bp; + pdev->wr_ready = 0; + bp_mapin (bp); + ne= pdev->cmddev->write_block( bp->b_un.b_addr, bp->b_bcount, pdev->common.cmdctx); + if( ne != NFP_SUCCESS ) { + bp_mapout (bp); + (void) untimeout(pdev->wrtimeout); + pdev->wr_bp = 0; + pdev->wr_ready = 1; + NFP_STRAT_ERR (bp,nfp_oserr(ne),"write failed"); + return (0); + } + } + nfp_log( NFP_DBG2, "nfp_strategy: leaving"); + + return (0); +} + + +/*--------------------*/ +/* poll / select */ +/*--------------------*/ + +static int nfp_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) { + nfp_dev *pdev; + short revents; + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_chpoll: unable to get nfp_dev"); + *reventsp=0; + return (0); + } + nfp_log( NFP_DBG2, "nfp_chpoll: entered %x", events); + + revents=0; + if (events&POLLWRNORM) { + if (pdev->wr_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: write ready"); + revents|=POLLWRNORM; + } + } + + if (events&POLLRDNORM) { + if (pdev->rd_ready) { + nfp_log( NFP_DBG2, "nfp_chpoll: read ready"); + revents|=POLLRDNORM; + } + } + + if (!revents && !anyyet) { + *phpp=&pdev->pollhead; + } + *reventsp=revents; + + nfp_log( NFP_DBG2, "nfp_chpoll: leaving"); + return (0); +} + + +/*--------------------*/ +/* ioctl */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_ioctl(dev_t dev, int cmd, ioctlptr_t arg, int mode, cred_t *credp, int *rvalp) { + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_ioctl: entered." ); + + if (!(pdev = ddi_get_soft_state(state_head, getminor(dev)))) { + nfp_log( NFP_DBG1, "nfp_ioctl: unable to get nfp dev."); + return (ENXIO); + } + + switch (cmd) { + case NFDEV_IOCTL_ENQUIRY: + { + long *outp; + int outlen; + nfdev_enquiry_str enq_data; + + enq_data.busno = (unsigned int)-1; + enq_data.slotno = (unsigned char)-1; + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &outlen) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "ddi_getlongprop('reg') ok." ); + if( outlen > 0 ) { + enq_data.busno = ((*outp)>>16) & 0xff; + enq_data.slotno = ((*outp)>>11) & 0x1f; + nfp_log( NFP_DBG2, "busno %d, slotno %d.", + enq_data.busno, enq_data.slotno ); + } + } else + nfp_log( NFP_DBG1, "ddi_getlongprop('reg') failed." ); + + if( ddi_copyout( (char *)&enq_data, (void *)arg, sizeof(enq_data), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + case NFDEV_IOCTL_ENSUREREADING: + { + unsigned int addr, len; + nfp_err ret; + if( ddi_copyin( (void *)arg, (char *)&len, sizeof(unsigned int), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + /* signal a read to the module */ + nfp_log( NFP_DBG2, "nfp_ioctl: signalling read request to module, len = %x.", len ); + if (len>8192) { + nfp_log( NFP_DBG1, "nfp_ioctl: len >8192 = %x.", len ); + return EINVAL; + } + if (pdev->rd_outstanding==1) { + nfp_log( NFP_DBG1, "nfp_ioctl: not about to call read with read outstanding."); + return EIO; + } + + addr= 0; + if(pdev->ifvers >= NFDEV_IF_PCI_PUSH) { + if( len > NFP_READBUF_SIZE ) { + nfp_log( NFP_DBG1, "nfp_ioctl: len > NFP_READBUF_SIZE = %x.", len ); + return EINVAL; + } + addr= pdev->read_dma_cookie.dmac_address; + } + + pdev->rd_outstanding = 1; + nfp_log( NFP_DBG2,"nfp_ioctl: pdev->rd_outstanding=1"); + + /* setup timeout timer */ + pdev->rdtimeout= timeout(nfp_rdtimeout, (caddr_t)pdev, NFP_TIMEOUT_SEC * drv_usectohz(1000000)); + + nfp_log( NFP_DBG2, "nfp_ioctl: read request"); + ret = pdev->cmddev->ensure_reading(addr, len, pdev->common.cmdctx); + if ( ret != NFP_SUCCESS ) { + (void) untimeout(pdev->rdtimeout); + pdev->rdtimeout = 0; + pdev->rd_outstanding = 0; + nfp_log( NFP_DBG1, "nfp_ioctl : cmddev->ensure_reading failed "); + return nfp_oserr( ret ); + } + } + break; + + case NFDEV_IOCTL_PCI_IFVERS: + { + int vers; + + nfp_log( NFP_DBG2, "nfp_ioctl: NFDEV_IOCTL_PCI_IFVERS"); + + if( ddi_copyin( (void *)arg, (char *)&vers, sizeof(vers), mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyin() failed." ); + return (EFAULT); + } + + if( pdev->rd_outstanding ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d as read outstanding", vers); + return EIO; + } + + nfp_set_ifvers(pdev, vers); + if( pdev->ifvers != vers ) { + nfp_log( NFP_DBG1, "nfp_ioctl: can't set ifvers %d", vers); + return EIO; + } + } + break; + + case NFDEV_IOCTL_STATS: + { + if( ddi_copyout( (char *)&(pdev->common.stats), + (void *)arg, + sizeof(nfdev_stats_str), + mode ) != 0 ) { + nfp_log( NFP_DBG1, "ddi_copyout() failed." ); + return EFAULT; + } + } + break; + + default: + nfp_log( NFP_DBG1, "nfp_ioctl: unknown ioctl." ); + return EINVAL; + } + + return 0; +} + +/*-------------------------*/ +/* nfp_open */ +/*-------------------------*/ + +/* ARGSUSED */ +int nfp_open(dev_t *dev, int openflags, int otyp, cred_t *credp) +{ + nfp_err ret; + register struct nfp_dev *pdev; + + nfp_log( NFP_DBG2, "entered nfp_open." ); + + pdev = (nfp_dev *)ddi_get_soft_state(state_head, getminor(*dev)); + + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_open: unable to get nfp dev."); + return (ENODEV); + } + + if( otyp != OTYP_CHR ) { + nfp_log( NFP_DBG1, "nfp_open: not opened as character device"); + return (EINVAL); + } + + mutex_enter(&pdev->busy_mutex); + + if (pdev->busy) { + mutex_exit(&pdev->busy_mutex); + nfp_log( NFP_DBG1, "nfp_open: device busy"); + return EBUSY; + } + pdev->busy= 1; + mutex_exit(&pdev->busy_mutex); + + /* use oldest possible interface until told otherwise */ + pdev->ifvers= NFDEV_IF_STANDARD; + nfp_log( NFP_DBG3, "nfp_open: setting ifvers %d", pdev->ifvers); + pdev->rd_ready= 0; /* drop any old data */ + + ret = pdev->cmddev->open(pdev->common.cmdctx); + if( ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, "nfp_open : cmddev->open failed "); + return nfp_oserr( ret ); + } + + nfp_log( NFP_DBG2, "nfp_open: done"); + + return 0; +} + +/*--------------------*/ +/* nfp_close */ +/*--------------------*/ + +/* ARGSUSED */ +static int nfp_close(dev_t dev, int openflags, int otyp, cred_t *credp) { + nfp_dev *pdev; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_close: entered"); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor(dev)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + mutex_enter(&pdev->isr_mutex); + if(pdev->rd_outstanding) { + int lbolt, err; + nfp_get_lbolt(&lbolt, err); + if(!err) + (void) cv_timedwait(&pdev->rd_cv, &pdev->isr_mutex, lbolt + (NFP_TIMEOUT_SEC * drv_usectohz(1000000)) ); + } + mutex_exit(&pdev->isr_mutex); + ret = pdev->cmddev->close(pdev->common.cmdctx); + if (ret != NFP_SUCCESS ) { + nfp_log( NFP_DBG1, " nfp_close : cmddev->close failed"); + return nfp_oserr( ret ); + } + + mutex_enter(&pdev->busy_mutex); + pdev->busy= 0; + mutex_exit(&pdev->busy_mutex); + + return 0; +} + +/**************************************************************************** + + nfp driver config + + ****************************************************************************/ + +/*-------------------------*/ +/* nfp_getinfo */ +/*-------------------------*/ + +/* ARGSUSED */ +static int nfp_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { + int error; + nfp_dev *pdev; + + nfp_log( NFP_DBG2, "nfp_getinfo: entered" ); + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, getminor((dev_t)arg)); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_close: cannot find dev."); + return ENODEV; + } + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if (pdev == NULL) { + *result = NULL; + error = DDI_FAILURE; + } else { + /* + * don't need to use a MUTEX even though we are + * accessing our instance structure; dev->dip + * never changes. + */ + *result = pdev->dip; + error = DDI_SUCCESS; + } + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)getminor((dev_t)arg); + error = DDI_SUCCESS; + break; + default: + *result = NULL; + error = DDI_FAILURE; + } + + nfp_log( NFP_DBG2, "nfp_getinfo: leaving." ); + return (error); +} + +/*-------------------------*/ +/* nfp_release */ +/*-------------------------*/ + +static int nfp_release_dev( dev_info_t *dip ) { + nfp_dev *pdev; + int instance, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_release_dev: entering" ); + + instance = ddi_get_instance(dip); + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if (pdev) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing device" ); + + nfp_free_pci_push(pdev); + + if( pdev->cmddev ) { + nfp_log( NFP_DBG3, "nfp_release_dev: destroying cmd dev" ); + ret = pdev->cmddev->destroy(pdev->common.cmdctx); + if (ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, " nfp_release_dev : cmddev->destroy failed "); + return nfp_oserr( ret ); + } + } + + if(pdev->high_iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing high and soft irq" ); + ddi_remove_softintr(pdev->soft_int_id); + ddi_remove_intr(pdev->dip, 0, pdev->high_iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + mutex_destroy( &pdev->high_mutex ); + } else if(pdev->iblock_cookie) { + nfp_log( NFP_DBG3, "nfp_release_dev: removing irq" ); + ddi_remove_intr(pdev->dip, 0, pdev->iblock_cookie); + mutex_destroy( &pdev->busy_mutex ); + cv_destroy( &pdev->rd_cv ); + mutex_destroy( &pdev->isr_mutex ); + } + if(pdev->low_iblock_cookie) { + ddi_remove_intr(pdev->dip, 0, pdev->low_iblock_cookie); + mutex_destroy( &pdev->low_mutex); + } + + for(i=0;i<6;i++) { + if( pdev->common.extra[i] ) { + nfp_log( NFP_DBG3, "nfp_release_dev: unmapping BAR %d", i ); + ddi_regs_map_free ((ddi_acc_handle_t *)&pdev->common.extra[i]); + } + } + + ddi_remove_minor_node(dip, NULL); + + if (pdev->conf_handle) + pci_config_teardown( &pdev->conf_handle ); + + ddi_soft_state_free(state_head, instance); + } + nfp_log( NFP_DBG2, "nfp_release: finished" ); + + return DDI_SUCCESS; +} + + +/*-------------------------*/ +/* nfp_attach */ +/*-------------------------*/ + +static int nfp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { + int instance; + nfp_dev *pdev = NULL; + int intres; + uint16_t device, vendor, sub_device, sub_vendor; + long *outp; + nfpcmd_dev const *cmddev; + int index, i; + nfp_err ret; + + nfp_log( NFP_DBG2, "nfp_attach: entered." ); + + if (cmd != DDI_ATTACH) { + nfp_log( NFP_DBG1, "nfp_attach: bad command." ); + goto bailout; + } + + instance = ddi_get_instance(dip); + + if (ddi_soft_state_zalloc(state_head, instance) != 0) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_soft_state_zalloc() failed." ); + goto bailout; + } + + pdev = (struct nfp_dev *)ddi_get_soft_state(state_head, instance); + if( !pdev ) { + nfp_log( NFP_DBG1, "nfp_attach: cannot find dev."); + return ENODEV; + } + pdev->dip = dip; + + /* map in pci config registers */ + if (pci_config_setup(dip, &pdev->conf_handle)) { + nfp_log( NFP_DBG1, "nfp_attach: pci_config_setup() failed." ); + goto bailout; + } + + /* find out what we have got */ + vendor= PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_VENID ); + device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_DEVID ); + sub_vendor = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBVENID ); + sub_device = PCI_CONFIG_GET16( pdev->conf_handle, PCI_CONF_SUBSYSID ); + + index= 0; + while( (cmddev = nfp_drvlist[index++]) != NULL ) { + if( cmddev->vendorid == vendor && + cmddev->deviceid == device && + cmddev->sub_vendorid == sub_vendor && + cmddev->sub_deviceid == sub_device ) + break; + } + if( !cmddev ) { + nfp_log( NFP_DBG1, "nfp_attach: unknonw device." ); + goto bailout; + } + + /* map BARs */ + for( i=0; i<6; i++ ) { + if( cmddev->bar_sizes[i] ) { + off_t size; + if( ddi_dev_regsize(dip, i+1, &size) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_dev_regsize() failed for BAR %d", i ); + goto bailout; + } + if( size < (cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK) ) { + nfp_log( NFP_DBG1, "nfp_attach: BAR %d too small %x (%x)", i, size, (cmddev->bar_sizes[i] & ~0xF) ); + goto bailout; + } + if (ddi_regs_map_setup(dip, i+1, (caddr_t *)&pdev->common.bar[i], + 0, cmddev->bar_sizes[i] & ~NFP_MEMBAR_MASK, &nosw_attr, (ddi_acc_handle_t *)&pdev->common.extra[i] )) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_regs_map_setup() failed for BAR %d", i ); + goto bailout; + } + nfp_log( NFP_DBG3, "nfp_attach: BAR[%d] mapped to %x (%x)", i, pdev->common.bar[i], size ); + } + } + + pdev->read_buf = NULL; + pdev->rd_dma_ok = 0; + + /* attach to minor node */ + if (ddi_create_minor_node(dip, "nfp", S_IFCHR, instance, (char *)cmddev->name, 0) == DDI_FAILURE) { + ddi_remove_minor_node(dip, NULL); + nfp_log( NFP_DBG1, "nfp_attach: ddi_create_minor_node() failed." ); + goto bailout; + } + + pdev->wr_ready = 1; + pdev->rd_ready = 0; + pdev->rd_pending = 0; + pdev->rd_outstanding = 0; + pdev->busy=0; + pdev->cmddev= cmddev; + + ret = pdev->cmddev->create(&pdev->common); + if( ret != NFP_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: failed to create command device"); + goto bailout; + } + pdev->common.dev= pdev; + + if (ddi_intr_hilevel(dip, 0) != 0){ + nfp_log( NFP_DBG2, "nfp_attach: high-level interrupt"); + if( ddi_get_iblock_cookie(dip, 0, &pdev->high_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(high) failed." ); + goto bailout; + } + if( ddi_get_iblock_cookie(dip, 0, &pdev->low_iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(low) failed." ); + goto bailout; + } + mutex_init(&pdev->high_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->high_iblock_cookie); + mutex_init(&pdev->low_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->low_iblock_cookie); + if (ddi_add_intr(dip, 0, NULL, + NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr(high) failed." ); + goto bailout; + } + if( ddi_get_soft_iblock_cookie(dip, DDI_SOFTINT_HIGH, + &pdev->iblock_cookie) ) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie(soft) failed." ); + goto bailout; + } + mutex_init(&pdev->isr_mutex, NULL, MUTEX_DRIVER, + (void *)pdev->iblock_cookie); + if (ddi_add_softintr(dip, DDI_SOFTINT_HIGH, &pdev->soft_int_id, + &pdev->iblock_cookie, NULL, + nfp_soft_isr, (caddr_t)pdev) != DDI_SUCCESS) + goto bailout; + pdev->high_intr= 1; + } else { + nfp_log( NFP_DBG2, "nfp_attach: low-level interrupt"); + + if (ddi_get_iblock_cookie (dip, 0, &pdev->iblock_cookie)) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_get_iblock_cookie() failed." ); + goto bailout; + } + + mutex_init(&pdev->isr_mutex, "nfp isr mutex", MUTEX_DRIVER, (void *)pdev->iblock_cookie); + + if (ddi_add_intr(dip, 0, NULL, + (ddi_idevice_cookie_t *)NULL, nfp_isr, + (caddr_t)pdev) != DDI_SUCCESS) { + nfp_log( NFP_DBG1, "nfp_attach: ddi_add_intr() failed." ); + goto bailout; + } + } + mutex_init(&pdev->busy_mutex, "nfp busy mutex", MUTEX_DRIVER, NULL ); + cv_init(&pdev->rd_cv, "nfp read condvar", CV_DRIVER, NULL ); + + /* get our bus and slot num */ + if (ddi_getlongprop (DDI_DEV_T_NONE, + pdev->dip, 0, "reg", + (caddr_t)&outp, &intres) != DDI_PROP_NOT_FOUND) { + nfp_log( NFP_DBG2, "nfp_attach: ddi_getlongprop('reg') ok." ); + if( intres > 0 ) { + nfp_log( NFP_DBG1, "nfp_attach: found PCI nfast bus %x slot %x.", + ((*outp)>>16) & 0xff, ((*outp)>>11) & 0x1f ); + } + } + + nfp_log( NFP_DBG2, "nfp_attach: attach succeeded." ); + return DDI_SUCCESS; + +bailout: + (void) nfp_release_dev( dip ); + + return DDI_FAILURE; +} + +/*-------------------------*/ +/* nfp_detach */ +/*-------------------------*/ + +/* + * When our driver is unloaded, nfp_detach cleans up and frees the resources + * we allocated in nfp_attach. + */ +static int nfp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + (void) nfp_release_dev(dip); + + return (DDI_SUCCESS); +} + +/*-------------------------*/ +/* _init */ +/*-------------------------*/ + +int _init(void) { + register int error; + + nfp_log( NFP_DBG2, "_init: entered" ); + + if ((error = ddi_soft_state_init(&state_head, sizeof (struct nfp_dev), 1)) != 0) { + nfp_log( NFP_DBG1, "_init: soft_state_init() failed" ); + return (error); + } + + if ((error = mod_install(&modlinkage)) != 0) { + nfp_log( NFP_DBG1, "_init: mod_install() failed" ); + ddi_soft_state_fini(&state_head); + } + + nfp_log( NFP_DBG2, "_init: leaving" ); + return (error); +} + +/*-------------------------*/ +/* _info */ +/*-------------------------*/ + +int _info(struct modinfo *modinfop) { + nfp_log( NFP_DBG2, "_info: entered" ); + + return (mod_info(&modlinkage, modinfop)); +} + +/*-------------------------*/ +/* _fini */ +/*-------------------------*/ + +int _fini(void) { + int status; + + nfp_log( NFP_DBG2, "_fini: entered" ); + + if ((status = mod_remove(&modlinkage)) != 0) { + nfp_log( NFP_DBG2, "_fini: mod_remove() failed." ); + return (status); + } + + ddi_soft_state_fini(&state_head); + + nfp_log( NFP_DBG2, "_fini: leaving" ); + + return (status); +} + diff --git a/usr/src/uts/common/io/nfp/i21285.c b/usr/src/uts/common/io/nfp/i21285.c new file mode 100644 index 0000000000..f51a09188d --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.c @@ -0,0 +1,310 @@ +/* + +i21285.c: nCipher PCI HSM intel/digital 21285 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21285.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* create ------------------------------------------------------- */ + +static nfp_err i21285_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + nfp_log( NFP_DBG2, "i21285_create: enable doorbell"); + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_ENABLE | POSTLIST_ENABLE); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21285_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21285_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, DOORBELL_DISABLE | POSTLIST_DISABLE ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_INTERRUPT_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_open( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21285_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21285_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + unsigned int doorbell; + unsigned int tmp32; + + nfp_log( NFP_DBG3, "i21285_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21285_isr: NULL pdev"); + return NFP_ENODEV; + } + + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + TO_LE32_IO( &tmp32, NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log(NFP_DBG2, "i21285_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + nfp_log( NFP_DBG1, "i21285_isr: unexpected interrupt %x", doorbell ); + TO_LE32_IO( &tmp32, 0xffff & doorbell ); + nfp_outl( pdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + } + doorbell= nfp_inl( pdev, IOBAR, I21285_OFFSET_DOORBELL); + doorbell= FROM_LE32_IO(&doorbell) & 0xffff; + } + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21285_write( const char *block, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21285_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_write: NULL pdev"); + return NFP_ENODEV; + } + + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ MEMBAR ]= %x\n", cdev->bar[ MEMBAR ]); + nfp_log(NFP_DBG2, "i21285_write: pdev->bar[ IOBAR ]= %x\n", cdev->bar[ IOBAR ]); + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_write: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_write: length not written"); + return NFP_EIO; + } + + TO_LE32_IO( &tmp32, NFAST_INT_HOST_WRITE_REQUEST); + + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + nfp_log( NFP_DBG2, "i21285_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21285_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21285_read: entered, len %d", len); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_read: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_read: null BAR[%d]", MEMBAR ); + return NFP_ENOMEM; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if(ne) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21285_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if( ne ) { + nfp_log( NFP_DBG1, "i21285_read: nfp_copy_to_user_from_dev failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21285_read: done"); + *rcount= count; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21285_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21285_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21285_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[2]; + unsigned int tmp32; + nfp_err ne; + + nfp_log( NFP_DBG2, "i21285_ensure_reading: entered"); + + if(addr) { + nfp_log( NFP_DBG2, "i21285_ensure_reading: bad addr"); + return -NFP_EINVAL; + } + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + if(!cdev->bar[ MEMBAR ]) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: null BAR[%d]", MEMBAR ); + return NFP_ENXIO; + } + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21285_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + TO_LE32_MEM( &hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM( &hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + TO_LE32_MEM( &tmp32, len ); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21285_ensure_reading: len not written"); + return NFP_EIO; + }; + TO_LE32_IO( &tmp32, NFAST_INT_HOST_READ_REQUEST ); + nfp_outl( cdev, IOBAR, I21285_OFFSET_DOORBELL, tmp32 ); + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + + +const nfpcmd_dev i21285_cmddev = { + "nCipher Gen 1 PCI", + PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_DEC_21285, + PCI_VENDOR_ID_NCIPHER, PCI_DEVICE_ID_NFAST_GEN1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21285_create, + i21285_destroy, + i21285_open, + i21285_close, + i21285_isr, + i21285_write, + i21285_read, + i21285_chupdate, + i21285_ensure_reading, + 0, /* no debug */ +}; + diff --git a/usr/src/uts/common/io/nfp/i21285.h b/usr/src/uts/common/io/nfp/i21285.h new file mode 100644 index 0000000000..4ea1d853ec --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21285.h @@ -0,0 +1,43 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_I21285_H +#define NFP_I21285_H + +#ifndef PCI_VENDOR_ID_DEC +#define PCI_VENDOR_ID_DEC 0x1011 +#endif +#ifndef PCI_DEVICE_ID_DEC_21285 +#define PCI_DEVICE_ID_DEC_21285 0x1065 +#endif +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_DEVICE_ID_NFAST_GEN1 +#define PCI_DEVICE_ID_NFAST_GEN1 0x0100 +#endif + +#define I21285_OFFSET_DOORBELL 0x60 +#define I21285_OFFSET_INTERRUPT_MASK 0x34 + +#define DOORBELL_ENABLE 0x0 +#define DOORBELL_DISABLE 0x4 + +#define POSTLIST_ENABLE 0x0 +#define POSTLIST_DISABLE 0x8 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x80 +#define MEMSIZE 0x100000 + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555.c b/usr/src/uts/common/io/nfp/i21555.c new file mode 100644 index 0000000000..82024dc800 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.c @@ -0,0 +1,423 @@ +/* + +i21555.c: nCipher PCI HSM intel 21555 command driver + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +09/10/2001 jsh Original + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_hostif.h" +#include "nfp_osif.h" +#include "i21555.h" +#include "nfp_cmd.h" +#include "nfpci.h" + +/* started ------------------------------------------------------ + * + * Check that device is ready to talk, by checking that + * the i21555 has master enabled on its secondary interface + */ + +static nfp_err i21555_started( nfp_cdev *pdev ) { + unsigned int tmp32; +#ifdef CONFIGSPACE_DEBUG + unsigned int reg32[64]; + int i; +#endif + nfp_err ne; + + nfp_log( NFP_DBG2, "i21555_started: entered"); + +#ifdef CONFIGSPACE_DEBUG + /* Suck up all the registers */ + for (i=0; i < 64; i++) { + ne = nfp_config_inl( pdev, i*4, ®32[i] ); + } + + for (i=0; i < 16; i++) { + int j = i * 4; + nfp_log( NFP_DBG3, "i21555 config reg %2x: %08x %08x %08x %08x", j*4, + reg32[j], reg32[j+1], reg32[j+2], reg32[j+3]); + } +#endif + + ne = nfp_config_inl( pdev, I21555_CFG_SEC_CMD_STATUS, &tmp32 ); + if (ne) { + /* succeed if PCI config reads are not implemented */ + if (ne == NFP_EUNKNOWN) + return NFP_SUCCESS; + nfp_log( NFP_DBG1, "i21555_started: nfp_config_inl failed"); + return ne; + } + + tmp32= FROM_LE32_IO(&tmp32) & 0xffff; + + if ( tmp32 & CFG_CMD_MASTER ) { + nfp_log( NFP_DBG3, "i21555_started: Yes %x", tmp32); + return NFP_SUCCESS; + } else { + nfp_log( NFP_DBG1, "i21555_started: device not started yet %x", tmp32); + return NFP_ESTARTING; + } +} + +/* create ------------------------------------------------------- */ + +static nfp_err i21555_create( nfp_cdev *pdev ) { + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_create: entered"); + pdev->cmdctx= pdev; /* set our context to just be a pointer to our nfp_cdev */ + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_create: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + nfp_log( NFP_DBG2, "i21555_create: enable doorbell"); + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_ENABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + return NFP_SUCCESS; +} + +/* stop ------------------------------------------------------- */ + +static nfp_err i21555_destroy( void * ctx ) { + nfp_cdev *pdev; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_destroy: entered"); + + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_destroy: NULL pdev"); + return NFP_ENODEV; + } + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_destroy: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + TO_LE32_IO( &tmp32, I21555_DOORBELL_PRI_DISABLE ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET_MASK, tmp32 ); + nfp_outl( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK, tmp32 ); + + return NFP_SUCCESS; +} + +/* open ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_open( void * ctx ) { + + nfp_log( NFP_DBG2, "i21555_open: entered"); + + return NFP_SUCCESS; +} + +/* close ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_close( void * ctx ) { + nfp_log( NFP_DBG2, "i21555_close: entered"); + + return NFP_SUCCESS; +} + +/* isr ------------------------------------------------------- */ + +static nfp_err i21555_isr( void *ctx, int *handled ) { + nfp_cdev *pdev; + nfp_err ne; + unsigned short doorbell; + unsigned short tmp16; + + nfp_log( NFP_DBG3, "i21555_isr: entered"); + + *handled= 0; + pdev= (nfp_cdev *)ctx; + if(!pdev) { + nfp_log( NFP_DBG1, "i21555_isr: NULL pdev"); + return NFP_ENODEV; + } + + pdev->stats.isr++; + + if(!pdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_isr: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + /* This interrupt may not be from our module, so check that it actually is + * us before handling it. + */ + ne = i21555_started( pdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_isr: i21555_started failed"); + } + return ne; + } + + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + while( doorbell && doorbell != 0xffff) { + *handled= 1; + /* service interrupts */ + if( doorbell & (NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + pdev->stats.isr_write++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: write done interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + + nfp_write_complete(pdev->dev, doorbell & NFAST_INT_DEVICE_WRITE_OK ? 1 : 0 ); + } + + if( doorbell & (NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED)) { + pdev->stats.isr_read++; + TO_LE16_IO(&tmp16,NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + + nfp_log( NFP_DBG2, "i21555_isr: read ack interrupt, ok = %d.", doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0 ); + nfp_read_complete( pdev->dev, doorbell & NFAST_INT_DEVICE_READ_OK ? 1 : 0); + } + + if( doorbell & ~(NFAST_INT_DEVICE_READ_OK | NFAST_INT_DEVICE_READ_FAILED | + NFAST_INT_DEVICE_WRITE_OK | NFAST_INT_DEVICE_WRITE_FAILED)) { + TO_LE16_IO(&tmp16,doorbell); + nfp_outw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_CLEAR, tmp16 ); + nfp_log( NFP_DBG1, "i21555_isr: unexpected interrupt %x", doorbell ); + } + doorbell= nfp_inw( pdev, IOBAR, I21555_OFFSET_DOORBELL_PRI_SET); + doorbell= FROM_LE16_IO(&doorbell); + } + nfp_log( NFP_DBG3, "i21555_isr: exiting"); + return 0; +} + +/* write ------------------------------------------------------- */ + +static nfp_err i21555_write( const char *block, int len, void *ctx) { + nfp_cdev *cdev; + unsigned int hdr[2]; + nfp_err ne; + unsigned short tmp16; + unsigned int tmp32; + + nfp_log( NFP_DBG2, "i21555_write: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_write: NULL cdev"); + return NFP_ENODEV; + } + + cdev->stats.write_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_write: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_write: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_write: cdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + nfp_log( NFP_DBG3, "i21555_write: block len %d", len ); + ne= nfp_copy_from_user_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_DATA, block, len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_user_to_dev failed"); + return ne; + } + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_WR_CONTROL, (const char *)hdr, 8); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_WR_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_write: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_write: length not written"); + return NFP_EIO; + } + TO_LE16_IO(&tmp16, NFAST_INT_HOST_WRITE_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.write_fail--; + cdev->stats.write_block++; + cdev->stats.write_byte += len; + + nfp_log( NFP_DBG2, "i21555_write: done"); + return NFP_SUCCESS; +} + +/* read ------------------------------------------------------- */ + +static nfp_err i21555_read( char *block, int len, void *ctx, int *rcount) { + nfp_cdev *cdev; + nfp_err ne; + int count; + + nfp_log( NFP_DBG2, "i21555_read: entered"); + *rcount= 0; + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_read: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.read_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_read: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)&count, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_from_dev failed."); + return ne; + } + count= FROM_LE32_MEM(&count); + if(count<0 || count>len) { + nfp_log( NFP_DBG1, "i21555_read: bad byte count (%d) from device", count); + return NFP_EIO; + } + ne= nfp_copy_to_user_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_DATA, block, count); + if (ne) { + nfp_log( NFP_DBG1, "i21555_read: nfp_copy_to_user failed."); + return ne; + } + nfp_log( NFP_DBG2, "i21555_read: done"); + *rcount= count; + cdev->stats.read_fail--; + cdev->stats.read_block++; + cdev->stats.read_byte += len; + return NFP_SUCCESS; +} + +/* chupdate ------------------------------------------------------- */ + +/* ARGSUSED */ +static nfp_err i21555_chupdate( char *data, int len, void *ctx ) { + nfp_log( NFP_DBG1, "i21555_chupdate: NYI"); + return NFP_SUCCESS; +} + +/* ensure reading -------------------------------------------------- */ + +static nfp_err i21555_ensure_reading( unsigned int addr, int len, void *ctx ) { + nfp_cdev *cdev; + unsigned int hdr[3]; + unsigned short tmp16; + unsigned int tmp32; + nfp_err ne; + int hdr_len; + + nfp_log( NFP_DBG2, "i21555_ensure_reading: entered"); + + cdev= (nfp_cdev *)ctx; + if(!cdev) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: NULL pdev"); + return NFP_ENODEV; + } + + cdev->stats.ensure_fail++; + + if(!cdev->bar[ IOBAR ]) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: null BAR[%d]", IOBAR ); + return NFP_ENOMEM; + } + + ne = i21555_started( cdev ); + if (ne) { + if (ne != NFP_ESTARTING) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: i21555_started failed"); + } + return ne; + } + + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ MEMBAR ]= %x", cdev->bar[ MEMBAR ]); + nfp_log( NFP_DBG3, "i21555_ensure_reading: pdev->bar[ IOBAR ]= %x", cdev->bar[ IOBAR ]); + if(addr) { + nfp_log( NFP_DBG3, "i21555_ensure_reading: new format, addr %x", addr); + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL_PCI_PUSH); + TO_LE32_MEM(&hdr[1], len); + TO_LE32_MEM(&hdr[2], addr); + hdr_len= 12; + } else { + TO_LE32_MEM(&hdr[0], NFPCI_JOB_CONTROL); + TO_LE32_MEM(&hdr[1], len); + hdr_len= 8; + } + ne= nfp_copy_to_dev( cdev, MEMBAR, NFPCI_JOBS_RD_CONTROL, (const char *)hdr, hdr_len); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_to_dev failed"); + return ne; + } + + ne= nfp_copy_from_dev( cdev, MEMBAR, NFPCI_JOBS_RD_LENGTH, (char *)hdr, 4); + if (ne) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: nfp_copy_from_dev failed"); + return ne; + } + + TO_LE32_MEM(&tmp32, len); + + if ( hdr[0] != tmp32 ) { + nfp_log( NFP_DBG1, "i21555_ensure_reading: len not written"); + return NFP_EIO; + } + TO_LE16_IO( &tmp16, NFAST_INT_HOST_READ_REQUEST >> 16); + nfp_outw( cdev, IOBAR, I21555_OFFSET_DOORBELL_SEC_SET, tmp16); + + cdev->stats.ensure_fail--; + cdev->stats.ensure++; + + return NFP_SUCCESS; +} + +/* command device structure ------------------------------------- */ + +const nfpcmd_dev i21555_cmddev = { + "nCipher Gen 2 PCI", + PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_21555, + PCI_VENDOR_ID_NCIPHER, PCI_SUBSYSTEM_ID_NFAST_REV1, + { 0, IOSIZE | PCI_BASE_ADDRESS_SPACE_IO, NFPCI_RAM_MINSIZE_JOBS, 0, 0, 0 }, + NFP_CMD_FLG_NEED_IOBUF, + i21555_create, + i21555_destroy, + i21555_open, + i21555_close, + i21555_isr, + i21555_write, + i21555_read, + i21555_chupdate, + i21555_ensure_reading, + i21555_debug, +}; diff --git a/usr/src/uts/common/io/nfp/i21555.h b/usr/src/uts/common/io/nfp/i21555.h new file mode 100644 index 0000000000..d8f3965938 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555.h @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef I21555_H +#define I21555_H + +#ifndef PCI_VENDOR_ID_INTEL +#define PCI_VENDOR_ID_INTEL 0x8086 +#endif + +#ifndef PCI_DEVICE_ID_INTEL_21555 +#define PCI_DEVICE_ID_INTEL_21555 0xb555 +#endif + +#ifndef PCI_VENDOR_ID_NCIPHER +#define PCI_VENDOR_ID_NCIPHER 0x0100 +#endif + +#ifndef PCI_SUBSYSTEM_ID_NFAST_REV1 +#define PCI_SUBSYSTEM_ID_NFAST_REV1 0x0100 +#endif + +#define I21555_OFFSET_DOORBELL_PRI_SET 0x9C +#define I21555_OFFSET_DOORBELL_SEC_SET 0x9E +#define I21555_OFFSET_DOORBELL_PRI_CLEAR 0x98 + +#define I21555_OFFSET_DOORBELL_PRI_SET_MASK 0xA4 +#define I21555_OFFSET_DOORBELL_PRI_CLEAR_MASK 0xA0 + +#define I21555_DOORBELL_PRI_ENABLE 0x0000 +#define I21555_DOORBELL_PRI_DISABLE 0xFFFF + +#define I21555_CFG_SEC_CMD_STATUS 0x44 + +#define CFG_CMD_MASTER 0x0004 + +#define IOBAR 1 +#define MEMBAR 2 + +#define IOSIZE 0x100 + +extern nfp_err i21555_debug( int cmd, void *ctx ); + +#endif diff --git a/usr/src/uts/common/io/nfp/i21555d.c b/usr/src/uts/common/io/nfp/i21555d.c new file mode 100644 index 0000000000..183ace8275 --- /dev/null +++ b/usr/src/uts/common/io/nfp/i21555d.c @@ -0,0 +1,28 @@ +/* + +i21555d.c: nCipher PCI HSM intel 21555 debug ioctl + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + + +history + +15/05/2002 jsh Original, does nothing + +*/ + +#include "nfp_common.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "i21555.h" + +/* ARGSUSED */ +nfp_err i21555_debug( int cmd, void *ctx) { + nfp_log( NFP_DBG1, "i21555_debug: entered"); + + return NFP_EUNKNOWN; +} diff --git a/usr/src/uts/common/io/nfp/nfdev-common.h b/usr/src/uts/common/io/nfp/nfdev-common.h new file mode 100644 index 0000000000..8a97bf2c63 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-common.h @@ -0,0 +1,141 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ +/** \file nfdev-common.h + * + * \brief nFast device driver (not generic SCSI) ioctl struct definition file + * include NFDEV-$(system) for ioctl number definitions + * + * 1998.07.13 jsh Started + * + * + */ + +#ifndef NFDEV_COMMON_H +#define NFDEV_COMMON_H + +/** + * Result of the ENQUIRY ioctl. + */ +typedef struct nfdev_enquiry_str { + unsigned int busno; /**< Which bus is the PCI device on. */ + unsigned char slotno; /**< Which slot is the PCI device in. */ + unsigned char reserved[3]; /**< for consistant struct alignment */ +} nfdev_enquiry_str; + +/** + * Result of the STATS ioctl. + */ +typedef struct nfdev_stats_str { + unsigned long isr; /**< Count interrupts. */ + unsigned long isr_read; /**< Count read interrupts. */ + unsigned long isr_write; /**< Count write interrupts. */ + unsigned long write_fail; /**< Count write failures. */ + unsigned long write_block; /**< Count blocks written. */ + unsigned long write_byte; /**< Count bytes written. */ + unsigned long read_fail; /**< Count read failures. */ + unsigned long read_block; /**< Count blocks read. */ + unsigned long read_byte; /**< Count bytes read. */ + unsigned long ensure_fail; /**< Count read request failures. */ + unsigned long ensure; /**< Count read requests. */ +} nfdev_stats_str; + +/** + * Input to the CONTROL ioctl. + */ +typedef struct nfdev_control_str { + unsigned control; /**< Control flags. */ +} nfdev_control_str; + +/** Control bit indicating host supports MOI control */ +#define NFDEV_CONTROL_HOST_MOI 0x0001 + +/** Index of control bits indicating desired mode + * + * Desired mode follows the M_ModuleMode enumeration. + */ +#define NFDEV_CONTROL_MODE_SHIFT 1 + +/** Detect a backwards-compatible control value + * + * Returns true if the request control value "makes no difference", i.e. + * and the failure of an attempt to set it is therefore uninteresting. + */ +#define NFDEV_CONTROL_HARMLESS(c) ((c) <= 1) + +/** + * Result of the STATUS ioctl. + */ +typedef struct nfdev_status_str { + unsigned status; /**< Status flags. */ + char error[8]; /**< Error string. */ +} nfdev_status_str; + +/** Monitor firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_MONITOR_MOI 0x0001 + +/** Application firmware supports MOI control and error reporting */ +#define NFDEV_STATUS_APPLICATION_MOI 0x0002 + +/** Application firmware running and supports error reporting */ +#define NFDEV_STATUS_APPLICATION_RUNNING 0x0004 + +/** HSM failed + * + * Consult error[] for additional information. + */ +#define NFDEV_STATUS_FAILED 0x0008 + +/** Standard PCI interface. */ +#define NFDEV_IF_STANDARD 0x01 + +/** PCI interface with results pushed from device + * via DMA. + */ +#define NFDEV_IF_PCI_PUSH 0x02 + +/* platform independant base ioctl numbers */ + +/** Enquiry ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_ENQUIRY 0x01 +/** Channel Update ioctl. + * \deprecated */ +#define NFDEV_IOCTL_NUM_CHUPDATE 0x02 +/** Ensure Reading ioctl. + * Signal a read request to the device. + * \param (unsigned int) Length of data to be read. + */ +#define NFDEV_IOCTL_NUM_ENSUREREADING 0x03 +/** Device Count ioctl. + * Not implemented for on all platforms. + * \return (int) the number of attached devices. */ +#define NFDEV_IOCTL_NUM_DEVCOUNT 0x04 +/** Internal Debug ioctl. + * Not implemented in release drivers. */ +#define NFDEV_IOCTL_NUM_DEBUG 0x05 +/** PCI Interface Version ioctl. + * \param (int) Maximum PCI interface version + * supported by the user of the device. */ +#define NFDEV_IOCTL_NUM_PCI_IFVERS 0x06 +/** Statistics ioctl. + * \return nfdev_enquiry_str describing the attached device. */ +#define NFDEV_IOCTL_NUM_STATS 0x07 + +/** Module control ioctl + * \param (nfdev_control_str) Value to write to HSM control register + */ +#define NFDEV_IOCTL_NUM_CONTROL 0x08 + +/** Module state ioctl + * \return (nfdev_status_str) Values read from HSM status/error registers + */ +#define NFDEV_IOCTL_NUM_STATUS 0x09 + +#endif diff --git a/usr/src/uts/common/io/nfp/nfdev-solaris.h b/usr/src/uts/common/io/nfp/nfdev-solaris.h new file mode 100644 index 0000000000..923b902e46 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfdev-solaris.h @@ -0,0 +1,37 @@ +/* + +nfdev-solaris.h: nFast solaris specific device ioctl interface. + +(C) Copyright nCipher Corporation Ltd 1998-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +14/07/1998 jsh Original + +*/ + +#ifndef NFDEV_SOLARIS_H +#define NFDEV_SOLARIS_H + +#include "nfdev-common.h" + +#define NFDEV_IOCTL_TYPE ('n'<<8) + +#define NFDEV_IOCTL_ENQUIRY ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENQUIRY ) +#define NFDEV_IOCTL_ENSUREREADING ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_ENSUREREADING ) +#define NFDEV_IOCTL_DEVCOUNT ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEVCOUNT ) +#define NFDEV_IOCTL_DEBUG ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_DEBUG ) +#define NFDEV_IOCTL_PCI_IFVERS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_PCI_IFVERS ) +#define NFDEV_IOCTL_STATS ( NFDEV_IOCTL_TYPE | \ + NFDEV_IOCTL_NUM_STATS ) + +#endif /* NFDEV_SOLARIS_H */ diff --git a/usr/src/uts/common/io/nfp/nfp.h b/usr/src/uts/common/io/nfp/nfp.h new file mode 100644 index 0000000000..9704f04fbc --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp.h @@ -0,0 +1,113 @@ +/* + +nfp.h: nFast PCI driver for Solaris 2.5, 2.6 and 2.7 + +(C) Copyright nCipher Corporation Ltd 2001-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +06/05/1998 jsh Original solaris 2.6 +21/05/1999 jsh added support for solaris 2.5 +10/06/1999 jsh added support for solaris 2.7 (32 and 64 bit) +16/10/2001 jsh moved from nfast to new structure in nfdrv + +*/ + +#ifndef NFP_H +#define NFP_H + +#ifndef _KERNEL +#error Hello? this is a driver, please compile with -D_KERNEL +#endif + +#if ( CH_KERNELVER < 260 ) +typedef int ioctlptr_t; +typedef unsigned short uint16_t; +#define DDI_GET32 ddi_getl +#define DDI_PUT32 ddi_putl +#define DDI_GET16 ddi_getw +#define DDI_PUT16 ddi_putw +#define DDI_REP_GET8 ddi_rep_getb +#define DDI_REP_PUT8 ddi_rep_putb +#define DDI_REP_GET32 ddi_rep_getl +#define DDI_REP_PUT32 ddi_rep_putl +#define PCI_CONFIG_GET16 pci_config_getw +#else /* ( CH_KERNELVER >= 260 ) */ +typedef intptr_t ioctlptr_t; +#define DDI_GET32 ddi_get32 +#define DDI_PUT32 ddi_put32 +#define DDI_GET16 ddi_get16 +#define DDI_PUT16 ddi_put16 +#define DDI_REP_GET8 ddi_rep_get8 +#define DDI_REP_PUT8 ddi_rep_put8 +#define DDI_REP_GET32 ddi_rep_get32 +#define DDI_REP_PUT32 ddi_rep_put32 +#define PCI_CONFIG_GET16 pci_config_get16 +#endif + +#if ( CH_KERNELVER < 270 ) +typedef int nfp_timeout_t; +#define EXTRA_CB_FLAGS 0 +#define VSXPRINTF(s, n, format, ap) vsprintf (s, format, ap) +#else /* ( CH_KERNELVER >= 270 ) */ +typedef timeout_id_t nfp_timeout_t; +#define EXTRA_CB_FLAGS D_64BIT +#define VSXPRINTF(s, n, format, ap) vsnprintf(s, n, format, ap) +#endif + +typedef struct nfp_dev { + int rd_ok; + int wr_ok; + + int ifvers; + + /* for PCI push read interface */ + unsigned char *read_buf; + ddi_dma_handle_t read_dma_handle; + ddi_dma_cookie_t read_dma_cookie; + + ddi_acc_handle_t acchandle; + + int rd_dma_ok; + + nfp_timeout_t wrtimeout; + nfp_timeout_t rdtimeout; + + struct buf *wr_bp; + int wr_ready; + int rd_ready; + int rd_pending; + int rd_outstanding; + kcondvar_t rd_cv; + + struct pollhead pollhead; + dev_info_t *dip; + + ddi_iblock_cookie_t high_iblock_cookie; /* for mutex */ + ddi_iblock_cookie_t low_iblock_cookie; /* for mutex */ + kmutex_t high_mutex; + kmutex_t low_mutex; + int high_intr; + ddi_softintr_t soft_int_id; + int high_read; + int high_write; + + ddi_iblock_cookie_t iblock_cookie; /* for mutex */ + kmutex_t isr_mutex; + + kmutex_t busy_mutex; + int busy; + + ddi_acc_handle_t conf_handle; + + nfp_cdev common; + const nfpcmd_dev *cmddev; +} nfp_dev; + +extern struct nfp_dev *nfp_dev_list[]; + +#endif /* NFP_H */ diff --git a/usr/src/uts/common/io/nfp/nfp_cmd.h b/usr/src/uts/common/io/nfp/nfp_cmd.h new file mode 100644 index 0000000000..db8af0b2f9 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_cmd.h @@ -0,0 +1,68 @@ +/* + +nfp_cmd.h: nCipher PCI HSM command driver decalrations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFPCMD_H +#define NFPCMD_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* read and write called with userspace buffer */ + +typedef struct nfpcmd_dev { + const char *name; + unsigned short vendorid, deviceid, + sub_vendorid, sub_deviceid; + unsigned int bar_sizes[6]; /* includes IO bit */ + unsigned int flags; + nfp_err (*create)(struct nfp_cdev *pdev); + nfp_err (*destroy)(void * ctx); + nfp_err (*open)(void * ctx); + nfp_err (*close)(void * ctx); + nfp_err (*isr)(void *ctx, int *handled); + nfp_err (*write_block)( const char *ublock, int len, void *ctx ); + nfp_err (*read_block)( char *ublock, int len, void *ctx, int *rcount); + nfp_err (*channel_update)( char *data, int len, void *ctx); + nfp_err (*ensure_reading)( unsigned int addr, int len, void *ctx ); + nfp_err (*debug)( int cmd, void *ctx); +} nfpcmd_dev; + +#define NFP_CMD_FLG_NEED_IOBUF 0x1 + +/* list of all supported drivers ---------------------------------------- */ + +extern const nfpcmd_dev *nfp_drvlist[]; + +extern const nfpcmd_dev i21285_cmddev; +extern const nfpcmd_dev i21555_cmddev; +extern const nfpcmd_dev bcm5820_cmddev; + +#ifndef PCI_BASE_ADDRESS_SPACE_IO +#define PCI_BASE_ADDRESS_SPACE_IO 0x1 +#endif + +#define NFP_MAXDEV 16 + + +#define NFP_MEMBAR_MASK ~0xf +#define NFP_IOBAR_MASK ~0x3 +/* + This masks off the bottom bits of the PCI_CSR_BAR which signify that the + BAR is an IO BAR rather than a MEM BAR +*/ + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_common.h b/usr/src/uts/common/io/nfp/nfp_common.h new file mode 100644 index 0000000000..d1d2100fea --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_common.h @@ -0,0 +1,68 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#ifndef NFP_COMMON_H +#define NFP_COMMON_H + +#include <sys/types.h> +#include <sys/conf.h> + +typedef uint32_t UINT32; +typedef uint8_t BYTE; + +#define DEFINE_NFPCI_PACKED_STRUCTS +#include "nfpci.h" +#include "nfdev-solaris.h" + +typedef int oserr_t; + +#if CH_BIGENDIAN + +/* Big Endian Sparc */ + +#define SWP32(x) \ +( (((unsigned int)(x)>>24)&0xff) | (((unsigned int)(x)>>8)&0xff00) | (((unsigned int)(x)<<8)&0xff0000) | (((unsigned int)(x)<<24)&0xff000000) ) + +#define SWP16(x) ( (((x)>>8)&0xff) | (((x)<<8)&0xff00) ) + +#define FROM_LE32_IO(x) SWP32(*x) +#define TO_LE32_IO(x,y) *x=SWP32(y) + +#define FROM_LE32_MEM(x) SWP32(*x) +#define TO_LE32_MEM(x,y) *x=SWP32(y) + +#define FROM_LE16_IO(x) SWP16(*x) +#define TO_LE16_IO(x,y) *x=SWP16(y) + +#else + +/* Little Endian x86 */ + +#define FROM_LE32_IO(x) (*x) +#define TO_LE32_IO(x,y) (*x=y) + +#define FROM_LE32_MEM(x) (*x) +#define TO_LE32_MEM(x,y) (*x=y) + +#define FROM_LE16_IO(x) (*x) +#define TO_LE16_IO(x,y) (*x=y) + +#endif /* !CH_BIGENDIAN */ + +#include <sys/types.h> + +#if CH_KERNELVER == 260 +#define nfp_get_lbolt( lbolt, err ) err= drv_getparm( LBOLT, lbolt ) +#else +#define nfp_get_lbolt( lbolt, err ) { *lbolt= ddi_get_lbolt(); err= 0; } +#endif + +#endif + diff --git a/usr/src/uts/common/io/nfp/nfp_error.h b/usr/src/uts/common/io/nfp/nfp_error.h new file mode 100644 index 0000000000..d64cb78fd4 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_error.h @@ -0,0 +1,48 @@ +/* + +nfp_error.h: nCipher PCI HSM error handling + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +05/12/2001 jsh Original + +*/ + +#ifndef NFP_ERROR_H +#define NFP_ERROR_H + +#include "nfp_common.h" + +#define NFP_SUCCESS 0x0 +#define NFP_EFAULT 0x1 +#define NFP_ENOMEM 0x2 +#define NFP_EINVAL 0x3 +#define NFP_EIO 0x4 +#define NFP_ENXIO 0x5 +#define NFP_ENODEV 0x6 +#define NFP_EINTR 0x7 +#define NFP_ESTARTING 0x8 +#define NFP_EAGAIN 0x9 +#define NFP_EUNKNOWN 0x100 + +typedef int nfp_err; + +extern oserr_t nfp_oserr( nfp_err nerr ); +extern nfp_err nfp_error( oserr_t oerr ); + +#define nfr( x) \ + return nfp_error((x)) + +#define nfer(x, fn, msg) \ + { oserr_t err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return nfp_error(err); } } + +#define er(x, fn, msg ) \ +{ nfp_err err=(x); if(err) { nfp_log( NFP_DBG1, #fn ": " msg); return err; } } + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_hostif.h b/usr/src/uts/common/io/nfp/nfp_hostif.h new file mode 100644 index 0000000000..3e7d8187e5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_hostif.h @@ -0,0 +1,54 @@ +/* + +nfp_hostif.h: nCipher PCI HSM host interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_HOSTIF_H +#define NFP_HOSTIF_H + +#include "nfdev-common.h" + +struct nfp_dev; + +/* common device structure */ + +typedef struct nfp_cdev { + unsigned char *bar[6]; + void *extra[6]; + + int busno; + int slotno; + + void *cmdctx; + + char *iobuf; + + struct nfp_dev* dev; + + struct nfdev_stats_str stats; + +} nfp_cdev; + +/* callbacks from command drivers -------------------------------------- */ + +void nfp_read_complete( struct nfp_dev *pdev, int ok); +void nfp_write_complete( struct nfp_dev *pdev, int ok); + +#define NFP_READ_MAX (8 * 1024) +#define NFP_READBUF_SIZE (NFP_READ_MAX + 8) +#define NFP_TIMEOUT_SEC 10 + +#define NFP_DRVNAME "nCipher nFast PCI driver" + +#endif diff --git a/usr/src/uts/common/io/nfp/nfp_ifvers.c b/usr/src/uts/common/io/nfp/nfp_ifvers.c new file mode 100644 index 0000000000..807b4f24c5 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_ifvers.c @@ -0,0 +1,51 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* + * nfp_ifervs.c - common pci interface versioning + * + * uses: + * + * int pdev->ifvers + * device interface version + * + * int nfp_ifvers + * interface version limit + * + * int nfp_alloc_pci_push( nfp_dev *pdev ) + * allocates resources needed for PCI Push, + * if not already allocated, and return True if successful + * + * void nfp_free_pci_push( nfp_dev *pdev ) { + * frees any resources allocated to PCI Push + */ + +void nfp_set_ifvers( nfp_dev *pdev, int vers ) { + if( nfp_ifvers != 0 && vers > nfp_ifvers ) { + nfp_log( NFP_DBG2, + "nfp_set_ifvers: can't set ifvers %d" + " as nfp_ifvers wants max ifvers %d", + vers, nfp_ifvers); + return; + } + if( vers >= NFDEV_IF_PCI_PUSH ) { + if(!nfp_alloc_pci_push(pdev)) { + nfp_log( NFP_DBG1, + "nfp_set_ifvers: can't set ifvers %d" + " as resources not available", + vers); + return; + } + } else { + nfp_free_pci_push(pdev); + } + pdev->ifvers= vers; + nfp_log( NFP_DBG3, "nfp_set_ifvers: setting ifvers %d", vers); +} diff --git a/usr/src/uts/common/io/nfp/nfp_osif.h b/usr/src/uts/common/io/nfp/nfp_osif.h new file mode 100644 index 0000000000..17ffe469ce --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfp_osif.h @@ -0,0 +1,105 @@ +/* + +nfp_osif.h: nCipher PCI HSM OS interface declarations + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +history + +10/10/2001 jsh Original + +*/ + +#ifndef NFP_OSIF_H +#define NFP_OSIF_H + +#include "nfp_hostif.h" +#include "nfp_error.h" + +/* general typedefs ----------------------------------------------- */ + +typedef volatile unsigned int reg32; +typedef volatile unsigned short reg16; +typedef volatile unsigned char reg8; + +/* sempaphores, mutexs and events --------------------------------- */ + +#if 0 +extern nfp_err nfp_sema_init( nfp_sema *sema, int initial); +extern void nfp_sema_destroy( nfp_sema *sema ); +extern void nfp_sema_post( nfp_sema *sema ); +extern void nfp_sema_wait( nfp_sema *sema ); +extern int nfp_sema_wait_sig( nfp_sema *sema ); + +extern nfp_err nfp_mutex_init( nfp_mutex *mutex ); +extern void nfp_mutex_destroy( nfp_mutex *mutex ); +extern void nfp_mutex_enter( nfp_mutex *mutex ); +extern void nfp_mutex_exit( nfp_mutex *mutex ); + +extern nfp_err nfp_event_init( nfp_event *event ); +extern void nfp_event_destroy( nfp_event *event ); +extern void nfp_event_set( nfp_event *event ); +extern void nfp_event_clear( nfp_event *event ); +extern void nfp_event_wait( nfp_event *event ); +extern void nfp_event_wait_sig( nfp_event *event ); + +#endif + +/* timeouts ------------------------------------------------------ */ + +extern void nfp_sleep( int ms ); + +/* memory handling ----------------------------------------------- */ + +#define KMALLOC_DMA 0 +#define KMALLOC_CACHED 1 + +extern void *nfp_kmalloc( int size, int flags ); +extern void *nfp_krealloc( void *ptr, int size, int flags ); +extern void nfp_kfree( void * ); + +/* config space access ------------------------------------------------ */ + +/* return Little Endian 32 bit config register */ +extern nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ); + +/* io space access ------------------------------------------------ */ + +extern unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ); +extern unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ); +extern void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ); +extern void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ); + +/* user and device memory space access ---------------------------- */ + +/* NB these 2 functions are not guarenteed to be re-entrant for a given device */ +extern nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len); +extern nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len); + +extern nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len ); +extern nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len ); + +extern nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len ); +extern nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len); + +/* debug ------------------------------------------------------------ */ + +#define NFP_DBG1 1 +#define NFP_DBGE NFP_DBG1 +#define NFP_DBG2 2 +#define NFP_DBG3 3 +#define NFP_DBG4 4 + +#ifdef STRANGE_VARARGS +extern void nfp_log(); +#else +extern void nfp_log( int severity, const char *format, ...); +#endif + +extern int nfp_debug; + +#endif diff --git a/usr/src/uts/common/io/nfp/nfpci.h b/usr/src/uts/common/io/nfp/nfpci.h new file mode 100644 index 0000000000..793f5995e6 --- /dev/null +++ b/usr/src/uts/common/io/nfp/nfpci.h @@ -0,0 +1,171 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +/* +* +* NFPCI.H - nFast PCI interface definition file +* +* +* +* 1998.06.09 IH Started +* +* The interface presented by nFast PCI devices consists of: +* +* A region of shared RAM used for data transfer & control information +* A doorbell interrupt register, so both sides can give each other interrupts +* A number of DMA channels for transferring data +*/ + +#ifndef NFPCI_H +#define NFPCI_H + +/* Sizes of some regions */ +#define NFPCI_RAM_MINSIZE 0x00100000 +/* This is the minimum size of shared RAM. In future it may be possible to + negotiate larger sizes of shared RAM or auto-detect how big it is */ +#define NFPCI_RAM_MINSIZE_JOBS 0x00020000 /* standard jobs only */ +#define NFPCI_RAM_MINSIZE_KERN 0x00040000 /* standard and kernel jobs */ + +/* Offsets within shared memory space. + The following main regions are: + jobs input area + jobs output area + kernel jobs input area + kernel output area +*/ + +#define NFPCI_OFFSET_JOBS 0x00000000 +#define NFPCI_OFFSET_JOBS_WR 0x00000000 +#define NFPCI_OFFSET_JOBS_RD 0x00010000 +#define NFPCI_OFFSET_KERN 0x00020000 +#define NFPCI_OFFSET_KERN_WR 0x00020000 +#define NFPCI_OFFSET_KERN_RD 0x00030000 + +/* Interrupts, defined by bit position in doorbell register */ + +/* Interrupts from device to host */ +#define NFAST_INT_DEVICE_WRITE_OK 0x00000001 +#define NFAST_INT_DEVICE_WRITE_FAILED 0x00000002 +#define NFAST_INT_DEVICE_READ_OK 0x00000004 +#define NFAST_INT_DEVICE_READ_FAILED 0x00000008 +#define NFAST_INT_DEVICE_KERN_WRITE_OK 0x00000010 +#define NFAST_INT_DEVICE_KERN_WRITE_FAILED 0x00000020 +#define NFAST_INT_DEVICE_KERN_READ_OK 0x00000040 +#define NFAST_INT_DEVICE_KERN_READ_FAILED 0x00000080 + +/* Interrupts from host to device */ +#define NFAST_INT_HOST_WRITE_REQUEST 0x00010000 +#define NFAST_INT_HOST_READ_REQUEST 0x00020000 +#define NFAST_INT_HOST_DEBUG 0x00040000 +#define NFAST_INT_HOST_KERN_WRITE_REQUEST 0x00080000 +#define NFAST_INT_HOST_KERN_READ_REQUEST 0x00100000 + +/* Ordinary job submission ------------------------ */ + +/* The NFPCI_OFFSET_JOBS_WR and NFPCI_OFFSET_JOBS_RD regions are defined + by the following (byte) address offsets... */ + +#define NFPCI_OFFSET_CONTROL 0x0 +#define NFPCI_OFFSET_LENGTH 0x4 +#define NFPCI_OFFSET_DATA 0x8 +#define NFPCI_OFFSET_PUSH_ADDR 0x8 + +#define NFPCI_JOBS_WR_CONTROL (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_WR_LENGTH (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_WR_DATA (NFPCI_OFFSET_JOBS_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_JOBS_WR_LEN (0x0000FFF8) + +#define NFPCI_JOBS_RD_CONTROL (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_JOBS_RD_LENGTH (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_JOBS_RD_DATA (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_JOBS_RD_PUSH_ADDR (NFPCI_OFFSET_JOBS_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_JOBS_RD_LEN (0x000FFF8) + +/* Kernel inferface job submission ---------------- */ + +#define NFPCI_KERN_WR_CONTROL (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_WR_LENGTH (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_WR_DATA (NFPCI_OFFSET_KERN_WR + NFPCI_OFFSET_DATA) +#define NFPCI_MAX_KERN_WR_LEN (0x0000FFF8) + +#define NFPCI_KERN_RD_CONTROL (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_CONTROL) +#define NFPCI_KERN_RD_LENGTH (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_LENGTH) +#define NFPCI_KERN_RD_DATA (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_DATA) +/* address in PCI space of host buffer for NFPCI_JOB_CONTROL_PCI_PUSH */ +#define NFPCI_KERN_RD_ADDR (NFPCI_OFFSET_KERN_RD + NFPCI_OFFSET_PUSH_ADDR) +#define NFPCI_MAX_KERN_RD_LEN (0x000FFF8) + +#ifdef DEFINE_NFPCI_PACKED_STRUCTS +typedef struct +{ + UINT32 controlword; + UINT32 length; /* length of data to follow */ + union { + BYTE data[1]; + UINT32 addr; + } uu; +} + NFPCI_JOBS_BLOCK; +#endif + + +#define NFPCI_JOB_CONTROL 0x00000001 +#define NFPCI_JOB_CONTROL_PCI_PUSH 0x00000002 +/* + The 'Control' word is analogous to the SCSI read/write address; + 1 = standard push/pull IO + 2 = push/push IO + + To submit a block of job data, the host: + - sets the (32-bit, little-endian) word at NFPCI_JOBS_WR_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_WR_LENGTH to the length of the data + - copies the data to NFPCI_JOBS_WR_DATA + - sets interrupt NFAST_INT_HOST_WRITE_REQUEST in the doorbell register + - awaits the NFAST_INT_DEVICE_WRITE_OK (or _FAILED) interrupts back + + To read a block of jobs back, the host: + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from NFPCI_JOBS_RD_DATA; the module will set the word at + NFPCI_JOBS_RD_LENGTH to its actual length. + + Optionally the host can request the PCI read data to be pushed to host PCI mapped ram: + - allocates a contiguous PCI addressable buffer for a NFPCI_JOBS_BLOCK of max + size NFPCI_MAX_JOBS_RD_LEN (or NFPCI_MAX_KERN_RD_LEN) + 8 + - sets the word at NFPCI_JOBS_RD_CONTROL to NFPCI_JOB_CONTROL_PCI_PUSH + - sets the word at NFPCI_JOBS_RD_LENGTH to the max length for returned data + - sets the word at NFPCI_JOBS_RD_PUSH_ADDR to be the host PCI address of + the buffer + - sets interrupt NFAST_INT_HOST_READ_REQUEST + - awaits the NFAST_INT_DEVICE_READ_OK (or _FAILED) interrupt + - reads the data from the buffer at NFPCI_OFFSET_DATA in the buffer. The + module will set NFPCI_OFFSET_LENGTH to the actual length. +*/ + +#define NFPCI_SCRATCH_CONTROL 0 + +#define NFPCI_SCRATCH_CONTROL_HOST_MOI (1<<0) +#define NFPCI_SCRATCH_CONTROL_MODE_SHIFT 1 +#define NFPCI_SCRATCH_CONTROL_MODE_MASK (3<<NFPCI_SCRATCH_CONTROL_MODE_SHIFT) + +#define NFPCI_SCRATCH_STATUS 1 + +#define NFPCI_SCRATCH_STATUS_MONITOR_MOI (1<<0) +#define NFPCI_SCRATCH_STATUS_APPLICATION_MOI (1<<1) +#define NFPCI_SCRATCH_STATUS_APPLICATION_RUNNING (1<<2) +#define NFPCI_SCRATCH_STATUS_ERROR (1<<3) + +#define NFPCI_SCRATCH_ERROR_LO 2 +#define NFPCI_SCRATCH_ERROR_HI 3 + +#endif diff --git a/usr/src/uts/common/io/nfp/osif.c b/usr/src/uts/common/io/nfp/osif.c new file mode 100644 index 0000000000..fba62f9a37 --- /dev/null +++ b/usr/src/uts/common/io/nfp/osif.c @@ -0,0 +1,184 @@ +/* + +(C) Copyright nCipher Corporation Ltd 2002-2008 All rights reserved + +Copyright (c) 2008-2013 Thales e-Security All rights reserved + +Copyright (c) 2014 Thales UK All rights reserved + +*/ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/map.h> +#include <sys/debug.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/open.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/pci.h> + +#include "nfp_common.h" +#include "nfp_hostif.h" +#include "nfp_error.h" +#include "nfp_osif.h" +#include "nfp_cmd.h" +#include "nfp.h" +#include "autoversion.h" + +/* config space access ---------------------------------- */ + +nfp_err nfp_config_inl( nfp_cdev *pdev, int offset, unsigned int *res ) { + unsigned int tmp32; + if ( !pdev || !pdev->dev || !pdev->dev->conf_handle ) + return NFP_ENODEV; + +/* pci_config_get32() does byte swapping, so put back to LE */ + tmp32 = pci_config_get32( pdev->dev->conf_handle, offset ); + TO_LE32_IO(res, tmp32); + + return NFP_SUCCESS; +} + +/* user space memory access ---------------------------------- */ + +nfp_err nfp_copy_from_user( char *kbuf, const char *ubuf, int len) { + bcopy(ubuf, kbuf, len); + return 0; +} + +nfp_err nfp_copy_to_user( char *ubuf, const char *kbuf, int len) { + bcopy(kbuf, ubuf, len); + return 0; +} + +nfp_err nfp_copy_from_user_to_dev( nfp_cdev *cdev, int bar, int offset, const char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying from kernel mem */ + return nfp_copy_to_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_to_user_from_dev( nfp_cdev *cdev, int bar, int offset, char *ubuf, int len) { + /* dirty hack on Solaris, as we are called from strategy we are, in fact, copying to kernel mem */ + return nfp_copy_from_dev( cdev, bar, offset, ubuf, len ); +} + +nfp_err nfp_copy_from_dev( nfp_cdev *cdev, int bar, int offset, char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_GET8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR); + else + /* LINTED: alignment */ + DDI_REP_GET32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR); + return NFP_SUCCESS; +} + +nfp_err nfp_copy_to_dev( nfp_cdev *cdev, int bar, int offset, const char *kbuf, int len) { + if( len & 0x3 || offset & 0x3 ) + DDI_REP_PUT8( cdev->extra[bar], (unsigned char *)kbuf, cdev->bar[bar] + offset, len, DDI_DEV_AUTOINCR ); + else + /* LINTED: alignment */ + DDI_REP_PUT32( cdev->extra[bar], (unsigned int *)kbuf, (unsigned int *)(cdev->bar[bar] + offset), len / 4, DDI_DEV_AUTOINCR ); + return NFP_SUCCESS; +} + +/* pci io space access --------------------------------------- */ + +unsigned int nfp_inl( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inl: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET32( pdev->extra[bar], (uint32_t *)(pdev->bar[bar] + offset) ); +} + +unsigned short nfp_inw( nfp_cdev *pdev, int bar, int offset ) { + nfp_log( NFP_DBG3, "nfp_inw: addr %x", (uintptr_t) pdev->bar[bar] + offset); + /* LINTED: alignment */ + return DDI_GET16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset) ); +} + +void nfp_outl( nfp_cdev *pdev, int bar, int offset, unsigned int data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT32( pdev->extra[bar], (uint32_t *)(pdev->bar[ bar ] + offset), data ); +} + +void nfp_outw( nfp_cdev *pdev, int bar, int offset, unsigned short data ) { + nfp_log( NFP_DBG3, "nfp_outl: addr %x, data %x", (uintptr_t) pdev->bar[bar] + offset, data); + /* LINTED: alignment */ + DDI_PUT16( pdev->extra[bar], (unsigned short *)(pdev->bar[ bar ] + offset), data ); +} + +/* logging ---------------------------------------------------- */ + +void nfp_log( int level, const char *fmt, ...) +{ + auto char buf[256]; + va_list ap; + + switch (level) { + case NFP_DBG4: if (nfp_debug < 4) break; + /*FALLTHROUGH*/ + case NFP_DBG3: if (nfp_debug < 3) break; + /*FALLTHROUGH*/ + case NFP_DBG2: if (nfp_debug < 2) break; + /*FALLTHROUGH*/ + case NFP_DBG1: if (nfp_debug < 1) break; + /*FALLTHROUGH*/ + default: + va_start(ap, fmt); + (void) vsnprintf(buf, 256, fmt, ap); + va_end(ap); + cmn_err(CE_CONT, "!" VERSION_COMPNAME " " VERSION_NO ": %s\n", buf); + break; + } +} + +struct errstr { + int oserr; + nfp_err nferr; +}; + + +static struct errstr errtab[] = { + { EFAULT, NFP_EFAULT }, + { ENOMEM, NFP_ENOMEM }, + { EINVAL, NFP_EINVAL }, + { EIO, NFP_EIO }, + { ENXIO, NFP_ENXIO }, + { ENODEV, NFP_ENODEV }, + { EINVAL, NFP_EUNKNOWN }, + { 0, 0 } +}; + +nfp_err nfp_error( int oserr ) +{ + struct errstr *perr; + if(!oserr) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->oserr == oserr) + return perr->nferr; + perr++; + } + return NFP_EUNKNOWN; +} + +int nfp_oserr( nfp_err nferr ) +{ + struct errstr *perr; + if(nferr == NFP_SUCCESS) + return 0; + perr= errtab; + while(perr->nferr) { + if(perr->nferr == nferr) + return perr->oserr; + perr++; + } + return EIO; +} diff --git a/usr/src/uts/common/io/overlay/overlay.c b/usr/src/uts/common/io/overlay/overlay.c new file mode 100644 index 0000000000..78bc418548 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.c @@ -0,0 +1,2172 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay Devices + * + * Overlay devices provide a means for creating overlay networks, a means of + * multiplexing multiple logical, isolated, and discrete layer two and layer + * three networks on top of one physical network. + * + * In general, these overlay devices encapsulate the logic to answer two + * different questions: + * + * 1) How should I transform a packet to put it on the wire? + * 2) Where should I send a transformed packet? + * + * Each overlay device is presented to the user as a GLDv3 device. While the + * link itself cannot have an IP interface created on top of it, it allows for + * additional GLDv3 devices, such as a VNIC, to be created on top of it which + * can be plumbed up with IP interfaces. + * + * + * -------------------- + * General Architecture + * -------------------- + * + * The logical overlay device that a user sees in dladm(1M) is a combination of + * two different components that work together. The first component is this + * kernel module, which is responsible for answering question one -- how should + * I transform a packet to put it on the wire. + * + * The second component is what we call the virtual ARP daemon, or varpd. It is + * a userland component that is responsible for answering the second question -- + * Where should I send a transformed packet. Instances of the kernel overlay + * GLDv3 device ask varpd the question of where should a packet go. + * + * The split was done for a few reasons. Importantly, we wanted to keep the act + * of generating encapsulated packets in the kernel so as to ensure that the + * general data path was fast and also kept simple. On the flip side, while the + * question of where should something go may be simple, it may often be + * complicated and need to interface with several different external or + * distributed systems. In those cases, it's simpler to allow for the full + * flexibility of userland to be brought to bear to solve that problem and in + * general, the path isn't very common. + * + * The following is what makes up the logical overlay device that a user would + * create with dladm(1M). + * + * Kernel Userland + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * . +--------+ +--------+ +--------+ . . . + * . | VNIC 0 | | VNIC 1 | | VNIC 2 | . . . + * . +--------+ +--------+ +--------+ . . . + * . | | | . . . + * . | | | . . . + * . +------------+-----------+ . . . + * . | . . /dev/overlay . + * . +--------------+ . . . +------------+ . + * . | | . . . | | . + * . | Overlay |======*=================| Virtual | . + * . | GLDv3 Device |========================| ARP Daemon | . + * . | | . . | | . + * . +--------------+ . . +------------+ . + * . | . . | . + * . | . . | . + * . +----------------+ . . +--------+ . + * . | Overlay | . . | varpd | . + * . | Encapsulation | . . | Lookup | . + * . | Plugin | . . | Plugin | . + * . +----------------+ . . +--------+ . + * . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . + * + * + * This image shows the two different components and where they live. + * Importantly, it also shows that both the kernel overlay device and the + * userland varpd both support plugins. The plugins actually implement the + * things that users care about and the APIs have been designed to try to + * minimize the amount of things that a module writer needs to worry about it. + * + * IDENTIFIERS + * + * Every overlay device is defined by a unique identifier which is the overlay + * identifier. Its purpose is similar to that of a VLAN identifier, it's a + * unique number that is used to differentiate between different entries on the + * wire. + * + * ENCAPSULATION + * + * An overlay encapsulation plugin is a kernel miscellaneous module whose + * purpose is to contain knowledge about how to transform packets to put them + * onto the wire and to take them off. An example of an encapsulation plugin is + * vxlan. It's also how support for things like nvgre or geneve would be brought + * into the system. + * + * Each encapsulation plugins defines a series of operation vectors and + * properties. For the full details on everything they should provide, please + * read uts/common/sys/overlay_plugin.h. The encapsulation plugin is responsible + * for telling the system what information is required to send a packet. For + * example, vxlan is defined to send everything over a UDP packet and therefore + * requires a port and an IP address, while nvgre on the other hand is its own + * IP type and therefore just requires an IP address. In addition, it also + * provides information about the kind of socket that should be created. This is + * used by the kernel multiplexor, more of that in the Kernel Components + * section. + * + * LOOKUPS + * + * The kernel communicates requests for lookups over the character device + * /dev/overlay. varpd is responsible for listening for requests on that device + * and answering them. The character device is specific to the target path and + * varpd. + * + * Much as the kernel overlay module handles the bulk of the scaffolding but + * leaves the important work to the encapsulation plugin, varpd provides a + * similar role and leaves the full brunt of lookups to a userland dynamic + * shared object which implements the logic of lookups. + * + * Each lookup plugin defines a series of operation vectors and properties. For + * the full details on everything that they should provide, please read + * lib/varpd/libvarpd/libvarpd_provider.h. Essentially, they are given a MAC + * address and asked to give an address on the physical network that it should + * be sent to. In addition, they handle questions related to how to handle + * things like broadcast and multicast traffic, etc. + * + * ---------- + * Properties + * ---------- + * + * A device from a dladm perspective has a unique set of properties that are + * combined from three different sources: + * + * 1) Generic properties that every overlay device has + * 2) Properties that are specific to the encapsulation plugin + * 3) Properties that are specific to the lookup plugin + * + * All of these are exposed in a single set of properties in dladm. Note that + * these are not necessarily traditional link properties. However, if something + * is both a traditional GLDv3 link property, say the MTU of a device, and a + * specific property here, than the driver ensures that all existing GLDv3 + * specific means of manipulating it are used and wraps up its private property + * interfaces to ensure that works. + * + * Properties in the second and third category are prefixed with the name of + * their module. For example, the vxlan encapsulation module has a property + * called the 'listen_ip'. This property would show up in dladm as + * 'vxlan/listen_ip'. This allows different plugins to both use similar names + * for similar properties and to also have independent name spaces so that + * overlapping names do not conflict with anything else. + * + * While the kernel combines both sets one and two into a single coherent view, + * it does not do anything with respect to the properties that are owned by the + * lookup plugin -- those are owned wholly by varpd. Instead, libdladm is in + * charge of bridging these two worlds into one magical experience for the user. + * It carries the burden of knowing about both overlay specific and varpd + * specific properties. Importantly, we want to maintain this distinction. We + * don't want to treat the kernel as an arbitrary key/value store for varpd and + * we want the kernel to own its own data and not have to ask userland for + * information that it owns. + * + * Every property in the system has the following attributes: + * + * o A name + * o A type + * o A size + * o Permissions + * o Default value + * o Valid value ranges + * o A value + * + * Everything except for the value is obtained by callers through the propinfo + * callbacks and a property has a maximum size of OVERLAY_PROP_SIZEMAX, + * currently 256 bytes. + * + * The following are the supported types of properties: + * + * OVERLAY_PROP_T_INT + * + * A signed integer, its length is 8 bytes, corresponding to a + * int64_t. + * + * OVERLAY_PROP_T_UINT + * + * An unsigned integer, its length is 8 bytes, corresponding to a + * uint64_t. + * + * OVERLAY_PROP_T_IP + * + * A struct in6_addr, it has a fixed size. + * + * OVERLAY_PROP_T_STRING + * + * A null-terminated character string encoded in either ASCII or + * UTF-8. Note that the size of the string includes the null + * terminator. + * + * The next thing that we apply to a property is its permission. The permissions + * are put together by the bitwise or of the following flags and values. + * + * OVERLAY_PROP_PERM_REQ + * + * This indicates a required property. A property that is required + * must be set by a consumer before the device can be created. If a + * required property has a default property, this constraint is + * loosened because the default property defines the value. + * + * OVERLAY_PORP_PERM_READ + * + * This indicates that a property can be read. All properties will + * have this value set. + * + * OVERLAY_PROP_PERM_WRITE + * + * This indicates that a property can be written to and thus + * updated by userland. Properties that are only intended to + * display information, will not have OVERLAY_PROP_PERM_WRITE set. + * + * In addition, a few additional values are defined as a convenience to + * consumers. The first, OVERLAY_PROP_PERM_RW, is a combination of + * OVERLAY_PROP_PERM_READ and OVERLAY_PERM_PROP_WRITE. The second, + * OVERLAY_PROP_PERM_RRW, is a combination of OVERLAY_PROP_PERM_REQ, + * OVERLAY_PROP_PERM_READ, and OVERLAY_PROP_PERM_WRITE. The protection mode of a + * property should generally be a constant across its lifetime. + * + * A property may optionally have a default value. If it does have a default + * value, and that property is not set to be a different value, then the default + * value is inherited automatically. It also means that if the default value is + * acceptable, there is no need to set the value for a required property. For + * example, the vxlan module has the vxlan/listen_port property which is + * required, but has a default value of 4789 (the IANA assigned port). Because + * of that default value, there is no need for it to be set. + * + * Finally, a property may declare a list of valid values. These valid values + * are used for display purposes, they are not enforced by the broader system, + * but merely allow a means for the information to be communicated to the user + * through dladm(1M). Like a default value, this is optional. + * + * The general scaffolding does not do very much with respect to the getting and + * setting of properties. That is really owned by the individual plugins + * themselves. + * + * ----------------------------- + * Destinations and Plugin Types + * ----------------------------- + * + * Both encapsulation and lookup plugins define the kinds of destinations that + * they know how to support. There are three different pieces of information + * that can be used to address to a destination currently, all of which is + * summarized in the type overlay_point_t. Any combination of these is + * supported. + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * An Ethernet MAC address is required. + * + * OVERLAY_PLUGIN_D_IP + * + * An IP address is required. All IP addresses used by the overlay + * system are transmitted as IPv6 addresses. IPv4 addresses can be + * represented by using IPv4-mapped IPv6 addresses. + * + * OVERLAY_PLUGIN_D_PORT + * + * A TCP/UDP port is required. + * + * A kernel encapsulation plugin declares which of these that it requires, it's + * a static set. On the other hand, a userland lookup plugin can be built to + * support all of these or any combination thereof. It gets passed the required + * destination type, based on the kernel encapsulation method, and then it makes + * the determination as to whether or not it supports it. For example, the + * direct plugin can support either an IP or both an IP and a port, it simply + * doesn't display the direct/dest_port property in the cases where a port is + * not required to support this. + * + * The user lookup plugins have two different modes of operation which + * determines how they interact with the broader system and how look ups are + * performed. These types are: + * + * OVERLAY_TARGET_POINT + * + * A point to point plugin has a single static definition for where + * to send all traffic. Every packet in the system always gets sent + * to the exact same destination which is programmed into the + * kernel when the general device is activated. + * + * OVERLAY_TARGET_DYNAMIC + * + * A dynamic plugin does not have a single static definition. + * Instead, for each destination, the kernel makes an asynchronous + * request to varpd to determine where the packet should be routed, + * and if a specific destination is found, then that destination is + * cached in the overlay device's target cache. + * + * This distinction, while important for the general overlay device's operation, + * is not important to the encapsulation plugins. They don't need to know about + * any of these pieces. It's just a concern for varpd, the userland plugin, and + * the general overlay scaffolding. + * + * When an overlay device is set to OVERLAY_TARGET_POINT, then it does not + * maintain a target cache, and instead just keeps track of the destination and + * always sends encapsulated packets to that address. When the target type is of + * OVERLAY_TARGET_DYNAMIC, then the kernel maintains a cache of all such + * destinations. These destinations are kept around in an instance of a + * reference hash that is specific to the given overlay device. Entries in the + * cache can be invalidated and replaced by varpd and its lookup plugins. + * + * ---------------------------------- + * Kernel Components and Architecture + * ---------------------------------- + * + * There are multiple pieces inside the kernel that work together, there is the + * general overlay_dev_t structure, which is the logical GLDv3 device, but it + * itself has references to things like an instance of an encapsulation plugin, + * a pointer to a mux and a target cache. It can roughly be summarized in the + * following image: + * + * +------------------+ + * | global | + * | overlay list | + * | overlay_dev_list | + * +------------------+ + * | + * | +-----------------------+ +---------------+ + * +->| GLDv3 Device |----------->| GLDv3 Device | -> ... + * | overlay_dev_t | | overlay_dev_t | + * | | +---------------+ + * | | + * | mac_handle_t -----+---> GLDv3 handle to MAC + * | datalink_id_t -----+---> Datalink ID used by DLS + * | overlay_dev_flag_t ---+---> Device state + * | uint_t -----+---> Curent device MTU + * | uint_t -----+---> In-progress RX operations + * | uint_t -----+---> In-progress TX operations + * | char[] -----+---> FMA degraded message + * | void * -----+---> plugin private data + * | overlay_target_t * ---+---------------------+ + * | overlay_plugin_t * ---+---------+ | + * +-----------------------+ | | + * ^ | | + * +--------------------+ | | | + * | Kernel Socket | | | | + * | Multiplexor | | | | + * | overlay_mux_t | | | | + * | | | | | + * | avl_tree_t -+--+ | | + * | uint_t -+--> socket family | | + * | uint_t -+--> socket type | | + * | uint_t -+--> socket protocol | | + * | ksocket_t -+--> I/O socket | | + * | struct sockaddr * -+--> ksocket address | | + * | overlay_plugin_t --+--------+ | | + * +--------------------+ | | | + * | | | + * +-------------------------+ | | | + * | Encap Plugin |<--+-----------+ | + * | overlay_plugin_t | | + * | | | + * | char * ---+--> plugin name | + * | overlay_plugin_ops_t * -+--> plugin downcalls | + * | char ** (props) ---+--> property list | + * | uint_t ---+--> id length | + * | overlay_plugin_flags_t -+--> plugin flags | + * | overlay_plugin_dest_t --+--> destination type v + * +-------------------------+ +-------------------------+ + * | Target Cache | + * | overlay_target_t | + * | | + * cache mode <--+- overlay_target_mode_t | + * dest type <--+- overlay_plugin_dest_t | + * cache flags <--+- overlay_target_flag_t | + * varpd id <--+- uint64_t | + * outstanding varpd reqs. <--+- uint_t | + * OVERLAY_TARGET_POINT state <--+- overlay_target_point_t | + * OVERLAY_TARGET_DYNAMIC state <-+---+- overlay_target_dyn_t | + * | +-------------------------+ + * +-----------------------+ + * | + * v + * +-------------------------------+ +------------------------+ + * | Target Entry |-->| Target Entry |--> ... + * | overlay_target_entry_t | | overlay_target_entry_t | + * | | +------------------------+ + * | | + * | overlay_target_entry_flags_t -+--> Entry flags + * | uint8_t[ETHERADDRL] ---+--> Target MAC address + * | overlay_target_point_t ---+--> Target underlay address + * | mblk_t * ---+--> outstanding mblk head + * | mblk_t * ---+--> outstanding mblk tail + * | size_t ---+--> outstanding mblk size + * +-------------------------------+ + * + * The primary entries that we care about are the overlay_dev_t, which + * correspond to each overlay device that is created with dladm(1M). Globally, + * these devices are maintained in a simple list_t which is protected with a + * lock. Hence, these include important information such as the mac_handle_t + * and a datalink_id_t which is used to interact with the broader MAC and DLS + * ecosystem. We also maintain additional information such as the current state, + * outstanding operations, the mtu, and importantly, the plugin's private data. + * This is the instance of an encapsulation plugin that gets created as part of + * creating an overlay device. Another aspect of this is that the overlay_dev_t + * also includes information with respect to FMA. For more information, see the + * FMA section. + * + * Each overlay_dev_t has a pointer to a plugin, a mux, and a target. The plugin + * is the encapsulation plugin. This allows the device to make downcalls into it + * based on doing things like getting and setting properties. Otherwise, the + * plugin itself is a fairly straightforward entity. They are maintained in an + * (not pictured above) list. The plugins themselves mostly maintain things like + * the static list of properties, what kind of destination they require, and the + * operations vector. A given module may contain more if necessary. + * + * The next piece of the puzzle is the mux, or a multiplexor. The mux itself + * maintains a ksocket and it is through the mux that we send and receive + * message blocks. The mux represents a socket type and address, as well as a + * plugin. Multiple overlay_dev_t devices may then share the same mux. For + * example, consider the case where you have different instances of vxlan all on + * the same underlay network. These would all logically share the same IP + * address and port that packets are sent and received on; however, what differs + * is the decapuslation ID. + * + * Each mux maintains a ksocket_t which is similar to a socket(3SOCKET). Unlike + * a socket, we enable a direct callback on the ksocket. This means that + * whenever a message block chain is received, rather than sitting there and + * getting a callback in a context and kicking that back out to a taskq. Instead + * data comes into the callback function overlay_mux_recv(). + * + * The mux is given encapsulated packets (via overlay_m_tx, the GLDv3 tx + * function) to transmit. It receives encapsulated packets, decapsulates them to + * determine the overlay identifier, looks up the given device that matches that + * identifier, and then causes the broader MAC world to receive the packet with + * a call to mac_rx(). + * + * Today, we don't do too much that's special with the ksocket; however, as + * hardware is gaining understanding for these encapuslation protocols, we'll + * probably want to think of better ways to get those capabilities passed down + * and potentially better ways to program receive filters so they get directly + * to us. Though, that's all fantasy future land. + * + * The next part of the puzzle is the target cache. The purpose of the target + * cache is to cache where we should send a packet on the underlay network, + * given its mac address. The target cache operates in two modes depending on + * whether the lookup module was declared to OVERLAY_TARGET_POINT or + * OVERLAY_TARGET_DYANMIC. + * + * In the case where the target cache has been programmed to be + * OVERLAY_TARGET_POINT, then we only maintain a single overlay_target_point_t + * which has the destination that we send everything, no matter the destination + * mac address. + * + * On the other hand, when we have an instance of OVERLAY_TARGET_DYNAMIC, things + * are much more interesting and as a result, more complicated. We primarily + * store lists of overlay_target_entry_t's which are stored in both an avl tree + * and a refhash_t. The primary look up path uses the refhash_t and the avl tree + * is only used for a few of the target ioctls used to dump data such that we + * can get a consistent iteration order for things like dladm show-overlay -t. + * The key that we use for the reference hashtable is based on the mac address + * in the cache and currently we just do a simple CRC32 to transform it into a + * hash. + * + * Each entry maintains a set of flags to indicate the current status of the + * request. The flags may indicate one of three states: that current cache entry + * is valid, that the current cache entry has been directed to drop all output, + * and that the current cache entry is invalid and may be being looked up. In + * the case where it's valid, we just take the destination address and run with + * it. + * + * If it's invalid and a lookup has not been made, then we start the process + * that prepares a query that will make its way up to varpd. The cache entry + * entry maintains a message block chain of outstanding message blocks and a + * size. These lists are populated only when we don't know the answer as to + * where should these be sent. The size entry is used to cap the amount of + * outstanding data that we don't know the answer to. If we exceed a cap on the + * amount of outstanding data (currently 1 Mb), then we'll drop any additional + * packets. Once we get an answer indicating a valid destination, we transmit + * any outstanding data to that place. For the full story on how we look that up + * will be discussed in the section on the Target Cache Lifecycle. + * + * ------------------------ + * FMA and Degraded Devices + * ------------------------ + * + * Every kernel overlay device keeps track of its FMA state. Today in FMA we + * cannot represent partitions between resources nor can we represent that a + * given minor node of a psuedo device has failed -- if we degrade the overlay + * device, then the entire dev_info_t is degraded. However, we still want to be + * able to indicate to administrators that things may go wrong. + * + * To this end, we've added a notion of a degraded state to every overlay + * device. This state is primarily dictated by userland and it can happen for + * various reasons. Generally, because a userland lookup plugin has been + * partitioned, or something has gone wrong such that there is no longer any + * userland lookup module for a device, then we'll mark it degraded. + * + * As long as any of our minor instances is degraded, then we'll fire off the + * FMA event to note that. Once the last degraded instance is no longer + * degraded, then we'll end up telling FMA that we're all clean. + * + * To help administrators get a better sense of which of the various minor + * devices is wrong, we store the odd_fmamsg[] character array. This character + * array can be fetched with doing a dladm show-overlay -f. + * + * When an overlay device is degraded, we update the link status of a device to + * ensure that dladm and other tooling that is trying to check the link status + * note that it is down due to this error. + * + * ----------------------- + * Target Cache Life Cycle + * ----------------------- + * + * This section only applies when we have a lookup plugin of + * OVERLAY_TARGET_DYNAMIC. None of this applies to those of type + * OVERLAY_TARGET_POINT. + * + * While we got into the target cache in the general architecture section, it's + * worth going into more details as to how this actually works and showing some + * examples and state machines. Recall that a target cache entry basically has + * the following state transition diagram: + * + * Initial state + * . . . . . . first access . . . varpd lookup enqueued + * . . . + * . . . + * +-------+ . +----------+ . + * | No |------*---->| Invalid |-------*----+ + * | Entry | | Entry | | + * +-------+ +----------+ | + * varpd ^ ^ varpd | + * invalidate | | drop | + * . . . * * . . v + * +-------+ | | +---------+ + * | Entry |--->-----+ +----<----| Entry | + * | Valid |<----------*---------<----| Pending |->-+ varpd + * +-------+ . +---------+ * . . drop, but + * . varpd ^ | other queued + * . success | | entries + * +-----+ + * + * When the table is first created, it is empty. As we attempt to lookup entries + * and we find there is no entry at all, we'll create a new table entry for it. + * At that point the entry is technically in an invalid state, that means that + * we have no valid data from varpd. In that case, we'll go ahead and queue the + * packet into the entry's pending chain, and queue a varpd lookup, setting the + * OVERLAY_ENTRY_F_PENDING flag in the progress. + * + * If additional mblk_t's come in for this entry, we end up appending them to + * the tail of the chain, if and only if, we don't exceed the threshold for the + * amount of space they can take up. An entry remains pending until we get a + * varpd reply. If varpd replies with a valid results, we move to the valid + * entry state, and remove the OVERLAY_ENTRY_F_PENDING flag and set it with one + * of OVERLAY_ENTRY_F_VALID or OVERLAY_ENTRY_F_DROP as appropriate. + * + * Once an entry is valid, it stays valid until user land tells us to invalidate + * it with an ioctl or replace it, OVERLAY_TARG_CACHE_REMOE and + * OVERLAY_TARG_CACHE_SET respectively. + * + * If the lookup fails with a call to drop the packet, then the next state is + * determined by the state of the queue. If the set of outstanding entries is + * empty, then we just transition back to the invalid state. If instead, the + * set of outstanding entries is not empty, then we'll queue another entry and + * stay in the same state, repeating this until the number of requests is + * drained. + * + * The following images describes the flow of a given lookup and where the + * overlay_target_entry_t is at any given time. + * + * +-------------------+ + * | Invalid Entry | An entry starts off as an invalid entry + * | de:ad:be:ef:00:00 | and only exists in the target cache. + * +-------------------+ + * + * ~~~~ + * + * +---------------------+ + * | Global list_t | A mblk_t comes in for an entry. We + * | overlay_target_list | append it to the overlay_target_list. + * +---------------------+ + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry |--->... + * | 42:5e:1a:10:d6:2d | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +--------------------------+ + * | /dev/overlay minor state | User land said that it would look up an + * | overlay_target_hdl_t | entry for us. We remove it from the + * +--------------------------+ global list and add it to the handle's + * | outstanding list. + * | + * v + * +-------------------+ +-------------------+ + * | Pending Entry |----->| Pending Entry | + * | 90:b8:d0:79:02:dd | | de:ad:be:ef:00:00 | + * +-------------------+ +-------------------+ + * + * ~~~~ + * + * +-------------------+ + * | Valid Entry | varpd returned an answer with + * | de:ad:be:ef:00:00 | OVERLAY_IOC_RESPOND and the target cache + * | 10.169.23.42:4789 | entry is now populated with a + * +-------------------+ destination and marked as valid + * + * + * The lookup mechanism is performed via a series of operations on the character + * psuedo-device /dev/overlay. The only thing that uses this device is the + * userland daemon varpd. /dev/overlay is a cloneable device, each open of it + * granting a new minor number which maintains its own state. We maintain this + * state so that way if an outstanding lookup was queued to something that + * crashed or closed its handle without responding, we can know about this and + * thus handle it appropriately. + * + * When a lookup is first created it's added to our global list of outstanding + * lookups. To service requests, userland is required to perform an ioctl to ask + * for a request. We will block it in the kernel a set amount of time waiting + * for a request. When we give a request to a given minor instance of the + * device, we remove it from the global list and append the request to the + * device's list of outstanding entries, for the reasons we discussed above. + * When a lookup comes in, we give user land a smaller amount of information + * specific to that packet, the overlay_targ_lookup_t. It includes a request id + * to identify this, and then the overlay id, the varpd id, the header and + * packet size, the source and destination mac address, the SAP, and any + * potential VLAN header. + * + * At that point, it stays in that outstanding list until one of two ioctls are + * returned: OVERLAY_TARG_RESPOND or OVERLAY_TARG_DROP. During this time, + * userland may also perform other operations. For example, it may use + * OVERLAY_TARG_PKT to get a copy of this packet so it can perform more in-depth + * analysis of what to do beyond what we gave it initially. This is useful for + * providing proxy arp and the like. Finally, there are two other ioctls that + * varpd can then do. The first is OVERLAY_TARG_INJECT which injects the + * non-jumbo frame packet up into that mac device and OVERLAY_TARG_RESEND which + * causes us to encapsulate and send out the packet they've given us. + * + * + * Finally, through the target cache, several ioctls are provided to allow for + * interrogation and management of the cache. They allow for individual entries + * to be retrieved, set, or have the entire table flushed. For the full set of + * ioctls here and what they do, take a look at uts/common/sys/overlay_target.h. + * + * ------------------ + * Sample Packet Flow + * ------------------ + * + * There's a lot of pieces here, hopefully an example of how this all fits + * together will help clarify and elucidate what's going on. We're going to + * first track an outgoing packet, eg. one that is sent from an IP interface on + * a VNIC on top of an overlay device, and then we'll look at what it means to + * respond to that. + * + * + * +----------------+ +--------------+ +------------------+ + * | IP/DLS send |------->| MAC sends it |----------->| mblk_t reaches | + * | packet to MAC | | to the GLDv3 | | overlay GLDv3 tx | + * +----------------+ | VNIC device | | overlay_m_tx() | + * +--------------+ +------------------+ + * | + * . lookup . cache | + * . drop . miss v + * +---------+ . +--------+ . +------------------+ + * | freemsg |<-----*-------| varpd |<---*------| Lookup each mblk | + * | mblk_t | | lookup | | in the target | + * +---------+ | queued | | cache | + * ^ +--------+ +------------------+ + * on send | | | cache + * error . . * *. . lookup * . . hit + * | | success v + * | | +------------------+ + * +-----------------+ +--------------->| call plugin | + * | Send out | | ovpo_encap() to | + * | overlay_mux_t's |<----------------------------------| get encap mblk_t | + * | ksocket | +------------------+ + * +-----------------+ + * + * The receive end point looks a little different and looks more like: + * + * +------------------+ +----------------+ +-----------+ + * | mblk_t comes off |---->| enter netstack |--->| delivered |---+ + * | the physical | | IP stack | | to | * . . direct + * | device | +----------------+ | ksocket | | callback + * +------------------+ +-----------+ | + * . overlay id | + * . not found v + * +-----------+ . +-----------------+ +--------------------+ + * | freemsg |<--*------| call plugin |<------| overlay_mux_recv() | + * | mblk_t | | ovpo_decap() to | +--------------------+ + * +-----------+ | decap mblk_t | + * +-----------------+ + * | + * * . . overlay id + * v found + * +--------+ +----------------+ + * | adjust |----->| call mac_rx | + * | mblk_t | | on original | + * +--------+ | decaped packet | + * +----------------+ + * + * ------------------ + * Netstack Awareness + * ------------------ + * + * In the above image we note that this enters a netstack. Today the only + * netstack that can be is the global zone as the overlay driver itself is not + * exactly netstack aware. What this really means is that varpd cannot run in a + * non-global zone and an overlay device cannot belong to a non-global zone. + * Non-global zones can still have a VNIC assigned to them that's been created + * over the overlay device the same way they would if it had been created over + * an etherstub or a physical device. + * + * The majority of the work to make it netstack aware is straightforward and the + * biggest thing is to create a netstack module that allows us to hook into + * netstack (and thus zone) creation and destruction. From there, we need to + * amend the target cache lookup routines that we discussed earlier to not have + * a global outstanding list and a global list of handles, but rather, one per + * netstack. + * + * For the mux, we'll need to open the ksocket in the context of the zone, we + * can likely do this with a properly composed credential, but we'll need to do + * some more work on that path. Finally, we'll want to make sure the dld ioctls + * are aware of the zoneid of the caller and we use that appropriately and store + * it in the overlay_dev_t. + * + * ----------- + * GLDv3 Notes + * ----------- + * + * The overlay driver implements a GLDv3 device. Parts of GLDv3 are more + * relevant and other parts are much less relevant for us. For example, the + * GLDv3 is used to toggle the device being put into and out of promiscuous + * mode, to program MAC addresses for unicast and multicast hardware filters. + * Today, an overlay device doesn't have a notion of promiscuous mode nor does + * it have a notion of unicast and multicast addresses programmed into the + * device. Instead, for the purposes of the hardware filter, we don't do + * anything and just always accept new addresses being added and removed. + * + * If the GLDv3 start function has not been called, then we will not use this + * device for I/O purposes. Any calls to transmit or receive should be dropped, + * though the GLDv3 guarantees us that transmit will not be called without + * calling start. Similarly, once stop is called, then no packets can be dealt + * with. + * + * Today we don't support the stat interfaces, though there's no good reason + * that we shouldn't assemble some of the stats based on what we have in the + * future. + * + * When it comes to link properties, many of the traditional link properties do + * not apply and many others MAC handles for us. For example, we don't need to + * implement anything for overlay_m_getprop() to deal with returning the MTU, as + * MAC never calls into us for that. As such, there isn't much of anything to + * support in terms of properties. + * + * Today, we don't support any notion of hardware capabilities. However, if + * future NIC hardware or other changes to the system cause it to make sense for + * us to emulate logical groups, then we should do that. However, we still do + * implement a capab function so that we can identify ourselves as an overlay + * device to the broader MAC framework. This is done mostly so that a device + * created on top of us can have fanout rings as we don't try to lie about a + * speed for our device. + * + * The other question is what should be done for a device's MTU and margin. We + * set our minimum supported MTU to be the minimum value that an IP network may + * be set to 576 -- which mimics what an etherstub does. On the flip side, we + * have our upper bound set to 8900. This value comes from the fact that a lot + * of jumbo networks use their maximum as 9000. As such, we want to reserve 100 + * bytes, which isn't exactly the most accurate number, but it'll be good enough + * for now. Because of that, our default MTU off of these devices is 1400, as + * the default MTU for everything is usually 1500 or whatever the underlying + * device is at; however, this is a bit simpler than asking the netstack what + * are all the IP interfaces at. It also calls into question how PMTU and PMTU + * discovery should work here. The challenge, especially for + * OVERLAY_TARG_DYNAMIC is that the MTU to any of the places will vary and it's + * not clear that if you have a single bad entry that the overall MTU should be + * lowered. Instead, we should figure out a better way of determining these + * kinds of PMTU errors and appropriately alerting the administrator via FMA. + * + * Regarding margin, we allow a margin of up to VLAN_TAGSZ depending on whether + * or not the underlying encapsulation device supports VLAN tags. If it does, + * then we'll set the margin to allow for it, otherwise, we will not. + */ + +#include <sys/conf.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/modctl.h> +#include <sys/policy.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/ddifm.h> + +#include <sys/dls.h> +#include <sys/dld_ioc.h> +#include <sys/mac_provider.h> +#include <sys/mac_client_priv.h> +#include <sys/mac_ether.h> +#include <sys/vlan.h> + +#include <sys/overlay_impl.h> + +dev_info_t *overlay_dip; +static kmutex_t overlay_dev_lock; +static list_t overlay_dev_list; +static uint8_t overlay_macaddr[ETHERADDRL] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + +typedef enum overlay_dev_prop { + OVERLAY_DEV_P_MTU = 0, + OVERLAY_DEV_P_VNETID, + OVERLAY_DEV_P_ENCAP, + OVERLAY_DEV_P_VARPDID +} overlay_dev_prop_t; + +#define OVERLAY_DEV_NPROPS 4 +static const char *overlay_dev_props[] = { + "mtu", + "vnetid", + "encap", + "varpd/id" +}; + +#define OVERLAY_MTU_MIN 576 +#define OVERLAY_MTU_DEF 1400 +#define OVERLAY_MTU_MAX 8900 + +overlay_dev_t * +overlay_hold_by_dlid(datalink_id_t id) +{ + overlay_dev_t *o; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (id == o->odd_linkid) { + mutex_enter(&o->odd_lock); + o->odd_ref++; + mutex_exit(&o->odd_lock); + mutex_exit(&overlay_dev_lock); + return (o); + } + } + + mutex_exit(&overlay_dev_lock); + return (NULL); +} + +void +overlay_hold_rele(overlay_dev_t *odd) +{ + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_ref > 0); + odd->odd_ref--; + mutex_exit(&odd->odd_lock); +} + +void +overlay_io_start(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) + odd->odd_rxcount++; + if (flag & OVERLAY_F_IN_TX) + odd->odd_txcount++; + odd->odd_flags |= flag; +} + +void +overlay_io_done(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + boolean_t signal = B_FALSE; + + ASSERT(flag == OVERLAY_F_IN_RX || flag == OVERLAY_F_IN_TX); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + if (flag & OVERLAY_F_IN_RX) { + ASSERT(odd->odd_rxcount > 0); + odd->odd_rxcount--; + if (odd->odd_rxcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_RX; + } + } + if (flag & OVERLAY_F_IN_TX) { + ASSERT(odd->odd_txcount > 0); + odd->odd_txcount--; + if (odd->odd_txcount == 0) { + signal = B_TRUE; + odd->odd_flags &= ~OVERLAY_F_IN_TX; + } + } + + if (signal == B_TRUE) + cv_broadcast(&odd->odd_iowait); +} + +static void +overlay_io_wait(overlay_dev_t *odd, overlay_dev_flag_t flag) +{ + ASSERT((flag & ~OVERLAY_F_IOMASK) == 0); + ASSERT(MUTEX_HELD(&odd->odd_lock)); + + while (odd->odd_flags & flag) { + cv_wait(&odd->odd_iowait, &odd->odd_lock); + } +} + +void +overlay_dev_iter(overlay_dev_iter_f func, void *arg) +{ + overlay_dev_t *odd; + + mutex_enter(&overlay_dev_lock); + for (odd = list_head(&overlay_dev_list); odd != NULL; + odd = list_next(&overlay_dev_list, odd)) { + if (func(odd, arg) != 0) { + mutex_exit(&overlay_dev_lock); + return; + } + } + mutex_exit(&overlay_dev_lock); +} + +/* ARGSUSED */ +static int +overlay_m_stat(void *arg, uint_t stat, uint64_t *val) +{ + return (ENOTSUP); +} + +static int +overlay_m_start(void *arg) +{ + overlay_dev_t *odd = arg; + overlay_mux_t *mux; + int ret, domain, family, prot; + struct sockaddr_storage storage; + socklen_t slen; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0) { + mutex_exit(&odd->odd_lock); + return (EAGAIN); + } + mutex_exit(&odd->odd_lock); + + ret = odd->odd_plugin->ovp_ops->ovpo_socket(odd->odd_pvoid, &domain, + &family, &prot, (struct sockaddr *)&storage, &slen); + if (ret != 0) + return (ret); + + mux = overlay_mux_open(odd->odd_plugin, domain, family, prot, + (struct sockaddr *)&storage, slen, &ret); + if (mux == NULL) + return (ret); + + overlay_mux_add_dev(mux, odd); + odd->odd_mux = mux; + mutex_enter(&odd->odd_lock); + ASSERT(!(odd->odd_flags & OVERLAY_F_IN_MUX)); + odd->odd_flags |= OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); + + return (0); +} + +static void +overlay_m_stop(void *arg) +{ + overlay_dev_t *odd = arg; + + /* + * The MAC Perimeter is held here, so we don't have to worry about + * synchornizing this with respect to metadata operations. + */ + mutex_enter(&odd->odd_lock); + VERIFY(odd->odd_flags & OVERLAY_F_IN_MUX); + VERIFY(!(odd->odd_flags & OVERLAY_F_MDDROP)); + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + overlay_mux_close(odd->odd_mux); + odd->odd_mux = NULL; + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + odd->odd_flags &= ~OVERLAY_F_MDDROP; + VERIFY((odd->odd_flags & OVERLAY_F_STOPMASK) == 0); + mutex_exit(&odd->odd_lock); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_promisc(void *arg, boolean_t on) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_multicast(void *arg, boolean_t add, const uint8_t *addrp) +{ + return (0); +} + +/* + * For more info on this, see the big theory statement. + */ +/* ARGSUSED */ +static int +overlay_m_unicast(void *arg, const uint8_t *macaddr) +{ + return (0); +} + +mblk_t * +overlay_m_tx(void *arg, mblk_t *mp_chain) +{ + overlay_dev_t *odd = arg; + mblk_t *mp, *ep; + int ret; + ovep_encap_info_t einfo; + struct msghdr hdr; + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + freemsgchain(mp_chain); + return (NULL); + } + overlay_io_start(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + + bzero(&hdr, sizeof (struct msghdr)); + + bzero(&einfo, sizeof (ovep_encap_info_t)); + einfo.ovdi_id = odd->odd_vid; + mp = mp_chain; + while (mp != NULL) { + socklen_t slen; + struct sockaddr_storage storage; + + mp_chain = mp->b_next; + mp->b_next = NULL; + ep = NULL; + + ret = overlay_target_lookup(odd, mp, + (struct sockaddr *)&storage, &slen); + if (ret != OVERLAY_TARGET_OK) { + if (ret == OVERLAY_TARGET_DROP) + freemsg(mp); + mp = mp_chain; + continue; + } + + hdr.msg_name = &storage; + hdr.msg_namelen = slen; + + ret = odd->odd_plugin->ovp_ops->ovpo_encap(odd->odd_mh, mp, + &einfo, &ep); + if (ret != 0 || ep == NULL) { + freemsg(mp); + goto out; + } + + ep->b_cont = mp; + ret = overlay_mux_tx(odd->odd_mux, &hdr, ep); + if (ret != 0) + goto out; + + mp = mp_chain; + } + +out: + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_TX); + mutex_exit(&odd->odd_lock); + return (mp_chain); +} + +/* ARGSUSED */ +static void +overlay_m_ioctl(void *arg, queue_t *q, mblk_t *mp) +{ + miocnak(q, mp, 0, ENOTSUP); +} + +/* ARGSUSED */ +static boolean_t +overlay_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) +{ + /* + * Tell MAC we're an overlay. + */ + if (cap == MAC_CAPAB_OVERLAY) + return (B_TRUE); + return (B_FALSE); +} + +/* ARGSUSED */ +static int +overlay_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + uint32_t mtu, old; + int err; + overlay_dev_t *odd = arg; + + if (pr_num != MAC_PROP_MTU) + return (ENOTSUP); + + bcopy(pr_val, &mtu, sizeof (mtu)); + if (mtu < OVERLAY_MTU_MIN || mtu > OVERLAY_MTU_MAX) + return (EINVAL); + + mutex_enter(&odd->odd_lock); + old = odd->odd_mtu; + odd->odd_mtu = mtu; + err = mac_maxsdu_update(odd->odd_mh, mtu); + if (err != 0) + odd->odd_mtu = old; + mutex_exit(&odd->odd_lock); + + return (err); +} + +/* ARGSUSED */ +static int +overlay_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + return (ENOTSUP); +} + +/* ARGSUSED */ +static void +overlay_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + if (pr_num != MAC_PROP_MTU) + return; + + mac_prop_info_set_default_uint32(prh, OVERLAY_MTU_DEF); + mac_prop_info_set_range_uint32(prh, OVERLAY_MTU_MIN, OVERLAY_MTU_MAX); +} + +static mac_callbacks_t overlay_m_callbacks = { + .mc_callbacks = (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_GETPROP | + MC_PROPINFO), + .mc_getstat = overlay_m_stat, + .mc_start = overlay_m_start, + .mc_stop = overlay_m_stop, + .mc_setpromisc = overlay_m_promisc, + .mc_multicst = overlay_m_multicast, + .mc_unicst = overlay_m_unicast, + .mc_tx = overlay_m_tx, + .mc_ioctl = overlay_m_ioctl, + .mc_getcapab = overlay_m_getcapab, + .mc_getprop = overlay_m_getprop, + .mc_setprop = overlay_m_setprop, + .mc_propinfo = overlay_m_propinfo +}; + +static boolean_t +overlay_valid_name(const char *name, size_t buflen) +{ + size_t actlen; + int err, i; + + for (i = 0; i < buflen; i++) { + if (name[i] == '\0') + break; + } + + if (i == 0 || i == buflen) + return (B_FALSE); + actlen = i; + if (strchr(name, '/') != NULL) + return (B_FALSE); + if (u8_validate((char *)name, actlen, NULL, + U8_VALIDATE_ENTIRE, &err) < 0) + return (B_FALSE); + return (B_TRUE); +} + +/* ARGSUSED */ +static int +overlay_i_create(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int err; + uint64_t maxid; + overlay_dev_t *odd, *o; + mac_register_t *mac; + overlay_ioc_create_t *oicp = karg; + + if (overlay_valid_name(oicp->oic_encap, MAXLINKNAMELEN) == B_FALSE) + return (EINVAL); + + odd = kmem_zalloc(sizeof (overlay_dev_t), KM_SLEEP); + odd->odd_linkid = oicp->oic_linkid; + odd->odd_plugin = overlay_plugin_lookup(oicp->oic_encap); + if (odd->odd_plugin == NULL) { + kmem_free(odd, sizeof (overlay_dev_t)); + return (ENOENT); + } + err = odd->odd_plugin->ovp_ops->ovpo_init((overlay_handle_t)odd, + &odd->odd_pvoid); + if (err != 0) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + /* + * Make sure that our virtual network id is valid for the given plugin + * that we're working with. + */ + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - 1ULL; + if (oicp->oic_vnetid > maxid) { + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + odd->odd_vid = oicp->oic_vnetid; + + mac = mac_alloc(MAC_VERSION); + if (mac == NULL) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EINVAL); + } + + mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + mac->m_driver = odd; + mac->m_dip = overlay_dip; + mac->m_dst_addr = NULL; + mac->m_callbacks = &overlay_m_callbacks; + mac->m_pdata = NULL; + mac->m_pdata_size = 0; + + mac->m_priv_props = NULL; + + /* Let mac handle this itself. */ + mac->m_instance = (uint_t)-1; + + /* + * There is no real source address that should be used here, but saying + * that we're not ethernet is going to cause its own problems. At the + * end of the say, this is fine. + */ + mac->m_src_addr = overlay_macaddr; + + /* + * Start with the default MTU as the max SDU. If the MTU is changed, the + * SDU will be changed to reflect that. + */ + mac->m_min_sdu = 1; + mac->m_max_sdu = OVERLAY_MTU_DEF; + mac->m_multicast_sdu = 0; + + /* + * The underlying device doesn't matter, instead this comes from the + * encapsulation protocol and whether or not they allow VLAN tags. + */ + if (odd->odd_plugin->ovp_flags & OVEP_F_VLAN_TAG) { + mac->m_margin = VLAN_TAGSZ; + } else { + mac->m_margin = 0; + } + + /* + * Today, we have no MAC virtualization, it may make sense in the future + * to go ahead and emulate some subset of this, but it doesn't today. + */ + mac->m_v12n = MAC_VIRT_NONE; + + mutex_enter(&overlay_dev_lock); + for (o = list_head(&overlay_dev_list); o != NULL; + o = list_next(&overlay_dev_list, o)) { + if (o->odd_linkid == oicp->oic_linkid) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + + if (o->odd_vid == oicp->oic_vnetid && + o->odd_plugin == odd->odd_plugin) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (EEXIST); + } + } + + err = mac_register(mac, &odd->odd_mh); + mac_free(mac); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + err = dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + if (err != 0) { + mutex_exit(&overlay_dev_lock); + (void) mac_unregister(odd->odd_mh); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + return (err); + } + + mutex_init(&odd->odd_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&odd->odd_iowait, NULL, CV_DRIVER, NULL); + odd->odd_ref = 0; + odd->odd_flags = 0; + list_insert_tail(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_activate(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + int i, ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_activate_t *oiap = karg; + overlay_ioc_propinfo_t *infop; + overlay_ioc_prop_t *oip; + overlay_prop_handle_t phdl; + + odd = overlay_hold_by_dlid(oiap->oia_linkid); + if (odd == NULL) + return (ENOENT); + + infop = kmem_alloc(sizeof (overlay_ioc_propinfo_t), KM_SLEEP); + oip = kmem_alloc(sizeof (overlay_ioc_prop_t), KM_SLEEP); + phdl = (overlay_prop_handle_t)infop; + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EEXIST); + } + mutex_exit(&odd->odd_lock); + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + const char *pname = odd->odd_plugin->ovp_props[i]; + bzero(infop, sizeof (overlay_ioc_propinfo_t)); + overlay_prop_init(phdl); + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo(pname, phdl); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + + if ((infop->oipi_prot & OVERLAY_PROP_PERM_REQ) == 0) + continue; + bzero(oip, sizeof (overlay_ioc_prop_t)); + oip->oip_size = sizeof (oip->oip_value); + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + pname, oip->oip_value, &oip->oip_size); + if (ret != 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ret); + } + if (oip->oip_size == 0) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (EINVAL); + } + } + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_VARPD) == 0) { + mutex_exit(&odd->odd_lock); + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + return (ENXIO); + } + + ASSERT((odd->odd_flags & OVERLAY_F_ACTIVATED) == 0); + odd->odd_flags |= OVERLAY_F_ACTIVATED; + mutex_exit(&odd->odd_lock); + + mac_perim_exit(mph); + overlay_hold_rele(odd); + kmem_free(infop, sizeof (overlay_ioc_propinfo_t)); + kmem_free(oip, sizeof (overlay_ioc_prop_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_delete(void *karg, intptr_t arg, int mode, cred_t *cred, int *rvalp) +{ + overlay_ioc_delete_t *oidp = karg; + overlay_dev_t *odd; + datalink_id_t tid; + int ret; + + odd = overlay_hold_by_dlid(oidp->oid_linkid); + if (odd == NULL) { + return (ENOENT); + } + + mutex_enter(&odd->odd_lock); + /* If we're not the only hold, we're busy */ + if (odd->odd_ref != 1) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + if (odd->odd_flags & OVERLAY_F_IN_MUX) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (EBUSY); + } + + /* + * To remove this, we need to first remove it from dls and then remove + * it from mac. The act of removing it from mac will check if there are + * devices on top of this, eg. vnics. If there are, then that will fail + * and we'll have to go through and recreate the dls entry. Only after + * mac_unregister has succeeded, then we'll go through and actually free + * everything and drop the dev lock. + */ + ret = dls_devnet_destroy(odd->odd_mh, &tid, B_TRUE); + if (ret != 0) { + overlay_hold_rele(odd); + return (ret); + } + + ASSERT(oidp->oid_linkid == tid); + ret = mac_disable(odd->odd_mh); + if (ret != 0) { + (void) dls_devnet_create(odd->odd_mh, odd->odd_linkid, + crgetzoneid(cred)); + overlay_hold_rele(odd); + return (ret); + } + + overlay_target_quiesce(odd->odd_target); + + mutex_enter(&overlay_dev_lock); + list_remove(&overlay_dev_list, odd); + mutex_exit(&overlay_dev_lock); + + cv_destroy(&odd->odd_iowait); + mutex_destroy(&odd->odd_lock); + overlay_target_free(odd); + odd->odd_plugin->ovp_ops->ovpo_fini(odd->odd_pvoid); + overlay_plugin_rele(odd->odd_plugin); + kmem_free(odd, sizeof (overlay_dev_t)); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_nprops(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_nprops_t *on = karg; + + odd = overlay_hold_by_dlid(on->oipn_linkid); + if (odd == NULL) + return (ENOENT); + on->oipn_nprops = odd->odd_plugin->ovp_nprops + OVERLAY_DEV_NPROPS; + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_propinfo_plugin_cb(overlay_plugin_t *opp, void *arg) +{ + overlay_prop_handle_t phdl = arg; + overlay_prop_set_range_str(phdl, opp->ovp_name); + return (0); +} + +static int +overlay_i_name_to_propid(overlay_dev_t *odd, const char *name, uint_t *id) +{ + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], name) == 0) { + *id = i; + return (0); + } + } + + for (i = 0; i < odd->odd_plugin->ovp_nprops; i++) { + if (strcmp(odd->odd_plugin->ovp_props[i], name) == 0) { + *id = i + OVERLAY_DEV_NPROPS; + return (0); + } + } + + return (ENOENT); +} + +static void +overlay_i_propinfo_mtu(overlay_dev_t *odd, overlay_prop_handle_t phdl) +{ + uint32_t def; + mac_propval_range_t range; + uint_t perm; + + ASSERT(MAC_PERIM_HELD(odd->odd_mh)); + + bzero(&range, sizeof (mac_propval_range_t)); + range.mpr_count = 1; + if (mac_prop_info(odd->odd_mh, MAC_PROP_MTU, "mtu", &def, + sizeof (def), &range, &perm) != 0) + return; + + if (perm == MAC_PROP_PERM_READ) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + else if (perm == MAC_PROP_PERM_WRITE) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_WRITE); + else if (perm == MAC_PROP_PERM_RW) + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_default(phdl, &def, sizeof (def)); + overlay_prop_set_range_uint32(phdl, range.mpr_range_uint32[0].mpur_min, + range.mpr_range_uint32[0].mpur_max); +} + +/* ARGSUSED */ +static int +overlay_i_propinfo(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + int ret; + mac_perim_handle_t mph; + uint_t propid = UINT_MAX; + overlay_ioc_propinfo_t *oip = karg; + overlay_prop_handle_t phdl = (overlay_prop_handle_t)oip; + + odd = overlay_hold_by_dlid(oip->oipi_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_prop_init(phdl); + mac_perim_enter_by_mh(odd->odd_mh, &mph); + + /* + * If the id is -1, then the property that we're looking for is named in + * oipi_name and we should fill in its id. Otherwise, we've been given + * an id and we need to turn that into a name for our plugin's sake. The + * id is our own fabrication for property discovery. + */ + if (oip->oipi_id == -1) { + /* + * Determine if it's a known generic property or it belongs to a + * module by checking against the list of known names. + */ + oip->oipi_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if ((ret = overlay_i_name_to_propid(odd, oip->oipi_name, + &propid)) != 0) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + oip->oipi_id = propid; + if (propid >= OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + oip->oipi_name, phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + + } + } else if (oip->oipi_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oipi_id - OVERLAY_DEV_NPROPS; + + if (id >= odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_propinfo( + odd->odd_plugin->ovp_props[id], phdl); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oipi_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oipi_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oipi_id >= 0); + propid = oip->oipi_id; + (void) strlcpy(oip->oipi_name, overlay_dev_props[propid], + sizeof (oip->oipi_name)); + } + + switch (propid) { + case OVERLAY_DEV_P_MTU: + overlay_i_propinfo_mtu(odd, phdl); + break; + case OVERLAY_DEV_P_VNETID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + case OVERLAY_DEV_P_ENCAP: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_STRING); + overlay_prop_set_nodefault(phdl); + overlay_plugin_walk(overlay_propinfo_plugin_cb, phdl); + break; + case OVERLAY_DEV_P_VARPDID: + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_READ); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + overlay_prop_set_nodefault(phdl); + break; + default: + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ENOENT); + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (0); +} + +/* ARGSUSED */ +static int +overlay_i_getprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + mac_perim_handle_t mph; + overlay_ioc_prop_t *oip = karg; + uint_t propid, mtu; + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + mac_perim_enter_by_mh(odd->odd_mh, &mph); + oip->oip_size = OVERLAY_PROP_SIZEMAX; + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_getprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_getprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + &oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } else if (oip->oip_id < -1) { + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + /* + * The MTU is always set and retrieved through MAC, to allow for + * MAC to do whatever it wants, as really that property belongs + * to MAC. This is important for things where vnics have hold on + * the MTU. + */ + mac_sdu_get(odd->odd_mh, NULL, &mtu); + bcopy(&mtu, oip->oip_value, sizeof (uint_t)); + oip->oip_size = sizeof (uint_t); + break; + case OVERLAY_DEV_P_VNETID: + /* + * While it's read-only while inside of a mux, we're not in a + * context that can guarantee that. Therefore we always grab the + * overlay_dev_t's odd_lock. + */ + mutex_enter(&odd->odd_lock); + bcopy(&odd->odd_vid, oip->oip_value, sizeof (uint64_t)); + mutex_exit(&odd->odd_lock); + oip->oip_size = sizeof (uint64_t); + break; + case OVERLAY_DEV_P_ENCAP: + oip->oip_size = strlcpy((char *)oip->oip_value, + odd->odd_plugin->ovp_name, oip->oip_size); + break; + case OVERLAY_DEV_P_VARPDID: + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + const uint64_t val = odd->odd_target->ott_id; + bcopy(&val, oip->oip_value, sizeof (uint64_t)); + oip->oip_size = sizeof (uint64_t); + } else { + oip->oip_size = 0; + } + mutex_exit(&odd->odd_lock); + break; + default: + ret = ENOENT; + } + + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); +} + +static void +overlay_setprop_vnetid(overlay_dev_t *odd, uint64_t vnetid) +{ + mutex_enter(&odd->odd_lock); + + /* Simple case, not active */ + if (!(odd->odd_flags & OVERLAY_F_IN_MUX)) { + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + return; + } + + /* + * In the hard case, we need to set the drop flag, quiesce I/O and then + * we can go ahead and do everything. + */ + odd->odd_flags |= OVERLAY_F_MDDROP; + overlay_io_wait(odd, OVERLAY_F_IOMASK); + mutex_exit(&odd->odd_lock); + + overlay_mux_remove_dev(odd->odd_mux, odd); + mutex_enter(&odd->odd_lock); + odd->odd_vid = vnetid; + mutex_exit(&odd->odd_lock); + overlay_mux_add_dev(odd->odd_mux, odd); + + mutex_enter(&odd->odd_lock); + ASSERT(odd->odd_flags & OVERLAY_F_IN_MUX); + odd->odd_flags &= ~OVERLAY_F_IN_MUX; + mutex_exit(&odd->odd_lock); +} + +/* ARGSUSED */ +static int +overlay_i_setprop(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + int ret; + overlay_dev_t *odd; + overlay_ioc_prop_t *oip = karg; + uint_t propid = UINT_MAX; + mac_perim_handle_t mph; + uint64_t maxid, *vidp; + + if (oip->oip_size > OVERLAY_PROP_SIZEMAX) + return (EINVAL); + + odd = overlay_hold_by_dlid(oip->oip_linkid); + if (odd == NULL) + return (ENOENT); + + oip->oip_name[OVERLAY_PROP_NAMELEN-1] = '\0'; + mac_perim_enter_by_mh(odd->odd_mh, &mph); + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_ACTIVATED) { + mac_perim_exit(mph); + mutex_exit(&odd->odd_lock); + return (ENOTSUP); + } + mutex_exit(&odd->odd_lock); + if (oip->oip_id == -1) { + int i; + + for (i = 0; i < OVERLAY_DEV_NPROPS; i++) { + if (strcmp(overlay_dev_props[i], oip->oip_name) == 0) + break; + if (i == OVERLAY_DEV_NPROPS) { + ret = odd->odd_plugin->ovp_ops->ovpo_setprop( + odd->odd_pvoid, oip->oip_name, + oip->oip_value, oip->oip_size); + overlay_hold_rele(odd); + mac_perim_exit(mph); + return (ret); + } + } + + propid = i; + } else if (oip->oip_id >= OVERLAY_DEV_NPROPS) { + uint_t id = oip->oip_id - OVERLAY_DEV_NPROPS; + + if (id > odd->odd_plugin->ovp_nprops) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } + ret = odd->odd_plugin->ovp_ops->ovpo_setprop(odd->odd_pvoid, + odd->odd_plugin->ovp_props[id], oip->oip_value, + oip->oip_size); + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); + } else if (oip->oip_id < -1) { + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (EINVAL); + } else { + ASSERT(oip->oip_id < OVERLAY_DEV_NPROPS); + ASSERT(oip->oip_id >= 0); + propid = oip->oip_id; + } + + ret = 0; + switch (propid) { + case OVERLAY_DEV_P_MTU: + ret = mac_set_prop(odd->odd_mh, MAC_PROP_MTU, "mtu", + oip->oip_value, oip->oip_size); + break; + case OVERLAY_DEV_P_VNETID: + if (oip->oip_size != sizeof (uint64_t)) { + ret = EINVAL; + break; + } + vidp = (uint64_t *)oip->oip_value; + ASSERT(odd->odd_plugin->ovp_id_size <= 8); + maxid = UINT64_MAX; + if (odd->odd_plugin->ovp_id_size != 8) + maxid = (1ULL << (odd->odd_plugin->ovp_id_size * 8)) - + 1ULL; + if (*vidp >= maxid) { + ret = EINVAL; + break; + } + overlay_setprop_vnetid(odd, *vidp); + break; + case OVERLAY_DEV_P_ENCAP: + case OVERLAY_DEV_P_VARPDID: + ret = EPERM; + break; + default: + ret = ENOENT; + } + + mac_perim_exit(mph); + overlay_hold_rele(odd); + return (ret); +} + +/* ARGSUSED */ +static int +overlay_i_status(void *karg, intptr_t arg, int mode, cred_t *cred, + int *rvalp) +{ + overlay_dev_t *odd; + overlay_ioc_status_t *os = karg; + + odd = overlay_hold_by_dlid(os->ois_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_DEGRADED) != 0) { + os->ois_status = OVERLAY_I_DEGRADED; + if (odd->odd_fmamsg != NULL) { + (void) strlcpy(os->ois_message, odd->odd_fmamsg, + OVERLAY_STATUS_BUFLEN); + } else { + os->ois_message[0] = '\0'; + } + + } else { + os->ois_status = OVERLAY_I_OK; + os->ois_message[0] = '\0'; + } + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + + return (0); +} + +static dld_ioc_info_t overlay_ioc_list[] = { + { OVERLAY_IOC_CREATE, DLDCOPYIN, sizeof (overlay_ioc_create_t), + overlay_i_create, secpolicy_dl_config }, + { OVERLAY_IOC_ACTIVATE, DLDCOPYIN, sizeof (overlay_ioc_activate_t), + overlay_i_activate, secpolicy_dl_config }, + { OVERLAY_IOC_DELETE, DLDCOPYIN, sizeof (overlay_ioc_delete_t), + overlay_i_delete, secpolicy_dl_config }, + { OVERLAY_IOC_PROPINFO, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_propinfo_t), overlay_i_propinfo, + secpolicy_dl_config }, + { OVERLAY_IOC_GETPROP, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_prop_t), overlay_i_getprop, + secpolicy_dl_config }, + { OVERLAY_IOC_SETPROP, DLDCOPYIN, + sizeof (overlay_ioc_prop_t), overlay_i_setprop, + secpolicy_dl_config }, + { OVERLAY_IOC_NPROPS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_nprops_t), overlay_i_nprops, + secpolicy_dl_config }, + { OVERLAY_IOC_STATUS, DLDCOPYIN | DLDCOPYOUT, + sizeof (overlay_ioc_status_t), overlay_i_status, + NULL } +}; + +static int +overlay_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int fmcap = DDI_FM_EREPORT_CAPABLE; + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + if (overlay_dip != NULL || ddi_get_instance(dip) != 0) + return (DDI_FAILURE); + + ddi_fm_init(dip, &fmcap, NULL); + + if (ddi_create_minor_node(dip, OVERLAY_CTL, S_IFCHR, + ddi_get_instance(dip), DDI_PSEUDO, 0) == DDI_FAILURE) + return (DDI_FAILURE); + + if (dld_ioc_register(OVERLAY_IOC, overlay_ioc_list, + DLDIOCCNT(overlay_ioc_list)) != 0) { + ddi_remove_minor_node(dip, OVERLAY_CTL); + return (DDI_FAILURE); + } + + overlay_dip = dip; + return (DDI_SUCCESS); +} + +/* ARGSUSED */ +static int +overlay_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *resp = (void *)overlay_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *resp = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + + return (error); +} + +static int +overlay_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&overlay_dev_lock); + if (!list_is_empty(&overlay_dev_list) || overlay_target_busy()) { + mutex_exit(&overlay_dev_lock); + return (EBUSY); + } + mutex_exit(&overlay_dev_lock); + + + dld_ioc_unregister(OVERLAY_IOC); + ddi_remove_minor_node(dip, OVERLAY_CTL); + ddi_fm_fini(dip); + overlay_dip = NULL; + return (DDI_SUCCESS); +} + +static struct cb_ops overlay_cbops = { + overlay_target_open, /* cb_open */ + overlay_target_close, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + overlay_target_ioctl, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* cb_stream */ + D_MP, /* cb_flag */ + CB_REV, /* cb_rev */ + nodev, /* cb_aread */ + nodev, /* cb_awrite */ +}; + +static struct dev_ops overlay_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + overlay_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + overlay_attach, /* devo_attach */ + overlay_detach, /* devo_detach */ + nulldev, /* devo_reset */ + &overlay_cbops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + NULL, /* devo_power */ + ddi_quiesce_not_supported /* devo_quiesce */ +}; + +static struct modldrv overlay_modldrv = { + &mod_driverops, + "Overlay Network Driver", + &overlay_dev_ops +}; + +static struct modlinkage overlay_linkage = { + MODREV_1, + &overlay_modldrv +}; + +static int +overlay_init(void) +{ + mutex_init(&overlay_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&overlay_dev_list, sizeof (overlay_dev_t), + offsetof(overlay_dev_t, odd_link)); + overlay_mux_init(); + overlay_plugin_init(); + overlay_target_init(); + + return (DDI_SUCCESS); +} + +static void +overlay_fini(void) +{ + overlay_target_fini(); + overlay_plugin_fini(); + overlay_mux_fini(); + mutex_destroy(&overlay_dev_lock); + list_destroy(&overlay_dev_list); +} + +int +_init(void) +{ + int err; + + if ((err = overlay_init()) != DDI_SUCCESS) + return (err); + + mac_init_ops(NULL, "overlay"); + err = mod_install(&overlay_linkage); + if (err != DDI_SUCCESS) { + overlay_fini(); + return (err); + } + + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&overlay_linkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + err = mod_remove(&overlay_linkage); + if (err != 0) + return (err); + + overlay_fini(); + return (0); +} diff --git a/usr/src/uts/common/io/overlay/overlay.conf b/usr/src/uts/common/io/overlay/overlay.conf new file mode 100644 index 0000000000..4b62fafd94 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015, Joyent, Inc. +# + +name="overlay" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/overlay/overlay.mapfile b/usr/src/uts/common/io/overlay/overlay.mapfile new file mode 100644 index 0000000000..800d72dc2b --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay.mapfile @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # Encapsualation Plugin interfaces + overlay_plugin_alloc; + overlay_plugin_free; + overlay_plugin_register; + overlay_plugin_unregister; + local: + *; +}; diff --git a/usr/src/uts/common/io/overlay/overlay_fm.c b/usr/src/uts/common/io/overlay/overlay_fm.c new file mode 100644 index 0000000000..f26356ceaf --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_fm.c @@ -0,0 +1,84 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device FMA operations. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/ddifm.h> +#include <sys/overlay_impl.h> + +kmutex_t overlay_fm_lock; +uint_t overlay_fm_count; + +void +overlay_fm_init(void) +{ + overlay_fm_count = 0; + mutex_init(&overlay_fm_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_fm_fini(void) +{ + VERIFY(overlay_fm_count == 0); + mutex_destroy(&overlay_fm_lock); +} + +void +overlay_fm_degrade(overlay_dev_t *odd, const char *msg) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + + if (msg != NULL) + (void) strlcpy(odd->odd_fmamsg, msg, OVERLAY_STATUS_BUFLEN); + + if (odd->odd_flags & OVERLAY_F_DEGRADED) + goto out; + + odd->odd_flags |= OVERLAY_F_DEGRADED; + overlay_fm_count++; + if (overlay_fm_count == 1) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_DEGRADED); + mac_link_update(odd->odd_mh, LINK_STATE_DOWN); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} + +void +overlay_fm_restore(overlay_dev_t *odd) +{ + mutex_enter(&overlay_fm_lock); + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_DEGRADED)) + goto out; + + odd->odd_fmamsg[0] = '\0'; + odd->odd_flags &= ~OVERLAY_F_DEGRADED; + overlay_fm_count--; + if (overlay_fm_count == 0) { + ddi_fm_service_impact(overlay_dip, DDI_SERVICE_RESTORED); + mac_link_update(odd->odd_mh, LINK_STATE_UP); + } +out: + mutex_exit(&odd->odd_lock); + mutex_exit(&overlay_fm_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_mux.c b/usr/src/uts/common/io/overlay/overlay_mux.c new file mode 100644 index 0000000000..9f70e8c83e --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_mux.c @@ -0,0 +1,354 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device ksocket multiplexer. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/ksynch.h> +#include <sys/ksocket.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/tihdr.h> + +#include <sys/overlay_impl.h> + +#include <sys/sdt.h> + +#define OVERLAY_FREEMSG(mp, reason) \ + DTRACE_PROBE2(overlay__fremsg, mblk_t *, mp, char *, reason) + +static list_t overlay_mux_list; +static kmutex_t overlay_mux_lock; + +void +overlay_mux_init(void) +{ + list_create(&overlay_mux_list, sizeof (overlay_mux_t), + offsetof(overlay_mux_t, omux_lnode)); + mutex_init(&overlay_mux_lock, NULL, MUTEX_DRIVER, NULL); +} + +void +overlay_mux_fini(void) +{ + mutex_destroy(&overlay_mux_lock); + list_destroy(&overlay_mux_list); +} + +static int +overlay_mux_comparator(const void *a, const void *b) +{ + const overlay_dev_t *odl, *odr; + odl = a; + odr = b; + if (odl->odd_vid > odr->odd_vid) + return (1); + else if (odl->odd_vid < odr->odd_vid) + return (-1); + else + return (0); +} + +/* + * This is the central receive data path. We need to decode the packet, if we + * can, and then deliver it to the appropriate overlay. + */ +/* ARGSUSED */ +static boolean_t +overlay_mux_recv(ksocket_t ks, mblk_t *mpchain, size_t msgsize, int oob, + void *arg) +{ + mblk_t *mp, *nmp, *fmp; + overlay_mux_t *mux = arg; + + /* + * We may have a received a chain of messages. Each messsage in the + * chain will likely have a T_unitdata_ind attached to it as an M_PROTO. + * If we aren't getting that, we should probably drop that for the + * moment. + */ + for (mp = mpchain; mp != NULL; mp = nmp) { + struct T_unitdata_ind *tudi; + ovep_encap_info_t infop; + overlay_dev_t od, *odd; + int ret; + + nmp = mp->b_next; + mp->b_next = NULL; + + if (DB_TYPE(mp) != M_PROTO) { + OVERLAY_FREEMSG(mp, "first one isn't M_PROTO"); + freemsg(mp); + continue; + } + + if (mp->b_cont == NULL) { + OVERLAY_FREEMSG(mp, "missing a b_cont"); + freemsg(mp); + continue; + } + + tudi = (struct T_unitdata_ind *)mp->b_rptr; + if (tudi->PRIM_type != T_UNITDATA_IND) { + OVERLAY_FREEMSG(mp, "Not a T_unitdata_ind *"); + freemsg(mp); + continue; + } + + /* + * In the future, we'll care about the source information + * for purposes of telling varpd for oob invalidation. But for + * now, just drop that block. + */ + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + freemsg(fmp); + + /* + * Decap and deliver. + */ + bzero(&infop, sizeof (ovep_encap_info_t)); + ret = mux->omux_plugin->ovp_ops->ovpo_decap(NULL, mp, &infop); + if (ret != 0) { + OVERLAY_FREEMSG(mp, "decap failed"); + freemsg(mp); + continue; + } + if (MBLKL(mp) > infop.ovdi_hdr_size) { + mp->b_rptr += infop.ovdi_hdr_size; + } else { + while (infop.ovdi_hdr_size != 0) { + size_t rem, blkl; + + if (mp == NULL) + break; + + blkl = MBLKL(mp); + rem = MIN(infop.ovdi_hdr_size, blkl); + infop.ovdi_hdr_size -= rem; + mp->b_rptr += rem; + if (rem == blkl) { + fmp = mp; + mp = fmp->b_cont; + fmp->b_cont = NULL; + OVERLAY_FREEMSG(mp, + "freed a fmp block"); + freemsg(fmp); + } + } + if (mp == NULL) { + OVERLAY_FREEMSG(mp, "freed it all..."); + continue; + } + } + + + od.odd_vid = infop.ovdi_id; + mutex_enter(&mux->omux_lock); + odd = avl_find(&mux->omux_devices, &od, NULL); + if (odd == NULL) { + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "no matching vid"); + freemsg(mp); + continue; + } + mutex_enter(&odd->odd_lock); + if ((odd->odd_flags & OVERLAY_F_MDDROP) || + !(odd->odd_flags & OVERLAY_F_IN_MUX)) { + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + OVERLAY_FREEMSG(mp, "dev dropped"); + freemsg(mp); + continue; + } + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + mutex_exit(&mux->omux_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + } + + return (B_TRUE); +} + +/* + * Register a given device with a socket backend. If no such device socket + * exists, create a new one. + */ +overlay_mux_t * +overlay_mux_open(overlay_plugin_t *opp, int domain, int family, int protocol, + struct sockaddr *addr, socklen_t len, int *errp) +{ + int err; + overlay_mux_t *mux; + ksocket_t ksock; + + if (errp == NULL) + errp = &err; + + mutex_enter(&overlay_mux_lock); + for (mux = list_head(&overlay_mux_list); mux != NULL; + mux = list_next(&overlay_mux_list, mux)) { + if (domain == mux->omux_domain && + family == mux->omux_family && + protocol == mux->omux_protocol && + len == mux->omux_alen && + bcmp(addr, mux->omux_addr, len) == 0) { + + if (opp != mux->omux_plugin) { + *errp = EEXIST; + return (NULL); + } + + mutex_enter(&mux->omux_lock); + mux->omux_count++; + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + *errp = 0; + return (mux); + } + } + + /* + * Today we aren't zone-aware and only exist in the global zone. When we + * allow for things to exist in the non-global zone, we'll want to use a + * credential that's actually specific to the zone. + */ + *errp = ksocket_socket(&ksock, domain, family, protocol, KSOCKET_SLEEP, + kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + return (NULL); + } + + *errp = ksocket_bind(ksock, addr, len, kcred); + if (*errp != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + /* + * Ask our lower layer to optionally toggle anything they need on this + * socket. Because a socket is owned by a single type of plugin, we can + * then ask it to perform any additional socket set up it'd like to do. + */ + if (opp->ovp_ops->ovpo_sockopt != NULL && + (*errp = opp->ovp_ops->ovpo_sockopt(ksock)) != 0) { + mutex_exit(&overlay_mux_lock); + ksocket_close(ksock, kcred); + return (NULL); + } + + mux = kmem_alloc(sizeof (overlay_mux_t), KM_SLEEP); + list_link_init(&mux->omux_lnode); + mux->omux_ksock = ksock; + mux->omux_plugin = opp; + mux->omux_domain = domain; + mux->omux_family = family; + mux->omux_protocol = protocol; + mux->omux_addr = kmem_alloc(len, KM_SLEEP); + bcopy(addr, mux->omux_addr, len); + mux->omux_alen = len; + mux->omux_count = 1; + avl_create(&mux->omux_devices, overlay_mux_comparator, + sizeof (overlay_dev_t), offsetof(overlay_dev_t, odd_muxnode)); + mutex_init(&mux->omux_lock, NULL, MUTEX_DRIVER, NULL); + + + /* Once this is called, we need to expect to rx data */ + *errp = ksocket_krecv_set(ksock, overlay_mux_recv, mux); + if (*errp != 0) { + ksocket_close(ksock, kcred); + mutex_destroy(&mux->omux_lock); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, len); + kmem_free(mux, sizeof (overlay_mux_t)); + return (NULL); + } + + list_insert_tail(&overlay_mux_list, mux); + mutex_exit(&overlay_mux_lock); + + *errp = 0; + return (mux); +} + +void +overlay_mux_close(overlay_mux_t *mux) +{ + mutex_enter(&overlay_mux_lock); + mutex_enter(&mux->omux_lock); + mux->omux_count--; + if (mux->omux_count != 0) { + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + return; + } + list_remove(&overlay_mux_list, mux); + mutex_exit(&mux->omux_lock); + mutex_exit(&overlay_mux_lock); + + ksocket_close(mux->omux_ksock, kcred); + avl_destroy(&mux->omux_devices); + kmem_free(mux->omux_addr, mux->omux_alen); + kmem_free(mux, sizeof (overlay_mux_t)); +} + +void +overlay_mux_add_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_add(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +void +overlay_mux_remove_dev(overlay_mux_t *mux, overlay_dev_t *odd) +{ + mutex_enter(&mux->omux_lock); + avl_remove(&mux->omux_devices, odd); + mutex_exit(&mux->omux_lock); +} + +int +overlay_mux_tx(overlay_mux_t *mux, struct msghdr *hdr, mblk_t *mp) +{ + int ret; + + /* + * It'd be nice to be able to use MSG_MBLK_QUICKRELE, unfortunately, + * that isn't actually supported by UDP at this time. + */ + ret = ksocket_sendmblk(mux->omux_ksock, hdr, 0, &mp, kcred); + if (ret != 0) + freemsg(mp); + + return (ret); +} diff --git a/usr/src/uts/common/io/overlay/overlay_plugin.c b/usr/src/uts/common/io/overlay/overlay_plugin.c new file mode 100644 index 0000000000..348ddb92a2 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_plugin.c @@ -0,0 +1,281 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device encapsulation plugin management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/sysmacros.h> +#include <sys/modctl.h> + +#include <sys/overlay_impl.h> + +static kmem_cache_t *overlay_plugin_cache; +static kmutex_t overlay_plugin_lock; +static list_t overlay_plugin_list; + +#define OVERLAY_MODDIR "overlay" + +/* ARGSUSED */ +static int +overlay_plugin_cache_constructor(void *buf, void *arg, int kmflags) +{ + overlay_plugin_t *opp = buf; + + mutex_init(&opp->ovp_mutex, NULL, MUTEX_DRIVER, NULL); + list_link_init(&opp->ovp_link); + + return (0); +} + +/* ARGSUSED */ +static void +overlay_plugin_cache_destructor(void *buf, void *arg) +{ + overlay_plugin_t *opp = buf; + ASSERT(list_link_active(&opp->ovp_link) == 0); + mutex_destroy(&opp->ovp_mutex); +} + +void +overlay_plugin_init(void) +{ + mutex_init(&overlay_plugin_lock, NULL, MUTEX_DRIVER, 0); + + /* + * In the future we may want to have a reaper to unload unused modules + * to help the kernel be able to reclaim memory. + */ + overlay_plugin_cache = kmem_cache_create("overlay_plugin_cache", + sizeof (overlay_plugin_t), 0, overlay_plugin_cache_constructor, + overlay_plugin_cache_destructor, NULL, NULL, NULL, 0); + list_create(&overlay_plugin_list, sizeof (overlay_plugin_t), + offsetof(overlay_plugin_t, ovp_link)); +} + +void +overlay_plugin_fini(void) +{ + mutex_enter(&overlay_plugin_lock); + VERIFY(list_is_empty(&overlay_plugin_list)); + mutex_exit(&overlay_plugin_lock); + + list_destroy(&overlay_plugin_list); + kmem_cache_destroy(overlay_plugin_cache); + mutex_destroy(&overlay_plugin_lock); +} + +overlay_plugin_register_t * +overlay_plugin_alloc(uint_t version) +{ + overlay_plugin_register_t *ovrp; + /* Version 1 is the only one that exists */ + if (version != OVEP_VERSION_ONE) + return (NULL); + + ovrp = kmem_zalloc(sizeof (overlay_plugin_register_t), KM_SLEEP); + ovrp->ovep_version = version; + return (ovrp); +} + +void +overlay_plugin_free(overlay_plugin_register_t *ovrp) +{ + kmem_free(ovrp, sizeof (overlay_plugin_register_t)); +} + +int +overlay_plugin_register(overlay_plugin_register_t *ovrp) +{ + overlay_plugin_t *opp, *ipp; + + /* Sanity check parameters of the registration */ + if (ovrp->ovep_version != OVEP_VERSION_ONE) + return (EINVAL); + + if (ovrp->ovep_name == NULL || ovrp->ovep_ops == NULL) + return (EINVAL); + + if ((ovrp->ovep_flags & ~(OVEP_F_VLAN_TAG)) != 0) + return (EINVAL); + + if (ovrp->ovep_id_size < 1) + return (EINVAL); + + /* Don't support anything that has an id size larger than 8 bytes */ + if (ovrp->ovep_id_size > 8) + return (ENOTSUP); + + if (ovrp->ovep_dest == OVERLAY_PLUGIN_D_INVALID) + return (EINVAL); + + if ((ovrp->ovep_dest & ~OVERLAY_PLUGIN_D_MASK) != 0) + return (EINVAL); + + if (ovrp->ovep_ops->ovpo_callbacks != 0) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_init == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_fini == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_encap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_decap == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_socket == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_getprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_setprop == NULL) + return (EINVAL); + if (ovrp->ovep_ops->ovpo_propinfo == NULL) + return (EINVAL); + + + opp = kmem_cache_alloc(overlay_plugin_cache, KM_SLEEP); + opp->ovp_active = 0; + opp->ovp_name = ovrp->ovep_name; + opp->ovp_ops = ovrp->ovep_ops; + opp->ovp_props = ovrp->ovep_props; + opp->ovp_id_size = ovrp->ovep_id_size; + opp->ovp_flags = ovrp->ovep_flags; + opp->ovp_dest = ovrp->ovep_dest; + + opp->ovp_nprops = 0; + if (ovrp->ovep_props != NULL) { + while (ovrp->ovep_props[opp->ovp_nprops] != NULL) { + if (strlen(ovrp->ovep_props[opp->ovp_nprops]) >= + OVERLAY_PROP_NAMELEN) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EINVAL); + } + opp->ovp_nprops++; + } + } + + mutex_enter(&overlay_plugin_lock); + for (ipp = list_head(&overlay_plugin_list); ipp != NULL; + ipp = list_next(&overlay_plugin_list, ipp)) { + if (strcmp(ipp->ovp_name, opp->ovp_name) == 0) { + mutex_exit(&overlay_plugin_lock); + kmem_cache_free(overlay_plugin_cache, opp); + return (EEXIST); + } + } + list_insert_tail(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + return (0); +} + +int +overlay_plugin_unregister(const char *name) +{ + overlay_plugin_t *opp; + + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(opp->ovp_name, name) == 0) + break; + } + + if (opp == NULL) { + mutex_exit(&overlay_plugin_lock); + return (ENOENT); + } + + mutex_enter(&opp->ovp_mutex); + if (opp->ovp_active > 0) { + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (EBUSY); + } + mutex_exit(&opp->ovp_mutex); + + list_remove(&overlay_plugin_list, opp); + mutex_exit(&overlay_plugin_lock); + + kmem_cache_free(overlay_plugin_cache, opp); + return (0); +} + +overlay_plugin_t * +overlay_plugin_lookup(const char *name) +{ + overlay_plugin_t *opp; + boolean_t trymodload = B_FALSE; + + for (;;) { + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (strcmp(name, opp->ovp_name) == 0) { + mutex_enter(&opp->ovp_mutex); + opp->ovp_active++; + mutex_exit(&opp->ovp_mutex); + mutex_exit(&overlay_plugin_lock); + return (opp); + } + } + mutex_exit(&overlay_plugin_lock); + + if (trymodload == B_TRUE) + return (NULL); + + /* + * If we didn't find it, it may still exist, but just not have + * been a loaded module. In that case, we'll do one attempt to + * load it. + */ + if (modload(OVERLAY_MODDIR, (char *)name) == -1) + return (NULL); + trymodload = B_TRUE; + } + +} + +void +overlay_plugin_rele(overlay_plugin_t *opp) +{ + mutex_enter(&opp->ovp_mutex); + ASSERT(opp->ovp_active > 0); + opp->ovp_active--; + mutex_exit(&opp->ovp_mutex); +} + +void +overlay_plugin_walk(overlay_plugin_walk_f func, void *arg) +{ + overlay_plugin_t *opp; + mutex_enter(&overlay_plugin_lock); + for (opp = list_head(&overlay_plugin_list); opp != NULL; + opp = list_next(&overlay_plugin_list, opp)) { + if (func(opp, arg) != 0) { + mutex_exit(&overlay_plugin_lock); + return; + } + } + mutex_exit(&overlay_plugin_lock); +} diff --git a/usr/src/uts/common/io/overlay/overlay_prop.c b/usr/src/uts/common/io/overlay/overlay_prop.c new file mode 100644 index 0000000000..ba1ea2a629 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_prop.c @@ -0,0 +1,122 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +/* + * Routines for manipulating property information structures. + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/overlay_impl.h> + +void +overlay_prop_init(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + infop->oipi_posssize = sizeof (mac_propval_range_t); + bzero(rangep, sizeof (mac_propval_range_t)); +} + +void +overlay_prop_set_name(overlay_prop_handle_t phdl, const char *name) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + (void) strlcpy(infop->oipi_name, name, OVERLAY_PROP_NAMELEN); +} + +void +overlay_prop_set_prot(overlay_prop_handle_t phdl, overlay_prop_prot_t prot) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_prot = prot; +} + +void +overlay_prop_set_type(overlay_prop_handle_t phdl, overlay_prop_type_t type) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_type = type; +} + +int +overlay_prop_set_default(overlay_prop_handle_t phdl, void *def, ssize_t len) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + + if (len > OVERLAY_PROP_SIZEMAX) + return (E2BIG); + + if (len < 0) + return (EOVERFLOW); + + bcopy(def, infop->oipi_default, len); + infop->oipi_defsize = (uint32_t)len; + + return (0); +} + +void +overlay_prop_set_nodefault(overlay_prop_handle_t phdl) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + infop->oipi_default[0] = '\0'; + infop->oipi_defsize = 0; +} + +void +overlay_prop_set_range_uint32(overlay_prop_handle_t phdl, uint32_t min, + uint32_t max) +{ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_UINT32) + return; + + if (infop->oipi_posssize + sizeof (mac_propval_uint32_range_t) > + sizeof (infop->oipi_poss)) + return; + + infop->oipi_posssize += sizeof (mac_propval_uint32_range_t); + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_UINT32; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_min = min; + rangep->u.mpr_uint32[rangep->mpr_count-1].mpur_max = max; +} + +void +overlay_prop_set_range_str(overlay_prop_handle_t phdl, const char *str) +{ + size_t len = strlen(str) + 1; /* Account for a null terminator */ + overlay_ioc_propinfo_t *infop = (overlay_ioc_propinfo_t *)phdl; + mac_propval_range_t *rangep = (mac_propval_range_t *)infop->oipi_poss; + mac_propval_str_range_t *pstr = &rangep->u.mpr_str; + + if (rangep->mpr_count != 0 && rangep->mpr_type != MAC_PROPVAL_STR) + return; + + if (infop->oipi_posssize + len > sizeof (infop->oipi_poss)) + return; + + rangep->mpr_count++; + rangep->mpr_type = MAC_PROPVAL_STR; + strlcpy((char *)&pstr->mpur_data[pstr->mpur_nextbyte], str, + sizeof (infop->oipi_poss) - infop->oipi_posssize); + pstr->mpur_nextbyte += len; + infop->oipi_posssize += len; +} diff --git a/usr/src/uts/common/io/overlay/overlay_target.c b/usr/src/uts/common/io/overlay/overlay_target.c new file mode 100644 index 0000000000..c2da20c883 --- /dev/null +++ b/usr/src/uts/common/io/overlay/overlay_target.c @@ -0,0 +1,1662 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * Overlay device target cache management + * + * For more information, see the big theory statement in + * uts/common/io/overlay/overlay.c + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/kmem.h> +#include <sys/policy.h> +#include <sys/sysmacros.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/mac_provider.h> +#include <sys/mac_client.h> +#include <sys/mac_client_priv.h> +#include <sys/vlan.h> +#include <sys/crc32.h> +#include <sys/cred.h> +#include <sys/file.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> + +#include <sys/overlay_impl.h> +#include <sys/sdt.h> + +/* + * This is total straw man, but at least it's a prime number. Here we're + * going to have to go through and do a lot of evaluation and understanding as + * to how these target caches should grow and shrink, as well as, memory + * pressure and evictions. This just gives us a starting point that'll be 'good + * enough', until it's not. + */ +#define OVERLAY_HSIZE 823 + +/* + * We use this data structure to keep track of what requests have been actively + * allocated to a given instance so we know what to put back on the pending + * list. + */ +typedef struct overlay_target_hdl { + minor_t oth_minor; /* RO */ + zoneid_t oth_zoneid; /* RO */ + int oth_oflags; /* RO */ + list_node_t oth_link; /* overlay_target_lock */ + kmutex_t oth_lock; + list_t oth_outstanding; /* oth_lock */ +} overlay_target_hdl_t; + +typedef int (*overlay_target_copyin_f)(const void *, void **, size_t *, int); +typedef int (*overlay_target_ioctl_f)(overlay_target_hdl_t *, void *); +typedef int (*overlay_target_copyout_f)(void *, void *, size_t, int); + +typedef struct overaly_target_ioctl { + int oti_cmd; /* ioctl id */ + boolean_t oti_write; /* ioctl requires FWRITE */ + boolean_t oti_ncopyout; /* copyout data? */ + overlay_target_copyin_f oti_copyin; /* copyin func */ + overlay_target_ioctl_f oti_func; /* function to call */ + overlay_target_copyout_f oti_copyout; /* copyin func */ + size_t oti_size; /* size of user level structure */ +} overlay_target_ioctl_t; + +static kmem_cache_t *overlay_target_cache; +static kmem_cache_t *overlay_entry_cache; +static id_space_t *overlay_thdl_idspace; +static void *overlay_thdl_state; + +/* + * When we support overlay devices in the NGZ, then all of these need to become + * zone aware, by plugging into the netstack engine and becoming per-netstack + * data. + */ +static list_t overlay_thdl_list; +static kmutex_t overlay_target_lock; +static kcondvar_t overlay_target_condvar; +static list_t overlay_target_list; +static boolean_t overlay_target_excl; + +/* + * Outstanding data per hash table entry. + */ +static int overlay_ent_size = 128 * 1024; + +/* ARGSUSED */ +static int +overlay_target_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_t *ott = buf; + + mutex_init(&ott->ott_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&ott->ott_cond, NULL, CV_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_target_cache_destructor(void *buf, void *arg) +{ + overlay_target_t *ott = buf; + + cv_destroy(&ott->ott_cond); + mutex_destroy(&ott->ott_lock); +} + +/* ARGSUSED */ +static int +overlay_entry_cache_constructor(void *buf, void *arg, int kmflgs) +{ + overlay_target_entry_t *ote = buf; + + bzero(ote, sizeof (overlay_target_entry_t)); + mutex_init(&ote->ote_lock, NULL, MUTEX_DRIVER, NULL); + return (0); +} + +/* ARGSUSED */ +static void +overlay_entry_cache_destructor(void *buf, void *arg) +{ + overlay_target_entry_t *ote = buf; + + mutex_destroy(&ote->ote_lock); +} + +static uint64_t +overlay_mac_hash(const void *v) +{ + uint32_t crc; + CRC32(crc, v, ETHERADDRL, -1U, crc32_table); + return (crc); +} + +static int +overlay_mac_cmp(const void *a, const void *b) +{ + return (bcmp(a, b, ETHERADDRL)); +} + +/* ARGSUSED */ +static void +overlay_target_entry_dtor(void *arg) +{ + overlay_target_entry_t *ote = arg; + + ote->ote_flags = 0; + bzero(ote->ote_addr, ETHERADDRL); + ote->ote_ott = NULL; + ote->ote_odd = NULL; + freemsgchain(ote->ote_chead); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = 0; + kmem_cache_free(overlay_entry_cache, ote); +} + +static int +overlay_mac_avl(const void *a, const void *b) +{ + int i; + const overlay_target_entry_t *l, *r; + l = a; + r = b; + + for (i = 0; i < ETHERADDRL; i++) { + if (l->ote_addr[i] > r->ote_addr[i]) + return (1); + else if (l->ote_addr[i] < r->ote_addr[i]) + return (-1); + } + + return (0); +} + +void +overlay_target_init(void) +{ + int ret; + ret = ddi_soft_state_init(&overlay_thdl_state, + sizeof (overlay_target_hdl_t), 1); + VERIFY(ret == 0); + overlay_target_cache = kmem_cache_create("overlay_target", + sizeof (overlay_target_t), 0, overlay_target_cache_constructor, + overlay_target_cache_destructor, NULL, NULL, NULL, 0); + overlay_entry_cache = kmem_cache_create("overlay_entry", + sizeof (overlay_target_entry_t), 0, overlay_entry_cache_constructor, + overlay_entry_cache_destructor, NULL, NULL, NULL, 0); + mutex_init(&overlay_target_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&overlay_target_condvar, NULL, CV_DRIVER, NULL); + list_create(&overlay_target_list, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + list_create(&overlay_thdl_list, sizeof (overlay_target_hdl_t), + offsetof(overlay_target_hdl_t, oth_link)); + overlay_thdl_idspace = id_space_create("overlay_target_minors", + 1, INT32_MAX); +} + +void +overlay_target_fini(void) +{ + id_space_destroy(overlay_thdl_idspace); + list_destroy(&overlay_thdl_list); + list_destroy(&overlay_target_list); + cv_destroy(&overlay_target_condvar); + mutex_destroy(&overlay_target_lock); + kmem_cache_destroy(overlay_entry_cache); + kmem_cache_destroy(overlay_target_cache); + ddi_soft_state_fini(&overlay_thdl_state); +} + +void +overlay_target_free(overlay_dev_t *odd) +{ + if (odd->odd_target == NULL) + return; + + if (odd->odd_target->ott_mode == OVERLAY_TARGET_DYNAMIC) { + refhash_t *rp = odd->odd_target->ott_u.ott_dyn.ott_dhash; + avl_tree_t *ap = &odd->odd_target->ott_u.ott_dyn.ott_tree; + overlay_target_entry_t *ote; + + /* + * Our AVL tree and hashtable contain the same elements, + * therefore we should just remove it from the tree, but then + * delete the entries when we remove them from the hash table + * (which happens through the refhash dtor). + */ + while ((ote = avl_first(ap)) != NULL) + avl_remove(ap, ote); + + avl_destroy(ap); + for (ote = refhash_first(rp); ote != NULL; + ote = refhash_next(rp, ote)) { + refhash_remove(rp, ote); + } + refhash_destroy(rp); + } + + ASSERT(odd->odd_target->ott_ocount == 0); + kmem_cache_free(overlay_target_cache, odd->odd_target); +} + +int +overlay_target_busy() +{ + int ret; + + mutex_enter(&overlay_target_lock); + ret = !list_is_empty(&overlay_thdl_list); + mutex_exit(&overlay_target_lock); + + return (ret); +} + +static void +overlay_target_queue(overlay_target_entry_t *entry) +{ + mutex_enter(&overlay_target_lock); + mutex_enter(&entry->ote_ott->ott_lock); + if (entry->ote_ott->ott_flags & OVERLAY_T_TEARDOWN) { + mutex_exit(&entry->ote_ott->ott_lock); + mutex_exit(&overlay_target_lock); + return; + } + entry->ote_ott->ott_ocount++; + mutex_exit(&entry->ote_ott->ott_lock); + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&overlay_target_lock); +} + +void +overlay_target_quiesce(overlay_target_t *ott) +{ + if (ott == NULL) + return; + mutex_enter(&ott->ott_lock); + ott->ott_flags |= OVERLAY_T_TEARDOWN; + while (ott->ott_ocount != 0) + cv_wait(&ott->ott_cond, &ott->ott_lock); + mutex_exit(&ott->ott_lock); +} + +/* + * This functions assumes that the destination mode is OVERLAY_PLUGIN_D_IP | + * OVERLAY_PLUGIN_D_PORT. As we don't have an implementation of anything else at + * this time, say for NVGRE, we drop all packets that mcuh this. + */ +int +overlay_target_lookup(overlay_dev_t *odd, mblk_t *mp, struct sockaddr *sock, + socklen_t *slenp) +{ + int ret; + struct sockaddr_in6 *v6; + overlay_target_t *ott; + mac_header_info_t mhi; + overlay_target_entry_t *entry; + + ASSERT(odd->odd_target != NULL); + + /* + * At this point, the overlay device is in a mux which means that it's + * been activated. At this point, parts of the target, such as the mode + * and the destination are now read-only and we don't have to worry + * about synchronization for them. + */ + ott = odd->odd_target; + if (ott->ott_dest != (OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT)) + return (OVERLAY_TARGET_DROP); + + v6 = (struct sockaddr_in6 *)sock; + bzero(v6, sizeof (struct sockaddr_in6)); + v6->sin6_family = AF_INET6; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + mutex_enter(&ott->ott_lock); + bcopy(&ott->ott_u.ott_point.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(ott->ott_u.ott_point.otp_port); + mutex_exit(&ott->ott_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (OVERLAY_TARGET_OK); + } + + ASSERT(ott->ott_mode == OVERLAY_TARGET_DYNAMIC); + + /* + * Note we only want the MAC address here, therefore we won't bother + * using mac_vlan_header_info(). If any caller needs the vlan info at + * this point, this should change to a call to mac_vlan_header_info(). + */ + if (mac_header_info(odd->odd_mh, mp, &mhi) != 0) + return (OVERLAY_TARGET_DROP); + mutex_enter(&ott->ott_lock); + entry = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + mhi.mhi_daddr); + if (entry == NULL) { + entry = kmem_cache_alloc(overlay_entry_cache, + KM_NOSLEEP | KM_NORMALPRI); + if (entry == NULL) { + mutex_exit(&ott->ott_lock); + return (OVERLAY_TARGET_DROP); + } + bcopy(mhi.mhi_daddr, entry->ote_addr, ETHERADDRL); + entry->ote_chead = entry->ote_ctail = mp; + entry->ote_mbsize = msgsize(mp); + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + entry->ote_ott = ott; + entry->ote_odd = odd; + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, entry); + avl_add(&ott->ott_u.ott_dyn.ott_tree, entry); + mutex_exit(&ott->ott_lock); + overlay_target_queue(entry); + return (OVERLAY_TARGET_ASYNC); + } + refhash_hold(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_DROP) { + ret = OVERLAY_TARGET_DROP; + } else if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + bcopy(&entry->ote_dest.otp_ip, &v6->sin6_addr, + sizeof (struct in6_addr)); + v6->sin6_port = htons(entry->ote_dest.otp_port); + *slenp = sizeof (struct sockaddr_in6); + ret = OVERLAY_TARGET_OK; + } else { + size_t mlen = msgsize(mp); + + if (mlen + entry->ote_mbsize > overlay_ent_size) { + ret = OVERLAY_TARGET_DROP; + } else { + if (entry->ote_ctail != NULL) { + ASSERT(entry->ote_ctail->b_next == + NULL); + entry->ote_ctail->b_next = mp; + entry->ote_ctail = mp; + } else { + entry->ote_chead = mp; + entry->ote_ctail = mp; + } + entry->ote_mbsize += mlen; + if ((entry->ote_flags & + OVERLAY_ENTRY_F_PENDING) == 0) { + entry->ote_flags |= + OVERLAY_ENTRY_F_PENDING; + overlay_target_queue(entry); + } + ret = OVERLAY_TARGET_ASYNC; + } + } + mutex_exit(&entry->ote_lock); + + mutex_enter(&ott->ott_lock); + refhash_rele(ott->ott_u.ott_dyn.ott_dhash, entry); + mutex_exit(&ott->ott_lock); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_info(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_info_t *oti = arg; + + odd = overlay_hold_by_dlid(oti->oti_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + oti->oti_flags = 0; + oti->oti_needs = odd->odd_plugin->ovp_dest; + if (odd->odd_flags & OVERLAY_F_DEGRADED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_DEGRADED; + if (odd->odd_flags & OVERLAY_F_ACTIVATED) + oti->oti_flags |= OVERLAY_TARG_INFO_F_ACTIVE; + oti->oti_vnetid = odd->odd_vid; + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_associate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_associate_t *ota = arg; + + odd = overlay_hold_by_dlid(ota->ota_linkid); + if (odd == NULL) + return (ENOENT); + + if (ota->ota_id == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode != OVERLAY_TARGET_POINT && + ota->ota_mode != OVERLAY_TARGET_DYNAMIC) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_provides != odd->odd_plugin->ovp_dest) { + overlay_hold_rele(odd); + return (EINVAL); + } + + if (ota->ota_mode == OVERLAY_TARGET_POINT) { + if (ota->ota_provides & OVERLAY_PLUGIN_D_IP) { + if (IN6_IS_ADDR_UNSPECIFIED(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4COMPAT(&ota->ota_point.otp_ip) || + IN6_IS_ADDR_V4MAPPED_ANY(&ota->ota_point.otp_ip)) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + + if (ota->ota_provides & OVERLAY_PLUGIN_D_PORT) { + if (ota->ota_point.otp_port == 0) { + overlay_hold_rele(odd); + return (EINVAL); + } + } + } + + ott = kmem_cache_alloc(overlay_target_cache, KM_SLEEP); + ott->ott_flags = 0; + ott->ott_ocount = 0; + ott->ott_mode = ota->ota_mode; + ott->ott_dest = ota->ota_provides; + ott->ott_id = ota->ota_id; + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + bcopy(&ota->ota_point, &ott->ott_u.ott_point, + sizeof (overlay_target_point_t)); + } else { + ott->ott_u.ott_dyn.ott_dhash = refhash_create(OVERLAY_HSIZE, + overlay_mac_hash, overlay_mac_cmp, + overlay_target_entry_dtor, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_reflink), + offsetof(overlay_target_entry_t, ote_addr), KM_SLEEP); + avl_create(&ott->ott_u.ott_dyn.ott_tree, overlay_mac_avl, + sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_avllink)); + } + mutex_enter(&odd->odd_lock); + if (odd->odd_flags & OVERLAY_F_VARPD) { + mutex_exit(&odd->odd_lock); + kmem_cache_free(overlay_target_cache, ott); + overlay_hold_rele(odd); + return (EEXIST); + } + + odd->odd_flags |= OVERLAY_F_VARPD; + odd->odd_target = ott; + mutex_exit(&odd->odd_lock); + + /* + * Now that we've successfully integrated ourselves here, we should note + * that the link state is now up, and transition it away from UKNOWN. + */ + mac_link_update(odd->odd_mh, LINK_STATE_UP); + + overlay_hold_rele(odd); + + + return (0); +} + + +/* ARGSUSED */ +static int +overlay_target_degrade(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_degrade_t *otd = arg; + + odd = overlay_hold_by_dlid(otd->otd_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_degrade(odd, otd->otd_buf); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_restore(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + overlay_fm_restore(odd); + overlay_hold_rele(odd); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_disassociate(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_targ_id_t *otid = arg; + + odd = overlay_hold_by_dlid(otid->otid_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + odd->odd_flags &= ~OVERLAY_F_VARPD; + mutex_exit(&odd->odd_lock); + + /* + * Without a varpd instance, we should consider ourselves link down. + */ + mac_link_update(odd->odd_mh, LINK_STATE_DOWN); + + overlay_hold_rele(odd); + return (0); + +} + +static int +overlay_target_lookup_request(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_lookup_t *otl = arg; + overlay_target_entry_t *entry; + clock_t ret, timeout; + mac_header_info_t mhi; + + timeout = ddi_get_lbolt() + drv_usectohz(MICROSEC); +again: + mutex_enter(&overlay_target_lock); + while (list_is_empty(&overlay_target_list)) { + ret = cv_timedwait(&overlay_target_condvar, + &overlay_target_lock, timeout); + if (ret == -1) { + mutex_exit(&overlay_target_lock); + return (ETIME); + } + } + entry = list_remove_head(&overlay_target_list); + mutex_exit(&overlay_target_lock); + mutex_enter(&entry->ote_lock); + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + ASSERT(entry->ote_chead == NULL); + mutex_exit(&entry->ote_lock); + goto again; + } + ASSERT(entry->ote_chead != NULL); + + /* + * If we have a bogon that doesn't have a valid mac header, drop it and + * try again. + */ + if (mac_vlan_header_info(entry->ote_odd->odd_mh, entry->ote_chead, + &mhi) != 0) { + boolean_t queue = B_FALSE; + mblk_t *mp = entry->ote_chead; + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + if (entry->ote_chead != NULL) + queue = B_TRUE; + mutex_exit(&entry->ote_lock); + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + goto again; + } + + otl->otl_dlid = entry->ote_odd->odd_linkid; + otl->otl_reqid = (uintptr_t)entry; + otl->otl_varpdid = entry->ote_ott->ott_id; + otl->otl_vnetid = entry->ote_odd->odd_vid; + + otl->otl_hdrsize = mhi.mhi_hdrsize; + otl->otl_pktsize = msgsize(entry->ote_chead) - otl->otl_hdrsize; + bcopy(mhi.mhi_daddr, otl->otl_dstaddr, ETHERADDRL); + bcopy(mhi.mhi_saddr, otl->otl_srcaddr, ETHERADDRL); + otl->otl_dsttype = mhi.mhi_dsttype; + otl->otl_sap = mhi.mhi_bindsap; + otl->otl_vlan = VLAN_ID(mhi.mhi_tci); + mutex_exit(&entry->ote_lock); + + mutex_enter(&thdl->oth_lock); + list_insert_tail(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + return (0); +} + +static int +overlay_target_lookup_respond(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + bcopy(&otr->otr_answer, &entry->ote_dest, + sizeof (overlay_target_point_t)); + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + entry->ote_flags |= OVERLAY_ENTRY_F_VALID; + mp = entry->ote_chead; + entry->ote_chead = NULL; + entry->ote_ctail = NULL; + entry->ote_mbsize = 0; + entry->ote_vtime = gethrtime(); + mutex_exit(&entry->ote_lock); + + /* + * For now do an in-situ drain. + */ + mp = overlay_m_tx(entry->ote_odd, mp); + freemsgchain(mp); + + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +static int +overlay_target_lookup_drop(overlay_target_hdl_t *thdl, void *arg) +{ + const overlay_targ_resp_t *otr = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + boolean_t queue = B_FALSE; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == otr->otr_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + list_remove(&thdl->oth_outstanding, entry); + mutex_exit(&thdl->oth_lock); + + mutex_enter(&entry->ote_lock); + + /* Safeguard against a confused varpd */ + if (entry->ote_flags & OVERLAY_ENTRY_F_VALID) { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + DTRACE_PROBE1(overlay__target__valid__drop, + overlay_target_entry_t *, entry); + mutex_exit(&entry->ote_lock); + goto done; + } + + mp = entry->ote_chead; + if (mp != NULL) { + entry->ote_chead = mp->b_next; + mp->b_next = NULL; + if (entry->ote_ctail == mp) + entry->ote_ctail = entry->ote_chead; + entry->ote_mbsize -= msgsize(mp); + } + if (entry->ote_chead != NULL) { + queue = B_TRUE; + entry->ote_flags |= OVERLAY_ENTRY_F_PENDING; + } else { + entry->ote_flags &= ~OVERLAY_ENTRY_F_PENDING; + } + mutex_exit(&entry->ote_lock); + + if (queue == B_TRUE) + overlay_target_queue(entry); + freemsg(mp); + +done: + mutex_enter(&entry->ote_ott->ott_lock); + entry->ote_ott->ott_ocount--; + cv_signal(&entry->ote_ott->ott_cond); + mutex_exit(&entry->ote_ott->ott_lock); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_pkt_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_pkt_t *pkt; + overlay_targ_pkt32_t *pkt32; + + pkt = kmem_alloc(sizeof (overlay_targ_pkt_t), KM_SLEEP); + *outp = pkt; + *bsize = sizeof (overlay_targ_pkt_t); + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + uintptr_t addr; + + if (ddi_copyin(ubuf, pkt, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + pkt32 = (overlay_targ_pkt32_t *)pkt; + addr = pkt32->otp_buf; + pkt->otp_buf = (void *)addr; + } else { + if (ddi_copyin(ubuf, pkt, *bsize, flags & FKIOCTL) != 0) { + kmem_free(pkt, *bsize); + return (EFAULT); + } + } + return (0); +} + +static int +overlay_target_pkt_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + if (ddi_model_convert_from(flags & FMODELS) == DDI_MODEL_ILP32) { + overlay_targ_pkt_t *pkt = buf; + overlay_targ_pkt32_t *pkt32 = buf; + uintptr_t addr = (uintptr_t)pkt->otp_buf; + pkt32->otp_buf = (caddr32_t)addr; + if (ddi_copyout(buf, ubuf, sizeof (overlay_targ_pkt32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + } else { + if (ddi_copyout(buf, ubuf, bufsize, flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +static int +overlay_target_packet(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + mblk_t *mp; + size_t mlen; + size_t boff; + + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + return (EINVAL); + } + mutex_enter(&entry->ote_lock); + mutex_exit(&thdl->oth_lock); + mp = entry->ote_chead; + /* Protect against a rogue varpd */ + if (mp == NULL) { + mutex_exit(&entry->ote_lock); + return (EINVAL); + } + mlen = MIN(msgsize(mp), pkt->otp_size); + pkt->otp_size = mlen; + boff = 0; + while (mlen > 0) { + size_t wlen = MIN(MBLKL(mp), mlen); + if (ddi_copyout(mp->b_rptr, + (void *)((uintptr_t)pkt->otp_buf + boff), + wlen, 0) != 0) { + mutex_exit(&entry->ote_lock); + return (EFAULT); + } + mlen -= wlen; + boff += wlen; + mp = mp->b_cont; + } + mutex_exit(&entry->ote_lock); + return (0); +} + +static int +overlay_target_inject(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mutex_enter(&odd->odd_lock); + overlay_io_start(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + mac_rx(odd->odd_mh, NULL, mp); + + mutex_enter(&odd->odd_lock); + overlay_io_done(odd, OVERLAY_F_IN_RX); + mutex_exit(&odd->odd_lock); + + return (0); +} + +static int +overlay_target_resend(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_targ_pkt_t *pkt = arg; + overlay_target_entry_t *entry; + overlay_dev_t *odd; + mblk_t *mp; + + if (pkt->otp_size > ETHERMAX + VLAN_TAGSZ) + return (EINVAL); + + mp = allocb(pkt->otp_size, 0); + if (mp == NULL) + return (ENOMEM); + + if (ddi_copyin(pkt->otp_buf, mp->b_rptr, pkt->otp_size, 0) != 0) { + freeb(mp); + return (EFAULT); + } + mp->b_wptr += pkt->otp_size; + + if (pkt->otp_linkid != UINT64_MAX) { + odd = overlay_hold_by_dlid(pkt->otp_linkid); + if (odd == NULL) { + freeb(mp); + return (ENOENT); + } + } else { + mutex_enter(&thdl->oth_lock); + for (entry = list_head(&thdl->oth_outstanding); entry != NULL; + entry = list_next(&thdl->oth_outstanding, entry)) { + if ((uintptr_t)entry == pkt->otp_reqid) + break; + } + + if (entry == NULL) { + mutex_exit(&thdl->oth_lock); + freeb(mp); + return (ENOENT); + } + odd = entry->ote_odd; + mutex_exit(&thdl->oth_lock); + } + + mp = overlay_m_tx(odd, mp); + freemsgchain(mp); + + return (0); +} + +typedef struct overlay_targ_list_int { + boolean_t otli_count; + uint32_t otli_cur; + uint32_t otli_nents; + uint32_t otli_ents[]; +} overlay_targ_list_int_t; + +static int +overlay_target_list_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_list_t n; + overlay_targ_list_int_t *otl; + + if (ddi_copyin(ubuf, &n, sizeof (overlay_targ_list_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + /* + */ + if (n.otl_nents >= INT32_MAX / sizeof (uint32_t)) + return (EINVAL); + *bsize = sizeof (overlay_targ_list_int_t) + + sizeof (uint32_t) * n.otl_nents; + otl = kmem_zalloc(*bsize, KM_SLEEP); + otl->otli_cur = 0; + otl->otli_nents = n.otl_nents; + if (otl->otli_nents != 0) { + otl->otli_count = B_FALSE; + if (ddi_copyin((void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + otl->otli_ents, n.otl_nents * sizeof (uint32_t), + flags & FKIOCTL) != 0) { + kmem_free(otl, *bsize); + return (EFAULT); + } + } else { + otl->otli_count = B_TRUE; + } + + *outp = otl; + return (0); +} + +static int +overlay_target_ioctl_list_cb(overlay_dev_t *odd, void *arg) +{ + overlay_targ_list_int_t *otl = arg; + + if (otl->otli_cur < otl->otli_nents) + otl->otli_ents[otl->otli_cur] = odd->odd_linkid; + otl->otli_cur++; + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_ioctl_list(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_iter(overlay_target_ioctl_list_cb, arg); + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_list_copyout(void *ubuf, void *buf, size_t bufsize, int flags) +{ + overlay_targ_list_int_t *otl = buf; + + if (ddi_copyout(&otl->otli_cur, ubuf, sizeof (uint32_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (otl->otli_count == B_FALSE) { + if (ddi_copyout(otl->otli_ents, + (void *)((uintptr_t)ubuf + + offsetof(overlay_targ_list_t, otl_ents)), + sizeof (uint32_t) * otl->otli_nents, + flags & FKIOCTL) != 0) + return (EFAULT); + } + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_get(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_POINT && + ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + otc->otc_entry.otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } else { + overlay_target_entry_t *ote; + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + if ((ote->ote_flags & + OVERLAY_ENTRY_F_VALID_MASK) != 0) { + if (ote->ote_flags & OVERLAY_ENTRY_F_DROP) { + otc->otc_entry.otce_flags = + OVERLAY_TARGET_CACHE_DROP; + } else { + otc->otc_entry.otce_flags = 0; + bcopy(&ote->ote_dest, + &otc->otc_entry.otce_dest, + sizeof (overlay_target_point_t)); + } + ret = 0; + } else { + ret = ENOENT; + } + mutex_exit(&ote->ote_lock); + } else { + ret = ENOENT; + } + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_set(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + mblk_t *mp = NULL; + + if (otc->otc_entry.otce_flags & ~OVERLAY_TARGET_CACHE_DROP) + return (EINVAL); + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote == NULL) { + ote = kmem_cache_alloc(overlay_entry_cache, KM_SLEEP); + bcopy(otc->otc_entry.otce_mac, ote->ote_addr, ETHERADDRL); + ote->ote_chead = ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_ott = ott; + ote->ote_odd = odd; + mutex_enter(&ote->ote_lock); + refhash_insert(ott->ott_u.ott_dyn.ott_dhash, ote); + avl_add(&ott->ott_u.ott_dyn.ott_tree, ote); + } else { + mutex_enter(&ote->ote_lock); + } + + if (otc->otc_entry.otce_flags & OVERLAY_TARGET_CACHE_DROP) { + ote->ote_flags |= OVERLAY_ENTRY_F_DROP; + } else { + ote->ote_flags |= OVERLAY_ENTRY_F_VALID; + bcopy(&otc->otc_entry.otce_dest, &ote->ote_dest, + sizeof (overlay_target_point_t)); + mp = ote->ote_chead; + ote->ote_chead = NULL; + ote->ote_ctail = NULL; + ote->ote_mbsize = 0; + ote->ote_vtime = gethrtime(); + } + + mutex_exit(&ote->ote_lock); + mutex_exit(&ott->ott_lock); + + if (mp != NULL) { + mp = overlay_m_tx(ote->ote_odd, mp); + freemsgchain(mp); + } + + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_remove(overlay_target_hdl_t *thdl, void *arg) +{ + int ret = 0; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + if (ote != NULL) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + ret = 0; + } else { + ret = ENOENT; + } + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (ret); +} + +/* ARGSUSED */ +static int +overlay_target_cache_flush(overlay_target_hdl_t *thdl, void *arg) +{ + avl_tree_t *avl; + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t *ote; + overlay_targ_cache_t *otc = arg; + + odd = overlay_hold_by_dlid(otc->otc_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + avl = &ott->ott_u.ott_dyn.ott_tree; + + for (ote = avl_first(avl); ote != NULL; ote = AVL_NEXT(avl, ote)) { + mutex_enter(&ote->ote_lock); + ote->ote_flags &= ~OVERLAY_ENTRY_F_VALID_MASK; + mutex_exit(&ote->ote_lock); + } + ote = refhash_lookup(ott->ott_u.ott_dyn.ott_dhash, + otc->otc_entry.otce_mac); + + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +static int +overlay_target_cache_iter_copyin(const void *ubuf, void **outp, size_t *bsize, + int flags) +{ + overlay_targ_cache_iter_t base, *iter; + + if (ddi_copyin(ubuf, &base, sizeof (overlay_targ_cache_iter_t), + flags & FKIOCTL) != 0) + return (EFAULT); + + if (base.otci_count > OVERLAY_TARGET_ITER_MAX) + return (E2BIG); + + if (base.otci_count == 0) + return (EINVAL); + + *bsize = sizeof (overlay_targ_cache_iter_t) + + base.otci_count * sizeof (overlay_targ_cache_entry_t); + iter = kmem_alloc(*bsize, KM_SLEEP); + bcopy(&base, iter, sizeof (overlay_targ_cache_iter_t)); + *outp = iter; + + return (0); +} + +typedef struct overlay_targ_cache_marker { + uint8_t otcm_mac[ETHERADDRL]; + uint16_t otcm_done; +} overlay_targ_cache_marker_t; + +/* ARGSUSED */ +static int +overlay_target_cache_iter(overlay_target_hdl_t *thdl, void *arg) +{ + overlay_dev_t *odd; + overlay_target_t *ott; + overlay_target_entry_t lookup, *ent; + overlay_targ_cache_marker_t *mark; + avl_index_t where; + avl_tree_t *avl; + uint16_t written = 0; + + overlay_targ_cache_iter_t *iter = arg; + mark = (void *)&iter->otci_marker; + + if (mark->otcm_done != 0) { + iter->otci_count = 0; + return (0); + } + + odd = overlay_hold_by_dlid(iter->otci_linkid); + if (odd == NULL) + return (ENOENT); + + mutex_enter(&odd->odd_lock); + if (!(odd->odd_flags & OVERLAY_F_VARPD)) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENXIO); + } + ott = odd->odd_target; + if (ott->ott_mode != OVERLAY_TARGET_DYNAMIC && + ott->ott_mode != OVERLAY_TARGET_POINT) { + mutex_exit(&odd->odd_lock); + overlay_hold_rele(odd); + return (ENOTSUP); + } + + /* + * Holding this lock across the entire iteration probably isn't very + * good. We should perhaps add an r/w lock for the avl tree. But we'll + * wait until we now it's necessary before we do more. + */ + mutex_enter(&ott->ott_lock); + mutex_exit(&odd->odd_lock); + + if (ott->ott_mode == OVERLAY_TARGET_POINT) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[0]; + bzero(out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + bcopy(&ott->ott_u.ott_point, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mark->otcm_done = 1; + } + + avl = &ott->ott_u.ott_dyn.ott_tree; + bcopy(mark->otcm_mac, lookup.ote_addr, ETHERADDRL); + ent = avl_find(avl, &lookup, &where); + + /* + * NULL ent means that the entry does not exist, so we want to start + * with the closest node in the tree. This means that we implicitly rely + * on the tree's order and the first node will be the mac 00:00:00:00:00 + * and the last will be ff:ff:ff:ff:ff:ff. + */ + if (ent == NULL) { + ent = avl_nearest(avl, where, AVL_AFTER); + if (ent == NULL) { + mark->otcm_done = 1; + goto done; + } + } + + for (; ent != NULL && written < iter->otci_count; + ent = AVL_NEXT(avl, ent)) { + overlay_targ_cache_entry_t *out = &iter->otci_ents[written]; + mutex_enter(&ent->ote_lock); + if ((ent->ote_flags & OVERLAY_ENTRY_F_VALID_MASK) == 0) { + mutex_exit(&ent->ote_lock); + continue; + } + bcopy(ent->ote_addr, out->otce_mac, ETHERADDRL); + out->otce_flags = 0; + if (ent->ote_flags & OVERLAY_ENTRY_F_DROP) + out->otce_flags |= OVERLAY_TARGET_CACHE_DROP; + if (ent->ote_flags & OVERLAY_ENTRY_F_VALID) + bcopy(&ent->ote_dest, &out->otce_dest, + sizeof (overlay_target_point_t)); + written++; + mutex_exit(&ent->ote_lock); + } + + if (ent != NULL) { + bcopy(ent->ote_addr, mark->otcm_mac, ETHERADDRL); + } else { + mark->otcm_done = 1; + } + +done: + iter->otci_count = written; + mutex_exit(&ott->ott_lock); + overlay_hold_rele(odd); + + return (0); +} + +/* ARGSUSED */ +static int +overlay_target_cache_iter_copyout(void *ubuf, void *buf, size_t bufsize, + int flags) +{ + size_t outsize; + const overlay_targ_cache_iter_t *iter = buf; + + outsize = sizeof (overlay_targ_cache_iter_t) + + iter->otci_count * sizeof (overlay_targ_cache_entry_t); + + if (ddi_copyout(buf, ubuf, outsize, flags & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static overlay_target_ioctl_t overlay_target_ioctab[] = { + { OVERLAY_TARG_INFO, B_TRUE, B_TRUE, + NULL, overlay_target_info, + NULL, sizeof (overlay_targ_info_t) }, + { OVERLAY_TARG_ASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_associate, + NULL, sizeof (overlay_targ_associate_t) }, + { OVERLAY_TARG_DISASSOCIATE, B_TRUE, B_FALSE, + NULL, overlay_target_disassociate, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_DEGRADE, B_TRUE, B_FALSE, + NULL, overlay_target_degrade, + NULL, sizeof (overlay_targ_degrade_t) }, + { OVERLAY_TARG_RESTORE, B_TRUE, B_FALSE, + NULL, overlay_target_restore, + NULL, sizeof (overlay_targ_id_t) }, + { OVERLAY_TARG_LOOKUP, B_FALSE, B_TRUE, + NULL, overlay_target_lookup_request, + NULL, sizeof (overlay_targ_lookup_t) }, + { OVERLAY_TARG_RESPOND, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_respond, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_DROP, B_TRUE, B_FALSE, + NULL, overlay_target_lookup_drop, + NULL, sizeof (overlay_targ_resp_t) }, + { OVERLAY_TARG_PKT, B_TRUE, B_TRUE, + overlay_target_pkt_copyin, + overlay_target_packet, + overlay_target_pkt_copyout, + sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_INJECT, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_inject, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_RESEND, B_TRUE, B_FALSE, + overlay_target_pkt_copyin, + overlay_target_resend, + NULL, sizeof (overlay_targ_pkt_t) }, + { OVERLAY_TARG_LIST, B_FALSE, B_TRUE, + overlay_target_list_copyin, + overlay_target_ioctl_list, + overlay_target_list_copyout, + sizeof (overlay_targ_list_t) }, + { OVERLAY_TARG_CACHE_GET, B_FALSE, B_TRUE, + NULL, overlay_target_cache_get, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_SET, B_TRUE, B_TRUE, + NULL, overlay_target_cache_set, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_REMOVE, B_TRUE, B_TRUE, + NULL, overlay_target_cache_remove, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_FLUSH, B_TRUE, B_TRUE, + NULL, overlay_target_cache_flush, + NULL, sizeof (overlay_targ_cache_t) }, + { OVERLAY_TARG_CACHE_ITER, B_FALSE, B_TRUE, + overlay_target_cache_iter_copyin, + overlay_target_cache_iter, + overlay_target_cache_iter_copyout, + sizeof (overlay_targ_cache_iter_t) }, + { 0 } +}; + +int +overlay_target_open(dev_t *devp, int flags, int otype, cred_t *credp) +{ + minor_t mid; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if (getminor(*devp) != 0) + return (ENXIO); + + if (otype & OTYP_BLK) + return (EINVAL); + + if (flags & ~(FREAD | FWRITE | FEXCL)) + return (EINVAL); + + if ((flags & FWRITE) && + !(flags & FEXCL)) + return (EINVAL); + + if (!(flags & FREAD) && !(flags & FWRITE)) + return (EINVAL); + + if (crgetzoneid(credp) != GLOBAL_ZONEID) + return (EPERM); + + mid = id_alloc(overlay_thdl_idspace); + if (ddi_soft_state_zalloc(overlay_thdl_state, mid) != 0) { + id_free(overlay_thdl_idspace, mid); + return (ENXIO); + } + + thdl = ddi_get_soft_state(overlay_thdl_state, mid); + VERIFY(thdl != NULL); + thdl->oth_minor = mid; + thdl->oth_zoneid = crgetzoneid(credp); + thdl->oth_oflags = flags; + mutex_init(&thdl->oth_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&thdl->oth_outstanding, sizeof (overlay_target_entry_t), + offsetof(overlay_target_entry_t, ote_qlink)); + *devp = makedevice(getmajor(*devp), mid); + + mutex_enter(&overlay_target_lock); + if ((flags & FEXCL) && overlay_target_excl == B_TRUE) { + mutex_exit(&overlay_target_lock); + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + return (EEXIST); + } else if ((flags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_FALSE); + overlay_target_excl = B_TRUE; + } + list_insert_tail(&overlay_thdl_list, thdl); + mutex_exit(&overlay_target_lock); + + return (0); +} + +/* ARGSUSED */ +int +overlay_target_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + overlay_target_ioctl_t *ioc; + overlay_target_hdl_t *thdl; + + if (secpolicy_dl_config(credp) != 0) + return (EPERM); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, + getminor(dev))) == NULL) + return (ENXIO); + + for (ioc = &overlay_target_ioctab[0]; ioc->oti_cmd != 0; ioc++) { + int ret; + caddr_t buf; + size_t bufsize; + + if (ioc->oti_cmd != cmd) + continue; + + if (ioc->oti_write == B_TRUE && !(mode & FWRITE)) + return (EBADF); + + if (ioc->oti_copyin == NULL) { + bufsize = ioc->oti_size; + buf = kmem_alloc(bufsize, KM_SLEEP); + if (ddi_copyin((void *)(uintptr_t)arg, buf, bufsize, + mode & FKIOCTL) != 0) { + kmem_free(buf, bufsize); + return (EFAULT); + } + } else { + if ((ret = ioc->oti_copyin((void *)(uintptr_t)arg, + (void **)&buf, &bufsize, mode)) != 0) + return (ret); + } + + ret = ioc->oti_func(thdl, buf); + if (ret == 0 && ioc->oti_size != 0 && + ioc->oti_ncopyout == B_TRUE) { + if (ioc->oti_copyout == NULL) { + if (ddi_copyout(buf, (void *)(uintptr_t)arg, + bufsize, mode & FKIOCTL) != 0) + ret = EFAULT; + } else { + ret = ioc->oti_copyout((void *)(uintptr_t)arg, + buf, bufsize, mode); + } + } + + kmem_free(buf, bufsize); + return (ret); + } + + return (ENOTTY); +} + +/* ARGSUSED */ +int +overlay_target_close(dev_t dev, int flags, int otype, cred_t *credp) +{ + overlay_target_hdl_t *thdl; + overlay_target_entry_t *entry; + minor_t mid = getminor(dev); + + if ((thdl = ddi_get_soft_state(overlay_thdl_state, mid)) == NULL) + return (ENXIO); + + mutex_enter(&overlay_target_lock); + list_remove(&overlay_thdl_list, thdl); + mutex_enter(&thdl->oth_lock); + while ((entry = list_remove_head(&thdl->oth_outstanding)) != NULL) + list_insert_tail(&overlay_target_list, entry); + cv_signal(&overlay_target_condvar); + mutex_exit(&thdl->oth_lock); + if ((thdl->oth_oflags & FEXCL) != 0) { + VERIFY(overlay_target_excl == B_TRUE); + overlay_target_excl = B_FALSE; + } + mutex_exit(&overlay_target_lock); + + list_destroy(&thdl->oth_outstanding); + mutex_destroy(&thdl->oth_lock); + mid = thdl->oth_minor; + ddi_soft_state_free(overlay_thdl_state, mid); + id_free(overlay_thdl_idspace, mid); + + return (0); +} diff --git a/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c new file mode 100644 index 0000000000..8b4e4ecb42 --- /dev/null +++ b/usr/src/uts/common/io/overlay/plugins/overlay_vxlan.c @@ -0,0 +1,372 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +/* + * VXLAN encapsulation module + * + * + * The VXLAN header looks as follows in network byte order: + * + * |0 3| 4 |5 31| + * +----------+---+------------------------+ + * | Reserved | I | Reserved | + * +---------------------------------------+ + * | Virtual Network ID | Reserved | + * +----------------------------+----------+ + * |0 23|24 31| + * + * All reserved values must be 0. The I bit must be 1. We call the top + * word the VXLAN magic field for the time being. The second word is + * definitely not the most friendly way to operate. Specifically, the ID + * is a 24-bit big endian value, but we have to make sure not to use the + * reserved byte. + * + * For us, VXLAN encapsulation is a fairly straightforward implementation. It + * only has two properties, a listen_ip and a listen_port. These determine on + * what address we should be listening on. While we do not have a default + * address to listen upon, we do have a default port, which is the IANA assigned + * port for VXLAN -- 4789. + */ + +#include <sys/overlay_plugin.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/byteorder.h> +#include <sys/vxlan.h> +#include <inet/ip.h> +#include <netinet/in.h> +#include <sys/strsun.h> +#include <netinet/udp.h> + +static const char *vxlan_ident = "vxlan"; +static uint16_t vxlan_defport = IPPORT_VXLAN; + +/* + * Should we enable UDP source port hashing for fanout. + */ +boolean_t vxlan_fanout = B_TRUE; + +static const char *vxlan_props[] = { + "vxlan/listen_ip", + "vxlan/listen_port", + NULL +}; + +typedef struct vxlan { + kmutex_t vxl_lock; + overlay_handle_t vxl_oh; + uint16_t vxl_lport; + boolean_t vxl_hladdr; + struct in6_addr vxl_laddr; +} vxlan_t; + +static int +vxlan_o_init(overlay_handle_t oh, void **outp) +{ + vxlan_t *vxl; + + vxl = kmem_alloc(sizeof (vxlan_t), KM_SLEEP); + *outp = vxl; + mutex_init(&vxl->vxl_lock, NULL, MUTEX_DRIVER, NULL); + vxl->vxl_oh = oh; + vxl->vxl_lport = vxlan_defport; + vxl->vxl_hladdr = B_FALSE; + + return (0); +} + +static void +vxlan_o_fini(void *arg) +{ + vxlan_t *vxl = arg; + + mutex_destroy(&vxl->vxl_lock); + kmem_free(arg, sizeof (vxlan_t)); +} + +static int +vxlan_o_socket(void *arg, int *dp, int *fp, int *pp, struct sockaddr *addr, + socklen_t *slenp) +{ + vxlan_t *vxl = arg; + struct sockaddr_in6 *in; + + in = (struct sockaddr_in6 *)addr; + *dp = AF_INET6; + *fp = SOCK_DGRAM; + *pp = 0; + bzero(in, sizeof (struct sockaddr_in6)); + in->sin6_family = AF_INET6; + + /* + * We should consider a more expressive private errno set that + * provider's can use. + */ + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + mutex_exit(&vxl->vxl_lock); + return (EINVAL); + } + in->sin6_port = htons(vxl->vxl_lport); + in->sin6_addr = vxl->vxl_laddr; + mutex_exit(&vxl->vxl_lock); + *slenp = sizeof (struct sockaddr_in6); + + return (0); +} + +static int +vxlan_o_sockopt(ksocket_t ksock) +{ + int val, err; + if (vxlan_fanout == B_FALSE) + return (0); + + val = UDP_HASH_VXLAN; + err = ksocket_setsockopt(ksock, IPPROTO_UDP, UDP_SRCPORT_HASH, &val, + sizeof (val), kcred); + return (err); +} + +/* ARGSUSED */ +static int +vxlan_o_encap(void *arg, mblk_t *mp, ovep_encap_info_t *einfop, + mblk_t **outp) +{ + mblk_t *ob; + vxlan_hdr_t *vxh; + + ASSERT(einfop->ovdi_id < (1 << 24)); + + /* + * This allocation could get hot. We may want to have a good way to + * cache and handle this allocation the same way that IP does with + * keeping around a message block per entry, or basically treating this + * as an immutable message block in the system. Basically freemsg() will + * be a nop, but we'll do the right thing with respect to the rest of + * the chain. + */ + ob = allocb(VXLAN_HDR_LEN, 0); + if (ob == NULL) + return (ENOMEM); + + vxh = (vxlan_hdr_t *)ob->b_rptr; + vxh->vxlan_flags = ntohl(VXLAN_F_VDI); + vxh->vxlan_id = htonl((uint32_t)einfop->ovdi_id << VXLAN_ID_SHIFT); + ob->b_wptr += VXLAN_HDR_LEN; + *outp = ob; + + return (0); +} + +/* ARGSUSED */ +static int +vxlan_o_decap(void *arg, mblk_t *mp, ovep_encap_info_t *dinfop) +{ + vxlan_hdr_t *vxh; + + if (MBLKL(mp) < sizeof (vxlan_hdr_t)) + return (EINVAL); + vxh = (vxlan_hdr_t *)mp->b_rptr; + if ((ntohl(vxh->vxlan_flags) & VXLAN_F_VDI) == 0) + return (EINVAL); + + dinfop->ovdi_id = ntohl(vxh->vxlan_id) >> VXLAN_ID_SHIFT; + dinfop->ovdi_hdr_size = VXLAN_HDR_LEN; + + return (0); +} + +static int +vxlan_o_getprop(void *arg, const char *pr_name, void *buf, uint32_t *bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + if (*bufsize < sizeof (struct in6_addr)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + if (vxl->vxl_hladdr == B_FALSE) { + *bufsize = 0; + } else { + bcopy(&vxl->vxl_laddr, buf, sizeof (struct in6_addr)); + *bufsize = sizeof (struct in6_addr); + } + mutex_exit(&vxl->vxl_lock); + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + uint64_t val; + if (*bufsize < sizeof (uint64_t)) + return (EOVERFLOW); + + mutex_enter(&vxl->vxl_lock); + val = vxl->vxl_lport; + bcopy(&val, buf, sizeof (uint64_t)); + *bufsize = sizeof (uint64_t); + mutex_exit(&vxl->vxl_lock); + return (0); + } + + return (EINVAL); +} + +static int +vxlan_o_setprop(void *arg, const char *pr_name, const void *buf, + uint32_t bufsize) +{ + vxlan_t *vxl = arg; + + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + const struct in6_addr *ipv6 = buf; + if (bufsize != sizeof (struct in6_addr)) + return (EINVAL); + + if (IN6_IS_ADDR_V4COMPAT(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_MULTICAST(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_6TO4(ipv6)) + return (EINVAL); + + if (IN6_IS_ADDR_V4MAPPED(ipv6)) { + ipaddr_t v4; + IN6_V4MAPPED_TO_IPADDR(ipv6, v4); + if (IN_MULTICAST(v4)) + return (EINVAL); + } + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_hladdr = B_TRUE; + bcopy(ipv6, &vxl->vxl_laddr, sizeof (struct in6_addr)); + mutex_exit(&vxl->vxl_lock); + + return (0); + } + + /* vxlan/listen_port */ + if (strcmp(pr_name, vxlan_props[1]) == 0) { + const uint64_t *valp = buf; + if (bufsize != 8) + return (EINVAL); + + if (*valp == 0 || *valp > UINT16_MAX) + return (EINVAL); + + mutex_enter(&vxl->vxl_lock); + vxl->vxl_lport = *valp; + mutex_exit(&vxl->vxl_lock); + return (0); + } + return (EINVAL); +} + +static int +vxlan_o_propinfo(const char *pr_name, overlay_prop_handle_t phdl) +{ + /* vxlan/listen_ip */ + if (strcmp(pr_name, vxlan_props[0]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[0]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_IP); + overlay_prop_set_nodefault(phdl); + return (0); + } + + if (strcmp(pr_name, vxlan_props[1]) == 0) { + overlay_prop_set_name(phdl, vxlan_props[1]); + overlay_prop_set_prot(phdl, OVERLAY_PROP_PERM_RRW); + overlay_prop_set_type(phdl, OVERLAY_PROP_T_UINT); + (void) overlay_prop_set_default(phdl, &vxlan_defport, + sizeof (vxlan_defport)); + overlay_prop_set_range_uint32(phdl, 1, UINT16_MAX); + return (0); + } + + return (EINVAL); +} + +static struct overlay_plugin_ops vxlan_o_ops = { + 0, + vxlan_o_init, + vxlan_o_fini, + vxlan_o_encap, + vxlan_o_decap, + vxlan_o_socket, + vxlan_o_sockopt, + vxlan_o_getprop, + vxlan_o_setprop, + vxlan_o_propinfo +}; + +static struct modlmisc vxlan_modlmisc = { + &mod_miscops, + "VXLAN encap plugin" +}; + +static struct modlinkage vxlan_modlinkage = { + MODREV_1, + &vxlan_modlmisc +}; + +int +_init(void) +{ + int err; + overlay_plugin_register_t *ovrp; + + ovrp = overlay_plugin_alloc(OVEP_VERSION); + if (ovrp == NULL) + return (ENOTSUP); + ovrp->ovep_name = vxlan_ident; + ovrp->ovep_ops = &vxlan_o_ops; + ovrp->ovep_id_size = VXLAN_ID_LEN; + ovrp->ovep_flags = OVEP_F_VLAN_TAG; + ovrp->ovep_dest = OVERLAY_PLUGIN_D_IP | OVERLAY_PLUGIN_D_PORT; + ovrp->ovep_props = vxlan_props; + + if ((err = overlay_plugin_register(ovrp)) == 0) { + if ((err = mod_install(&vxlan_modlinkage)) != 0) { + (void) overlay_plugin_unregister(vxlan_ident); + } + } + + overlay_plugin_free(ovrp); + return (err); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vxlan_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int err; + + if ((err = overlay_plugin_unregister(vxlan_ident)) != 0) + return (err); + + return (mod_remove(&vxlan_modlinkage)); +} diff --git a/usr/src/uts/common/io/pseudo.conf b/usr/src/uts/common/io/pseudo.conf index 42248e93d6..08affec609 100644 --- a/usr/src/uts/common/io/pseudo.conf +++ b/usr/src/uts/common/io/pseudo.conf @@ -22,8 +22,7 @@ # # Copyright 2003 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -# ident "%Z%%M% %I% %E% SMI" +# Copyright 2014 Joyent, Inc. All rights reserved. # # This file is private to the pseudonex driver. It should not be edited. # @@ -38,3 +37,9 @@ name="pseudo" class="root" instance=0; # /pseudo; it has as its children the zone console pseudo nodes. # name="zconsnex" parent="/pseudo" instance=1 valid-children="zcons"; + +# +# zfdnex is an alias for pseudo; this node is instantiated as a child of +# /pseudo; it has as its children the zone fd pseudo nodes. +# +name="zfdnex" parent="/pseudo" instance=2 valid-children="zfd"; diff --git a/usr/src/uts/common/io/pseudonex.c b/usr/src/uts/common/io/pseudonex.c index f83b0abf39..0ae06f88cc 100644 --- a/usr/src/uts/common/io/pseudonex.c +++ b/usr/src/uts/common/io/pseudonex.c @@ -83,6 +83,8 @@ static int pseudonex_detach(dev_info_t *, ddi_detach_cmd_t); static int pseudonex_open(dev_t *, int, int, cred_t *); static int pseudonex_close(dev_t, int, int, cred_t *); static int pseudonex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static int pseudonex_fm_init(dev_info_t *, dev_info_t *, int, + ddi_iblock_cookie_t *); static int pseudonex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *, void *); @@ -90,6 +92,8 @@ static void *pseudonex_state; typedef struct pseudonex_state { dev_info_t *pnx_devi; + int pnx_fmcap; + ddi_iblock_cookie_t pnx_fm_ibc; } pseudonex_state_t; static struct bus_ops pseudonex_bus_ops = { @@ -116,7 +120,7 @@ static struct bus_ops pseudonex_bus_ops = { NULL, /* bus_intr_ctl */ NULL, /* bus_config */ NULL, /* bus_unconfig */ - NULL, /* bus_fm_init */ + pseudonex_fm_init, /* bus_fm_init */ NULL, /* bus_fm_fini */ NULL, /* bus_fm_access_enter */ NULL, /* bus_fm_access_exit */ @@ -228,6 +232,9 @@ pseudonex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) pnx_state = ddi_get_soft_state(pseudonex_state, instance); pnx_state->pnx_devi = devi; + pnx_state->pnx_fmcap = DDI_FM_EREPORT_CAPABLE; + ddi_fm_init(devi, &pnx_state->pnx_fmcap, &pnx_state->pnx_fm_ibc); + if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance, DDI_NT_NEXUS, 0) != DDI_SUCCESS) { ddi_remove_minor_node(devi, NULL); @@ -247,6 +254,10 @@ pseudonex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd) if (cmd == DDI_SUSPEND) return (DDI_SUCCESS); + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + ddi_fm_fini(devi); ddi_remove_minor_node(devi, NULL); ddi_soft_state_free(pseudonex_state, instance); return (DDI_SUCCESS); @@ -375,6 +386,19 @@ pseudonex_auto_assign(dev_info_t *child) } static int +pseudonex_fm_init(dev_info_t *dip, dev_info_t *tdip, int cap, + ddi_iblock_cookie_t *ibc) +{ + pseudonex_state_t *pnx_state; + + pnx_state = ddi_get_soft_state(pseudonex_state, ddi_get_instance(dip)); + ASSERT(pnx_state != NULL); + ASSERT(ibc != NULL); + *ibc = pnx_state->pnx_fm_ibc; + return (pnx_state->pnx_fmcap & cap); +} + +static int pseudonex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop, void *arg, void *result) { diff --git a/usr/src/uts/common/io/ptm.c b/usr/src/uts/common/io/ptm.c index 400e9ffd10..07ffddc123 100644 --- a/usr/src/uts/common/io/ptm.c +++ b/usr/src/uts/common/io/ptm.c @@ -447,6 +447,18 @@ ptmclose(queue_t *rqp, int flag, cred_t *credp) return (0); } +static boolean_t +ptmptsopencb(ptmptsopencb_arg_t arg) +{ + struct pt_ttys *ptmp = (struct pt_ttys *)arg; + boolean_t rval; + + PT_ENTER_READ(ptmp); + rval = (ptmp->pt_nullmsg != NULL); + PT_EXIT_READ(ptmp); + return (rval); +} + /* * The wput procedure will only handle ioctl and flush messages. */ @@ -574,6 +586,41 @@ ptmwput(queue_t *qp, mblk_t *mp) miocack(qp, mp, 0, 0); break; } + case PTMPTSOPENCB: + { + mblk_t *dp; /* ioctl reply data */ + ptmptsopencb_t *ppocb; + + /* only allow the kernel to invoke this ioctl */ + if (iocp->ioc_cr != kcred) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* we don't support transparent ioctls */ + ASSERT(iocp->ioc_count != TRANSPARENT); + if (iocp->ioc_count == TRANSPARENT) { + miocnak(qp, mp, 0, EINVAL); + break; + } + + /* allocate a response message */ + dp = allocb(sizeof (ptmptsopencb_t), BPRI_MED); + if (dp == NULL) { + miocnak(qp, mp, 0, EAGAIN); + break; + } + + /* initialize the ioctl results */ + ppocb = (ptmptsopencb_t *)dp->b_rptr; + ppocb->ppocb_func = ptmptsopencb; + ppocb->ppocb_arg = (ptmptsopencb_arg_t)ptmp; + + /* send the reply data */ + mioc2ack(mp, dp, sizeof (ptmptsopencb_t), 0); + qreply(qp, mp); + break; + } } break; diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c index f094c9a510..db9197ca36 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c +++ b/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright 2014 OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2014, Tegile Systems Inc. All rights reserved. */ @@ -72,6 +72,7 @@ #include <sys/file.h> #include <sys/policy.h> #include <sys/model.h> +#include <sys/refhash.h> #include <sys/sysevent.h> #include <sys/sysevent/eventdefs.h> #include <sys/sysevent/dr.h> @@ -98,7 +99,6 @@ #include <sys/scsi/adapters/mpt_sas/mptsas_var.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mptsas_smhba.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/raidioctl.h> #include <sys/fs/dv_node.h> /* devfs_clean */ diff --git a/usr/src/uts/common/io/scsi/targets/sd.c b/usr/src/uts/common/io/scsi/targets/sd.c index ae1e7e0fc3..dc5dc22e37 100644 --- a/usr/src/uts/common/io/scsi/targets/sd.c +++ b/usr/src/uts/common/io/scsi/targets/sd.c @@ -3503,9 +3503,13 @@ sd_set_mmc_caps(sd_ssc_t *ssc) * according to the successful response to the page * 0x2A mode sense request. */ - scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, - "sd_set_mmc_caps: Mode Sense returned " - "invalid block descriptor length\n"); + /* + * The following warning occurs due to the KVM CD-ROM + * mishandling the multi-media commands. Ignore it. + * scsi_log(SD_DEVINFO(un), sd_label, CE_WARN, + * "sd_set_mmc_caps: Mode Sense returned " + * "invalid block descriptor length\n"); + */ kmem_free(buf, BUFLEN_MODE_CDROM_CAP); return; } @@ -4450,18 +4454,77 @@ sd_sdconf_id_match(struct sd_lun *un, char *id, int idlen) { struct scsi_inquiry *sd_inq; int rval = SD_SUCCESS; + char *p; + int chk_vidlen = 0, chk_pidlen = 0; + int has_tail = 0; + static const int VSZ = sizeof (sd_inq->inq_vid); + static const int PSZ = sizeof (sd_inq->inq_pid); ASSERT(un != NULL); sd_inq = un->un_sd->sd_inq; ASSERT(id != NULL); /* - * We use the inq_vid as a pointer to a buffer containing the - * vid and pid and use the entire vid/pid length of the table - * entry for the comparison. This works because the inq_pid - * data member follows inq_vid in the scsi_inquiry structure. + * We would like to use the inq_vid as a pointer to a buffer + * containing the vid and pid and use the entire vid/pid length of + * the table entry for the comparison. However, this does not work + * because, while the inq_pid data member follows inq_vid in the + * scsi_inquiry structure, we do not control the contents of this + * buffer, and some broken devices violate SPC 4.3.1 and return + * fields with null bytes in them. + */ + chk_vidlen = MIN(VSZ, idlen); + p = id + chk_vidlen - 1; + while (*p == ' ' && chk_vidlen > 0) { + --p; + --chk_vidlen; + } + + /* + * If it's all spaces, check the whole thing. */ - if (strncasecmp(sd_inq->inq_vid, id, idlen) != 0) { + if (chk_vidlen == 0) + chk_vidlen = MIN(VSZ, idlen); + + if (idlen > VSZ) { + chk_pidlen = idlen - VSZ; + p = id + idlen - 1; + while (*p == ' ' && chk_pidlen > 0) { + --p; + --chk_pidlen; + } + if (chk_pidlen == 0) + chk_pidlen = MIN(PSZ, idlen - VSZ); + } + + /* + * There's one more thing we need to do here. If the user specified + * an ID with trailing spaces, we need to make sure the inquiry + * vid/pid has only spaces or NULs after the check length; otherwise, it + * can't match. + */ + if (idlen > chk_vidlen && chk_vidlen < VSZ) { + for (p = sd_inq->inq_vid + chk_vidlen; + p < sd_inq->inq_vid + VSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + if (idlen > chk_pidlen + VSZ && chk_pidlen < PSZ) { + for (p = sd_inq->inq_pid + chk_pidlen; + p < sd_inq->inq_pid + PSZ; ++p) { + if (*p != ' ' && *p != '\0') { + ++has_tail; + break; + } + } + } + + if (has_tail || strncasecmp(sd_inq->inq_vid, id, chk_vidlen) != 0 || + (idlen > VSZ && + strncasecmp(sd_inq->inq_pid, id + VSZ, chk_pidlen) != 0)) { /* * The user id string is compared to the inquiry vid/pid * using a case insensitive comparison and ignoring diff --git a/usr/src/uts/common/io/timerfd.c b/usr/src/uts/common/io/timerfd.c new file mode 100644 index 0000000000..2f59c0f3e2 --- /dev/null +++ b/usr/src/uts/common/io/timerfd.c @@ -0,0 +1,548 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * Support for the timerfd facility, a Linux-borne facility that allows + * POSIX.1b timers to be created and manipulated via a file descriptor + * interface. + */ + +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/timerfd.h> +#include <sys/conf.h> +#include <sys/vmem.h> +#include <sys/sysmacros.h> +#include <sys/filio.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/timer.h> + +struct timerfd_state; +typedef struct timerfd_state timerfd_state_t; + +struct timerfd_state { + kmutex_t tfd_lock; /* lock protecting state */ + kcondvar_t tfd_cv; /* condvar */ + pollhead_t tfd_pollhd; /* poll head */ + uint64_t tfd_fired; /* # of times fired */ + itimer_t tfd_itimer; /* underlying itimer */ + timerfd_state_t *tfd_next; /* next state on global list */ +}; + +/* + * Internal global variables. + */ +static kmutex_t timerfd_lock; /* lock protecting state */ +static dev_info_t *timerfd_devi; /* device info */ +static vmem_t *timerfd_minor; /* minor number arena */ +static void *timerfd_softstate; /* softstate pointer */ +static timerfd_state_t *timerfd_state; /* global list of state */ + +static itimer_t * +timerfd_itimer_lock(timerfd_state_t *state) +{ + itimer_t *it = &state->tfd_itimer; + + mutex_enter(&state->tfd_lock); + + while (it->it_lock & ITLK_LOCKED) { + it->it_blockers++; + cv_wait(&it->it_cv, &state->tfd_lock); + it->it_blockers--; + } + + it->it_lock |= ITLK_LOCKED; + + mutex_exit(&state->tfd_lock); + + return (it); +} + +static void +timerfd_itimer_unlock(timerfd_state_t *state, itimer_t *it) +{ + VERIFY(it == &state->tfd_itimer); + VERIFY(it->it_lock & ITLK_LOCKED); + + mutex_enter(&state->tfd_lock); + + it->it_lock &= ~ITLK_LOCKED; + + if (it->it_blockers) + cv_signal(&it->it_cv); + + mutex_exit(&state->tfd_lock); +} + +static void +timerfd_fire(itimer_t *it) +{ + timerfd_state_t *state = it->it_frontend; + uint64_t oval; + + mutex_enter(&state->tfd_lock); + oval = state->tfd_fired++; + mutex_exit(&state->tfd_lock); + + if (oval == 0) { + cv_broadcast(&state->tfd_cv); + pollwakeup(&state->tfd_pollhd, POLLRDNORM | POLLIN); + } +} + +/*ARGSUSED*/ +static int +timerfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ + timerfd_state_t *state; + major_t major = getemajor(*devp); + minor_t minor = getminor(*devp); + + if (minor != TIMERFDMNRN_TIMERFD) + return (ENXIO); + + mutex_enter(&timerfd_lock); + + minor = (minor_t)(uintptr_t)vmem_alloc(timerfd_minor, 1, + VM_BESTFIT | VM_SLEEP); + + if (ddi_soft_state_zalloc(timerfd_softstate, minor) != DDI_SUCCESS) { + vmem_free(timerfd_minor, (void *)(uintptr_t)minor, 1); + mutex_exit(&timerfd_lock); + return (NULL); + } + + state = ddi_get_soft_state(timerfd_softstate, minor); + *devp = makedevice(major, minor); + + state->tfd_next = timerfd_state; + timerfd_state = state; + + mutex_exit(&timerfd_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +timerfd_read(dev_t dev, uio_t *uio, cred_t *cr) +{ + timerfd_state_t *state; + minor_t minor = getminor(dev); + uint64_t val; + int err; + + if (uio->uio_resid < sizeof (val)) + return (EINVAL); + + state = ddi_get_soft_state(timerfd_softstate, minor); + + mutex_enter(&state->tfd_lock); + + while (state->tfd_fired == 0) { + if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { + mutex_exit(&state->tfd_lock); + return (EAGAIN); + } + + if (!cv_wait_sig_swap(&state->tfd_cv, &state->tfd_lock)) { + mutex_exit(&state->tfd_lock); + return (EINTR); + } + } + + /* + * Our tfd_fired is non-zero; slurp its value and then clear it. + */ + val = state->tfd_fired; + state->tfd_fired = 0; + mutex_exit(&state->tfd_lock); + + err = uiomove(&val, sizeof (val), UIO_READ, uio); + + return (err); +} + +/*ARGSUSED*/ +static int +timerfd_poll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + timerfd_state_t *state; + minor_t minor = getminor(dev); + short revents = 0; + + state = ddi_get_soft_state(timerfd_softstate, minor); + + mutex_enter(&state->tfd_lock); + + if (state->tfd_fired > 0) + revents |= POLLRDNORM | POLLIN; + + if (!(*reventsp = revents & events) && !anyyet) + *phpp = &state->tfd_pollhd; + + mutex_exit(&state->tfd_lock); + + return (0); +} + +static int +timerfd_copyin(uintptr_t addr, itimerspec_t *dest) +{ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyin((void *)addr, dest, sizeof (itimerspec_t)) != 0) + return (EFAULT); + } else { + itimerspec32_t dest32; + + if (copyin((void *)addr, &dest32, sizeof (itimerspec32_t)) != 0) + return (EFAULT); + + ITIMERSPEC32_TO_ITIMERSPEC(dest, &dest32); + } + + if (itimerspecfix(&dest->it_value) || + (itimerspecfix(&dest->it_interval) && + timerspecisset(&dest->it_value))) { + return (EINVAL); + } + + return (0); +} + +static int +timerfd_copyout(itimerspec_t *src, uintptr_t addr) +{ + if (get_udatamodel() == DATAMODEL_NATIVE) { + if (copyout(src, (void *)addr, sizeof (itimerspec_t)) != 0) + return (EFAULT); + } else { + itimerspec32_t src32; + + if (ITIMERSPEC_OVERFLOW(src)) + return (EOVERFLOW); + + ITIMERSPEC_TO_ITIMERSPEC32(&src32, src); + + if (copyout(&src32, (void *)addr, sizeof (itimerspec32_t)) != 0) + return (EFAULT); + } + + return (0); +} + +/*ARGSUSED*/ +static int +timerfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) +{ + itimerspec_t when, oval; + timerfd_state_t *state; + minor_t minor = getminor(dev); + int err; + itimer_t *it; + + state = ddi_get_soft_state(timerfd_softstate, minor); + + switch (cmd) { + case TIMERFDIOC_CREATE: { + if (arg == TIMERFD_MONOTONIC) + arg = CLOCK_MONOTONIC; + + it = timerfd_itimer_lock(state); + + if (it->it_backend != NULL) { + timerfd_itimer_unlock(state, it); + return (EEXIST); + } + + if ((it->it_backend = clock_get_backend(arg)) == NULL) { + timerfd_itimer_unlock(state, it); + return (EINVAL); + } + + /* + * We need to provide a proc structure only for purposes + * of locking CLOCK_REALTIME-based timers -- it is safe to + * provide p0 here. + */ + it->it_proc = &p0; + + err = it->it_backend->clk_timer_create(it, timerfd_fire); + + if (err != 0) { + it->it_backend = NULL; + timerfd_itimer_unlock(state, it); + return (err); + } + + it->it_frontend = state; + timerfd_itimer_unlock(state, it); + + return (0); + } + + case TIMERFDIOC_GETTIME: { + it = timerfd_itimer_lock(state); + + if (it->it_backend == NULL) { + timerfd_itimer_unlock(state, it); + return (ENODEV); + } + + err = it->it_backend->clk_timer_gettime(it, &when); + timerfd_itimer_unlock(state, it); + + if (err != 0) + return (err); + + if ((err = timerfd_copyout(&when, arg)) != 0) + return (err); + + return (0); + } + + case TIMERFDIOC_SETTIME: { + timerfd_settime_t st; + + if (copyin((void *)arg, &st, sizeof (st)) != 0) + return (EFAULT); + + if ((err = timerfd_copyin(st.tfd_settime_value, &when)) != 0) + return (err); + + it = timerfd_itimer_lock(state); + + if (it->it_backend == NULL) { + timerfd_itimer_unlock(state, it); + return (ENODEV); + } + + if (st.tfd_settime_ovalue != NULL) { + err = it->it_backend->clk_timer_gettime(it, &oval); + + if (err != 0) { + timerfd_itimer_unlock(state, it); + return (err); + } + } + + err = it->it_backend->clk_timer_settime(it, + st.tfd_settime_flags & TFD_TIMER_ABSTIME ? + TIMER_ABSTIME : TIMER_RELTIME, &when); + timerfd_itimer_unlock(state, it); + + if (err != 0 || st.tfd_settime_ovalue == NULL) + return (err); + + if ((err = timerfd_copyout(&oval, st.tfd_settime_ovalue)) != 0) + return (err); + + return (0); + } + + default: + break; + } + + return (ENOTTY); +} + +/*ARGSUSED*/ +static int +timerfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ + timerfd_state_t *state, **sp; + itimer_t *it; + minor_t minor = getminor(dev); + + state = ddi_get_soft_state(timerfd_softstate, minor); + + if (state->tfd_pollhd.ph_list != NULL) { + pollwakeup(&state->tfd_pollhd, POLLERR); + pollhead_clean(&state->tfd_pollhd); + } + + /* + * No one can get to this timer; we don't need to lock it -- we can + * just call on the backend to delete it. + */ + it = &state->tfd_itimer; + + if (it->it_backend != NULL) + it->it_backend->clk_timer_delete(it); + + mutex_enter(&timerfd_lock); + + /* + * Remove our state from our global list. + */ + for (sp = &timerfd_state; *sp != state; sp = &((*sp)->tfd_next)) + VERIFY(*sp != NULL); + + *sp = (*sp)->tfd_next; + + ddi_soft_state_free(timerfd_softstate, minor); + vmem_free(timerfd_minor, (void *)(uintptr_t)minor, 1); + + mutex_exit(&timerfd_lock); + + return (0); +} + +/*ARGSUSED*/ +static int +timerfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) +{ + mutex_enter(&timerfd_lock); + + if (ddi_soft_state_init(&timerfd_softstate, + sizeof (timerfd_state_t), 0) != 0) { + cmn_err(CE_NOTE, "/dev/timerfd failed to create soft state"); + mutex_exit(&timerfd_lock); + return (DDI_FAILURE); + } + + if (ddi_create_minor_node(devi, "timerfd", S_IFCHR, + TIMERFDMNRN_TIMERFD, DDI_PSEUDO, NULL) == DDI_FAILURE) { + cmn_err(CE_NOTE, "/dev/timerfd couldn't create minor node"); + ddi_soft_state_fini(&timerfd_softstate); + mutex_exit(&timerfd_lock); + return (DDI_FAILURE); + } + + ddi_report_dev(devi); + timerfd_devi = devi; + + timerfd_minor = vmem_create("timerfd_minor", (void *)TIMERFDMNRN_CLONE, + UINT32_MAX - TIMERFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, + VM_SLEEP | VMC_IDENTIFIER); + + mutex_exit(&timerfd_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +timerfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + switch (cmd) { + case DDI_DETACH: + break; + + case DDI_SUSPEND: + return (DDI_SUCCESS); + + default: + return (DDI_FAILURE); + } + + mutex_enter(&timerfd_lock); + vmem_destroy(timerfd_minor); + + ddi_remove_minor_node(timerfd_devi, NULL); + timerfd_devi = NULL; + + ddi_soft_state_fini(&timerfd_softstate); + mutex_exit(&timerfd_lock); + + return (DDI_SUCCESS); +} + +/*ARGSUSED*/ +static int +timerfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + int error; + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)timerfd_devi; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + } + return (error); +} + +static struct cb_ops timerfd_cb_ops = { + timerfd_open, /* open */ + timerfd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + timerfd_read, /* read */ + nodev, /* write */ + timerfd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + timerfd_poll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + 0, /* streamtab */ + D_NEW | D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops timerfd_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + timerfd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + timerfd_attach, /* attach */ + timerfd_detach, /* detach */ + nodev, /* reset */ + &timerfd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed, /* quiesce */ +}; + +static struct modldrv modldrv = { + &mod_driverops, /* module type (this is a pseudo driver) */ + "timerfd support", /* name of module */ + &timerfd_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + (void *)&modldrv, + NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/usr/src/uts/common/io/timerfd.conf b/usr/src/uts/common/io/timerfd.conf new file mode 100644 index 0000000000..c6ad86d051 --- /dev/null +++ b/usr/src/uts/common/io/timerfd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015 Joyent, Inc. All rights reserved. +# + +name="timerfd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/udmf/dm9601reg.h b/usr/src/uts/common/io/udmf/dm9601reg.h new file mode 100644 index 0000000000..a36f2b0fc8 --- /dev/null +++ b/usr/src/uts/common/io/udmf/dm9601reg.h @@ -0,0 +1,348 @@ +/* + * %W% %E% + * Macro definitions for Davicom DM9601 USB to fast ethernet controler + * based on Davicom DM9601E data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +#ifndef __DM9601_H__ +#define __DM9601_H__ + +/* + * offset of registers + */ +#define NCR 0x00U /* network control register */ +#define NSR 0x01U /* network status register */ +#define TCR 0x02U /* tx control register */ +#define TSR1 0x03U /* tx status register 1 */ +#define TSR2 0x04U /* tx status register 2 */ +#define RCR 0x05U /* rx control register */ +#define RSR 0x06U /* rx status register */ +#define ROCR 0x07U /* rx overflow counter register */ +#define BPTR 0x08U /* back pressure threshold regster */ +#define FCTR 0x09U /* flow control threshold regster */ +#define FCR 0x0aU /* flow control threshold regster */ +#define EPCR 0x0bU /* eeprom & phy control register */ +#define EPAR 0x0cU /* eeprom & phy address register */ +#define EPDR 0x0dU /* eeprom & phy data register (2byte) */ +#define WCR 0x0fU /* wake up control register */ +#define PAR 0x10U /* physical address register (6byte) */ +#define MAR 0x16U /* multicast address register (8byte) */ +#define GPCR 0x1eU /* general purpose control register */ +#define GPR 0x1fU /* general purpose register */ +#define VID 0x28U /* vendor ID (2byte) */ +#define PID 0x2aU /* product ID (2byte) */ +#define CHIPR 0x2cU /* chip revision */ +#define USBDA 0xf0U /* usb device address register */ +#define RXC 0xf1U /* received packet counter register */ +#define TUSC 0xf2U /* tx packet counter/usb status register */ +#define USBC 0xf4U /* usb control register */ + +/* + * register definitions + */ +/* network control register */ +#define NCR_EXT_PHY 0x80U /* 1: select external phy */ +#define NCR_WAKEEN 0x40U /* 1: wake up event enable */ +#define NCR_FCOL 0x10U /* force collision mode for test */ +#define NCR_FDX 0x08U /* 1: full duplex mode (for external phy) */ +#define NCR_LBK 0x06U +#define NCR_LBK_SHIFT 1 +#define NCR_LBK_NORMAL (0U << NCR_LBK_SHIFT) +#define NCR_LBK_MAC (1U << NCR_LBK_SHIFT) +#define NCR_LBK_PHY_D (2U << NCR_LBK_SHIFT) +#define NCR_LBK_PHY_A (3U << NCR_LBK_SHIFT) +#define NCR_RST 0x01U /* 1: reset, auto clear */ + +#define NCR_BITS \ + "\020" \ + "\010EXT_PHY" \ + "\007WAKEEN" \ + "\005FCOL" \ + "\004FDX" \ + "\001RST" + +/* network status register */ +#define NSR_SPEED 0x80U /* 1:10M 0:100M */ +#define NSR_LINKST 0x40U /* 1:ok 0:fail */ +#define NSR_WAKEST 0x20U /* 1:enabled */ +#define NSR_TXFULL 0x10U /* 1:tx fifo full */ +#define NSR_TX2END 0x08U /* tx packet2 complete status */ +#define NSR_TX1END 0x04U /* tx packet1 complete status */ +#define NSR_RXOV 0x02U /* rx fifo overflow */ +#define NSR_RXRDY 0x01U /* rx packet ready */ + +#define NSR_BITS \ + "\020" \ + "\010SPEED_10" \ + "\007LINKST_UP" \ + "\006WAKEST" \ + "\005TXFULL" \ + "\004TX2END" \ + "\003TX1END" \ + "\002RXOV" \ + "\001RXRDY" + +/* tx control register */ +#define TCR_TJDIS 0x40U /* tx jitter control */ +#define TCR_EXCEDM 0x20U /* excessive collision mode */ +#define TCR_PAD_DIS2 0x10U /* PAD appends disable for pkt2 */ +#define TCR_CRC_DIS2 0x08U /* CRC appends disable for pkt2 */ +#define TCR_PAD_DIS1 0x04U /* PAD appends disable for pkt1 */ +#define TCR_CRC_DIS1 0x02U /* CRC appends disable for pkt1 */ + +#define TCR_BITS \ + "\020" \ + "\007TJDIS" \ + "\006EXCEDM" \ + "\005PAD_DIS2" \ + "\004CRC_DIS2" \ + "\003PAD_DIS1" \ + "\002CRC_DIS1" + +/* tx status register (ro) */ +#define TSR_TJTO 0x80U /* tx jabber time out */ +#define TSR_LC 0x40U /* loss of carrier */ +#define TSR_NC 0x20U /* no carrier */ +#define TSR_LATEC 0x10U /* late collision */ +#define TSR_COL 0x08U /* late collision */ +#define TSR_EL 0x04U /* excessive collision */ + +#define TSR_BITS \ + "\020" \ + "\010TJTO" \ + "\007LC" \ + "\006NC" \ + "\005LATEC" \ + "\004COL" \ + "\003EL" + +/* rx control register */ +#define RCR_WTDIS 0x40U /* watch dog timer disable */ +#define RCR_DIS_LONG 0x20U /* discard longer packets than 1522 */ +#define RCR_DIS_CRC 0x10U /* discard crc error packets */ +#define RCR_ALL 0x08U /* pass all multicast */ +#define RCR_RUNT 0x04U /* pass runt packets */ +#define RCR_PRMSC 0x02U /* promiscuous mode */ +#define RCR_RXEN 0x01U /* rx enable */ + +#define RCR_BITS \ + "\020" \ + "\007WTDIS" \ + "\006DIS_LONG" \ + "\005DIS_CRC" \ + "\004ALL" \ + "\003RUNT" \ + "\002PRMSC" \ + "\001RXEN" + +/* rx status register */ +#define RSR_RF 0x80U /* runt frame */ +#define RSR_MF 0x40U /* multicast frame */ +#define RSR_LCS 0x20U /* late collision seen */ +#define RSR_RWTO 0x10U /* receive watchdog timeout */ +#define RSR_PLE 0x08U /* physical layer error */ +#define RSR_AE 0x04U /* alignment error */ +#define RSR_CE 0x02U /* crc error */ +#define RSR_FOE 0x01U /* fifo overflow error */ + +#define RSR_BITS \ + "\020" \ + "\010RF" \ + "\007MF" \ + "\006LCS" \ + "\005RWTO" \ + "\004PLE" \ + "\003AE" \ + "\002CE" \ + "\001FOE" + +/* receive overflow counter register */ +#define ROCR_RXFU 0x80U /* receive overflow counter overflow */ +#define ROCR_ROC 0x7fU /* receive overflow counter */ + +#define ROCR_BITS \ + "\020" \ + "\010RXFU" + +/* back pressure threshold register */ +#define BPTR_BPHW 0xf0U /* high water overflow threshold */ +#define BPTR_BPHW_SHIFT 4 +#define BPTR_BPHW_UNIT 1024U +#define BPTR_BPHW_DEFAULT (3 << BPTR_BPHW_SHIFT) /* 3k */ +#define BPTR_JPT 0x0fU /* jam pattern time */ +#define BPTR_JPT_SHIFT 0 +#define BPTR_JPT_5us (0U << BPTR_JPT_SHIFT) +#define BPTR_JPT_10us (1U << BPTR_JPT_SHIFT) +#define BPTR_JPT_15us (2U << BPTR_JPT_SHIFT) +#define BPTR_JPT_25us (3U << BPTR_JPT_SHIFT) +#define BPTR_JPT_50us (4U << BPTR_JPT_SHIFT) +#define BPTR_JPT_100us (5U << BPTR_JPT_SHIFT) +#define BPTR_JPT_150us (6U << BPTR_JPT_SHIFT) +#define BPTR_JPT_200us (7U << BPTR_JPT_SHIFT) +#define BPTR_JPT_250us (8U << BPTR_JPT_SHIFT) +#define BPTR_JPT_300us (9U << BPTR_JPT_SHIFT) +#define BPTR_JPT_350us (10U << BPTR_JPT_SHIFT) +#define BPTR_JPT_400us (11U << BPTR_JPT_SHIFT) +#define BPTR_JPT_450us (12U << BPTR_JPT_SHIFT) +#define BPTR_JPT_500us (13U << BPTR_JPT_SHIFT) +#define BPTR_JPT_550us (14U << BPTR_JPT_SHIFT) +#define BPTR_JPT_600us (15U << BPTR_JPT_SHIFT) + +/* flow control threshold register */ +#define FCTR_HWOT 0xf0U /* rx fifo high water overflow threshold */ +#define FCTR_HWOT_SHIFT 4 +#define FCTR_HWOT_UNIT 1024U +#define FCTR_LWOT 0x0fU /* rx fifo low water overflow threshold */ +#define FCTR_LWOT_SHIFT 0 +#define FCTR_LWOT_UNIT 1024U + +/* rx/tx flow control register */ +#define FCR_TXPO 0x80U /* tx pause packet */ +#define FCR_TXPF 0x40U /* tx pause packet */ +#define FCR_TXPEN 0x20U /* tx pause packet */ +#define FCR_BKPA 0x10U /* back pressure mode */ +#define FCR_BKPM 0x08U /* back pressure mode */ +#define FCR_BKPS 0x04U /* rx pause packet current status (r/c) */ +#define FCR_RXPCS 0x02U /* rx pause packet current status (ro) */ +#define FCR_FLCE 0x01U /* flow control enbale */ + +#define FCR_BITS \ + "\020" \ + "\000TXPO" \ + "\000TXPF" \ + "\000TXPEN" \ + "\000BKPA" \ + "\000BKPM" \ + "\000BKPS" \ + "\000RXPCS" \ + "\000FLCE" + +/* EEPROM & PHY control register (0x0b) */ +#define EPCR_REEP 0x20U /* reload eeprom */ +#define EPCR_WEP 0x10U /* write eeprom enable */ +#define EPCR_EPOS 0x08U /* select device, 0:eeprom, 1:phy */ +#define EPCR_ERPRR 0x04U /* read command */ +#define EPCR_ERPRW 0x02U /* write command */ +#define EPCR_ERRE 0x01U /* eeprom/phy access in progress (ro) */ + +#define EPCR_BITS \ + "\020" \ + "\005REEP" \ + "\004WEP" \ + "\003EPOS" \ + "\002ERPRR" \ + "\001ERPRW" \ + "\000ERRE" + +/* EEPROM & PHY access register (0x0c) */ +#define EPAR_PHYADR 0xc0U /* phy address, internal phy(1) or external */ +#define EPAR_PHYADR_SHIFT 6 +#define EPAR_EROA 0x3fU /* eeprom word addr or phy register addr */ +#define EPAR_EROA_SHIFT 0 + +/* EEPROM & PHY data register (0x0d(low)-0x0e(hi)) */ + +/* wake up control register (0x0f) */ +#define WCR_LINKEN 0x20U /* enable link status event */ +#define WCR_SAMPLEEN 0x10U /* enable sample frame event */ +#define WCR_MAGICEN 0x08U /* enable magic pkt event */ +#define WCR_LINKST 0x04U /* link status change occur ro */ +#define WCR_SAMPLEST 0x02U /* sample frame rx occur ro */ +#define WCR_MAGICST 0x01U /* magic pkt rx occur ro */ + +#define WCR_BITS \ + "\020" \ + "\000LINKEN" \ + "\000SAMPLEEN" \ + "\000MAGICEN" \ + "\000LINKST" \ + "\000SAMPLEST" \ + "\000MAGICST" + +/* physical address register (0x10-0x15) */ +/* multicast address register (0x16-0x1c) */ +/* general purpose control register (0x1e) */ +#define GPCR_GEPCTRL 0x7f +#define GPCR_OUT(n) (1U << (n)) + +#define GPCR_BITS \ + "\020" \ + "\006OUT5" \ + "\005OUT4" \ + "\004OUT3" \ + "\003OUT2" \ + "\002OUT1" \ + "\001OUT0" + +/* general purpose register (0x1f) */ +#define GPR_GEPIO5 0x20U +#define GPR_GEPIO4 0x10U +#define GPR_GEPIO3 0x08U +#define GPR_GEPIO2 0x04U +#define GPR_GEPIO1 0x02U +#define GPR_GEPIO0 0x01U + +#define GPR_BITS \ + "\020" \ + "\006GEPIO5" \ + "\005GEPIO4" \ + "\004GEPIO3" \ + "\003GEPIO2" \ + "\002GEPIO1" \ + "\001GEPIO0" + +/* vendor id register (0x28-0x29) */ +/* product id register (0x2a-0x2b) */ +/* chip revision register (0x2c) */ + +/* usb device address register (0xf0) */ +#define USBDA_USBFA 0x3fU /* usb device address */ +#define USBDA_USBFA_SHIFT 0 + +/* receive packet counter register (0xf1) */ + +/* transmitpacket counter/usb status register (0xf2) */ +#define TUSR_RXFAULT 0x80U /* indicate rx has unexpected condition */ +#define TUSR_SUSFLAG 0x40U /* indicate device has suspended condition */ +#define TUSR_EP1RDY 0x20U /* ready for read from ep1 pipe */ +#define TUSR_SRAM 0x18U /* sram size 0:32K, 1:48K, 2:16K, 3:64K */ +#define TUSR_SRAM_SHIFT 3 +#define TUSR_SRAM_32K (0U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_48K (1U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_16K (2U << TUSR_SRAM_SHIFT) +#define TUSR_SRAM_64K (3U << TUSR_SRAM_SHIFT) +#define TUSR_TXC2 0x04U /* two or more packets in tx buffer */ +#define TUSR_TXC1 0x02U /* one packet in tx buffer */ +#define TUSR_TXC0 0x01U /* no packet in tx buffer */ + +#define TUSR_BITS \ + "\020" \ + "\010RXFAULT" \ + "\007SUSFLAG" \ + "\006EP1RDY" \ + "\003TXC2" \ + "\002TXC1" \ + "\001TXC0" + +/* usb control register (0xf4) */ +#define USBC_EP3ACK 0x20U /* ep3 will alway return 8byte data if NAK=0*/ +#define USBC_EP3NACK 0x10U /* ep3 will alway return NAK */ +#define USBC_MEMTST 0x01U + +/* bulk message format */ +#define TX_HEADER_SIZE 2 +#define RX_HEADER_SIZE 3 + +/* interrupt msg format */ +struct intr_msg { + uint8_t im_nsr; + uint8_t im_tsr1; + uint8_t im_tsr2; + uint8_t im_rsr; + uint8_t im_rocr; + uint8_t im_rxc; + uint8_t im_txc; + uint8_t im_gpr; +}; +#endif /* __DM9601_H__ */ diff --git a/usr/src/uts/common/io/udmf/udmf_usbgem.c b/usr/src/uts/common/io/udmf/udmf_usbgem.c new file mode 100644 index 0000000000..0637de054b --- /dev/null +++ b/usr/src/uts/common/io/udmf/udmf_usbgem.c @@ -0,0 +1,1036 @@ +/* + * udmfE_usbgem.c : Davicom DM9601E USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2009-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "dm9601reg.h" + +char ident[] = "dm9601 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label +#define LE16P(p) ((((uint8_t *)(p))[1] << 8) | ((uint8_t *)(p))[0]) + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int udmf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (udmf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for dm9601 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct udmf_dev { + /* + * Misc HW information + */ + uint8_t rcr; + uint8_t last_nsr; + uint8_t mac_addr[ETHERADDRL]; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t udmf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void udmf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int udmf_reset_chip(struct usbgem_dev *); +static int udmf_init_chip(struct usbgem_dev *); +static int udmf_start_chip(struct usbgem_dev *); +static int udmf_stop_chip(struct usbgem_dev *); +static int udmf_set_media(struct usbgem_dev *); +static int udmf_set_rx_filter(struct usbgem_dev *); +static int udmf_get_stats(struct usbgem_dev *); +static void udmf_interrupt(struct usbgem_dev *, mblk_t *); + +/* packet operations */ +static mblk_t *udmf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *udmf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUT(dp, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 1, \ + /* wValue */ 0, \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define OUTB(dp, ix, val, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 3, \ + /* wValue */ (val), \ + /* wIndex */ (ix), \ + /* wLength */ 0, \ + /* value */ NULL, \ + /* size */ 0)) != USB_SUCCESS) goto label + +#define IN(dp, ix, len, buf, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ 0, \ + /* wValue */ 0, \ + /* wIndex */ (ix), \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static void +udmf_enable_phy(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + /* de-assert reset signal to phy */ + OUTB(dp, GPCR, GPCR_OUT(0), &err, usberr); + OUTB(dp, GPR, 0, &err, usberr); +usberr: + ; +} + +static int +udmf_reset_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + OUTB(dp, NCR, NCR_LBK_NORMAL | NCR_RST, &err, usberr); + drv_usecwait(100); +usberr: + return (err); +} + +/* + * Setup dm9601 + */ +static int +udmf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err = USB_SUCCESS; + uint16_t reg; + uint8_t buf[2]; + struct udmf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + OUTB(dp, NCR, NCR_LBK_NORMAL, &err, usberr); + + /* tx control regiser: enable padding and crc generation */ + OUTB(dp, TCR, 0, &err, usberr); + + /* rx control register: will be set later by udmf_set_rx_filer() */ + lp->rcr = RCR_RUNT; + + /* back pressure threshold: */ + OUTB(dp, BPTR, (2 << BPTR_BPHW_SHIFT) | BPTR_JPT_200us, + &err, usberr); + + /* flow control threshold: same as default */ + OUTB(dp, FCTR, (3 << FCTR_HWOT_SHIFT) | (8 << FCTR_LWOT_SHIFT), + &err, usberr); + + /* usb control register */ + OUTB(dp, USBC, USBC_EP3ACK | 0x06, &err, usberr); + + /* flow control: will be set later by udmf_set_media() */ + + /* wake up control register: */ + OUTB(dp, WCR, 0, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + + /* enable Rx */ + lp->rcr |= RCR_RXEN; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_stop_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + + /* disable rx */ + lp->rcr &= ~RCR_RXEN; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_get_stats(struct usbgem_dev *dp) +{ + /* EMPTY */ + return (USB_SUCCESS); +} + +static uint_t +udmf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_le(addr) & 0x3f); +} + +static int +udmf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint8_t rcr; + uint8_t mode; + uint8_t mhash[8]; + uint8_t *mac; + uint_t h; + int err = USB_SUCCESS; + struct udmf_dev *lp = dp->private; + static uint8_t invalid_mac[ETHERADDRL] = {0, 0, 0, 0, 0, 0}; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & RCR_RXEN) { + /* set promiscuous mode before changing rx filter mode */ + OUTB(dp, RCR, lp->rcr | RCR_PRMSC, &err, usberr); + } + + lp->rcr &= ~(RCR_ALL | RCR_PRMSC); + mode = 0; + bzero(mhash, sizeof (mhash)); + mac = dp->cur_addr.ether_addr_octet; + + if ((dp->rxmode & RXMODE_ENABLE) == 0) { + mac = invalid_mac; + } else if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_PRMSC; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 32) { + /* accept all multicast packets */ + mode |= RCR_ALL; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + for (i = 0; i < dp->mc_count; i++) { + /* hash table is 64 = 2^6 bit width */ + h = dp->mc_list[i].hash; + mhash[h / 8] |= 1 << (h % 8); + } + } + + /* set node address */ + if (bcmp(mac, lp->mac_addr, ETHERADDRL) != 0) { + OUT(dp, PAR, ETHERADDRL, dp->cur_addr.ether_addr_octet, + &err, usberr); + bcopy(mac, lp->mac_addr, ETHERADDRL); + } + + /* set multicast hash table */ + OUT(dp, MAR, sizeof (mhash), &mhash[0], &err, usberr); + + /* update rcr */ + lp->rcr |= mode; + OUTB(dp, RCR, lp->rcr, &err, usberr); + +#if DEBUG_LEVEL > 1 + /* verify rcr */ + IN(dp, RCR, 1, &rcr, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, rcr, RCR_BITS); +#endif +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +static int +udmf_set_media(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + uint8_t fcr; + struct udmf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* setup flow control */ + fcr = 0; + if (dp->full_duplex) { + /* select flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_RX_PAUSE: + fcr |= FCR_FLCE; + break; + + case FLOW_CONTROL_TX_PAUSE: + fcr |= FCR_TXPEN; + break; + + case FLOW_CONTROL_SYMMETRIC: + fcr |= FCR_FLCE | FCR_TXPEN; + break; + } + } + + /* update flow control register */ + OUTB(dp, FCR, fcr, &err, usberr); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + err, err == USB_SUCCESS ? "success" : "error")); + return (err); +} + +/* + * send/receive packet check + */ +static mblk_t * +udmf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int n; + size_t pkt_size; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + uint_t align_mask; + + pkt_size = msgdsize(mp); + align_mask = 63; + + /* + * re-allocate the mp + */ + + /* minimum ethernet packet size of ETHERMIN */ + pkt_size = max(pkt_size, ETHERMIN); + +#if 0 /* CONFIG_ADD_TX_DELIMITOR_ALWAYS */ + pkt_size += TX_HEADER_SIZE; +#endif + if (((pkt_size + TX_HEADER_SIZE) & align_mask) == 0) { + /* padding is required in usb communication */ + pkt_size += TX_HEADER_SIZE; + } + + if ((new = allocb(TX_HEADER_SIZE + pkt_size, 0)) == NULL) { + return (NULL); + } + new->b_wptr = new->b_rptr + TX_HEADER_SIZE + pkt_size; + + /* add a header */ + bp = new->b_rptr; + bp[0] = (uint8_t)pkt_size; + bp[1] = (uint8_t)(pkt_size >> 8); + bp += TX_HEADER_SIZE; + + /* copy contents of the buffer */ + for (tp = mp; tp; tp = tp->b_cont) { + n = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, n); + bp += n; + } + + /* clear the rest including the next zero length header */ + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + return (new); +} + +static void +udmf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +udmf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + int len; + uint8_t rx_stat; + + len = mp->b_wptr - mp->b_rptr; + + if (len <= RX_HEADER_SIZE) { + /* + * the usb bulk-in frame doesn't include a valid + * ethernet packet. + */ + return (NULL); + } + + /* remove rx header */ + rx_stat = mp->b_rptr[0]; + if (rx_stat & (RSR_RF | RSR_LCS | RSR_RWTO | + RSR_PLE | RSR_AE | RSR_CE | RSR_FOE)) { + if (rx_stat & RSR_RF) { + dp->stats.runt++; + } + if (rx_stat & RSR_LCS) { + /* late collision */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_RWTO) { + /* rx timeout */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_PLE) { + /* physical layer error */ + dp->stats.rcv_internal_err++; + } + if (rx_stat & RSR_AE) { + /* alignment error */ + dp->stats.frame++; + } + if (rx_stat & RSR_CE) { + /* crc error */ + dp->stats.crc++; + } + if (rx_stat & RSR_FOE) { + /* fifo overflow error */ + dp->stats.overflow++; + } + dp->stats.errrcv++; + } + len = LE16P(&mp->b_rptr[1]); + if (len >= ETHERFCSL) { + len -= ETHERFCSL; + } + mp->b_rptr += RX_HEADER_SIZE; + mp->b_wptr = mp->b_rptr + len; + + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +udmf_ep_read(struct usbgem_dev *dp, uint_t which, uint_t addr, int *errp) +{ + int i; + uint8_t epcr; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, addr)); + + OUTB(dp, EPAR, addr, errp, usberr); + OUTB(dp, EPCR, which | EPCR_ERPRR, errp, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, EPCR, sizeof (epcr), &epcr, errp, usberr); + if ((epcr & EPCR_ERRE) == 0) { + /* done */ + IN(dp, EPDR, sizeof (val), &val, errp, usberr); + val = LE_16(val); + goto done; + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__); + val = 0; +done: + OUTB(dp, EPCR, 0, errp, usberr); + return (val); + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + *errp, *errp == USB_SUCCESS ? "success" : "error")); + return (0); +} + +static void +udmf_ep_write(struct usbgem_dev *dp, uint_t which, uint_t addr, + uint16_t val, int *errp) +{ + int i; + uint8_t epcr; + + DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__)); + + val = LE_16(val); + OUT(dp, EPDR, sizeof (val), &val, errp, usberr); + + OUTB(dp, EPAR, addr, errp, usberr); + + OUTB(dp, EPCR, which | EPCR_WEP | EPCR_ERPRW, errp, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, EPCR, 1, &epcr, errp, usberr); + if ((epcr & EPCR_ERRE) == 0) { + /* done */ + goto done; + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout", dp->name, __func__); +done: + OUTB(dp, EPCR, 0, errp, usberr); + return; + +usberr: + DPRINTF(2, (CE_CONT, "!%s: %s: end err:%d(%s)", + dp->name, __func__, + *errp, *errp == USB_SUCCESS ? "success" : "error")); +} + +static uint16_t +udmf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint16_t val; + + val = udmf_ep_read(dp, EPCR_EPOS, + (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, errp); + + return (val); +} + +static void +udmf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + udmf_ep_write(dp, EPCR_EPOS, + (dp->mii_phy_addr << EPAR_PHYADR_SHIFT) | index, val, errp); +} + +static void +udmf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + struct intr_msg *imp; + struct udmf_dev *lp = dp->private; + + imp = (struct intr_msg *)&mp->b_rptr[0]; + + DPRINTF(4, (CE_CONT, + "!%s: %s: size:%d, nsr:%b tsr1:%b tsr2:%b" + " rsr:%b rocr:%b rxc:%02x txc:%b gpr:%b", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + imp->im_nsr, NSR_BITS, + imp->im_tsr1, TSR_BITS, + imp->im_tsr2, TSR_BITS, + imp->im_rsr, RSR_BITS, + imp->im_rocr, ROCR_BITS, + imp->im_rxc, + imp->im_txc, TUSR_BITS, + imp->im_gpr, GPR_BITS)); + + if ((lp->last_nsr ^ imp->im_nsr) & NSR_LINKST) { + usbgem_mii_update_link(dp); + } + + lp->last_nsr = imp->im_nsr; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static uint16_t +udmf_eeprom_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint16_t val; + + val = udmf_ep_read(dp, 0, index, errp); + + return (val); +} + +#ifdef DEBUG_LEVEL +static void +udmf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint16_t w0, w1, w2, w3; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + + err = USB_SUCCESS; + + for (i = 0; i < size; i += 4) { + w0 = udmf_eeprom_read(dp, i + 0, &err); + w1 = udmf_eeprom_read(dp, i + 1, &err); + w2 = udmf_eeprom_read(dp, i + 2, &err); + w3 = udmf_eeprom_read(dp, i + 3, &err); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i, w0, w1, w2, w3); + } +usberr: + ; +} +#endif + +static int +udmf_attach_chip(struct usbgem_dev *dp) +{ + int i; + uint_t val; + uint8_t *m; + int err; + struct udmf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s enter", dp->name, __func__)); + + /* + * get mac address from EEPROM + */ + m = dp->dev_addr.ether_addr_octet; + for (i = 0; i < ETHERADDRL; i += 2) { + val = udmf_eeprom_read(dp, i/2, &err); + m[i + 0] = (uint8_t)val; + m[i + 1] = (uint8_t)(val >> 8); + } + + /* invalidate a private cache for mac addr */ + bzero(lp->mac_addr, sizeof (lp->mac_addr)); +#ifdef CONFIG_VLAN + dp->misc_flag = USBGEM_VLAN; +#endif +#if DEBUG_LEVEL > 0 + udmf_eeprom_dump(dp, /* 0x3f + 1 */ 128); +#endif +{ + static uint8_t bcst[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + DPRINTF(0, (CE_CONT, "!%s: %s: hash of bcast:%x", + dp->name, __func__, usbgem_ether_crc_be(bcst))); +} + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: %s: usb error detected (%d)", + dp->name, __func__, err); + return (USB_FAILURE); +} + +static int +udmf_mii_probe(struct usbgem_dev *dp) +{ + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + udmf_enable_phy(dp); + return (usbgem_mii_probe_default(dp)); +} + +static int +udmf_mii_init(struct usbgem_dev *dp) +{ + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + udmf_enable_phy(dp); + return (USB_SUCCESS); +} + +static int +udmfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int revid; + int unit; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct udmf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + /* + * softmac requires that ppa is the instance number + * of the device, otherwise it hangs in seaching the device. + */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 64; + + ugcp->usbgc_rx_header_len = RX_HEADER_SIZE; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; +#if 1 + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; +#else + /* + * XXX - flow control caused link down frequently under + * heavy traffic + */ + ugcp->usbgc_flow_control = FLOW_CONTROL_NONE; +#endif + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = + USBGEM_LINK_WATCH_INTERVAL; + ugcp->usbgc_mii_an_watch_interval = + USBGEM_LINK_WATCH_INTERVAL/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = (25*ONESEC)/10; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + ugcp->usbgc_mii_hw_link_detection = B_TRUE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &udmf_attach_chip; + ugcp->usbgc_reset_chip = &udmf_reset_chip; + ugcp->usbgc_init_chip = &udmf_init_chip; + ugcp->usbgc_start_chip = &udmf_start_chip; + ugcp->usbgc_stop_chip = &udmf_stop_chip; + ugcp->usbgc_multicast_hash = &udmf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &udmf_set_rx_filter; + ugcp->usbgc_set_media = &udmf_set_media; + ugcp->usbgc_get_stats = &udmf_get_stats; + ugcp->usbgc_interrupt = &udmf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &udmf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &udmf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &udmf_mii_probe; + ugcp->usbgc_mii_init = &udmf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &udmf_mii_read; + ugcp->usbgc_mii_write = &udmf_mii_write; + ugcp->usbgc_mii_addr_min = 1; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct udmf_dev), KM_SLEEP); + lp->last_nsr; + + ddi_set_driver_private(dip, NULL); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct udmf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct udmf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + + return (DDI_FAILURE); +} + +static int +udmfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(udmf_ops, udmfattach, udmfdetach); +#else +static struct module_info udmfminfo = { + 0, /* mi_idnum */ + "udmf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit udmfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &udmfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit udmfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &udmfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab udmf_info = { + &udmfrinit, /* st_rdinit */ + &udmfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_udmf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &udmf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops udmf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + udmfattach, /* devo_attach */ + udmfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_udmf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &udmf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!udmf: _init: called")); + + status = usbgem_mod_init(&udmf_ops, "udmf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&udmf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!udmf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&udmf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/upf/adm8511reg.h b/usr/src/uts/common/io/upf/adm8511reg.h new file mode 100644 index 0000000000..68a2207bb5 --- /dev/null +++ b/usr/src/uts/common/io/upf/adm8511reg.h @@ -0,0 +1,205 @@ +/* + * @(#)adm8511reg.h 1.1 09/06/20 + * Register dehinitsions of ADMtek ADM8511 Fast Ethernet to USB controller. + * Codeded by Masayuki Murayama(KHF04453@nifty.ne.jp) + * This file is public domain. + */ + +#define EC0 0x00 /* B */ +#define EC1 0x01 /* B */ +#define EC2 0x02 /* B */ +#define MA 0x08 /* 8byte array */ +#define EID 0x10 /* B */ +#define PAUSETIMER 0x18 /* B pause timer */ +#define RPNBFC 0x1a /* B */ +#define ORFBFC 0x1b /* B */ +#define EP1C 0x1c /* B */ +#define RXFC 0x1d /* B */ +#define BIST 0x1e /* B */ +#define EEOFFSET 0x20 /* B */ +#define EEDATA 0x21 /* W */ +#define EECTRL 0x23 /* B */ +#define PHYA 0x25 /* B */ +#define PHYD 0x26 /* W */ +#define PHYAC 0x28 /* B */ +#define USBSTAT 0x2a /* B */ +#define ETHTXSTAT 0x2b /* W */ +#define ETHRXSTAT 0x2d /* B */ +#define LOSTCNT 0x2e /* W */ +#define WF0MASK 0x30 /* 16byte array */ +#define WF0OFFSET 0x40 /* W */ +#define WF0CRC 0x41 /* W */ +#define WF1MASK 0x48 /* 16byte array */ +#define WF1OFFSET 0x58 /* W */ +#define WF1CRC 0x59 /* W */ +#define WF2MASK 0x60 /* 16byte array */ +#define WF2OFFSET 0x70 /* W */ +#define WF2CRC 0x71 /* W */ +#define WCTRL 0x78 /* B */ +#define WSTAT 0x7a /* B */ +#define IPHYC 0x7b /* B */ +#define GPIO54 0x7c /* B */ +#define GPIO10 0x7e /* B */ +#define GPIO32 0x7f /* B */ +#define TEST 0x80 /* B */ +#define TM 0x81 /* B */ +#define RPN 0x82 /* B */ + +/* Ethernet control register 0: offset 0 */ +#define EC0_TXE 0x80U +#define EC0_RXE 0x40U +#define EC0_RXFCE 0x20U +#define EC0_WOE 0x10U +#define EC0_RXSA 0x08U +#define EC0_SBO 0x04U +#define EC0_RXMA 0x02U +#define EC0_RXCS 0x01U + +#define EC0_BITS \ + "\020" \ + "\010TXE" \ + "\007RXE" \ + "\006RXFCE" \ + "\005WOE" \ + "\004RXSA" \ + "\003SBO" \ + "\002RXMA" \ + "\001RXCS" + +/* Ethernet control register 1: offset 1 */ +#define EC1_FD 0x20U +#define EC1_100M 0x10U /* 0:10Mbps 1:100Mbps */ +#define EC1_RM 0x08U /* reset mac */ + +#define EC1_BITS \ + "\020" \ + "\006FD" \ + "\005100M" \ + "\004RM" + +/* Ethernet control register 2: offset 2 */ +#define EC2_MEPL 0x80U /* 8515: MTU 0:1528, 1:1638 */ +#define EC2_RPNC 0x40U +#define EC2_LEEPRS 0x20U +#define EC2_EEPRW 0x10U +#define EC2_LB 0x08U +#define EC2_PROM 0x04U +#define EC2_RXBP 0x02U +#define EC2_EP3RC 0x01U + +#define EC2_BITS \ + "\020" \ + "\010MEPS" \ + "\007RPNC" \ + "\006LEEPRS" \ + "\005EEPRW" \ + "\004LB" \ + "\003PROM" \ + "\002RXBP" \ + "\001EP3RC" + +/* Recieve Packet number based Flow Control register: offset 0x1a */ +#define RPNBFC_PN 0x7eU /* */ +#define RPNBFC_PN_SHIFT 1 +#define RPNBFC_FCP 0x01U /* enable rx flow control */ + +/* Occupied Recieve FIFO based Flow Control register: offset 0x1b */ +#define ORFBFC_RXS 0x7eU /* */ +#define ORFBFC_RXS_SHIFT 1 +#define ORFBFC_RXS_UNIT 1024U +#define ORFBFC_FCRXS 0x01U /* enable rx flow control */ + +/* EP1 control register: offset 0x1c */ +#define EP1C_EP1S0E 0x80U /* send 0 enable */ +#define EP1C_ITMA 0x60U /* internal test mode A */ +#define EP1C_ITMB 0x1fU /* internal test mode B */ + +#define EP1C_BITS \ + "\020" \ + "\010EP1S0E" + +/* Rx FIFO Control register: offset 0x1d */ +#define RXFC_EXT_SRAM 0x02 /* enable external 32k sram */ +#define RXFC_RX32PKT 0x01 /* max 32 packet */ + +/* EEPROM offset register: offset 0x20 */ +#define EEOFFSET_MASK 0x3f /* eeprom offset address in word */ + +/* EEPROM access control register: offset 0x23 */ +#define EECTRL_DONE 0x04 +#define EECTRL_RD 0x02 +#define EECTRL_WR 0x01 + +#define EECTRL_BITS \ + "\020" \ + "\003DONE" \ + "\002RD" \ + "\001WR" + +/* PHY control register: offset 28 */ +#define PHYAC_DO 0x80U /* Done */ +#define PHYAC_RDPHY 0x40U /* read phy */ +#define PHYAC_WRPHY 0x20U /* write phy */ +#define PHYAC_PHYRA 0x1fU /* PHY register address */ + +#define PHYCTRL_BITS \ + "\020" \ + "\010DO" \ + "\007RDPHY" \ + "\006WRPHY" + +/* Internal PHY control register: offset 7b */ +#define IPHYC_EPHY 0x02 +#define IPHYC_PHYR 0x01 + +#define IPHYC_BITS \ + "\020" \ + "\002EPHY" \ + "\001PHYR" + +/* GPIO45 register: offset 7c */ +#define GPIO54_5OE 0x20 +#define GPIO54_5O 0x10 +#define GPIO54_5I 0x08 +#define GPIO54_4OE 0x04 +#define GPIO54_4O 0x02 +#define GPIO54_4I 0x01 + +/* GPIO01 register: offset 7e */ +#define GPIO10_1OE 0x20 +#define GPIO10_1O 0x10 +#define GPIO10_1I 0x08 +#define GPIO10_0OE 0x04 +#define GPIO10_0O 0x02 +#define GPIO10_0I 0x01 + +/* GPIO23 register: offset 7f */ +#define GPIO32_3OE 0x20 +#define GPIO32_3O 0x10 +#define GPIO32_3I 0x08 +#define GPIO32_2OE 0x04 +#define GPIO32_2O 0x02 +#define GPIO32_2I 0x01 + +/* rx status at the end of received packets */ +/* byte 0 and 1 is packet length in little endian */ +/* byte 2 is receive status */ +#define RSR_DRIBBLE 0x10 +#define RSR_CRC 0x08 +#define RSR_RUNT 0x04 +#define RSR_LONG 0x02 +#define RSR_MULTI 0x01 + +#define RSR_ERRORS \ + (RSR_DRIBBLE | RSR_CRC | RSR_RUNT | RSR_LONG | RSR_MULTI) + +#define RSR_BITS \ + "\020" \ + "\005DRIBBLE" \ + "\004CRC" \ + "\003RUNT" \ + "\002LONG" \ + "\001MULTI" +/* byte 3 is reserved */ + +/* TEST register: offset 80 */ diff --git a/usr/src/uts/common/io/upf/upf_usbgem.c b/usr/src/uts/common/io/upf/upf_usbgem.c new file mode 100644 index 0000000000..5614803158 --- /dev/null +++ b/usr/src/uts/common/io/upf/upf_usbgem.c @@ -0,0 +1,1213 @@ +/* + * upf_usbgem.c : ADMtek an986/adm8511/adm8513/adm8515 USB to + * Fast Ethernet Driver for Solaris + */ + +/* + * Copyright (c) 2004-2011 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" + +/* hardware stuff */ +#include "usbgem_mii.h" +#include "adm8511reg.h" + +char ident[] = "pegasus usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define CHECK_AND_JUMP(val, label) \ + if ((val) != USB_SUCCESS) { goto label; } + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int upf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (upf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for ADMtek Pegasus/PegasusII + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct upf_dev { + /* + * Misc HW information + */ + uint8_t ec[3]; + uint8_t mac_addr[ETHERADDRL]; + int chip_type; +#define CHIP_AN986 1 /* avoid 0 */ +#define CHIP_ADM8511 2 /* including adm8515 */ +#define CHIP_ADM8513 3 + boolean_t phy_init_done; + uint8_t last_link_state; + + uint16_t vid; /* vendor id */ + uint16_t pid; /* product id */ +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t upf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void upf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int upf_attach_chip(struct usbgem_dev *); +static int upf_reset_chip(struct usbgem_dev *); +static int upf_init_chip(struct usbgem_dev *); +static int upf_start_chip(struct usbgem_dev *); +static int upf_stop_chip(struct usbgem_dev *); +static int upf_set_media(struct usbgem_dev *); +static int upf_set_rx_filter(struct usbgem_dev *); +static int upf_get_stats(struct usbgem_dev *); + +/* packet operations */ +static mblk_t *upf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *upf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* interrupt handler */ +static void upf_interrupt(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define UPF_REQ_GET_REGISTER 0xf0 +#define UPF_REQ_SET_REGISTER 0xf1 +#define OUTB(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ (v), \ + /* wIndex */ (p), \ + /* wLength */ 1, \ + /* buf */ NULL, \ + /* size */ 0)) != USB_SUCCESS) goto label; + +#define OUTW(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 2, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_SET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ (len), \ + /* buf */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define INB(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 1, \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INW(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ 2, \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ UPF_REQ_GET_REGISTER, \ + /* wValue */ 0, \ + /* wIndex */ (p), \ + /* wLength */ (len), \ + /* buf */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +upf_reset_chip(struct usbgem_dev *dp) +{ + int i; + uint8_t val; + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + bzero(lp->mac_addr, sizeof (lp->mac_addr)); + + lp->ec[1] = 0; + OUTB(dp, EC1, EC1_RM, &err, usberr); + + for (i = 0; i < 1000; i++) { + INB(dp, EC1, &val, &err, usberr); + if ((val & EC1_RM) == 0) { + lp->ec[1] = val; + return (USB_SUCCESS); + } + drv_usecwait(10); + } + + /* time out */ + cmn_err(CE_WARN, "!%s: failed to reset: timeout", dp->name); + return (USB_FAILURE); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * Setup an986/adm8511/adm8513/adm8515 + */ +static int +upf_init_chip(struct usbgem_dev *dp) +{ + uint64_t zero64 = 0; + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* ethernet control register 0 */ + lp->ec[0] |= EC0_RXSA | EC0_RXCS; + OUTB(dp, EC0, lp->ec[0], &err, usberr); + + /* ethernet control reg1: will be set later in set_rx_filter() */ + + /* ethernet control register 2: will be set later in set_rx_filter() */ + INB(dp, EC2, &lp->ec[2], &err, usberr); + lp->ec[2] |= EC2_RXBP | EC2_EP3RC; +#ifdef CONFIG_VLAN + if (dp->misc_flag & USBGEM_VLAN) { + lp->ec[2] |= EC2_MEPL; + } +#endif + OUTB(dp, EC2, lp->ec[2], &err, usberr); + + /* Multicast address hash: clear */ + OUTS(dp, MA, &zero64, 8, &err, usberr); + + /* Ethernet ID : will be set later in upf_set_rx_filter() */ + + /* PAUSE timer */ + OUTB(dp, PAUSETIMER, 0x1f, &err, usberr); + + /* receive packet number based pause control:set in upf_set_media() */ + + /* occupied receive FIFO based pause control:set in upf_set_media() */ + + /* EP1 control: default */ + + /* Rx FIFO control */ + if (lp->chip_type != CHIP_AN986) { + /* use 24K internal sram, 16pkts in fifo */ + OUTB(dp, RXFC, 0, &err, usberr); + } + + /* BIST contror: do nothing */ + err = upf_set_media(dp); + CHECK_AND_JUMP(err, usberr); + + DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_start_chip(struct usbgem_dev *dp) +{ + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* enable RX and TX */ + lp->ec[0] |= EC0_TXE | EC0_RXE; + OUTB(dp, EC0, lp->ec[0], &err, usberr); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_stop_chip(struct usbgem_dev *dp) +{ + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* disable RX and TX */ + lp->ec[0] &= ~(EC0_TXE | EC0_RXE); + OUTB(dp, EC0, lp->ec[0], &err, usberr); + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usberr(%d) detected", + dp->name, __func__, err); + return (err); +} + +static int +upf_get_stats(struct usbgem_dev *dp) +{ + /* do nothing */ + return (USB_SUCCESS); +} + +static uint_t +upf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + /* hash table is 64 = 2^6 bit width */ + return (usbgem_ether_crc_le(addr) & 0x3f); +} + +static int +upf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + int err; +#ifdef DEBUG_LEVEL + uint8_t reg0; + uint8_t reg1; + uint8_t reg2; +#endif + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called, rxmode:%b", + dp->name, __func__, dp->rxmode, RXMODE_BITS)); + + /* reset rx mode */ + lp->ec[0] &= ~EC0_RXMA; + lp->ec[2] &= ~EC2_PROM; + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + lp->ec[0] |= EC0_RXMA; + lp->ec[2] |= EC2_PROM; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 0) { + /* XXX - multicast hash table didin't work */ + /* accept all multicast packets */ + lp->ec[0] |= EC0_RXMA; + } + + if (bcmp(dp->cur_addr.ether_addr_octet, + lp->mac_addr, ETHERADDRL) != 0) { + + /* need to update mac address */ + bcopy(dp->cur_addr.ether_addr_octet, + lp->mac_addr, ETHERADDRL); + OUTS(dp, EID, + lp->mac_addr, ETHERADDRL, &err, usberr); + } + + /* update rx mode */ + OUTS(dp, EC0, lp->ec, 3, &err, usberr); + +#if DEBUG_LEVEL > 0 + INB(dp, EC0, ®0, &err, usberr); + INB(dp, EC1, ®1, &err, usberr); + INB(dp, EC2, ®2, &err, usberr); + + cmn_err(CE_CONT, "!%s: %s: returned, ec:%b %b %b", + dp->name, __func__, + reg0, EC0_BITS, reg1, EC1_BITS, reg2, EC2_BITS); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (err); +} + +static int +upf_set_media(struct usbgem_dev *dp) +{ + int err; + struct upf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + lp->ec[1] &= ~(EC1_FD | EC1_100M); + + /* select duplex */ + if (dp->full_duplex) { + lp->ec[1] |= EC1_FD; + } + + /* select speed */ + if (dp->speed == USBGEM_SPD_100) { + lp->ec[1] |= EC1_100M; + } + + /* rx flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_RX_PAUSE: + lp->ec[0] |= EC0_RXFCE; + break; + + default: + lp->ec[0] &= ~EC0_RXFCE; + break; + } + + /* tx flow control */ + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + case FLOW_CONTROL_TX_PAUSE: + if (lp->chip_type != CHIP_AN986) { + /* pegasus II has internal 24k fifo */ + OUTB(dp, ORFBFC, + (12 << ORFBFC_RXS_SHIFT) | ORFBFC_FCRXS, + &err, usberr); + + /* 16 packts can be stored in rx fifo */ + OUTB(dp, RPNBFC_PN, + (8 << RPNBFC_PN_SHIFT) | RPNBFC_FCP, + &err, usberr); + } else { + /* an986 has external 32k fifo */ + OUTB(dp, ORFBFC, + (16 << ORFBFC_RXS_SHIFT) | ORFBFC_FCRXS, + &err, usberr); + + /* AN986 fails to link up when RPNBFC is enabled */ + OUTB(dp, RPNBFC, 0, &err, usberr); + } + break; + + default: + OUTB(dp, ORFBFC, 0, &err, usberr); + OUTB(dp, RPNBFC, 0, &err, usberr); + break; + } + + /* update ether control registers */ + OUTS(dp, EC0, lp->ec, 2, &err, usberr); + DPRINTF(0, (CE_CONT, "!%s: %s: returned, ec0:%b, ec1:%b", + dp->name, __func__, lp->ec[0], EC0_BITS, lp->ec[1], EC1_BITS)); + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: %s: failed to write ec1", dp->name, __func__); + return (err); +} + +/* + * send/receive packet check + */ +static mblk_t * +upf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + size_t len; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + int msglen; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + len = msgdsize(mp); + if (len < ETHERMIN) { + len = ETHERMIN; + } + + /* allocate msg block */ + msglen = len + sizeof (uint16_t); + + /* avoid usb controller bug */ + if ((msglen & 0x3f) == 0) { + /* add a header for additional 0-length usb message */ + msglen += sizeof (uint16_t); + } + + if ((new = allocb(msglen, 0)) == NULL) { + return (NULL); + } + + /* copy contents of the buffer */ + new->b_wptr = new->b_rptr + msglen; + bp = new->b_rptr; + + /* the nic requires a two byte header of the packet size */ + bp[0] = (uint8_t)len; + bp[1] = (uint8_t)(len >> 8); + bp += sizeof (uint16_t); + + /* copy the payload */ + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + if (len > 0) { + bcopy(tp->b_rptr, bp, len); + bp += len; + } + } + + /* clear ethernet pads and additional usb header if we have */ + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + return (new); +} + +static void +upf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +upf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *p; + uint16_t rxhd; + uint_t len; + uint8_t rsr; + struct upf_dev *lp = dp->private; + + ASSERT(mp != NULL); + +#ifdef DEBUG_LEVEL + len = msgdsize(mp); + DPRINTF(2, (CE_CONT, "!%s: time:%d %s: cont:%p", + dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont)); + + if (upf_debug > 3) { + upf_dump_packet(dp, mp->b_rptr, max(6, len)); + } +#endif + /* get the length of Rx packet */ + p = mp->b_wptr - 4; + rsr = p[3]; + if (lp->chip_type == CHIP_ADM8513) { + /* As Rx packets from ADM8513 have two byte header, remove it */ + p = mp->b_rptr; + len = ((p[1] << 8) | p[0]) & 0x0fff; + mp->b_rptr += 2; + } else { + len = (((p[1] << 8) | p[0]) & 0x0fff) - ETHERFCSL - 4; + } + + DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d", + dp->name, __func__, rsr, RSR_BITS, len)); + + /* check if error happen */ + if (rsr & RSR_ERRORS) { + DPRINTF(0, (CE_CONT, "!%s: rsr:%b", dp->name, rsr, RSR_BITS)); + if (rsr & (RSR_CRC | RSR_DRIBBLE)) { + dp->stats.frame++; + } + if (rsr & RSR_LONG) { + dp->stats.frame_too_long++; + } + if (rsr & RSR_RUNT) { + dp->stats.runt++; + } + + dp->stats.errrcv++; + return (NULL); + } +#ifndef CONFIG_VLAN + /* check packet size */ + if (len > ETHERMAX) { + /* too long */ + dp->stats.frame_too_long++; + dp->stats.errrcv++; + return (NULL); + } else if (len < ETHERMIN) { + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } +#endif + /* remove tailing crc and rx status fields */ + mp->b_wptr = mp->b_rptr + len; + ASSERT(mp->b_next == NULL); + return (mp); +} + +/* + * Device depend interrupt handler + */ +static void +upf_interrupt(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *bp; + struct upf_dev *lp = dp->private; + + bp = mp->b_rptr; + + DPRINTF(2, (CE_CONT, + "!%s: %s: size:%d, %02x %02x %02x %02x %02x %02x %02x %02x", + dp->name, __func__, mp->b_wptr - mp->b_rptr, + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7])); + + if ((lp->last_link_state ^ bp[5]) & 1) { + DPRINTF(1, (CE_CONT, "!%s:%s link status changed:", + dp->name, __func__)); + usbgem_mii_update_link(dp); + } + + lp->last_link_state = bp[5] & 1; +} + +/* + * MII Interfaces + */ +static uint16_t +upf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + uint8_t phyctrl; + uint16_t val; + int i; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + ASSERT(index >= 0 && index < 32); + + *errp = USB_SUCCESS; + + /* set PHYADDR */ + OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr); + + /* Initiate MII read transaction */ + OUTB(dp, PHYAC, index | PHYAC_RDPHY, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, PHYAC, &phyctrl, errp, usberr); + if (phyctrl & PHYAC_DO) { + /* done */ + INW(dp, PHYD, &val, errp, usberr); + DPRINTF(4, (CE_CONT, "!%s: %s: return %04x", + dp->name, __func__, val)); + return (val); + } + drv_usecwait(10); + } + /* timeout */ + cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__); + *errp = USB_FAILURE; + return (0); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +upf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + int i; + uint8_t phyctrl; + + DPRINTF(4, (CE_CONT, "!%s: %s called index:%d val:0x%04x", + dp->name, __func__, index, val)); + ASSERT(index >= 0 && index < 32); + + *errp = USB_SUCCESS; + + OUTW(dp, PHYD, val, errp, usberr); + OUTB(dp, PHYA, dp->mii_phy_addr, errp, usberr); + OUTB(dp, PHYAC, index | PHYAC_WRPHY, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, PHYAC, &phyctrl, errp, usberr); + if (phyctrl & PHYAC_DO) { + /* done */ + return; + } + drv_usecwait(10); + } + + /* time out */ + cmn_err(CE_WARN, "!%s: %s: timeout detected", dp->name, __func__); + *errp = USB_FAILURE; + return; + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); +} + + +static int +upf_enable_phy(struct usbgem_dev *dp) +{ + uint8_t val; + int err; + struct upf_dev *lp = dp->private; + + /* + * first, try to enable internal phy + */ + INB(dp, IPHYC, &val, &err, usberr); + val = (val | IPHYC_EPHY) & ~IPHYC_PHYR; + OUTB(dp, IPHYC, val, &err, usberr); + + INB(dp, IPHYC, &val, &err, usberr); + DPRINTF(0, (CE_CONT, "!%s: %s: IPHYC: %b", + dp->name, __func__, val, IPHYC_BITS)); + if (val) { + /* reset internal phy */ + OUTB(dp, IPHYC, val | IPHYC_PHYR, &err, usberr); + OUTB(dp, IPHYC, val, &err, usberr); + delay(drv_usectohz(10000)); + + /* identify the chip generation */ + OUTB(dp, 0x83, 0xa5, &err, usberr); + INB(dp, 0x83, &val, &err, usberr); + if (val == 0xa5) { + lp->chip_type = CHIP_ADM8513; + } else { + /* adm8511 or adm8515 */ + lp->chip_type = CHIP_ADM8511; + } + dp->ugc.usbgc_mii_hw_link_detection = B_TRUE; + } else { + /* + * It should be AN986 which doesn't have an internal PHY. + * We need to setup gpio ports in AN986, which are + * connected to external PHY control pins. + */ + lp->chip_type = CHIP_AN986; + + /* reset external phy */ + /* output port#0 L, port#1 L */ + OUTB(dp, GPIO10, GPIO10_0O | GPIO10_0OE, &err, usberr); + + /* output port#0 H, port#1 L */ + OUTB(dp, GPIO10, + GPIO10_0O | GPIO10_0OE | GPIO10_1OE, &err, usberr); + + /* hw link detection doesn't work correctly */ + dp->ugc.usbgc_mii_hw_link_detection = B_FALSE; + } + + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +upf_mii_probe(struct usbgem_dev *dp) +{ + int err; + uint16_t val; + struct upf_dev *lp = dp->private; + + if (!lp->phy_init_done) { + upf_enable_phy(dp); + lp->phy_init_done = B_TRUE; + } + + return (usbgem_mii_probe_default(dp)); +} + +static int +upf_mii_init(struct usbgem_dev *dp) +{ + uint16_t val; + int err = USB_SUCCESS; + struct upf_dev *lp = dp->private; + + if (!lp->phy_init_done) { + upf_enable_phy(dp); + } + lp->phy_init_done = B_FALSE; + + if (lp->chip_type == CHIP_AN986 && + (lp->vid == 0x0db7 /* elecom */ || + lp->vid == 0x066b /* linksys */ || + lp->vid == 0x077b /* linksys */ || + lp->vid == 0x2001 /* dlink */)) { + /* special treatment for Linksys products */ + val = upf_mii_read(dp, 0x1b, &err) | 0x4; + upf_mii_write(dp, 0x1b, val, &err); + } + return (err); +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static uint16_t +upf_read_eeprom(struct usbgem_dev *dp, int index, int *errp) +{ + int i; + uint8_t eectrl; + uint16_t data; + + *errp = USB_SUCCESS; + + OUTB(dp, EECTRL, 0, errp, usberr); + + OUTB(dp, EEOFFSET, index, errp, usberr); + OUTB(dp, EECTRL, EECTRL_RD, errp, usberr); + + for (i = 0; i < 100; i++) { + INB(dp, EECTRL, &eectrl, errp, usberr); + if (eectrl & EECTRL_DONE) { + INW(dp, EEDATA, &data, errp, usberr); + return (data); + } + drv_usecwait(10); + } + + /* time out */ + *errp = USB_FAILURE; + return (0); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + return (0); +} + +static void +upf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + + cmn_err(CE_CONT, "!%s: %s dump:", dp->name, __func__); + + for (i = 0; i < size; i += 4) { + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i*2, + upf_read_eeprom(dp, i + 0, &err), + upf_read_eeprom(dp, i + 1, &err), + upf_read_eeprom(dp, i + 2, &err), + upf_read_eeprom(dp, i + 3, &err)); + } +} + +static int +upf_attach_chip(struct usbgem_dev *dp) +{ + int i; + int err; + uint16_t val; + uint8_t *mac; + struct upf_dev *lp = dp->private; + + /* + * Read mac address from EEPROM + */ + mac = dp->dev_addr.ether_addr_octet; + for (i = 0; i < 3; i++) { + val = upf_read_eeprom(dp, i, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + mac[i*2+0] = (uint8_t)val; + mac[i*2+1] = (uint8_t)(val >> 8); + } + + DPRINTF(0, (CE_CONT, + "%s: %s: mac: %02x:%02x:%02x:%02x:%02x:%02x", + dp->name, __func__, + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5])); + + dp->misc_flag = 0; +#ifdef CONFIG_VLAN + dp->misc_flag |= USBGEM_VLAN; +#endif +#if DEBUG_LEVEL > 3 + upf_eeprom_dump(dp, 0x80); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "!%s: %s: usb error detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +upfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int unit; + uint32_t tcr; + int len; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct upf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, unit, __func__, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + sprintf(ugcp->usbgc_name, "%s%d", drv_name, unit); + ugcp->usbgc_ppa = unit; + + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 16; + + ugcp->usbgc_rx_header_len = 4; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = USBGEM_TX_TIMEOUT_INTERVAL; + + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_NONE; + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = MII_AN_TIMEOUT/2; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + ugcp->usbgc_mii_an_delay = ONESEC/10; + + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RESET; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &upf_attach_chip; + ugcp->usbgc_reset_chip = &upf_reset_chip; + ugcp->usbgc_init_chip = &upf_init_chip; + ugcp->usbgc_start_chip = &upf_start_chip; + ugcp->usbgc_stop_chip = &upf_stop_chip; + ugcp->usbgc_multicast_hash = &upf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &upf_set_rx_filter; + ugcp->usbgc_set_media = &upf_set_media; + ugcp->usbgc_get_stats = &upf_get_stats; + ugcp->usbgc_interrupt = &upf_interrupt; + + /* packet operation */ + ugcp->usbgc_tx_make_packet = &upf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &upf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &upf_mii_probe; + ugcp->usbgc_mii_init = &upf_mii_init; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &upf_mii_read; + ugcp->usbgc_mii_write = &upf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct upf_dev), KM_SLEEP); + + lp->vid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "usb-vendor-id", -1); + lp->pid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "usb-product-id", -1); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct upf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct upf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + if (cmd == DDI_RESUME) { + dp = USBGEM_GET_DEV(dip); + lp = dp->private; + lp->phy_init_done = B_FALSE; + + return (usbgem_resume(dip)); + } + return (DDI_FAILURE); +} + +static int +upfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(upf_ops, upfattach, upfdetach); +#else +static struct module_info upfminfo = { + 0, /* mi_idnum */ + "upf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + 32*1024, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit upfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &upfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit upfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &upfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab upf_info = { + &upfrinit, /* st_rdinit */ + &upfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_upf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &upf_info, /* cb_stream */ + D_MP /* cb_flag */ +}; + +static struct dev_ops upf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + upfattach, /* devo_attach */ + upfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_upf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif + +}; +#endif +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &upf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!upf: _init: called")); + + status = usbgem_mod_init(&upf_ops, "upf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&upf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!upf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&upf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/urf/rtl8150reg.h b/usr/src/uts/common/io/urf/rtl8150reg.h new file mode 100644 index 0000000000..7cba53356e --- /dev/null +++ b/usr/src/uts/common/io/urf/rtl8150reg.h @@ -0,0 +1,218 @@ +/* + * @(#)rtl8150reg.h 1.1 04/09/16 + * Macro definitions for Realtek 8150 USB to fast ethernet controller + * based on Realtek RTL8150 data sheet + * This file is public domain. Coded by M.Murayama (KHF04453@nifty.com) + */ + +/* + * Register offset + */ +#define IDR 0x0120 /* Base of ID registers */ +#define MAR 0x0126 /* Base of multicast registers */ +#define CR 0x012e /* Command register */ +#define TCR 0x012f /* Transmit Configuration register */ +#define RCR 0x0130 /* Receive Configuration register */ +#define TSR 0x0132 /* Transmit Status register */ +#define RSR 0x0133 /* Receive Status register */ +#define CON0 0x0135 /* Configuration register 0 */ +#define CON1 0x0136 /* Configuration register 1 */ +#define MSR 0x0137 /* Media Status register */ +#define PHYADD 0x0138 /* PHY address register */ +#define PHYDAT 0x0139 /* PHY data register */ +#define PHYCNT 0x013b /* PHY control register */ +#define GPPC 0x013d /* General purpose pin control */ +#define WAKECNT 0x013e /* Wake up event control */ +#define BMCR 0x0140 /* Basic Mode Control register */ +#define BMSR 0x0142 /* Basic Mode Status register */ +#define ANAR 0x0144 /* Auto Negotiation Advertisement register */ +#define ANLP 0x0146 /* Auto Negotiation Link Partner register */ +#define ANER 0x0148 /* Auto Negotiation Expansion register */ +#define NWAYT 0x014a /* Nway test register */ +#define CSCR 0x014c /* CS configuration register */ +#define CRC0 0x014e /* Power management register for wakeup frame0 */ +#define CRC1 0x0150 /* Power management register for wakeup frame1 */ +#define CRC2 0x0152 /* Power management register for wakeup frame2 */ +#define CRC3 0x0154 /* Power management register for wakeup frame3 */ +#define CRC4 0x0156 /* Power management register for wakeup frame4 */ +#define BYTEMASK0 0x0158 /* Power management wakeup frame0 bytemask */ +#define BYTEMASK1 0x0160 /* Power management wakeup frame1 bytemask */ +#define BYTEMASK2 0x0168 /* Power management wakeup frame2 bytemask */ +#define BYTEMASK3 0x0170 /* Power management wakeup frame3 bytemask */ +#define BYTEMASK4 0x0178 /* Power management wakeup frame4 bytemask */ +#define PHY1 0x0180 /* PHY parameter 1 */ +#define PHY2 0x0184 /* PHY parameter 2 */ +#define TW1 0x0186 /* Twister parameter 1 */ + +/* + * Bit field definitions + */ +/* CR : Command register (uint8_t) */ +#define CR_WEPROM 0x20 /* EEPROM write enable */ +#define CR_SOFT_RST 0x10 /* Reset */ +#define CR_RE 0x08 /* Ethernet receive enable */ +#define CR_TE 0x04 /* Ethernet transmit enable */ +#define CR_EP3CLREN 0x02 /* clear performance counter after EP3 */ +#define CR_AUTOLOAD 0x01 /* autoload contents of 93c46 */ + +#define CR_BITS "\020\006WEPROM\005SOFT_RST\004RE\003TE\002EP3CLREN\001AUTOLOAD" + +/* TCR: Transmit Configuration register */ +#define TCR_TXRR 0xc0 /* Tx retry count */ +#define TCR_TXRR_SHIFT 6 +#define TCR_IFG 0x18 /* Interframe Gap */ +#define TCR_IFG_SHIFT 3 +#define TCR_IFG_802_3 (3 << TCR_IFG_SHIFT) /* 802.3 standard */ +#define TCR_NOCRC 0x01 /* Inhibit Appending CRC */ + +#define TCR_BITS "\020\001NOCRC" + +/* Receive Configuration register */ +#define RCR_TAIL 0x0080 /* Rx header forward to host in CRC field */ +#define RCR_AER 0x0040 /* Accept Error packet */ +#define RCR_AR 0x0020 /* Accept runt */ +#define RCR_AM 0x0010 /* Accept multicast */ +#define RCR_AB 0x0008 /* Accept broadcast */ +#define RCR_AD 0x0004 /* Accept physical match */ +#define RCR_AAM 0x0002 /* Accept all Multicast */ +#define RCR_AAP 0x0001 /* Accept all physical */ + +#define RCR_ACCEPT_MODE \ + (RCR_AER | RCR_AR | RCR_AM | RCR_AB | RCR_AD | RCR_AAM | RCR_AAP) + +#define RCR_BITS \ + "\020\010TAIL\007AER\006AR\005AM\004AB\003AD\002AAM\001AAP" + +/* Transmit Status register */ + +#define TSR_ECOL 0x20 /* excessive collision indication */ +#define TSR_LCOL 0x10 /* late collision indication */ +#define TSR_LOSS_CRS 0x08 /* lost of carrier indication */ +#define TSR_JBR 0x04 /* jabber time out indication */ +#define TSR_BUF_EMPTY 0x02 /* Tx buffer is empty */ +#define TSR_BUF_FULL 0x01 /* Tx buffer is full */ + +#define TSR_BITS \ + "\020" \ + "\006ECOL" \ + "\005LCOL" \ + "\004LOSS_CRS" \ + "\003JBR" \ + "\002BUF_EMPTY" \ + "\001BUF_FULL" + +/* Receive status register in Rx packet field */ +#define RSR_WEVENT 0x80 /* Wakeup event indication */ +#define RSR_RX_BUF_FULL 0x40 /* Receive buffer full indication */ +#define RSR_LKCHG 0x20 /* Link change indication */ +#define RSR_RUNT 0x10 /* short packet indication */ +#define RSR_LONG 0x08 /* Long packet indication*/ +#define RSR_CRC 0x04 /* CRC error indication*/ +#define RSR_FAE 0x02 /* Frame alignment error */ +#define RSR_ROK 0x01 /* Receive OK indication */ + +#define RSR_ERRS (RSR_RUNT | RSR_LONG | RSR_CRC | RSR_FAE) +#define RSR_BITS \ + "\020" \ + "\010WEVENT" \ + "\007RX_BUF_FULL" \ + "\006LKCHG" \ + "\005RUNT" \ + "\004LONG" \ + "\003CRC" \ + "\002FAE" \ + "\001ROK" + +/* Config 0 */ + +#define CON0_SUSLED 0x80 +#define CON0_PARM_EN 0x40 /* parameter enable */ +#define CON0_LDPS 0x08 +#define CON0_MSEL 0x04 /* media select 1:MII, 0:auto */ +#define CON0_LEDS 0x03 /* LED pattern */ + +/* Config 1 */ +#define CON0_BWF 0x40 /* Broadcast wakeup function 1:on 0:off */ +#define CON0_MWF 0x20 /* Multicast wakeup function 1:on 0:off */ +#define CON0_UWF 0x10 /* Unicast wakeup function 1:on 0:off */ +#define CON0_LONGWF1 0x02 /* */ +#define CON0_LONGWF0 0x01 /* */ + + +/* MSR : Media Status register */ +#define MSR_TXFCE 0x80 /* Tx Flow control enable */ +#define MSR_RXFCE 0x40 /* Rx Flow control enable */ +#define MSR_DUPLEX 0x10 /* full duplex */ +#define MSR_SPEED_100 0x08 /* 100Mbps mode */ +#define MSR_LINK 0x04 /* link status */ +#define MSR_TXPF 0x02 /* 8150 sends pause packet */ +#define MSR_RXPF 0x01 /* 8150 is in backoff state*/ + +#define MSR_BITS \ + "\020" \ + "\010TXFCE" \ + "\007RXFCE" \ + "\005DUPLEX" \ + "\004SPEED_100" \ + "\003LINK" \ + "\002TXPF" \ + "\001RXPF" + +/* MII PHY Address */ +#define PHYADD_MASK 0x1f + +/* MII PHY Data */ +#define PHYCNT_OWN 0x40 /* 8150 owns:1 not owns:0 */ +#define PHYCNT_RWCR 0x20 /* write:1 read:0 */ +#define PHYCNT_PHYOFF 0x1f + +/* BMCR (almost same with MII_CONTROL register) */ +#define BMCR_RESET 0x8000 /* PHY reset */ +#define BMCR_Spd_Set 0x2000 /* 100Mbps */ +#define BMCR_ANE 0x1000 /* auto negotiation enable */ +#define BMCR_RSA 0x0200 /* restart auto negotiation */ +#define BMCR_duplex 0x0100 /* 100Mbps */ + +/* Basic mode status register */ +/* Auto-negotiation Advertisement register */ +/* Auto-negotiation Link Partner Ability register */ +/* Auto-negotiation Expansion register */ + +/* Nway test register */ +#define NWAYT_NWLPBK 0x0080 +#define NWAYT_ENNWLE 0x0008 +#define NWAYT_FLAGABD 0x0004 +#define NWAYT_FLAGPDF 0x0002 +#define NWAYT_FLAGLSC 0x0001 + +/* CS configuration register */ +#define CS_TESTFUN 0x8000 /* */ +#define CS_LD 0x0200 /* */ +#define CS_HEARTBEAT 0x0100 /* */ +#define CS_JBEN 0x0080 /* */ +#define CS_F_LINK100 0x0040 /* */ +#define CS_F_CONNECT 0x0020 /* */ +#define CS_CON_STATUS 0x0008 /* */ +#define CS_CON_STATUS_EN 0x0004 /* */ +#define CS_PASS_SCR 0x0001 /* bypass scramble function */ + +/* + * header format of rx packet + */ +#define RXHD_MULT 0x8000 /* multicast packet */ +#define RXHD_PHYS 0x4000 /* physical match packet */ +#define RXHD_RUNT 0x2000 /* too short */ +#define RXHD_VALID 0x1000 /* packet is ok */ +#define RXHD_BYTECNT 0x0fff /* rx byte count */ + +#define RXHD_BITS \ + "\020" \ + "\020MULT" \ + "\017PHYS" \ + "\016RUNT" \ + "\015VALID" +/* + * Offset to EPROM contents + */ +#define URF_EEPROM_BASE 0x1200 +#define EPROM_EthernetID 0x0002 diff --git a/usr/src/uts/common/io/urf/urf_usbgem.c b/usr/src/uts/common/io/urf/urf_usbgem.c new file mode 100644 index 0000000000..f61c8e3502 --- /dev/null +++ b/usr/src/uts/common/io/urf/urf_usbgem.c @@ -0,0 +1,1039 @@ +/* + * urf_usbgem.c : Realtek RTL8150 USB to Fast Ethernet Driver for Solaris + * + * Copyright (c) 2003-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "%W% %E%" + +/* + * Changelog: + */ + +/* + * TODO + */ +/* ======================================================= */ + +/* + * Solaris system header files and macros + */ + +/* minimum kernel headers for drivers */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/byteorder.h> + +/* ethernet stuff */ +#include <sys/ethernet.h> + +/* interface card depend stuff */ +#include <sys/stropts.h> +#include <sys/stream.h> +#include <sys/strlog.h> +#include <sys/usb/usba.h> +#include "usbgem.h" +#include "usbgem_mii.h" +#include "rtl8150reg.h" + +char ident[] = "rtl8150 usbnic driver v" VERSION; + +/* + * Useful macros + */ +#define ROUNDUP2(x, y) (((x)+(y)-1) & ~((y)-1)) +#define CHECK_AND_JUMP(err, label) if (err != USB_SUCCESS) goto label + +/* + * Debugging + */ +#ifdef DEBUG_LEVEL +static int urf_debug = DEBUG_LEVEL; +#define DPRINTF(n, args) if (urf_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Our configration for rtl8150 + */ +/* timeouts */ +#define ONESEC (drv_usectohz(1*1000000)) + +/* + * Local device definitions + */ +struct chip_info { + int flags; + char *name; + int type; +}; + +#define CHIPTABLESIZE (sizeof (chiptbl_8150) / sizeof (struct chip_info)) + +struct urf_dev { + /* + * Misc HW information + */ + struct chip_info *chip; + uint8_t cr; + uint8_t tsr; + uint16_t rcr; + uint8_t txok_cnt; +}; + +/* + * private functions + */ + +/* mii operations */ +static uint16_t urf_mii_read(struct usbgem_dev *, uint_t, int *errp); +static void urf_mii_write(struct usbgem_dev *, uint_t, uint16_t, int *errp); + +/* nic operations */ +static int urf_attach_chip(struct usbgem_dev *); +static int urf_reset_chip(struct usbgem_dev *); +static int urf_init_chip(struct usbgem_dev *); +static int urf_start_chip(struct usbgem_dev *); +static int urf_stop_chip(struct usbgem_dev *); +static int urf_set_media(struct usbgem_dev *); +static int urf_set_rx_filter(struct usbgem_dev *); +static int urf_get_stats(struct usbgem_dev *); + +/* packet operations */ +static mblk_t *urf_tx_make_packet(struct usbgem_dev *, mblk_t *); +static mblk_t *urf_rx_make_packet(struct usbgem_dev *, mblk_t *); + +/* =============================================================== */ +/* + * I/O functions + */ +/* =============================================================== */ +#define OUTB(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ 1, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTW(dp, p, v, errp, label) \ + if ((*(errp) = usbgem_ctrl_out_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ 2, \ + /* value */ (v))) != USB_SUCCESS) goto label + +#define OUTS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_out((dp), \ + /* bmRequestType */ USB_DEV_REQ_HOST_TO_DEV \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ (len), \ + /* value */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +#define IN(dp, p, vp, errp, label) \ + if ((*(errp) = usbgem_ctrl_in_val((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ sizeof ((*vp)), \ + /* valuep */ (vp))) != USB_SUCCESS) goto label + +#define INS(dp, p, buf, len, errp, label) \ + if ((*(errp) = usbgem_ctrl_in((dp), \ + /* bmRequestType */ USB_DEV_REQ_DEV_TO_HOST \ + | USB_DEV_REQ_TYPE_VENDOR | USB_DEV_REQ_RCPT_DEV, \ + /* bRequest */ USB_REQ_SET_ADDRESS, \ + /* wValue */ (p), \ + /* wIndex */ 0, \ + /* wLength */ (len), \ + /* valuep */ (buf), \ + /* size */ (len))) != USB_SUCCESS) goto label + +/* =============================================================== */ +/* + * variables + */ +/* =============================================================== */ +static int urf_ppa = 0; + +/* =============================================================== */ +/* + * Hardware manupilation + */ +/* =============================================================== */ +static int +urf_reset_chip(struct usbgem_dev *dp) +{ + int i; + int err; + uint8_t reg; + struct urf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + lp->cr = 0; + OUTB(dp, CR, lp->cr | CR_SOFT_RST, &err, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, CR, ®, &err, usberr); + if ((reg & CR_SOFT_RST) == 0) { + return (USB_SUCCESS); + } + } + /* time out */ + cmn_err(CE_WARN, "%s: failed to reset: timeout", dp->name); + return (USB_FAILURE); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * Setup rtl8150 + */ +static int +urf_init_chip(struct usbgem_dev *dp) +{ + int i; + uint32_t val; + int err; + struct urf_dev *lp = dp->private; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* ID registers: set later by urf_set_rx_filter */ + + /* Multicast registers: set later by urf_set_rx_filter */ + + /* Command register : Enable Tx and Rx before writing TCR and RCR */ + lp->cr |= CR_RE | CR_TE; + OUTB(dp, CR, lp->cr, &err, usberr); + + /* Transmit configration register : */ + OUTB(dp, TCR, TCR_IFG_802_3, &err, usberr); + + /* Receive configuration register : disable rx filter */ + lp->rcr = RCR_TAIL | RCR_AER | RCR_AR; + OUTW(dp, RCR, lp->rcr, &err, usberr); +#ifdef notdef + /* Media status register */ + err = urf_set_media(dp); + CHECK_AND_JUMP(err, usberr); +#endif + /* Configuration register 0: no need to change */ + + DPRINTF(2, (CE_CONT, "!%s: %s: end (success)", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +urf_start_chip(struct usbgem_dev *dp) +{ + struct urf_dev *lp = dp->private; + + /* do nothing */ + return (USB_SUCCESS); +} + +static int +urf_stop_chip(struct usbgem_dev *dp) +{ + return (urf_reset_chip(dp)); +} + +static int +urf_get_stats(struct usbgem_dev *dp) +{ + /* do nothing */ + return (USB_SUCCESS); +} + +static uint_t +urf_mcast_hash(struct usbgem_dev *dp, const uint8_t *addr) +{ + return (usbgem_ether_crc_be(addr)); +} + +static int +urf_set_rx_filter(struct usbgem_dev *dp) +{ + int i; + uint16_t mode; + uint8_t mhash[8]; + int err; + int16_t rcr; + struct urf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called, rxmode:%x", + dp->name, __func__, dp->rxmode)); + + if (lp->rcr & (RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM)) { +#ifdef notdef + /* disable rx filter before changing it. */ + lp->rcr &= ~(RCR_AB | RCR_AD | RCR_AAM | RCR_AAP | RCR_AM); + OUTW(dp, RCR, lp->rcr, &err, usberr); +#else + /* receive all packets while we change rx filter*/ + OUTW(dp, RCR, lp->rcr | RCR_AAM | RCR_AAP, &err, usberr); +#endif + } + + mode = RCR_AB /* accept broadcast */ + | RCR_AD; /* accept physical match */ + bzero(mhash, sizeof (mhash)); + + if (dp->rxmode & RXMODE_PROMISC) { + /* promiscious mode implies all multicast and all physical */ + mode |= RCR_AAM | RCR_AAP; + } else if ((dp->rxmode & RXMODE_ALLMULTI) || dp->mc_count > 64/2) { + /* accept all multicast packets */ + mode |= RCR_AAM; + } else if (dp->mc_count > 0) { + /* + * make hash table to select interresting + * multicast address only. + */ + mode |= RCR_AM; + for (i = 0; i < dp->mc_count; i++) { + uint_t h; + /* hash table is 64 = 2^6 bit width */ + h = dp->mc_list[i].hash >> (32 - 6); + mhash[h / 8] |= 1 << (h % 8); + } + } + lp->rcr |= mode; + + /* set mac address */ + OUTS(dp, IDR, dp->cur_addr.ether_addr_octet, ETHERADDRL, &err, usberr); + + /* set multicast hash table */ + if (mode & RCR_AM) { + /* need to set up multicast hash table */ + OUTS(dp, MAR, mhash, sizeof (mhash), &err, usberr); + } + + OUTW(dp, RCR, lp->rcr, &err, usberr); + +#if DEBUG_LEVEL > 2 + IN(dp, RCR, &rcr, &err, usberr); + cmn_err(CE_CONT, "!%s: %s: rcr:%b returned", + dp->name, __func__, rcr, RCR_BITS); +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +static int +urf_set_media(struct usbgem_dev *dp) +{ + uint8_t new; + uint8_t old; + int err; + struct urf_dev *lp = dp->private; + + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* select duplex: do nothing */ + + /* select speed: do nothing */ + + /* flow control */ + IN(dp, MSR, &old, &err, usberr); + + + /* setup flow control */ + new = old & ~(MSR_TXFCE | MSR_RXFCE); + switch (dp->flow_control) { + case FLOW_CONTROL_SYMMETRIC: + new |= MSR_TXFCE | MSR_RXFCE; + break; + + case FLOW_CONTROL_TX_PAUSE: + new |= MSR_TXFCE; + break; + + case FLOW_CONTROL_RX_PAUSE: + new |= MSR_RXFCE; + break; + + case FLOW_CONTROL_NONE: + default: + break; + } + + if (new != old) { + OUTB(dp, MSR, new, &err, usberr); + } + DPRINTF(2, (CE_CONT, "!%s: %s: returned", dp->name, __func__)); + return (USB_SUCCESS); + +usberr: + cmn_err(CE_NOTE, "!%s: %s: usberr detected", dp->name, __func__); + return (USB_FAILURE); +} + +/* + * send/receive packet check + */ +static mblk_t * +urf_tx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + size_t len; + mblk_t *new; + mblk_t *tp; + uint8_t *bp; + uint8_t *last_pos; + + len = msgdsize(mp); + + if (len < ETHERMIN || mp->b_cont != NULL || (len & 0x3f) == 0) { + /* + * re-allocate mp + */ + len = max(len, ETHERMIN); + + if ((len & 0x3f) == 0) { + /* workaround for buggy USB hba */ + len++; + } + + if ((new = allocb(len, 0)) == NULL) { + return (NULL); + } + + /* copy contents of the buffer */ + new->b_wptr = new->b_rptr + len; + bp = new->b_rptr; + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + bcopy(tp->b_rptr, bp, len); + bp += len; + } + + last_pos = new->b_wptr; + while (bp < last_pos) { + *bp++ = 0; + } + + mp = new; + } + + return (mp); +} + +static void +urf_dump_packet(struct usbgem_dev *dp, uint8_t *bp, int n) +{ + int i; + + for (i = 0; i < n; i += 8, bp += 8) { + cmn_err(CE_CONT, "%02x %02x %02x %02x %02x %02x %02x %02x", + bp[0], bp[1], bp[2], bp[3], bp[4], bp[5], bp[6], bp[7]); + } +} + +static mblk_t * +urf_rx_make_packet(struct usbgem_dev *dp, mblk_t *mp) +{ + uint8_t *p; + uint16_t rxhd; + uint_t len; + + ASSERT(mp != NULL); + len = msgdsize(mp); +#ifdef DEBUG_LEVEL + DPRINTF(2, (CE_CONT, "!%s: time:%d %s: len:%d cont:%p", + dp->name, ddi_get_lbolt(), __func__, len, mp->b_cont)); + + if (urf_debug > 2) { + urf_dump_packet(dp, mp->b_rptr, max(6, len)); + } +#endif + if (len < ETHERMIN + ETHERFCSL) { + /* Too short */ + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } + + /* get Rx header which is placed at tail of the packet. */ + p = mp->b_wptr - 4; + rxhd = (p[1] << 8) | p[0]; + len = rxhd & RXHD_BYTECNT; + + DPRINTF(2, (CE_CONT, "!%s: %s: rsr:%b len:%d", + dp->name, __func__, rxhd, RXHD_BITS, len)); + + /* check if error happen */ + if ((rxhd & (RXHD_VALID)) == 0) { + DPRINTF(-1, (CE_CONT, "!%s: %s: rxhd:%b", + dp->name, __func__, rxhd, RXHD_BITS)); + if (rxhd & RXHD_RUNT) { + dp->stats.runt++; + } + + dp->stats.errrcv++; + return (NULL); + } +#ifdef notdef + /* check packet size */ + if (len > ETHERMAX + ETHERFCSL) { + /* too long */ + dp->stats.frame_too_long++; + dp->stats.errrcv++; + return (NULL); + } else if (len < ETHERMIN + ETHERFCSL) { + dp->stats.runt++; + dp->stats.errrcv++; + return (NULL); + } +#endif + /* remove tailing crc field */ + mp->b_wptr -= ETHERFCSL; + return (mp); +} + +/* + * MII Interfaces + */ +static uint16_t +urf_mii_read(struct usbgem_dev *dp, uint_t index, int *errp) +{ + int reg; + uint16_t val; + + DPRINTF(4, (CE_CONT, "!%s: %s: called, ix:%d", + dp->name, __func__, index)); + + *errp = USB_SUCCESS; + + switch (index) { + case MII_CONTROL: + reg = BMCR; + break; + + case MII_STATUS: + reg = BMSR; + break; + + case MII_AN_ADVERT: + reg = ANAR; + break; + + case MII_AN_LPABLE: + reg = ANLP; + break; + + case MII_AN_EXPANSION: + reg = ANER; + break; + + default: + return (0); + } + + IN(dp, reg, &val, errp, usberr); + + if (index == MII_STATUS) { + uint8_t msr; + /* + * Fix MII status register as it does't have LINKUP and + * MFPRMBLSUPR bits. + */ + IN(dp, MSR, &msr, errp, usberr); + + val |= (MII_STATUS_MFPRMBLSUPR | MII_STATUS_LINKUP); + if ((msr & MSR_LINK) == 0) { + val &= ~MII_STATUS_LINKUP; + } + } + + return (val); + +usberr: + cmn_err(CE_CONT, + "!%s: %s: usberr(%d) detected", dp->name, __func__, *errp); + + return (0); +} + +static void +urf_mii_write(struct usbgem_dev *dp, uint_t index, uint16_t val, int *errp) +{ + int reg; + + DPRINTF(5, (CE_CONT, "!%s: %s called", dp->name, __func__)); + + *errp = USB_SUCCESS; + + switch (index) { + case MII_CONTROL: + reg = BMCR; + break; + + case MII_STATUS: + reg = BMSR; + break; + + case MII_AN_ADVERT: + reg = ANAR; + break; + + case MII_AN_LPABLE: + reg = ANLP; + break; + + case MII_AN_EXPANSION: + reg = ANER; + break; + + default: + return; + } + + OUTW(dp, reg, val, errp, usberr); +usberr: + ; +} + +/* ======================================================== */ +/* + * OS depend (device driver DKI) routine + */ +/* ======================================================== */ +static void +urf_eeprom_dump(struct usbgem_dev *dp, int size) +{ + int i; + int err; + uint16_t w0, w1, w2, w3; + + cmn_err(CE_CONT, "!%s: eeprom dump:", dp->name); + for (i = URF_EEPROM_BASE; i < size + URF_EEPROM_BASE; i += 8) { + IN(dp, i + 0, &w0, &err, usberr); + IN(dp, i + 2, &w1, &err, usberr); + IN(dp, i + 4, &w2, &err, usberr); + IN(dp, i + 6, &w3, &err, usberr); + cmn_err(CE_CONT, "!0x%02x: 0x%04x 0x%04x 0x%04x 0x%04x", + i - URF_EEPROM_BASE, w0, w1, w2, w3); + } +usberr: + ; +} + +static int +urf_attach_chip(struct usbgem_dev *dp) +{ + int i; + uint8_t old; + uint_t new; + uint8_t reg; + int err; + struct urf_dev *lp = dp->private; + + /* + * setup flow control bit in eeprom + */ + IN(dp, URF_EEPROM_BASE + 9, &old, &err, usberr); + + DPRINTF(0, (CE_CONT, "!%s: eeprom offset 9: %02x", dp->name, old)); + + if (dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE) { + /* enable PAUSE bit */ + new = old | 0x04; + } else { + /* clear PAUSE bit */ + new = old & ~0x04; + } + if (new != old) { + /* make eeprom writable */ + OUTB(dp, CR, lp->cr | CR_WEPROM, &err, usberr); + + /* eerom allows only word access for writing */ + IN(dp, URF_EEPROM_BASE + 8, ®, &err, usberr); + new = (new << 8) | reg; + + OUTW(dp, URF_EEPROM_BASE + 8, new, &err, usberr); + + /* make eeprom non-writable */ + OUTB(dp, CR, lp->cr, &err, usberr); + } + + /* + * load EEPROM contents into nic + */ + OUTB(dp, CR, lp->cr | CR_AUTOLOAD, &err, usberr); + CHECK_AND_JUMP(err, usberr); + + for (i = 0; i < 100; i++) { + IN(dp, CR, ®, &err, usberr); + if ((reg & CR_AUTOLOAD) == 0) { + goto autoload_done; + } + } + /* timeout */ + cmn_err(CE_WARN, "%s: %s: failed to autoload: timeout", + dp->name, __func__); + goto usberr; + +autoload_done: + /* + * mac address in EEPROM has loaded to ID registers. + */ + INS(dp, IDR, dp->dev_addr.ether_addr_octet, ETHERADDRL, &err, usberr); + + /* no need to scan phy */ + dp->mii_phy_addr = -1; + +#if DEBUG_LEVEL > 2 + urf_eeprom_dump(dp, 0x80); +#endif + +#ifdef CONFIG_VLAN + dp->misc_flag = USBGEM_VLAN; +#endif + return (USB_SUCCESS); + +usberr: + cmn_err(CE_WARN, "%s: urf_attach_chip: usb error detected", dp->name); + return (USB_FAILURE); +} + +static int +urfattach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int i; + ddi_iblock_cookie_t c; + int ret; + int unit; + struct chip_info *p; + const char *drv_name; + struct usbgem_dev *dp; + void *base; + struct usbgem_conf *ugcp; + struct urf_dev *lp; + + unit = ddi_get_instance(dip); + drv_name = ddi_driver_name(dip); + + DPRINTF(3, (CE_CONT, "!%s%d: %s: called, cmd:%d", + drv_name, __func__, unit, cmd)); + + if (cmd == DDI_ATTACH) { + /* + * Check if the chip is supported. + */ + + /* + * Check the chip if it is really realtek rtl8150 + */ + + /* + * construct usbgem configration + */ + ugcp = kmem_zalloc(sizeof (*ugcp), KM_SLEEP); + + /* name */ + sprintf(ugcp->usbgc_name, + "%s%d(ppa=%d)", drv_name, unit, urf_ppa); +#ifdef USBGEM_CONFIG_GLDv3 + ugcp->usbgc_ppa = urf_ppa; +#else + ugcp->usbgc_ppa = unit; +#endif + ugcp->usbgc_ifnum = 0; + ugcp->usbgc_alt = 0; + + ugcp->usbgc_tx_list_max = 16; + + /* the rx status partially replaces FCS */ + ugcp->usbgc_rx_header_len = 0; + ugcp->usbgc_rx_list_max = 64; + + /* time out parameters */ + ugcp->usbgc_tx_timeout = USBGEM_TX_TIMEOUT; + ugcp->usbgc_tx_timeout_interval = ONESEC; + + /* flow control */ + ugcp->usbgc_flow_control = FLOW_CONTROL_RX_PAUSE; + + /* MII timeout parameters */ + ugcp->usbgc_mii_link_watch_interval = ONESEC; + ugcp->usbgc_mii_an_watch_interval = ONESEC/5; + ugcp->usbgc_mii_reset_timeout = MII_RESET_TIMEOUT; /* 1 sec */ + ugcp->usbgc_mii_an_timeout = MII_AN_TIMEOUT; /* 5 sec */ + ugcp->usbgc_mii_an_wait = (25*ONESEC)/10; + ugcp->usbgc_mii_linkdown_timeout = MII_LINKDOWN_TIMEOUT; + + ugcp->usbgc_mii_an_delay = ONESEC/10; + ugcp->usbgc_mii_linkdown_action = MII_ACTION_RSA; + ugcp->usbgc_mii_linkdown_timeout_action = MII_ACTION_RESET; + ugcp->usbgc_mii_dont_reset = B_FALSE; + + /* I/O methods */ + + /* mac operation */ + ugcp->usbgc_attach_chip = &urf_attach_chip; + ugcp->usbgc_reset_chip = &urf_reset_chip; + ugcp->usbgc_init_chip = &urf_init_chip; + ugcp->usbgc_start_chip = &urf_start_chip; + ugcp->usbgc_stop_chip = &urf_stop_chip; + ugcp->usbgc_multicast_hash = &urf_mcast_hash; + + ugcp->usbgc_set_rx_filter = &urf_set_rx_filter; + ugcp->usbgc_set_media = &urf_set_media; + ugcp->usbgc_get_stats = &urf_get_stats; +#ifdef notdef + ugcp->usbgc_interrupt = &urf_interrupt; +#else + ugcp->usbgc_interrupt = NULL; +#endif + /* packet operation */ + ugcp->usbgc_tx_make_packet = &urf_tx_make_packet; + ugcp->usbgc_rx_make_packet = &urf_rx_make_packet; + + /* mii operations */ + ugcp->usbgc_mii_probe = &usbgem_mii_probe_default; + ugcp->usbgc_mii_init = &usbgem_mii_init_default; + ugcp->usbgc_mii_config = &usbgem_mii_config_default; + ugcp->usbgc_mii_read = &urf_mii_read; + ugcp->usbgc_mii_write = &urf_mii_write; + + /* mtu */ + ugcp->usbgc_min_mtu = ETHERMTU; + ugcp->usbgc_max_mtu = ETHERMTU; + ugcp->usbgc_default_mtu = ETHERMTU; + + lp = kmem_zalloc(sizeof (struct urf_dev), KM_SLEEP); + lp->chip = p; + + ddi_set_driver_private(dip, NULL); + + dp = usbgem_do_attach(dip, ugcp, lp, sizeof (struct urf_dev)); + + kmem_free(ugcp, sizeof (*ugcp)); + + if (dp != NULL) { + urf_ppa++; + return (DDI_SUCCESS); + } + +err_free_mem: + kmem_free(lp, sizeof (struct urf_dev)); +err_close_pipe: +err: + return (DDI_FAILURE); + } + if (cmd == DDI_RESUME) { + return (usbgem_resume(dip)); + } + return (DDI_FAILURE); +} + +static int +urfdetach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + int ret; + + if (cmd == DDI_DETACH) { + ret = usbgem_do_detach(dip); + if (ret != DDI_SUCCESS) { + return (DDI_FAILURE); + } + urf_ppa--; + return (DDI_SUCCESS); + } + if (cmd == DDI_SUSPEND) { + return (usbgem_suspend(dip)); + } + return (DDI_FAILURE); +} + +/* ======================================================== */ +/* + * OS depend (loadable streams driver) routine + */ +/* ======================================================== */ +#ifdef USBGEM_CONFIG_GLDv3 +USBGEM_STREAM_OPS(urf_ops, urfattach, urfdetach); +#else +static struct module_info urfminfo = { + 0, /* mi_idnum */ + "urf", /* mi_idname */ + 0, /* mi_minpsz */ + ETHERMTU, /* mi_maxpsz */ + ETHERMTU*128, /* mi_hiwat */ + 1, /* mi_lowat */ +}; + +static struct qinit urfrinit = { + (int (*)()) NULL, /* qi_putp */ + usbgem_rsrv, /* qi_srvp */ + usbgem_open, /* qi_qopen */ + usbgem_close, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &urfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct qinit urfwinit = { + usbgem_wput, /* qi_putp */ + usbgem_wsrv, /* qi_srvp */ + (int (*)()) NULL, /* qi_qopen */ + (int (*)()) NULL, /* qi_qclose */ + (int (*)()) NULL, /* qi_qadmin */ + &urfminfo, /* qi_minfo */ + NULL /* qi_mstat */ +}; + +static struct streamtab urf_info = { + &urfrinit, /* st_rdinit */ + &urfwinit, /* st_wrinit */ + NULL, /* st_muxrinit */ + NULL /* st_muxwrinit */ +}; + +static struct cb_ops cb_urf_ops = { + nulldev, /* cb_open */ + nulldev, /* cb_close */ + nodev, /* cb_strategy */ + nodev, /* cb_print */ + nodev, /* cb_dump */ + nodev, /* cb_read */ + nodev, /* cb_write */ + nodev, /* cb_ioctl */ + nodev, /* cb_devmap */ + nodev, /* cb_mmap */ + nodev, /* cb_segmap */ + nochpoll, /* cb_chpoll */ + ddi_prop_op, /* cb_prop_op */ + &urf_info, /* cb_stream */ + D_NEW|D_MP /* cb_flag */ +}; + +static struct dev_ops urf_ops = { + DEVO_REV, /* devo_rev */ + 0, /* devo_refcnt */ + usbgem_getinfo, /* devo_getinfo */ + nulldev, /* devo_identify */ + nulldev, /* devo_probe */ + urfattach, /* devo_attach */ + urfdetach, /* devo_detach */ + nodev, /* devo_reset */ + &cb_urf_ops, /* devo_cb_ops */ + NULL, /* devo_bus_ops */ + usbgem_power, /* devo_power */ +#if DEVO_REV >= 4 + usbgem_quiesce, /* devo_quiesce */ +#endif + +}; +#endif + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module. This one is a driver */ + ident, + &urf_ops, /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modldrv, NULL +}; + +/* ======================================================== */ +/* + * _init : done + */ +/* ======================================================== */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!urf: _init: called")); + + status = usbgem_mod_init(&urf_ops, "urf"); + if (status != DDI_SUCCESS) { + return (status); + } + status = mod_install(&modlinkage); + if (status != DDI_SUCCESS) { + usbgem_mod_fini(&urf_ops); + } + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!urf: _fini: called")); + status = mod_remove(&modlinkage); + if (status == DDI_SUCCESS) { + usbgem_mod_fini(&urf_ops); + } + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/usb/usba/usba_ugen.c b/usr/src/uts/common/io/usb/usba/usba_ugen.c index cb20c24270..5852e40799 100644 --- a/usr/src/uts/common/io/usb/usba/usba_ugen.c +++ b/usr/src/uts/common/io/usb/usba/usba_ugen.c @@ -23,6 +23,10 @@ */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* * UGEN: USB Generic Driver support code * * This code provides entry points called by the ugen driver or other @@ -1082,7 +1086,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_INTR_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_INTR_IN_POLL_PENDING; @@ -1101,7 +1108,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, ((epp->ep_state & UGEN_EP_STATE_ISOC_IN_POLLING_ON) == 0)) { *reventsp |= POLLIN; - } else if (!anyyet) { + } + + if ((!*reventsp && !anyyet) || + (events & POLLET)) { *phpp = &epp->ep_pollhead; epp->ep_state |= UGEN_EP_STATE_ISOC_IN_POLL_PENDING; @@ -1115,9 +1125,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; case UGEN_MINOR_DEV_STAT_NODE: - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; @@ -1131,9 +1142,10 @@ usb_ugen_poll(usb_ugen_hdl_t usb_ugen_hdl, dev_t dev, short events, break; } } else { - if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) { + if (ugenp->ug_ds.dev_stat & UGEN_DEV_STATUS_CHANGED) *reventsp |= POLLHUP|POLLIN; - } else if (!anyyet) { + + if ((!*reventsp && !anyyet) || (events & POLLET)) { *phpp = &ugenp->ug_ds.dev_pollhead; ugenp->ug_ds.dev_stat |= UGEN_DEV_STATUS_POLL_PENDING; diff --git a/usr/src/uts/common/io/usbgem/usbgem.c b/usr/src/uts/common/io/usbgem/usbgem.c new file mode 100644 index 0000000000..a42f7119ef --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem.c @@ -0,0 +1,6389 @@ +/* + * usbgem.c: General USB to Fast Ethernet mac driver framework + * + * Copyright (c) 2002-2012 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#pragma ident "@(#)usbgem.c 1.6 12/02/09" + +/* + * Change log + */ + +/* + * TODO: + * implement DELAYED_START + */ + +/* + * System Header files. + */ +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/vtrace.h> +#include <sys/ethernet.h> +#include <sys/modctl.h> +#include <sys/errno.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#ifndef USBGEM_CONFIG_GLDv3 +#include <sys/dlpi.h> +#include <sys/strsubr.h> +#endif +#include <sys/stream.h> /* required for MBLK* */ +#include <sys/strsun.h> /* required for mionack() */ +#include <sys/byteorder.h> + +#include <sys/usb/usba.h> +#ifdef USBGEM_CONFIG_GLDv3 +#include <inet/common.h> +#include <inet/led.h> +#include <inet/mi.h> +#include <inet/nd.h> +#endif + +/* supplement definitions */ +extern const char *usb_str_cr(usb_cr_t); + +#ifndef USBGEM_CONFIG_GLDv3 +#pragma weak gld_linkstate +#endif +#include <sys/note.h> + +#include "usbgem_mii.h" +#include "usbgem.h" + +#ifdef MODULE +char ident[] = "usb general ethernet mac driver v" VERSION; +#else +extern char ident[]; +#endif + +/* Debugging support */ +#ifdef USBGEM_DEBUG_LEVEL +static int usbgem_debug = USBGEM_DEBUG_LEVEL; +#define DPRINTF(n, args) if (usbgem_debug > (n)) cmn_err args +#else +#define DPRINTF(n, args) +#endif + +/* + * Useful macros and typedefs + */ +#define ROUNDUP(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define DEFAULT_PIPE(dp) ((dp)->reg_data->dev_default_ph) +#define VTAG_SIZE 4 +#define BOOLEAN(x) ((x) != 0) +/* + * configuration parameters + */ +#define USBDRV_MAJOR_VER 2 +#define USBDRV_MINOR_VER 0 + +#define ETHERHEADERL (sizeof (struct ether_header)) +#define MAXPKTLEN(dp) ((dp)->mtu + ETHERHEADERL) +#define MAXPKTBUF(dp) ((dp)->mtu + ETHERHEADERL + ETHERFCSL) + +#define WATCH_INTERVAL_FAST drv_usectohz(100*1000) + +#define STOP_GRACEFUL B_TRUE + +/* + * Private functions + */ +static int usbgem_open_pipes(struct usbgem_dev *dp); +static int usbgem_close_pipes(struct usbgem_dev *dp); +static void usbgem_intr_cb(usb_pipe_handle_t, usb_intr_req_t *); +static void usbgem_bulkin_cb(usb_pipe_handle_t, usb_bulk_req_t *); +static void usbgem_bulkout_cb(usb_pipe_handle_t, usb_bulk_req_t *); + +static int usbgem_mii_start(struct usbgem_dev *); +static void usbgem_mii_stop(struct usbgem_dev *); + +/* local buffer management */ +static int usbgem_init_rx_buf(struct usbgem_dev *); + +/* internal mac interfaces */ +static void usbgem_tx_timeout(struct usbgem_dev *); +static void usbgem_mii_link_watcher(struct usbgem_dev *); +static int usbgem_mac_init(struct usbgem_dev *); +static int usbgem_mac_start(struct usbgem_dev *); +static int usbgem_mac_stop(struct usbgem_dev *, int, boolean_t); +static void usbgem_mac_ioctl(struct usbgem_dev *, queue_t *, mblk_t *); + +int usbgem_speed_value[] = {10, 100, 1000}; + +static int usbgem_ctrl_retry = 5; + +/* usb event support */ +static int usbgem_disconnect_cb(dev_info_t *dip); +static int usbgem_reconnect_cb(dev_info_t *dip); +int usbgem_suspend(dev_info_t *dip); +int usbgem_resume(dev_info_t *dip); + +static uint8_t usbgem_bcastaddr[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +#ifdef MODULE +extern struct mod_ops mod_miscops; + +static struct modlmisc modlmisc = { + &mod_miscops, + "usbgem v" VERSION, +}; + +static struct modlinkage modlinkage = { + MODREV_1, &modlmisc, NULL +}; + +/* + * _init : done + */ +int +_init(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!usbgem: _init: called")); + status = mod_install(&modlinkage); + + return (status); +} + +/* + * _fini : done + */ +int +_fini(void) +{ + int status; + + DPRINTF(2, (CE_CONT, "!usbgem: _fini: called")); + status = mod_remove(&modlinkage); + return (status); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} +#endif /* MODULE */ + +/* ============================================================== */ +/* + * Ether CRC calculation utilities + */ +/* ============================================================== */ +/* + * Ether CRC calculation according to 21143 data sheet + */ +#define CRC32_POLY_LE 0xedb88320 +uint32_t +usbgem_ether_crc_le(const uint8_t *addr) +{ + int idx; + int bit; + uint_t data; + uint32_t crc = 0xffffffff; + + crc = 0xffffffff; + for (idx = 0; idx < ETHERADDRL; idx++) { + for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) { + crc = (crc >> 1) ^ + (((crc ^ data) & 1) ? CRC32_POLY_LE : 0); + } + } + return (crc); +} + +#define CRC32_POLY_BE 0x04c11db7 +uint32_t +usbgem_ether_crc_be(const uint8_t *addr) +{ + int idx; + int bit; + uint_t data; + uint32_t crc; + + crc = 0xffffffff; + for (idx = 0; idx < ETHERADDRL; idx++) { + for (data = *addr++, bit = 0; bit < 8; bit++, data >>= 1) { + crc = (crc << 1) ^ + ((((crc >> 31) ^ data) & 1) ? CRC32_POLY_BE : 0); + } + } + return (crc); +} + +int +usbgem_prop_get_int(struct usbgem_dev *dp, char *prop_template, int def_val) +{ + char propname[32]; + + (void) sprintf(propname, prop_template, dp->name); + + return (ddi_prop_get_int(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, propname, def_val)); +} + +static int +usbgem_population(uint32_t x) +{ + int i; + int cnt; + + cnt = 0; + for (i = 0; i < 32; i++) { + if (x & (1 << i)) { + cnt++; + } + } + return (cnt); +} + +static clock_t +usbgem_timestamp_nz() +{ + clock_t now; + now = ddi_get_lbolt(); + return (now ? now : (clock_t)1); +} + +#ifdef USBGEM_DEBUG_LEVEL +#ifdef USBGEM_DEBUG_VLAN +#ifdef notdef +#include <netinet/in.h> +#endif +static void +usbgem_dump_packet(struct usbgem_dev *dp, char *title, mblk_t *mp, + boolean_t check_cksum) +{ + char msg[180]; + uint8_t buf[18+20+20]; + uint8_t *p; + size_t offset; + uint_t ethertype; + uint_t proto; + uint_t ipproto = 0; + uint_t iplen; + uint_t iphlen; + uint_t tcplen; + uint_t udplen; + uint_t cksum; + int rest; + int len; + char *bp; + mblk_t *tp; + extern uint_t ip_cksum(mblk_t *, int, uint32_t); + + msg[0] = 0; + bp = msg; + + rest = sizeof (buf); + offset = 0; + for (tp = mp; tp; tp = tp->b_cont) { + len = tp->b_wptr - tp->b_rptr; + len = min(rest, len); + bcopy(tp->b_rptr, &buf[offset], len); + rest -= len; + offset += len; + if (rest == 0) { + break; + } + } + + offset = 0; + p = &buf[offset]; + + /* ethernet address */ + sprintf(bp, + "ether: %02x:%02x:%02x:%02x:%02x:%02x" + " -> %02x:%02x:%02x:%02x:%02x:%02x", + p[6], p[7], p[8], p[9], p[10], p[11], + p[0], p[1], p[2], p[3], p[4], p[5]); + bp = &msg[strlen(msg)]; + + /* vlag tag and etherrtype */ + ethertype = GET_ETHERTYPE(p); + if (ethertype == VTAG_TPID) { + sprintf(bp, " vtag:0x%04x", GET_NET16(&p[14])); + bp = &msg[strlen(msg)]; + + offset += VTAG_SIZE; + p = &buf[offset]; + ethertype = GET_ETHERTYPE(p); + } + sprintf(bp, " type:%04x", ethertype); + bp = &msg[strlen(msg)]; + + /* ethernet packet length */ + sprintf(bp, " mblklen:%d", msgdsize(mp)); + bp = &msg[strlen(msg)]; + if (mp->b_cont) { + sprintf(bp, "("); + bp = &msg[strlen(msg)]; + for (tp = mp; tp; tp = tp->b_cont) { + if (tp == mp) { + sprintf(bp, "%d", tp->b_wptr - tp->b_rptr); + } else { + sprintf(bp, "+%d", tp->b_wptr - tp->b_rptr); + } + bp = &msg[strlen(msg)]; + } + sprintf(bp, ")"); + bp = &msg[strlen(msg)]; + } + + if (ethertype != ETHERTYPE_IP) { + goto x; + } + + /* ip address */ + offset += sizeof (struct ether_header); + p = &buf[offset]; + ipproto = p[9]; + iplen = GET_NET16(&p[2]); + sprintf(bp, ", ip: %d.%d.%d.%d -> %d.%d.%d.%d proto:%d iplen:%d", + p[12], p[13], p[14], p[15], + p[16], p[17], p[18], p[19], + ipproto, iplen); + bp = (void *)&msg[strlen(msg)]; + + iphlen = (p[0] & 0xf) * 4; + + /* cksum for psuedo header */ + cksum = *(uint16_t *)&p[12]; + cksum += *(uint16_t *)&p[14]; + cksum += *(uint16_t *)&p[16]; + cksum += *(uint16_t *)&p[18]; + cksum += BE_16(ipproto); + + /* tcp or udp protocol header */ + offset += iphlen; + p = &buf[offset]; + if (ipproto == IPPROTO_TCP) { + tcplen = iplen - iphlen; + sprintf(bp, ", tcp: len:%d cksum:%x", + tcplen, GET_NET16(&p[16])); + bp = (void *)&msg[strlen(msg)]; + + if (check_cksum) { + cksum += BE_16(tcplen); + cksum = (uint16_t)ip_cksum(mp, offset, cksum); + sprintf(bp, " (%s)", + (cksum == 0 || cksum == 0xffff) ? "ok" : "ng"); + bp = (void *)&msg[strlen(msg)]; + } + } else if (ipproto == IPPROTO_UDP) { + udplen = GET_NET16(&p[4]); + sprintf(bp, ", udp: len:%d cksum:%x", + udplen, GET_NET16(&p[6])); + bp = (void *)&msg[strlen(msg)]; + + if (GET_NET16(&p[6]) && check_cksum) { + cksum += *(uint16_t *)&p[4]; + cksum = (uint16_t)ip_cksum(mp, offset, cksum); + sprintf(bp, " (%s)", + (cksum == 0 || cksum == 0xffff) ? "ok" : "ng"); + bp = (void *)&msg[strlen(msg)]; + } + } +x: + cmn_err(CE_CONT, "!%s: %s: %s", dp->name, title, msg); +} +#endif /* USBGEM_DEBUG_VLAN */ +#endif /* USBGEM_DEBUG_LEVEL */ + +#ifdef GEM_GCC_RUNTIME +/* + * gcc3 runtime routines + */ +#pragma weak memcmp +int +memcmp(const void *s1, const void *s2, size_t n) +{ + int i; + int ret; + + ret = 0; + for (i = 0; i < n; i++) { + ret = (int)((uint8_t *)s1)[i] - (int)((uint8_t *)s2)[i]; + if (ret) { + return (ret); + } + } + return (0); +} + +#pragma weak memset +void * +memset(void *s, int c, size_t n) +{ + if ((c & 0xff) == 0) { + bzero(s, n); + } else { + while (n--) { + ((uint8_t *)s)[n] = c; + } + } + return (s); +} + +#pragma weak _memcpy = memcpy +#pragma weak memcpy +void * +memcpy(void *s1, const void *s2, size_t n) +{ + bcopy(s2, s1, n); + return (s1); +} +#endif /* GEM_GCC_RUNTIME */ +/* ============================================================== */ +/* + * hardware operations + */ +/* ============================================================== */ +static int +usbgem_hal_reset_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_reset_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_init_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_init_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_attach_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_attach_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_set_rx_filter(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_set_rx_filter)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_set_media(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_set_media)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_start_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_start_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_stop_chip(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_stop_chip)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + +static int +usbgem_hal_get_stats(struct usbgem_dev *dp) +{ + int err; + + sema_p(&dp->hal_op_lock); + err = (*dp->ugc.usbgc_get_stats)(dp); + sema_v(&dp->hal_op_lock); + return (err); +} + + +/* ============================================================== */ +/* + * USB pipe management + */ +/* ============================================================== */ +static boolean_t +usbgem_rx_start_unit(struct usbgem_dev *dp, usb_bulk_req_t *req) +{ + mblk_t *mp; + int err; + usb_flags_t flags; + + ASSERT(req); + + mp = allocb(dp->rx_buf_len, BPRI_MED); + if (mp == NULL) { + cmn_err(CE_WARN, "!%s: %s: failed to allocate mblk", + dp->name, __func__); + goto err; + } + + req->bulk_len = dp->rx_buf_len; + req->bulk_data = mp; + req->bulk_client_private = (usb_opaque_t)dp; + req->bulk_timeout = 0; + req->bulk_attributes = USB_ATTRS_SHORT_XFER_OK; + req->bulk_cb = usbgem_bulkin_cb; + req->bulk_exc_cb = usbgem_bulkin_cb; + req->bulk_completion_reason = 0; + req->bulk_cb_flags = 0; + + flags = 0; + err = usb_pipe_bulk_xfer(dp->bulkin_pipe, req, flags); + + if (err != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: failed to bulk_xfer for rx, err:%d", + dp->name, err); + + /* free req and mp */ + usb_free_bulk_req(req); + goto err; + } + return (B_TRUE); +err: + return (B_FALSE); +} + +/* ============================================================== */ +/* + * Rx/Tx buffer management + */ +/* ============================================================== */ +static int +usbgem_init_rx_buf(struct usbgem_dev *dp) +{ + int i; + usb_bulk_req_t *req; + + ASSERT(dp->mac_state == MAC_STATE_ONLINE); + + for (i = 0; i < dp->ugc.usbgc_rx_list_max; i++) { + req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to allocate bulkreq for rx", + dp->name, __func__); + return (USB_FAILURE); + } + if (!usbgem_rx_start_unit(dp, req)) { + return (USB_FAILURE); + } + mutex_enter(&dp->rxlock); + dp->rx_busy_cnt++; + mutex_exit(&dp->rxlock); + } + return (USB_SUCCESS); +} + +/* ============================================================== */ +/* + * memory resource management + */ +/* ============================================================== */ +static int +usbgem_free_memory(struct usbgem_dev *dp) +{ + usb_bulk_req_t *req; + + /* free all tx requst structure */ + while ((req = dp->tx_free_list) != NULL) { + dp->tx_free_list = + (usb_bulk_req_t *)req->bulk_client_private; + req->bulk_data = NULL; + usb_free_bulk_req(req); + } + return (USB_SUCCESS); +} + +static int +usbgem_alloc_memory(struct usbgem_dev *dp) +{ + int i; + usb_bulk_req_t *req; + + /* allocate tx requests */ + dp->tx_free_list = NULL; + for (i = 0; i < dp->ugc.usbgc_tx_list_max; i++) { + req = usb_alloc_bulk_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, + "%s:%s failed to allocate tx requests", + dp->name, __func__); + + /* free partially allocated tx requests */ + (void) usbgem_free_memory(dp); + return (USB_FAILURE); + } + + /* add the new one allocated into tx free list */ + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + } + + return (USB_SUCCESS); +} + +/* ========================================================== */ +/* + * Start transmission. + * Return zero on success, + */ +/* ========================================================== */ + +#ifdef TXTIMEOUT_TEST +static int usbgem_send_cnt = 0; +#endif + +/* + * usbgem_send is used only to send data packet into ethernet line. + */ +static mblk_t * +usbgem_send_common(struct usbgem_dev *dp, mblk_t *mp, uint32_t flags) +{ + int err; + mblk_t *new; + usb_bulk_req_t *req; + int mcast; + int bcast; + int len; + boolean_t intr; + usb_flags_t usb_flags = 0; +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_state_t p_state; +#endif + DPRINTF(2, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + intr = (flags & 1) != 0; + len = msgdsize(mp); + bcast = 0; + mcast = 0; + if (mp->b_rptr[0] & 1) { + if (bcmp(mp->b_rptr, &usbgem_bcastaddr, ETHERADDRL) == 0) { + bcast = 1; + } else { + mcast = 1; + } + } + new = (*dp->ugc.usbgc_tx_make_packet)(dp, mp); + if (new == NULL) { + /* + * no memory resource. we don't stop downstream, + * we just discard the packet. + */ + DPRINTF(0, (CE_CONT, "!%s: %s: no memory", + dp->name, __func__)); + freemsg(mp); + + mutex_enter(&dp->txlock); + dp->stats.noxmtbuf++; + dp->stats.errxmt++; + mutex_exit(&dp->txlock); + + return (NULL); + } + + ASSERT(new->b_cont == NULL); + + mutex_enter(&dp->txlock); + if (dp->tx_free_list == NULL) { + /* + * no tx free slot + */ + ASSERT(dp->tx_busy_cnt == dp->ugc.usbgc_tx_list_max); + mutex_exit(&dp->txlock); + + DPRINTF(4, (CE_CONT, "!%s: %s: no free slot", + dp->name, __func__)); + if (new && new != mp) { + /* free reallocated message */ + freemsg(new); + } + return (mp); + } + req = dp->tx_free_list; + dp->tx_free_list = (usb_bulk_req_t *)req->bulk_client_private; + dp->tx_busy_cnt++; + + if (dp->tx_free_list == NULL) { + intr = B_TRUE; + } + if (intr) { + dp->tx_intr_pended++; + } + DB_TCI(new) = intr; +#ifdef USBGEM_DEBUG_LEVEL + new->b_datap->db_cksum32 = dp->tx_seq_num; + dp->tx_seq_num++; +#endif + dp->stats.obytes += len; + dp->stats.opackets++; + if (bcast | mcast) { + dp->stats.obcast += bcast; + dp->stats.omcast += mcast; + } + mutex_exit(&dp->txlock); + + DPRINTF(2, (CE_CONT, "!%s: %s: sending", dp->name, __func__)); + + req->bulk_len = (long)new->b_wptr - (long)new->b_rptr; + req->bulk_data = new; + req->bulk_client_private = (usb_opaque_t)dp; + req->bulk_timeout = dp->bulkout_timeout; /* in second */ + req->bulk_attributes = 0; + req->bulk_cb = usbgem_bulkout_cb; + req->bulk_exc_cb = usbgem_bulkout_cb; + req->bulk_completion_reason = 0; + req->bulk_cb_flags = 0; + + if (intr) { + usb_flags = USB_FLAGS_SLEEP; + } + if ((err = usb_pipe_bulk_xfer(dp->bulkout_pipe, req, usb_flags)) + != USB_SUCCESS) { + + /* failed to transfer the packet, discard it. */ + freemsg(new); + req->bulk_data = NULL; + + /* recycle the request block */ + mutex_enter(&dp->txlock); + dp->tx_busy_cnt--; + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + mutex_exit(&dp->txlock); + + cmn_err(CE_NOTE, + "%s: %s: usb_pipe_bulk_xfer: failed: err:%d", + dp->name, __func__, err); + + /* we use another flag to indicate error state. */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + } else { + /* record the start time */ + dp->tx_start_time = ddi_get_lbolt(); + } + + if (err == USB_SUCCESS && (usb_flags & USB_FLAGS_SLEEP)) { + usbgem_bulkout_cb(dp->bulkout_pipe, req); + } + + if (new != mp) { + freemsg(mp); + } + return (NULL); +} + +int +usbgem_restart_nic(struct usbgem_dev *dp) +{ + int ret; + int flags = 0; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + ASSERT(dp->mac_state != MAC_STATE_DISCONNECTED); + + /* + * ensure to stop the nic + */ + if (dp->mac_state == MAC_STATE_ONLINE) { + (void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL); + } + + /* now the nic become quiescent, reset the chip */ + if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: %s: failed to reset chip", + dp->name, __func__); + goto err; + } + + /* + * restore the nic state step by step + */ + if (dp->nic_state < NIC_STATE_INITIALIZED) { + goto done; + } + + if (usbgem_mac_init(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "%s: %s: failed to initialize chip", + dp->name, __func__); + goto err; + } + + /* setup mac address and enable rx filter */ + sema_p(&dp->rxfilter_lock); + dp->rxmode |= RXMODE_ENABLE; + ret = usbgem_hal_set_rx_filter(dp); + sema_v(&dp->rxfilter_lock); + if (ret != USB_SUCCESS) { + goto err; + } + + /* + * update the link state asynchronously + */ + cv_signal(&dp->link_watcher_wait_cv); + + /* + * XXX - a panic happened because of linkdown. + * We must check mii_state here, because the link can be down just + * before the restart event happen. If the link is down now, + * gem_mac_start() will be called from gem_mii_link_check() when + * the link become up later. + */ + if (dp->mii_state == MII_STATE_LINKUP) { + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto err; + } + if (dp->nic_state < NIC_STATE_ONLINE) { + goto done; + } + + (void) usbgem_mac_start(dp); + + } +done: + return (USB_SUCCESS); +err: +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (USB_FAILURE); +} + +static void +usbgem_tx_timeout(struct usbgem_dev *dp) +{ + int ret; + uint_t rwlock; + clock_t now; + + for (; ; ) { + mutex_enter(&dp->tx_watcher_lock); + ret = cv_timedwait(&dp->tx_watcher_cv, &dp->tx_watcher_lock, + dp->tx_watcher_interval + ddi_get_lbolt()); + mutex_exit(&dp->tx_watcher_lock); + + if (dp->tx_watcher_stop) { + break; + } + + now = ddi_get_lbolt(); + + rwlock = RW_READER; +again: + rw_enter(&dp->dev_state_lock, rwlock); + + if ((dp->mac_state != MAC_STATE_DISCONNECTED && + dp->fatal_error && + now - dp->fatal_error >= dp->ugc.usbgc_tx_timeout) || + (dp->mac_state == MAC_STATE_ONLINE && + dp->mii_state == MII_STATE_LINKUP && + dp->tx_busy_cnt != 0 && + now - dp->tx_start_time >= dp->ugc.usbgc_tx_timeout)) { + if (rwlock == RW_READER) { + /* + * Upgrade dev_state_lock from shared mode + * to exclusive mode to restart nic + */ + rwlock = RW_WRITER; + rw_exit(&dp->dev_state_lock); + goto again; + } + cmn_err(CE_WARN, "%s: %s: restarting the nic:" + " fatal_error:%ld nic_state:%d" + " mac_state:%d starttime:%ld", + dp->name, __func__, + dp->fatal_error ? now - dp->fatal_error: 0, + dp->nic_state, dp->mac_state, + dp->tx_busy_cnt ? now - dp->tx_start_time : 0); + + (void) usbgem_restart_nic(dp); + } + + rw_exit(&dp->dev_state_lock); + } +} + +static int +usbgem_tx_watcher_start(struct usbgem_dev *dp) +{ + int err; + kthread_t *wdth; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* make a first call of uwgem_lw_link_check() */ + dp->tx_watcher_stop = 0; + dp->tx_watcher_interval = drv_usectohz(1000*1000); + + wdth = thread_create(NULL, 0, usbgem_tx_timeout, dp, 0, &p0, + TS_RUN, minclsyspri); + if (wdth == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to create a tx_watcher thread", + dp->name, __func__); + return (USB_FAILURE); + } + dp->tx_watcher_did = wdth->t_did; + + return (USB_SUCCESS); +} + +static void +usbgem_tx_watcher_stop(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + if (dp->tx_watcher_did) { + /* Ensure timer routine stopped */ + dp->tx_watcher_stop = 1; + cv_signal(&dp->tx_watcher_cv); + thread_join(dp->tx_watcher_did); + dp->tx_watcher_did = NULL; + } +} + +/* ================================================================== */ +/* + * Callback handlers + */ +/* ================================================================== */ +static void +usbgem_bulkin_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req) +{ + mblk_t *newmp; + mblk_t *mp; + mblk_t *tp; + int len = 0; + int pkts = 0; + int bcast = 0; + int mcast = 0; + boolean_t busy; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->bulk_client_private; + mp = req->bulk_data; + req->bulk_data = NULL; + + DPRINTF(2, (CE_CONT, "!%s: %s: mp:%p, cr:%s(%d)", + dp->name, __func__, mp, + usb_str_cr(req->bulk_completion_reason), + req->bulk_completion_reason)); + + /* + * we cannot acquire dev_state_lock because the routine + * must be executed during usbgem_mac_stop() to avoid + * dead lock. + * we use a simle membar operation to get the state correctly. + */ + membar_consumer(); + + if (req->bulk_completion_reason == USB_CR_OK && + dp->nic_state == NIC_STATE_ONLINE) { + newmp = (*dp->ugc.usbgc_rx_make_packet)(dp, mp); + + if (newmp != mp) { + /* the message has been reallocated, free old one */ + freemsg(mp); + } + + /* the message may includes one or more ethernet packets */ + for (tp = newmp; tp; tp = tp->b_next) { + len += tp->b_wptr - tp->b_rptr; + pkts++; + if (tp->b_rptr[0] & 1) { + if (bcmp(tp->b_rptr, &usbgem_bcastaddr, + ETHERADDRL) == 0) { + bcast++; + } else { + mcast++; + } + } + } + + /* send up if it is a valid packet */ +#ifdef USBGEM_CONFIG_GLDv3 + mac_rx(dp->mh, NULL, newmp); +#else + while (newmp) { + tp = newmp; + newmp = newmp->b_next; + tp->b_next = NULL; + gld_recv(dp->macinfo, tp); + } +#endif + } else { + freemsg(mp); + len = 0; + } + + mutex_enter(&dp->rxlock); + /* update rx_active */ + if (dp->rx_active) { + dp->rx_active = dp->mac_state == MAC_STATE_ONLINE; + } + + dp->stats.rbytes += len; + dp->stats.rpackets += pkts; + if (bcast | mcast) { + dp->stats.rbcast += bcast; + dp->stats.rmcast += mcast; + } + mutex_exit(&dp->rxlock); + + if (dp->rx_active) { + /* prepare to receive the next packets */ + if (usbgem_rx_start_unit(dp, req)) { + /* we successed */ + goto done; + } + cmn_err(CE_WARN, + "!%s: %s: failed to fill next rx packet", + dp->name, __func__); + /* + * we use another flag to indicate error state. + * if we acquire dev_state_lock for RW_WRITER here, + * usbgem_mac_stop() may hang. + */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + } else { + /* no need to prepare the next packets */ + usb_free_bulk_req(req); + } + + mutex_enter(&dp->rxlock); + dp->rx_active = B_FALSE; + dp->rx_busy_cnt--; + if (dp->rx_busy_cnt == 0) { + /* wake up someone waits for me */ + cv_broadcast(&dp->rx_drain_cv); + } + mutex_exit(&dp->rxlock); +done: + ; +} + +static void +usbgem_bulkout_cb(usb_pipe_handle_t pipe, usb_bulk_req_t *req) +{ + boolean_t intr; + boolean_t tx_sched; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->bulk_client_private; + tx_sched = B_FALSE; + + DPRINTF(2, (CE_CONT, + "!%s: %s: cr:%s(%d) cb_flags:0x%x head:%d tail:%d", + dp->name, __func__, + usb_str_cr(req->bulk_completion_reason), + req->bulk_completion_reason, + req->bulk_cb_flags, + dp->tx_busy_cnt)); + + /* we have finished to transfer the packet into tx fifo */ + intr = DB_TCI(req->bulk_data); + freemsg(req->bulk_data); + + if (req->bulk_completion_reason != USB_CR_OK && + dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + + mutex_enter(&dp->txlock); + + if (intr) { + ASSERT(dp->tx_intr_pended > 0); + /* find the last interrupt we have scheduled */ + if (--(dp->tx_intr_pended) == 0) { + tx_sched = B_TRUE; + } + } + + ASSERT(dp->tx_busy_cnt > 0); + req->bulk_client_private = (usb_opaque_t)dp->tx_free_list; + dp->tx_free_list = req; + dp->tx_busy_cnt--; + +#ifdef CONFIG_TX_LIMITER + if (tx_sched) { + dp->tx_max_packets = + min(dp->tx_max_packets + 1, dp->ugc.usbgc_tx_list_max); + } +#endif + if (dp->mac_state != MAC_STATE_ONLINE && dp->tx_busy_cnt == 0) { + cv_broadcast(&dp->tx_drain_cv); + } + + mutex_exit(&dp->txlock); + + if (tx_sched) { +#ifdef USBGEM_CONFIG_GLDv3 + mac_tx_update(dp->mh); +#else + gld_sched(dp->macinfo); +#endif + } +} + +static void +usbgem_intr_cb(usb_pipe_handle_t ph, usb_intr_req_t *req) +{ + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)req->intr_client_private; + dp->stats.intr++; + + if (req->intr_completion_reason == USB_CR_OK) { + (*dp->ugc.usbgc_interrupt)(dp, req->intr_data); + } + + /* free the request and data */ + usb_free_intr_req(req); +} + +/* ======================================================================== */ +/* + * MII support routines + */ +/* ======================================================================== */ +static void +usbgem_choose_forcedmode(struct usbgem_dev *dp) +{ + /* choose media mode */ + if (dp->anadv_1000fdx || dp->anadv_1000hdx) { + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = dp->anadv_1000fdx; + } else if (dp->anadv_100fdx || dp->anadv_100t4) { + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if (dp->anadv_100hdx) { + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_FALSE; + } else { + dp->speed = USBGEM_SPD_10; + dp->full_duplex = dp->anadv_10fdx; + } +} + +static uint16_t +usbgem_mii_read(struct usbgem_dev *dp, uint_t reg, int *errp) +{ + uint16_t val; + + sema_p(&dp->hal_op_lock); + val = (*dp->ugc.usbgc_mii_read)(dp, reg, errp); + sema_v(&dp->hal_op_lock); + + return (val); +} + +static void +usbgem_mii_write(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp) +{ + sema_p(&dp->hal_op_lock); + (*dp->ugc.usbgc_mii_write)(dp, reg, val, errp); + sema_v(&dp->hal_op_lock); +} + +static int +usbgem_mii_probe(struct usbgem_dev *dp) +{ + int err; + + err = (*dp->ugc.usbgc_mii_probe)(dp); + return (err); +} + +static int +usbgem_mii_init(struct usbgem_dev *dp) +{ + int err; + + err = (*dp->ugc.usbgc_mii_init)(dp); + return (err); +} + +#define fc_cap_decode(x) \ + ((((x) & MII_ABILITY_PAUSE) != 0 ? 1 : 0) | \ + (((x) & MII_ABILITY_ASM_DIR) != 0 ? 2 : 0)) + +int +usbgem_mii_config_default(struct usbgem_dev *dp, int *errp) +{ + uint16_t mii_stat; + uint16_t val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Configure bits in advertisement register + */ + mii_stat = dp->mii_status; + + DPRINTF(1, (CE_CONT, "!%s: %s: MII_STATUS reg:%b", + dp->name, __func__, mii_stat, MII_STATUS_BITS)); + + if ((mii_stat & MII_STATUS_ABILITY_TECH) == 0) { + /* it's funny */ + cmn_err(CE_WARN, "!%s: wrong ability bits: mii_status:%b", + dp->name, mii_stat, MII_STATUS_BITS); + return (USB_FAILURE); + } + + /* Do not change the rest of ability bits in advert reg */ + val = usbgem_mii_read(dp, MII_AN_ADVERT, errp) & ~MII_ABILITY_ALL; + if (*errp != USB_SUCCESS) { + goto usberr; + } + + DPRINTF(0, (CE_CONT, + "!%s: %s: 100T4:%d 100F:%d 100H:%d 10F:%d 10H:%d", + dp->name, __func__, + dp->anadv_100t4, dp->anadv_100fdx, dp->anadv_100hdx, + dp->anadv_10fdx, dp->anadv_10hdx)); + + /* set technology bits */ + if (dp->anadv_100t4) { + val |= MII_ABILITY_100BASE_T4; + } + if (dp->anadv_100fdx) { + val |= MII_ABILITY_100BASE_TX_FD; + } + if (dp->anadv_100hdx) { + val |= MII_ABILITY_100BASE_TX; + } + if (dp->anadv_10fdx) { + val |= MII_ABILITY_10BASE_T_FD; + } + if (dp->anadv_10hdx) { + val |= MII_ABILITY_10BASE_T; + } + + /* set flow control capabilities */ + if (dp->anadv_pause) { + val |= MII_ABILITY_PAUSE; + } + if (dp->anadv_asmpause) { + val |= MII_ABILITY_ASM_DIR; + } + + DPRINTF(0, (CE_CONT, + "!%s: %s: setting MII_AN_ADVERT reg:%b, pause:%d, asmpause:%d", + dp->name, __func__, val, MII_ABILITY_BITS, + dp->anadv_pause, dp->anadv_asmpause)); + + usbgem_mii_write(dp, MII_AN_ADVERT, val, errp); + if (*errp != USB_SUCCESS) { + goto usberr; + } + + if (dp->mii_status & MII_STATUS_XSTATUS) { + /* + * 1000Base-T GMII support + */ + if (!dp->anadv_autoneg) { + /* enable manual configuration */ + val = MII_1000TC_CFG_EN; + if (dp->anadv_1000t_ms == 2) { + val |= MII_1000TC_CFG_VAL; + } + } else { + val = 0; + if (dp->anadv_1000fdx) { + val |= MII_1000TC_ADV_FULL; + } + if (dp->anadv_1000hdx) { + val |= MII_1000TC_ADV_HALF; + } + switch (dp->anadv_1000t_ms) { + case 1: + /* slave */ + val |= MII_1000TC_CFG_EN; + break; + + case 2: + /* master */ + val |= MII_1000TC_CFG_EN | MII_1000TC_CFG_VAL; + break; + + default: + /* auto: do nothing */ + break; + } + } + DPRINTF(0, (CE_CONT, + "!%s: %s: setting MII_1000TC reg:%b", + dp->name, __func__, val, MII_1000TC_BITS)); + + usbgem_mii_write(dp, MII_1000TC, val, errp); + if (*errp != USB_SUCCESS) { + goto usberr; + } + } + return (USB_SUCCESS); + +usberr: + return (*errp); +} + +static char *usbgem_fc_type[] = { + "without", + "with symmetric", + "with tx", + "with rx", +}; + +#ifdef USBGEM_CONFIG_GLDv3 +#define USBGEM_LINKUP(dp) mac_link_update((dp)->mh, LINK_STATE_UP) +#define USBGEM_LINKDOWN(dp) mac_link_update((dp)->mh, LINK_STATE_DOWN) +#else +#define USBGEM_LINKUP(dp) \ + if (gld_linkstate) { \ + gld_linkstate((dp)->macinfo, GLD_LINKSTATE_UP); \ + } +#define USBGEM_LINKDOWN(dp) \ + if (gld_linkstate) { \ + gld_linkstate((dp)->macinfo, GLD_LINKSTATE_DOWN); \ + } +#endif + +static uint8_t usbgem_fc_result[4 /* my cap */][4 /* lp cap */] = { +/* none symm tx rx/symm */ +/* none */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE}, +/* sym */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC, + FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC}, +/* tx */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_NONE, + FLOW_CONTROL_TX_PAUSE}, +/* rx/symm */ + {FLOW_CONTROL_NONE, + FLOW_CONTROL_SYMMETRIC, + FLOW_CONTROL_RX_PAUSE, + FLOW_CONTROL_SYMMETRIC}, +}; + +static boolean_t +usbgem_mii_link_check(struct usbgem_dev *dp, int *oldstatep, int *newstatep) +{ + boolean_t tx_sched = B_FALSE; + uint16_t status; + uint16_t advert; + uint16_t lpable; + uint16_t exp; + uint16_t ctl1000; + uint16_t stat1000; + uint16_t val; + clock_t now; + clock_t diff; + int linkdown_action; + boolean_t fix_phy = B_FALSE; + int err; + uint_t rwlock; + + DPRINTF(4, (CE_CONT, "!%s: %s: time:%d state:%d", + dp->name, __func__, ddi_get_lbolt(), dp->mii_state)); + + if (dp->mii_state != MII_STATE_LINKUP) { + rwlock = RW_WRITER; + } else { + rwlock = RW_READER; + } +again: + rw_enter(&dp->dev_state_lock, rwlock); + + /* save old mii state */ + *oldstatep = dp->mii_state; + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* stop periodic execution of the link watcher */ + dp->mii_interval = 0; + tx_sched = B_FALSE; + goto next; + } + + now = ddi_get_lbolt(); + diff = now - dp->mii_last_check; + dp->mii_last_check = now; + + /* + * For NWAM, don't show linkdown state right + * when the device is attached. + */ + if (dp->linkup_delay > 0) { + if (dp->linkup_delay > diff) { + dp->linkup_delay -= diff; + } else { + /* link up timeout */ + dp->linkup_delay = -1; + } + } + +next_nowait: + switch (dp->mii_state) { + case MII_STATE_UNKNOWN: + goto reset_phy; + + case MII_STATE_RESETTING: + dp->mii_timer -= diff; + if (dp->mii_timer > 0) { + /* don't read phy registers in resetting */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (val & MII_CONTROL_RESET) { + cmn_err(CE_NOTE, + "!%s: time:%ld resetting phy not complete." + " mii_control:0x%b", + dp->name, ddi_get_lbolt(), + val, MII_CONTROL_BITS); + } + + /* ensure neither isolated nor pwrdown nor auto-nego mode */ + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + goto usberr; + } +#if USBGEM_DEBUG_LEVEL > 10 + val = usbgem_mii_read(dp, MII_CONTROL, &err); + cmn_err(CE_CONT, "!%s: readback control %b", + dp->name, val, MII_CONTROL_BITS); +#endif + /* As resetting PHY has completed, configure PHY registers */ + if ((*dp->ugc.usbgc_mii_config)(dp, &err) != USB_SUCCESS) { + /* we failed to configure PHY */ + goto usberr; + } + + /* prepare for forced mode */ + usbgem_choose_forcedmode(dp); + + dp->mii_lpable = 0; + dp->mii_advert = 0; + dp->mii_exp = 0; + dp->mii_ctl1000 = 0; + dp->mii_stat1000 = 0; + + dp->flow_control = FLOW_CONTROL_NONE; + + if (!dp->anadv_autoneg) { + /* skip auto-negotiation phase */ + dp->mii_state = MII_STATE_MEDIA_SETUP; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + goto next_nowait; + } + + /* issue an auto-negotiation command */ + goto autonego; + + case MII_STATE_AUTONEGOTIATING: + /* + * Autonegotiation in progress + */ + dp->mii_timer -= diff; + if (dp->mii_timer - + (dp->ugc.usbgc_mii_an_timeout - dp->ugc.usbgc_mii_an_wait) + > 0) { + /* wait for minimum time (2.3 - 2.5 sec) */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + /* read PHY status */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + DPRINTF(4, (CE_CONT, + "!%s: %s: called: mii_state:%d MII_STATUS reg:%b", + dp->name, __func__, dp->mii_state, + status, MII_STATUS_BITS)); + + if (status & MII_STATUS_REMFAULT) { + /* + * The link parnert told me something wrong happend. + * What do we do ? + */ + cmn_err(CE_CONT, + "!%s: auto-negotiation failed: remote fault", + dp->name); + goto autonego; + } + + if ((status & MII_STATUS_ANDONE) == 0) { + if (dp->mii_timer <= 0) { + /* + * Auto-negotiation has been timed out, + * Reset PHY and try again. + */ + if (!dp->mii_supress_msg) { + cmn_err(CE_WARN, + "!%s: auto-negotiation failed:" + " timeout", + dp->name); + dp->mii_supress_msg = B_TRUE; + } + goto autonego; + } + /* + * Auto-negotiation is in progress. Wait for a while. + */ + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + } + + /* + * Auto-negotiation has been completed. Let's go to AN_DONE. + */ + dp->mii_state = MII_STATE_AN_DONE; + dp->mii_supress_msg = B_FALSE; + DPRINTF(0, (CE_CONT, + "!%s: auto-negotiation completed, MII_STATUS:%b", + dp->name, status, MII_STATUS_BITS)); + + if (dp->ugc.usbgc_mii_an_delay > 0) { + dp->mii_timer = dp->ugc.usbgc_mii_an_delay; + dp->mii_interval = drv_usectohz(20*1000); + goto next; + } + + dp->mii_timer = 0; + diff = 0; + goto next_nowait; + + case MII_STATE_AN_DONE: + /* + * Auto-negotiation has done. Now we can set up media. + */ + dp->mii_timer -= diff; + if (dp->mii_timer > 0) { + /* wait for a while */ + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + } + + /* + * Setup speed and duplex mode according with + * the result of auto negotiation. + */ + + /* + * Read registers required to determin current + * duplex mode and media speed. + */ + if (dp->ugc.usbgc_mii_an_delay > 0) { + /* the 'status' variable is not initialized yet */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + advert = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + lpable = usbgem_mii_read(dp, MII_AN_LPABLE, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + exp = usbgem_mii_read(dp, MII_AN_EXPANSION, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (exp == 0xffff) { + /* some phys don't have exp register */ + exp = 0; + } + + ctl1000 = 0; + stat1000 = 0; + if (dp->mii_status & MII_STATUS_XSTATUS) { + ctl1000 = usbgem_mii_read(dp, MII_1000TC, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + stat1000 = usbgem_mii_read(dp, MII_1000TS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + dp->mii_lpable = lpable; + dp->mii_advert = advert; + dp->mii_exp = exp; + dp->mii_ctl1000 = ctl1000; + dp->mii_stat1000 = stat1000; + + cmn_err(CE_CONT, + "!%s: auto-negotiation done: " + "status:%b, advert:%b, lpable:%b, exp:%b", + dp->name, + status, MII_STATUS_BITS, + advert, MII_ABILITY_BITS, + lpable, MII_ABILITY_BITS, + exp, MII_AN_EXP_BITS); + + DPRINTF(0, (CE_CONT, "!%s: MII_STATUS:%b", + dp->name, status, MII_STATUS_BITS)); + + if (dp->mii_status & MII_STATUS_XSTATUS) { + cmn_err(CE_CONT, + "! MII_1000TC reg:%b, MII_1000TS reg:%b", + ctl1000, MII_1000TC_BITS, + stat1000, MII_1000TS_BITS); + } + + if (usbgem_population(lpable) <= 1 && + (exp & MII_AN_EXP_LPCANAN) == 0) { + if ((advert & MII_ABILITY_TECH) != lpable) { + cmn_err(CE_WARN, + "!%s: but the link partner doesn't seem" + " to have auto-negotiation capability." + " please check the link configuration.", + dp->name); + } + /* + * it should be a result of pararell detection, + * which cannot detect duplex mode. + */ + if ((advert & lpable) == 0 && + lpable & MII_ABILITY_10BASE_T) { + /* no common technology, try 10M half mode */ + lpable |= advert & MII_ABILITY_10BASE_T; + fix_phy = B_TRUE; + } + } else if (lpable == 0) { + cmn_err(CE_WARN, "!%s: wrong lpable.", dp->name); + goto reset_phy; + } + /* + * configure current link mode according to AN priority. + */ + val = advert & lpable; + if ((ctl1000 & MII_1000TC_ADV_FULL) && + (stat1000 & MII_1000TS_LP_FULL)) { + /* 1000BaseT & full duplex */ + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = B_TRUE; + } else if ((ctl1000 & MII_1000TC_ADV_HALF) && + (stat1000 & MII_1000TS_LP_HALF)) { + /* 1000BaseT & half duplex */ + dp->speed = USBGEM_SPD_1000; + dp->full_duplex = B_FALSE; + } else if ((val & MII_ABILITY_100BASE_TX_FD)) { + /* 100BaseTx & fullduplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_100BASE_T4)) { + /* 100BaseTx & fullduplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_100BASE_TX)) { + /* 100BaseTx & half duplex */ + dp->speed = USBGEM_SPD_100; + dp->full_duplex = B_FALSE; + } else if ((val & MII_ABILITY_10BASE_T_FD)) { + /* 10BaseT & full duplex */ + dp->speed = USBGEM_SPD_10; + dp->full_duplex = B_TRUE; + } else if ((val & MII_ABILITY_10BASE_T)) { + /* 10BaseT & half duplex */ + dp->speed = USBGEM_SPD_10; + dp->full_duplex = B_FALSE; + } else { + /* + * the link partner doesn't seem to have + * auto-negotiation capability and our PHY + * could not report current mode correctly. + * We guess current mode by mii_control register. + */ + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + /* select 100m half or 10m half */ + dp->speed = (val & MII_CONTROL_100MB) ? + USBGEM_SPD_100 : USBGEM_SPD_10; + dp->full_duplex = B_FALSE; + fix_phy = B_TRUE; + + cmn_err(CE_NOTE, + "!%s: auto-negotiation done but " + "common ability not found.\n" + "PHY state: control:%b advert:%b lpable:%b\n" + "guessing %d Mbps %s duplex mode", + dp->name, + val, MII_CONTROL_BITS, + advert, MII_ABILITY_BITS, + lpable, MII_ABILITY_BITS, + usbgem_speed_value[dp->speed], + dp->full_duplex ? "full" : "half"); + } + + if (dp->full_duplex) { + dp->flow_control = + usbgem_fc_result[fc_cap_decode(advert)] + [fc_cap_decode(lpable)]; + } else { + dp->flow_control = FLOW_CONTROL_NONE; + } + dp->mii_state = MII_STATE_MEDIA_SETUP; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + goto next_nowait; + + case MII_STATE_MEDIA_SETUP: + DPRINTF(2, (CE_CONT, "!%s: setup midia mode", dp->name)); + + /* assume the link state is down */ + dp->mii_state = MII_STATE_LINKDOWN; + dp->mii_supress_msg = B_FALSE; + + /* use short interval */ + dp->mii_interval = WATCH_INTERVAL_FAST; + + if ((!dp->anadv_autoneg) || + dp->ugc.usbgc_mii_an_oneshot || fix_phy) { + + /* + * write the result of auto negotiation back. + */ + val = usbgem_mii_read(dp, MII_CONTROL, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + val &= ~(MII_CONTROL_SPEED | MII_CONTROL_FDUPLEX | + MII_CONTROL_ANE | MII_CONTROL_RSAN); + + if (dp->full_duplex) { + val |= MII_CONTROL_FDUPLEX; + } + + switch (dp->speed) { + case USBGEM_SPD_1000: + val |= MII_CONTROL_1000MB; + break; + + case USBGEM_SPD_100: + val |= MII_CONTROL_100MB; + break; + + default: + cmn_err(CE_WARN, "%s: unknown speed:%d", + dp->name, dp->speed); + /* FALLTHROUGH */ + + case USBGEM_SPD_10: + /* for USBGEM_SPD_10, do nothing */ + break; + } + + if (dp->mii_status & MII_STATUS_XSTATUS) { + usbgem_mii_write(dp, + MII_1000TC, MII_1000TC_CFG_EN, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + usbgem_mii_write(dp, MII_CONTROL, val, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + /* + * XXX -- nic state should be one of + * NIC_STATE_DISCONNECTED + * NIC_STATE_STOPPED + * NIC_STATE_INITIALIZED + * NIC_STATE_ONLINE + */ + if (dp->nic_state >= NIC_STATE_INITIALIZED) { + /* notify the result of autonegotiation to mac */ + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto usberr; + } + } + goto next_nowait; + + case MII_STATE_LINKDOWN: + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (status & MII_STATUS_LINKUP) { + /* + * Link is going up + */ + dp->mii_state = MII_STATE_LINKUP; + dp->mii_supress_msg = B_FALSE; + + DPRINTF(0, (CE_CONT, + "!%s: link up detected: status:%b", + dp->name, status, MII_STATUS_BITS)); + + /* + * MII_CONTROL_100MB and MII_CONTROL_FDUPLEX are + * ignored when MII_CONTROL_ANE is set. + */ + cmn_err(CE_CONT, + "!%s: Link up: %d Mbps %s duplex %s flow control", + dp->name, + usbgem_speed_value[dp->speed], + dp->full_duplex ? "full" : "half", + usbgem_fc_type[dp->flow_control]); + + dp->mii_interval = + dp->ugc.usbgc_mii_link_watch_interval; + + if (dp->ugc.usbgc_mii_hw_link_detection && + dp->nic_state == NIC_STATE_ONLINE) { + dp->mii_interval = 0; + } + + if (dp->nic_state == NIC_STATE_ONLINE) { + if (dp->mac_state == MAC_STATE_INITIALIZED) { + (void) usbgem_mac_start(dp); + } + tx_sched = B_TRUE; + } + + goto next; + } + + dp->mii_supress_msg = B_TRUE; + if (dp->anadv_autoneg) { + dp->mii_timer -= diff; + if (dp->mii_timer <= 0) { + /* + * the link down timer expired. + * need to restart auto-negotiation. + */ + linkdown_action = + dp->ugc.usbgc_mii_linkdown_timeout_action; + goto restart_autonego; + } + } + /* don't change mii_state */ + goto next; + + case MII_STATE_LINKUP: + if (rwlock == RW_READER) { + /* first pass, read mii status */ + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + if ((status & MII_STATUS_LINKUP) == 0) { + /* + * Link is going down + */ + cmn_err(CE_NOTE, + "!%s: link down detected: status:%b", + dp->name, status, MII_STATUS_BITS); + /* + * Acquire exclusive lock to change mii_state + */ + if (rwlock == RW_READER) { + rwlock = RW_WRITER; + rw_exit(&dp->dev_state_lock); + goto again; + } + + dp->mii_state = MII_STATE_LINKDOWN; + dp->mii_timer = dp->ugc.usbgc_mii_linkdown_timeout; + + /* + * As we may change the state of the device, + * let us acquire exclusive lock for the state. + */ + if (dp->nic_state == NIC_STATE_ONLINE && + dp->mac_state == MAC_STATE_ONLINE && + dp->ugc.usbgc_mii_stop_mac_on_linkdown) { + (void) usbgem_restart_nic(dp); + /* drain tx */ + tx_sched = B_TRUE; + } + + if (dp->anadv_autoneg) { + /* need to restart auto-negotiation */ + linkdown_action = + dp->ugc.usbgc_mii_linkdown_action; + goto restart_autonego; + } + /* + * don't use hw link down detection until the link + * status become stable for a while. + */ + dp->mii_interval = + dp->ugc.usbgc_mii_link_watch_interval; + + goto next; + } + + /* + * still link up, no need to change mii_state + */ + if (dp->ugc.usbgc_mii_hw_link_detection && + dp->nic_state == NIC_STATE_ONLINE) { + /* + * no need to check link status periodicly + * if nic can generate interrupts when link go down. + */ + dp->mii_interval = 0; + } + goto next; + } + /* NOTREACHED */ + cmn_err(CE_PANIC, "!%s: %s: not reached", dp->name, __func__); + + /* + * Actions for new state. + */ +restart_autonego: + switch (linkdown_action) { + case MII_ACTION_RESET: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: resetting PHY", dp->name); + } + dp->mii_supress_msg = B_TRUE; + goto reset_phy; + + case MII_ACTION_NONE: + dp->mii_supress_msg = B_TRUE; + if (dp->ugc.usbgc_mii_an_oneshot) { + goto autonego; + } + /* PHY will restart autonego automatically */ + dp->mii_state = MII_STATE_AUTONEGOTIATING; + dp->mii_timer = dp->ugc.usbgc_mii_an_timeout; + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + + case MII_ACTION_RSA: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: restarting auto-negotiation", + dp->name); + } + dp->mii_supress_msg = B_TRUE; + goto autonego; + + default: + cmn_err(CE_PANIC, "!%s: unknowm linkdown action: %d", + dp->name, dp->ugc.usbgc_mii_linkdown_action); + dp->mii_supress_msg = B_TRUE; + } + /* NOTREACHED */ + +reset_phy: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: resetting PHY", dp->name); + } + dp->mii_state = MII_STATE_RESETTING; + dp->mii_timer = dp->ugc.usbgc_mii_reset_timeout; + if (!dp->ugc.usbgc_mii_dont_reset) { + usbgem_mii_write(dp, MII_CONTROL, MII_CONTROL_RESET, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + } + dp->mii_interval = WATCH_INTERVAL_FAST; + goto next; + +autonego: + if (!dp->mii_supress_msg) { + cmn_err(CE_CONT, "!%s: auto-negotiation started", dp->name); + } + dp->mii_state = MII_STATE_AUTONEGOTIATING; + dp->mii_timer = dp->ugc.usbgc_mii_an_timeout; + + /* start/restart autoneg */ + val = usbgem_mii_read(dp, MII_CONTROL, &err) & + ~(MII_CONTROL_ISOLATE | MII_CONTROL_PWRDN | MII_CONTROL_RESET); + if (err != USB_SUCCESS) { + goto usberr; + } + if (val & MII_CONTROL_ANE) { + val |= MII_CONTROL_RSAN; + } + usbgem_mii_write(dp, MII_CONTROL, + val | dp->ugc.usbgc_mii_an_cmd | MII_CONTROL_ANE, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + dp->mii_interval = dp->ugc.usbgc_mii_an_watch_interval; + goto next; + +usberr: + dp->mii_state = MII_STATE_UNKNOWN; + dp->mii_interval = dp->ugc.usbgc_mii_link_watch_interval; + tx_sched = B_TRUE; + +next: + *newstatep = dp->mii_state; + rw_exit(&dp->dev_state_lock); + return (tx_sched); +} + +static void +usbgem_mii_link_watcher(struct usbgem_dev *dp) +{ + int old_mii_state; + int new_mii_state; + boolean_t tx_sched; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + for (; ; ) { + + mutex_enter(&dp->link_watcher_lock); + if (dp->mii_interval) { + (void) cv_timedwait(&dp->link_watcher_wait_cv, + &dp->link_watcher_lock, + dp->mii_interval + ddi_get_lbolt()); + } else { + cv_wait(&dp->link_watcher_wait_cv, + &dp->link_watcher_lock); + } + mutex_exit(&dp->link_watcher_lock); + + if (dp->link_watcher_stop) { + break; + } + + /* we block callbacks from disconnect/suspend and restart */ + tx_sched = usbgem_mii_link_check(dp, + &old_mii_state, &new_mii_state); + + /* + * gld v2 notifier functions are not able to + * be called with any locks in this layer. + */ + if (tx_sched) { + /* kick potentially stopped downstream */ +#ifdef USBGEM_CONFIG_GLDv3 + mac_tx_update(dp->mh); +#else + gld_sched(dp->macinfo); +#endif + } + + if (old_mii_state != new_mii_state) { + /* notify new mii link state */ + if (new_mii_state == MII_STATE_LINKUP) { + dp->linkup_delay = 0; + USBGEM_LINKUP(dp); + } else if (dp->linkup_delay <= 0) { + USBGEM_LINKDOWN(dp); + } + } else if (dp->linkup_delay < 0) { + /* first linkup timeout */ + dp->linkup_delay = 0; + USBGEM_LINKDOWN(dp); + } + } + + thread_exit(); +} + +void +usbgem_mii_update_link(struct usbgem_dev *dp) +{ + cv_signal(&dp->link_watcher_wait_cv); +} + +int +usbgem_mii_probe_default(struct usbgem_dev *dp) +{ + int phy; + uint16_t status; + uint16_t xstatus; + int err; + uint16_t adv; + uint16_t adv_org; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Scan PHY + */ + dp->mii_status = 0; + + /* Try default phy first */ + if (dp->mii_phy_addr) { + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + if (status != 0xffff && status != 0x0000) { + goto PHY_found; + } + + if (dp->mii_phy_addr < 0) { + cmn_err(CE_NOTE, + "!%s: failed to probe default internal and/or non-MII PHY", + dp->name); + return (USB_FAILURE); + } + + cmn_err(CE_NOTE, + "!%s: failed to probe default MII PHY at %d", + dp->name, dp->mii_phy_addr); + } + + /* Try all possible address */ + for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) { + dp->mii_phy_addr = phy; + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(status) failed", + dp->name, __func__)); + goto usberr; + } + + if (status != 0xffff && status != 0x0000) { + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_write(control) failed", + dp->name, __func__)); + goto usberr; + } + goto PHY_found; + } + } + for (phy = dp->ugc.usbgc_mii_addr_min; phy < 32; phy++) { + dp->mii_phy_addr = phy; + usbgem_mii_write(dp, MII_CONTROL, 0, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_write(control) failed", + dp->name, __func__)); + goto usberr; + } + status = usbgem_mii_read(dp, MII_STATUS, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(status) failed", + dp->name, __func__)); + goto usberr; + } + + if (status != 0xffff && status != 0) { + goto PHY_found; + } + } + + cmn_err(CE_NOTE, "!%s: no MII PHY found", dp->name); + return (USB_FAILURE); + +PHY_found: + dp->mii_status = status; + dp->mii_status_ro = ~status; + dp->mii_phy_id = usbgem_mii_read(dp, MII_PHYIDH, &err) << 16; + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(PHYIDH) failed", + dp->name, __func__)); + goto usberr; + } + dp->mii_phy_id |= usbgem_mii_read(dp, MII_PHYIDL, &err); + if (err != USB_SUCCESS) { + DPRINTF(0, (CE_CONT, + "!%s: %s: mii_read(PHYIDL) failed", + dp->name, __func__)); + goto usberr; + } + + if (dp->mii_phy_addr < 0) { + cmn_err(CE_CONT, "!%s: using internal/non-MII PHY(0x%08x)", + dp->name, dp->mii_phy_id); + } else { + cmn_err(CE_CONT, "!%s: MII PHY (0x%08x) found at %d", + dp->name, dp->mii_phy_id, dp->mii_phy_addr); + } + + cmn_err(CE_CONT, + "!%s: PHY control:%b, status:%b, advert:%b, lpar:%b, exp:%b", + dp->name, + usbgem_mii_read(dp, MII_CONTROL, &err), MII_CONTROL_BITS, + status, MII_STATUS_BITS, + usbgem_mii_read(dp, MII_AN_ADVERT, &err), MII_ABILITY_BITS, + usbgem_mii_read(dp, MII_AN_LPABLE, &err), MII_ABILITY_BITS, + usbgem_mii_read(dp, MII_AN_EXPANSION, &err), MII_AN_EXP_BITS); + + dp->mii_xstatus = 0; + if (status & MII_STATUS_XSTATUS) { + dp->mii_xstatus = usbgem_mii_read(dp, MII_XSTATUS, &err); + + cmn_err(CE_CONT, "!%s: xstatus:%b", + dp->name, dp->mii_xstatus, MII_XSTATUS_BITS); + } + dp->mii_xstatus_ro = ~dp->mii_xstatus; + + /* check if the phy can advertize pause abilities */ + adv_org = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + usbgem_mii_write(dp, MII_AN_ADVERT, + MII_ABILITY_PAUSE | MII_ABILITY_ASM_DIR, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + adv = usbgem_mii_read(dp, MII_AN_ADVERT, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + + if ((adv & MII_ABILITY_PAUSE) == 0) { + dp->ugc.usbgc_flow_control &= ~1; + } + + if ((adv & MII_ABILITY_ASM_DIR) == 0) { + dp->ugc.usbgc_flow_control &= ~2; + } + + usbgem_mii_write(dp, MII_AN_ADVERT, adv_org, &err); + if (err != USB_SUCCESS) { + goto usberr; + } + return (USB_SUCCESS); + +usberr: + return (USB_FAILURE); +} + +int +usbgem_mii_init_default(struct usbgem_dev *dp) +{ + /* ENPTY */ + return (USB_SUCCESS); +} + +static int +usbgem_mii_start(struct usbgem_dev *dp) +{ + int err; + kthread_t *lwth; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* make a first call of usbgem_mii_link_check() */ + dp->link_watcher_stop = 0; + dp->mii_state = MII_STATE_UNKNOWN; + dp->mii_interval = drv_usectohz(1000*1000); /* 1sec */ + dp->mii_last_check = ddi_get_lbolt(); + dp->linkup_delay = 600 * drv_usectohz(1000*1000); /* 10 minutes */ + + lwth = thread_create(NULL, 0, usbgem_mii_link_watcher, dp, 0, &p0, + TS_RUN, minclsyspri); + if (lwth == NULL) { + cmn_err(CE_WARN, + "!%s: %s: failed to create a link watcher thread", + dp->name, __func__); + return (USB_FAILURE); + } + dp->link_watcher_did = lwth->t_did; + + return (USB_SUCCESS); +} + +static void +usbgem_mii_stop(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* Ensure timer routine stopped */ + dp->link_watcher_stop = 1; + cv_signal(&dp->link_watcher_wait_cv); + thread_join(dp->link_watcher_did); +} + +/* ============================================================== */ +/* + * internal mac register operation interface + */ +/* ============================================================== */ +/* + * usbgem_mac_init: cold start + */ +static int +usbgem_mac_init(struct usbgem_dev *dp) +{ + int err; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* pretend we succeeded */ + return (USB_SUCCESS); + } + + ASSERT(dp->mac_state == MAC_STATE_STOPPED); + + /* reset fatal error timestamp */ + dp->fatal_error = (clock_t)0; + + /* reset tx side state */ + mutex_enter(&dp->txlock); + dp->tx_busy_cnt = 0; + dp->tx_max_packets = dp->ugc.usbgc_tx_list_max; + mutex_exit(&dp->txlock); + + /* reset rx side state */ + mutex_enter(&dp->rxlock); + dp->rx_busy_cnt = 0; + mutex_exit(&dp->rxlock); + + err = usbgem_hal_init_chip(dp); + if (err == USB_SUCCESS) { + dp->mac_state = MAC_STATE_INITIALIZED; + } + + return (err); +} + +/* + * usbgem_mac_start: warm start + */ +static int +usbgem_mac_start(struct usbgem_dev *dp) +{ + int err; + int i; + usb_flags_t flags = 0; + usb_intr_req_t *req; +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_state_t p_state; +#endif + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + /* do nothing but don't return failure */ + return (USB_SUCCESS); + } + + if (dp->mac_state != MAC_STATE_INITIALIZED) { + /* don't return failer */ + DPRINTF(0, (CE_CONT, + "!%s: %s: mac_state(%d) is not MAC_STATE_INITIALIZED", + dp->name, __func__, dp->mac_state)); + goto x; + } + + dp->mac_state = MAC_STATE_ONLINE; + + if (usbgem_hal_start_chip(dp) != USB_SUCCESS) { + cmn_err(CE_NOTE, + "!%s: %s: usb error was detected during start_chip", + dp->name, __func__); + goto x; + } + +#ifdef USBGEM_DEBUG_LEVEL + usb_pipe_get_state(dp->intr_pipe, &p_state, 0); + ASSERT(p_state == USB_PIPE_STATE_IDLE); +#endif /* USBGEM_DEBUG_LEVEL */ + + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + + /* make a request for interrupt */ + + req = usb_alloc_intr_req(dp->dip, 0, USB_FLAGS_SLEEP); + if (req == NULL) { + cmn_err(CE_WARN, "!%s: %s: failed to allocate intreq", + dp->name, __func__); + goto x; + } + req->intr_data = NULL; + req->intr_client_private = (usb_opaque_t)dp; + req->intr_timeout = 0; + req->intr_attributes = + USB_ATTRS_SHORT_XFER_OK | USB_ATTRS_AUTOCLEARING; + req->intr_len = dp->ep_intr->wMaxPacketSize; + req->intr_cb = usbgem_intr_cb; + req->intr_exc_cb = usbgem_intr_cb; + req->intr_completion_reason = 0; + req->intr_cb_flags = 0; + + err = usb_pipe_intr_xfer(dp->intr_pipe, req, flags); + if (err != USB_SUCCESS) { + cmn_err(CE_WARN, + "%s: err:%d failed to start polling of intr pipe", + dp->name, err); + goto x; + } + } + + /* kick to receive the first packet */ + if (usbgem_init_rx_buf(dp) != USB_SUCCESS) { + goto err_stop_intr; + } + dp->rx_active = B_TRUE; + + return (USB_SUCCESS); + +err_stop_intr: + /* stop the interrupt pipe */ + DPRINTF(0, (CE_CONT, "!%s: %s: FAULURE", dp->name, __func__)); + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP); + } +x: + ASSERT(dp->mac_state == MAC_STATE_ONLINE); + /* we use another flag to indicate error state. */ + if (dp->fatal_error == (clock_t)0) { + dp->fatal_error = usbgem_timestamp_nz(); + } + return (USB_FAILURE); +} + +static int +usbgem_mac_stop(struct usbgem_dev *dp, int new_state, boolean_t graceful) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * we must have writer lock for dev_state_lock + */ + ASSERT(new_state == MAC_STATE_STOPPED + || new_state == MAC_STATE_DISCONNECTED); + + /* stop polling interrupt pipe */ + if (dp->ugc.usbgc_interrupt && dp->intr_pipe) { + usb_pipe_stop_intr_polling(dp->intr_pipe, USB_FLAGS_SLEEP); + } + + if (new_state == MAC_STATE_STOPPED || graceful) { + /* stop the nic hardware completely */ + if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) { + (void) usbgem_hal_reset_chip(dp); + } + } + + /* stop preparing new rx packets and sending new packets */ + dp->mac_state = new_state; + + /* other processors must get mac_state correctly after here */ + membar_producer(); + + /* cancel all requests we have sent */ + usb_pipe_reset(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + usb_pipe_reset(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + + DPRINTF(0, (CE_CONT, + "!%s: %s: rx_busy_cnt:%d tx_busy_cnt:%d", + dp->name, __func__, dp->rx_busy_cnt, dp->tx_busy_cnt)); + + /* + * Here all rx packets has been cancelled and their call back + * function has been exeuted, because we called usb_pipe_reset + * synchronously. + * So actually we just ensure rx_busy_cnt == 0. + */ + mutex_enter(&dp->rxlock); + while (dp->rx_busy_cnt > 0) { + cv_wait(&dp->rx_drain_cv, &dp->rxlock); + } + mutex_exit(&dp->rxlock); + + DPRINTF(0, (CE_CONT, "!%s: %s: rx_busy_cnt is %d now", + dp->name, __func__, dp->rx_busy_cnt)); + + mutex_enter(&dp->txlock); + while (dp->tx_busy_cnt > 0) { + cv_wait(&dp->tx_drain_cv, &dp->txlock); + } + mutex_exit(&dp->txlock); + + DPRINTF(0, (CE_CONT, "!%s: %s: tx_busy_cnt is %d now", + dp->name, __func__, dp->tx_busy_cnt)); + + return (USB_SUCCESS); +} + +static int +usbgem_add_multicast(struct usbgem_dev *dp, const uint8_t *ep) +{ + int cnt; + int err; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + if (dp->mc_count_req++ < USBGEM_MAXMC) { + /* append the new address at the end of the mclist */ + cnt = dp->mc_count; + bcopy(ep, dp->mc_list[cnt].addr.ether_addr_octet, + ETHERADDRL); + if (dp->ugc.usbgc_multicast_hash) { + dp->mc_list[cnt].hash = + (*dp->ugc.usbgc_multicast_hash)(dp, ep); + } + dp->mc_count = cnt + 1; + } + + if (dp->mc_count_req != dp->mc_count) { + /* multicast address list overflow */ + dp->rxmode |= RXMODE_MULTI_OVF; + } else { + dp->rxmode &= ~RXMODE_MULTI_OVF; + } + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + /* tell new multicast list to the hardware */ + err = usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (err); +} + +static int +usbgem_remove_multicast(struct usbgem_dev *dp, const uint8_t *ep) +{ + size_t len; + int i; + int cnt; + int err; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + dp->mc_count_req--; + cnt = dp->mc_count; + for (i = 0; i < cnt; i++) { + if (bcmp(ep, &dp->mc_list[i].addr, ETHERADDRL)) { + continue; + } + /* shrink the mclist by copying forward */ + len = (cnt - (i + 1)) * sizeof (*dp->mc_list); + if (len > 0) { + bcopy(&dp->mc_list[i+1], &dp->mc_list[i], len); + } + dp->mc_count--; + break; + } + + if (dp->mc_count_req != dp->mc_count) { + /* multicast address list overflow */ + dp->rxmode |= RXMODE_MULTI_OVF; + } else { + dp->rxmode &= ~RXMODE_MULTI_OVF; + } + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + err = usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (err); +} + + +/* ============================================================== */ +/* + * ioctl + */ +/* ============================================================== */ +enum ioc_reply { + IOC_INVAL = -1, /* bad, NAK with EINVAL */ + IOC_DONE, /* OK, reply sent */ + IOC_ACK, /* OK, just send ACK */ + IOC_REPLY, /* OK, just send reply */ + IOC_RESTART_ACK, /* OK, restart & ACK */ + IOC_RESTART_REPLY /* OK, restart & reply */ +}; + + +#ifdef USBGEM_CONFIG_MAC_PROP +static int +usbgem_get_def_val(struct usbgem_dev *dp, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +{ + link_flowctrl_t fl; + int err = 0; + + ASSERT(pr_valsize > 0); + switch (pr_num) { + case MAC_PROP_AUTONEG: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + break; + + case MAC_PROP_FLOWCTRL: + if (pr_valsize < sizeof (link_flowctrl_t)) { + return (EINVAL); + } + switch (dp->ugc.usbgc_flow_control) { + case FLOW_CONTROL_NONE: + fl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_SYMMETRIC: + fl = LINK_FLOWCTRL_BI; + break; + case FLOW_CONTROL_TX_PAUSE: + fl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_RX_PAUSE: + fl = LINK_FLOWCTRL_RX; + break; + } + bcopy(&fl, pr_val, sizeof (fl)); + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_EN_1000FDX_CAP: + *(uint8_t *)pr_val = + (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_EN_1000HDX_CAP: + *(uint8_t *)pr_val = + (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case MAC_PROP_ADV_100T4_CAP: + case MAC_PROP_EN_100T4_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_EN_100FDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_EN_100HDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_EN_10FDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_EN_10HDX_CAP: + *(uint8_t *)pr_val = + BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + default: + err = ENOTSUP; + break; + } + return (err); +} + +#ifdef MAC_VERSION_V1 +static void +usbgem_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, + mac_prop_info_handle_t prh) +{ + struct usbgem_dev *dp = arg; + link_flowctrl_t fl; + + /* + * By default permissions are read/write unless specified + * otherwise by the driver. + */ + + switch (pr_num) { + case MAC_PROP_DUPLEX: + case MAC_PROP_SPEED: + case MAC_PROP_STATUS: + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_ADV_100T4_CAP: + case MAC_PROP_EN_100T4_CAP: + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + break; + + case MAC_PROP_EN_1000FDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASET_FD)); + } else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD) + == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_1000HDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASET)); + } else if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN( + dp->mii_xstatus & MII_XSTATUS_1000BASEX)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_100FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_100HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_10FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_10_FD)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_EN_10HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_10)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_AUTONEG: + if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) { + mac_prop_info_set_default_uint8(prh, + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG)); + } else { + mac_prop_info_set_perm(prh, MAC_PROP_PERM_READ); + } + break; + + case MAC_PROP_FLOWCTRL: + switch (dp->ugc.usbgc_flow_control) { + case FLOW_CONTROL_NONE: + fl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_SYMMETRIC: + fl = LINK_FLOWCTRL_BI; + break; + case FLOW_CONTROL_TX_PAUSE: + fl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_RX_PAUSE: + fl = LINK_FLOWCTRL_RX; + break; + } + mac_prop_info_set_default_link_flowctrl(prh, fl); + break; + + case MAC_PROP_MTU: + mac_prop_info_set_range_uint32(prh, + dp->ugc.usbgc_min_mtu, dp->ugc.usbgc_max_mtu); + break; + + case MAC_PROP_PRIVATE: + break; + } +} +#endif + +static int +usbgem_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, const void *pr_val) +{ + struct usbgem_dev *dp = arg; + int err = 0; + boolean_t update = B_FALSE; + link_flowctrl_t flowctrl; + uint32_t cur_mtu, new_mtu; + + rw_enter(&dp->dev_state_lock, RW_WRITER); + switch (pr_num) { + case MAC_PROP_EN_1000FDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) == 0 || + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD) == 0) { + if (dp->anadv_1000fdx != *(uint8_t *)pr_val) { + dp->anadv_1000fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_1000HDX_CAP: + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) == 0 || + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX) == 0) { + if (dp->anadv_1000hdx != *(uint8_t *)pr_val) { + dp->anadv_1000hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_100FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX_FD) == 0) { + if (dp->anadv_100fdx != *(uint8_t *)pr_val) { + dp->anadv_100fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_100HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_100_BASEX) == 0) { + if (dp->anadv_100hdx != *(uint8_t *)pr_val) { + dp->anadv_100hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_10FDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + if (dp->anadv_10fdx != *(uint8_t *)pr_val) { + dp->anadv_10fdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_EN_10HDX_CAP: + if ((dp->mii_status_ro & MII_STATUS_10_FD) == 0) { + if (dp->anadv_10hdx != *(uint8_t *)pr_val) { + dp->anadv_10hdx = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_AUTONEG: + if ((dp->mii_status_ro & MII_STATUS_CANAUTONEG) == 0) { + if (dp->anadv_autoneg != *(uint8_t *)pr_val) { + dp->anadv_autoneg = *(uint8_t *)pr_val; + update = B_TRUE; + } + } else { + err = ENOTSUP; + } + break; + + case MAC_PROP_FLOWCTRL: + bcopy(pr_val, &flowctrl, sizeof (flowctrl)); + + switch (flowctrl) { + default: + err = EINVAL; + break; + + case LINK_FLOWCTRL_NONE: + if (dp->flow_control != FLOW_CONTROL_NONE) { + dp->flow_control = FLOW_CONTROL_NONE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_RX: + if (dp->flow_control != FLOW_CONTROL_RX_PAUSE) { + dp->flow_control = FLOW_CONTROL_RX_PAUSE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_TX: + if (dp->flow_control != FLOW_CONTROL_TX_PAUSE) { + dp->flow_control = FLOW_CONTROL_TX_PAUSE; + update = B_TRUE; + } + break; + + case LINK_FLOWCTRL_BI: + if (dp->flow_control != FLOW_CONTROL_SYMMETRIC) { + dp->flow_control = FLOW_CONTROL_SYMMETRIC; + update = B_TRUE; + } + break; + } + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_STATUS: + case MAC_PROP_SPEED: + case MAC_PROP_DUPLEX: + err = ENOTSUP; /* read-only prop. Can't set this. */ + break; + + case MAC_PROP_MTU: + bcopy(pr_val, &new_mtu, sizeof (new_mtu)); + if (new_mtu != dp->mtu) { + err = EINVAL; + } + break; + + case MAC_PROP_PRIVATE: + err = ENOTSUP; + break; + + default: + err = ENOTSUP; + break; + } + + if (update) { + /* sync with PHY */ + usbgem_choose_forcedmode(dp); + dp->mii_state = MII_STATE_UNKNOWN; + cv_signal(&dp->link_watcher_wait_cv); + } + rw_exit(&dp->dev_state_lock); + return (err); +} + +static int +#ifdef MAC_VERSION_V1 +usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_valsize, void *pr_val) +#else +usbgem_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, + uint_t pr_flags, uint_t pr_valsize, void *pr_val, uint_t *perm) +#endif +{ + struct usbgem_dev *dp = arg; + int err = 0; + link_flowctrl_t flowctrl; + uint64_t tmp = 0; + + if (pr_valsize == 0) { + return (EINVAL); + } +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_RW; +#endif + bzero(pr_val, pr_valsize); +#ifndef MAC_VERSION_V1 + if ((pr_flags & MAC_PROP_DEFAULT) && (pr_num != MAC_PROP_PRIVATE)) { + return (usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val)); + } +#endif + rw_enter(&dp->dev_state_lock, RW_READER); + switch (pr_num) { + case MAC_PROP_DUPLEX: +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_READ; +#endif + if (pr_valsize >= sizeof (link_duplex_t)) { + if (dp->mii_state != MII_STATE_LINKUP) { + *(link_duplex_t *)pr_val = LINK_DUPLEX_UNKNOWN; + } else if (dp->full_duplex) { + *(link_duplex_t *)pr_val = LINK_DUPLEX_FULL; + } else { + *(link_duplex_t *)pr_val = LINK_DUPLEX_HALF; + } + } else { + err = EINVAL; + } + break; + case MAC_PROP_SPEED: +#ifndef MAC_VERSION_V1 + *perm = MAC_PROP_PERM_READ; +#endif + if (pr_valsize >= sizeof (uint64_t)) { + switch (dp->speed) { + case USBGEM_SPD_1000: + tmp = 1000000000; + break; + case USBGEM_SPD_100: + tmp = 100000000; + break; + case USBGEM_SPD_10: + tmp = 10000000; + break; + default: + tmp = 0; + } + bcopy(&tmp, pr_val, sizeof (tmp)); + } else { + err = EINVAL; + } + break; + + case MAC_PROP_AUTONEG: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_CANAUTONEG) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_autoneg; + break; + + case MAC_PROP_FLOWCTRL: + if (pr_valsize >= sizeof (link_flowctrl_t)) { + switch (dp->flow_control) { + case FLOW_CONTROL_NONE: + flowctrl = LINK_FLOWCTRL_NONE; + break; + case FLOW_CONTROL_RX_PAUSE: + flowctrl = LINK_FLOWCTRL_RX; + break; + case FLOW_CONTROL_TX_PAUSE: + flowctrl = LINK_FLOWCTRL_TX; + break; + case FLOW_CONTROL_SYMMETRIC: + flowctrl = LINK_FLOWCTRL_BI; + break; + } + bcopy(&flowctrl, pr_val, sizeof (flowctrl)); + } else { + err = EINVAL; + } + break; + + case MAC_PROP_ADV_1000FDX_CAP: + case MAC_PROP_ADV_1000HDX_CAP: + case MAC_PROP_ADV_100FDX_CAP: + case MAC_PROP_ADV_100HDX_CAP: + case MAC_PROP_ADV_10FDX_CAP: + case MAC_PROP_ADV_10HDX_CAP: + case MAC_PROP_ADV_100T4_CAP: + usbgem_get_def_val(dp, pr_num, pr_valsize, pr_val); + break; + + case MAC_PROP_EN_1000FDX_CAP: +#ifndef MAC_VERSION_V1 + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET_FD) && + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX_FD)) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_1000fdx; + break; + + case MAC_PROP_EN_1000HDX_CAP: +#ifndef MAC_VERSION_V1 + if ((dp->mii_xstatus_ro & MII_XSTATUS_1000BASET) && + (dp->mii_xstatus_ro & MII_XSTATUS_1000BASEX)) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_1000hdx; + break; + + case MAC_PROP_EN_100FDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASEX_FD) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100fdx; + break; + + case MAC_PROP_EN_100HDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASEX) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100hdx; + break; + + case MAC_PROP_EN_10FDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_10_FD) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_10fdx; + break; + + case MAC_PROP_EN_10HDX_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_10) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_10hdx; + break; + + case MAC_PROP_EN_100T4_CAP: +#ifndef MAC_VERSION_V1 + if (dp->mii_status_ro & MII_STATUS_100_BASE_T4) { + *perm = MAC_PROP_PERM_READ; + } +#endif + *(uint8_t *)pr_val = dp->anadv_100t4; + break; + + case MAC_PROP_PRIVATE: + err = ENOTSUP; + break; + +#ifndef MAC_VERSION_V1 + case MAC_PROP_MTU: { + mac_propval_range_t range; + if (!(pr_flags & MAC_PROP_POSSIBLE)) { + err = ENOTSUP; + break; + } + if (pr_valsize < sizeof (mac_propval_range_t)) { + err = EINVAL; + break; + } + range.mpr_count = 1; + range.mpr_type = MAC_PROPVAL_UINT32; + range.range_uint32[0].mpur_min = ETHERMTU; + range.range_uint32[0].mpur_max = dp->mtu; + bcopy(&range, pr_val, sizeof (range)); + break; + } +#endif + default: + err = ENOTSUP; + break; + } + + rw_exit(&dp->dev_state_lock); + return (err); +} +#endif /* USBGEM_CONFIG_MAC_PROP */ + +#ifdef USBGEM_CONFIG_ND +/* ============================================================== */ +/* + * ND interface + */ +/* ============================================================== */ +enum { + PARAM_AUTONEG_CAP, + PARAM_PAUSE_CAP, + PARAM_ASYM_PAUSE_CAP, + PARAM_1000FDX_CAP, + PARAM_1000HDX_CAP, + PARAM_100T4_CAP, + PARAM_100FDX_CAP, + PARAM_100HDX_CAP, + PARAM_10FDX_CAP, + PARAM_10HDX_CAP, + + PARAM_ADV_AUTONEG_CAP, + PARAM_ADV_PAUSE_CAP, + PARAM_ADV_ASYM_PAUSE_CAP, + PARAM_ADV_1000FDX_CAP, + PARAM_ADV_1000HDX_CAP, + PARAM_ADV_100T4_CAP, + PARAM_ADV_100FDX_CAP, + PARAM_ADV_100HDX_CAP, + PARAM_ADV_10FDX_CAP, + PARAM_ADV_10HDX_CAP, + PARAM_ADV_1000T_MS, + + PARAM_LP_AUTONEG_CAP, + PARAM_LP_PAUSE_CAP, + PARAM_LP_ASYM_PAUSE_CAP, + PARAM_LP_1000FDX_CAP, + PARAM_LP_1000HDX_CAP, + PARAM_LP_100T4_CAP, + PARAM_LP_100FDX_CAP, + PARAM_LP_100HDX_CAP, + PARAM_LP_10FDX_CAP, + PARAM_LP_10HDX_CAP, + + PARAM_LINK_STATUS, + PARAM_LINK_SPEED, + PARAM_LINK_DUPLEX, + + PARAM_LINK_AUTONEG, + PARAM_LINK_RX_PAUSE, + PARAM_LINK_TX_PAUSE, + + PARAM_LOOP_MODE, + PARAM_MSI_CNT, +#ifdef DEBUG_RESUME + PARAM_RESUME_TEST, +#endif + + PARAM_COUNT +}; + +struct usbgem_nd_arg { + struct usbgem_dev *dp; + int item; +}; + +static int +usbgem_param_get(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *credp) +{ + struct usbgem_dev *dp = ((struct usbgem_nd_arg *)(void *)arg)->dp; + int item = ((struct usbgem_nd_arg *)(void *)arg)->item; + long val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called, item:%d", + dp->name, __func__, item)); + + switch (item) { + case PARAM_AUTONEG_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + DPRINTF(1, (CE_CONT, "autoneg_cap:%d", val)); + break; + + case PARAM_PAUSE_CAP: + val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE; + break; + + case PARAM_ASYM_PAUSE_CAP: + val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC; + break; + + case PARAM_1000FDX_CAP: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case PARAM_1000HDX_CAP: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case PARAM_100T4_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case PARAM_100FDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case PARAM_100HDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case PARAM_10FDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case PARAM_10HDX_CAP: + val = BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + case PARAM_ADV_AUTONEG_CAP: + val = dp->anadv_autoneg; + break; + + case PARAM_ADV_PAUSE_CAP: + val = dp->anadv_pause; + break; + + case PARAM_ADV_ASYM_PAUSE_CAP: + val = dp->anadv_asmpause; + break; + + case PARAM_ADV_1000FDX_CAP: + val = dp->anadv_1000fdx; + break; + + case PARAM_ADV_1000HDX_CAP: + val = dp->anadv_1000hdx; + break; + + case PARAM_ADV_100T4_CAP: + val = dp->anadv_100t4; + break; + + case PARAM_ADV_100FDX_CAP: + val = dp->anadv_100fdx; + break; + + case PARAM_ADV_100HDX_CAP: + val = dp->anadv_100hdx; + break; + + case PARAM_ADV_10FDX_CAP: + val = dp->anadv_10fdx; + break; + + case PARAM_ADV_10HDX_CAP: + val = dp->anadv_10hdx; + break; + + case PARAM_ADV_1000T_MS: + val = dp->anadv_1000t_ms; + break; + + case PARAM_LP_AUTONEG_CAP: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case PARAM_LP_PAUSE_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE); + break; + + case PARAM_LP_ASYM_PAUSE_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR); + break; + + case PARAM_LP_1000FDX_CAP: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + break; + + case PARAM_LP_1000HDX_CAP: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + break; + + case PARAM_LP_100T4_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4); + break; + + case PARAM_LP_100FDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + break; + + case PARAM_LP_100HDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + break; + + case PARAM_LP_10FDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + break; + + case PARAM_LP_10HDX_CAP: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + break; + + case PARAM_LINK_STATUS: + val = (dp->mii_state == MII_STATE_LINKUP); + break; + + case PARAM_LINK_SPEED: + val = usbgem_speed_value[dp->speed]; + break; + + case PARAM_LINK_DUPLEX: + val = 0; + if (dp->mii_state == MII_STATE_LINKUP) { + val = dp->full_duplex ? 2 : 1; + } + break; + + case PARAM_LINK_AUTONEG: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case PARAM_LINK_RX_PAUSE: + val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) || + (dp->flow_control == FLOW_CONTROL_RX_PAUSE); + break; + + case PARAM_LINK_TX_PAUSE: + val = (dp->flow_control == FLOW_CONTROL_SYMMETRIC) || + (dp->flow_control == FLOW_CONTROL_TX_PAUSE); + break; + +#ifdef DEBUG_RESUME + case PARAM_RESUME_TEST: + val = 0; + break; +#endif + default: + cmn_err(CE_WARN, "%s: unimplemented ndd control (%d)", + dp->name, item); + break; + } + + (void) mi_mpprintf(mp, "%ld", val); + + return (0); +} + +static int +usbgem_param_set(queue_t *q, + mblk_t *mp, char *value, caddr_t arg, cred_t *credp) +{ + struct usbgem_dev *dp = ((struct usbgem_nd_arg *)(void *)arg)->dp; + int item = ((struct usbgem_nd_arg *)(void *)arg)->item; + long val; + char *end; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + if (ddi_strtol(value, &end, 10, &val)) { + return (EINVAL); + } + if (end == value) { + return (EINVAL); + } + + switch (item) { + case PARAM_ADV_AUTONEG_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_CANAUTONEG) == 0) { + goto err; + } + dp->anadv_autoneg = (int)val; + break; + + case PARAM_ADV_PAUSE_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && dp->ugc.usbgc_flow_control == FLOW_CONTROL_NONE) { + goto err; + } + dp->anadv_pause = (int)val; + break; + + case PARAM_ADV_ASYM_PAUSE_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && + dp->ugc.usbgc_flow_control <= FLOW_CONTROL_SYMMETRIC) { + goto err; + } + dp->anadv_asmpause = (int)val; + break; + + case PARAM_ADV_1000FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | + MII_XSTATUS_1000BASEX_FD)) == 0) { + goto err; + } + dp->anadv_1000fdx = (int)val; + break; + + case PARAM_ADV_1000HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASEX)) == 0) { + goto err; + } + dp->anadv_1000hdx = (int)val; + break; + + case PARAM_ADV_100T4_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASE_T4) == 0) { + goto err; + } + dp->anadv_100t4 = (int)val; + break; + + case PARAM_ADV_100FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASEX_FD) == 0) { + goto err; + } + dp->anadv_100fdx = (int)val; + break; + + case PARAM_ADV_100HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_100_BASEX) == 0) { + goto err; + } + dp->anadv_100hdx = (int)val; + break; + + case PARAM_ADV_10FDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_10_FD) == 0) { + goto err; + } + dp->anadv_10fdx = (int)val; + break; + + case PARAM_ADV_10HDX_CAP: + if (val != 0 && val != 1) { + goto err; + } + if (val && (dp->mii_status & MII_STATUS_10) == 0) { + goto err; + } + dp->anadv_10hdx = (int)val; + break; + + case PARAM_ADV_1000T_MS: + if (val != 0 && val != 1 && val != 2) { + goto err; + } + if (val && (dp->mii_xstatus & + (MII_XSTATUS_1000BASET | MII_XSTATUS_1000BASET_FD)) == 0) { + goto err; + } + dp->anadv_1000t_ms = (int)val; + break; + +#ifdef DEBUG_RESUME + case PARAM_RESUME_TEST: + mutex_exit(&dp->xmitlock); + mutex_exit(&dp->intrlock); + gem_suspend(dp->dip); + gem_resume(dp->dip); + mutex_enter(&dp->intrlock); + mutex_enter(&dp->xmitlock); + break; +#endif + } + + /* sync with PHY */ + usbgem_choose_forcedmode(dp); + + dp->mii_state = MII_STATE_UNKNOWN; + if (dp->ugc.usbgc_mii_hw_link_detection) { + /* wake up link watcher possiblely sleeps */ + cv_signal(&dp->link_watcher_wait_cv); + } + + return (0); +err: + return (EINVAL); +} + +static void +usbgem_nd_load(struct usbgem_dev *dp, + char *name, ndgetf_t gf, ndsetf_t sf, int item) +{ + struct usbgem_nd_arg *arg; + + ASSERT(item >= 0); + ASSERT(item < PARAM_COUNT); + + arg = &((struct usbgem_nd_arg *)(void *)dp->nd_arg_p)[item]; + arg->dp = dp; + arg->item = item; + + DPRINTF(2, (CE_CONT, "!%s: %s: name:%s, item:%d", + dp->name, __func__, name, item)); + (void) nd_load(&dp->nd_data_p, name, gf, sf, (caddr_t)arg); +} + +static void +usbgem_nd_setup(struct usbgem_dev *dp) +{ + DPRINTF(1, (CE_CONT, "!%s: %s: called, mii_status:0x%b", + dp->name, __func__, dp->mii_status, MII_STATUS_BITS)); + + ASSERT(dp->nd_arg_p == NULL); + + dp->nd_arg_p = + kmem_zalloc(sizeof (struct usbgem_nd_arg) * PARAM_COUNT, KM_SLEEP); + +#define SETFUNC(x) ((x) ? usbgem_param_set : NULL) + + usbgem_nd_load(dp, "autoneg_cap", + usbgem_param_get, NULL, PARAM_AUTONEG_CAP); + usbgem_nd_load(dp, "pause_cap", + usbgem_param_get, NULL, PARAM_PAUSE_CAP); + usbgem_nd_load(dp, "asym_pause_cap", + usbgem_param_get, NULL, PARAM_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "1000fdx_cap", + usbgem_param_get, NULL, PARAM_1000FDX_CAP); + usbgem_nd_load(dp, "1000hdx_cap", + usbgem_param_get, NULL, PARAM_1000HDX_CAP); + usbgem_nd_load(dp, "100T4_cap", + usbgem_param_get, NULL, PARAM_100T4_CAP); + usbgem_nd_load(dp, "100fdx_cap", + usbgem_param_get, NULL, PARAM_100FDX_CAP); + usbgem_nd_load(dp, "100hdx_cap", + usbgem_param_get, NULL, PARAM_100HDX_CAP); + usbgem_nd_load(dp, "10fdx_cap", + usbgem_param_get, NULL, PARAM_10FDX_CAP); + usbgem_nd_load(dp, "10hdx_cap", + usbgem_param_get, NULL, PARAM_10HDX_CAP); + + /* Our advertised capabilities */ + usbgem_nd_load(dp, "adv_autoneg_cap", usbgem_param_get, + SETFUNC(dp->mii_status & MII_STATUS_CANAUTONEG), + PARAM_ADV_AUTONEG_CAP); + usbgem_nd_load(dp, "adv_pause_cap", usbgem_param_get, + SETFUNC(dp->ugc.usbgc_flow_control & 1), + PARAM_ADV_PAUSE_CAP); + usbgem_nd_load(dp, "adv_asym_pause_cap", usbgem_param_get, + SETFUNC(dp->ugc.usbgc_flow_control & 2), + PARAM_ADV_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "adv_1000fdx_cap", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD)), + PARAM_ADV_1000FDX_CAP); + usbgem_nd_load(dp, "adv_1000hdx_cap", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET)), + PARAM_ADV_1000HDX_CAP); + usbgem_nd_load(dp, "adv_100T4_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASE_T4) && + !dp->mii_advert_ro), + PARAM_ADV_100T4_CAP); + usbgem_nd_load(dp, "adv_100fdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASEX_FD) && + !dp->mii_advert_ro), + PARAM_ADV_100FDX_CAP); + usbgem_nd_load(dp, "adv_100hdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_100_BASEX) && + !dp->mii_advert_ro), + PARAM_ADV_100HDX_CAP); + usbgem_nd_load(dp, "adv_10fdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_10_FD) && + !dp->mii_advert_ro), + PARAM_ADV_10FDX_CAP); + usbgem_nd_load(dp, "adv_10hdx_cap", usbgem_param_get, + SETFUNC((dp->mii_status & MII_STATUS_10) && + !dp->mii_advert_ro), + PARAM_ADV_10HDX_CAP); + usbgem_nd_load(dp, "adv_1000t_ms", usbgem_param_get, + SETFUNC(dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)), + PARAM_ADV_1000T_MS); + + + /* Partner's advertised capabilities */ + usbgem_nd_load(dp, "lp_autoneg_cap", + usbgem_param_get, NULL, PARAM_LP_AUTONEG_CAP); + usbgem_nd_load(dp, "lp_pause_cap", + usbgem_param_get, NULL, PARAM_LP_PAUSE_CAP); + usbgem_nd_load(dp, "lp_asym_pause_cap", + usbgem_param_get, NULL, PARAM_LP_ASYM_PAUSE_CAP); + usbgem_nd_load(dp, "lp_1000fdx_cap", + usbgem_param_get, NULL, PARAM_LP_1000FDX_CAP); + usbgem_nd_load(dp, "lp_1000hdx_cap", + usbgem_param_get, NULL, PARAM_LP_1000HDX_CAP); + usbgem_nd_load(dp, "lp_100T4_cap", + usbgem_param_get, NULL, PARAM_LP_100T4_CAP); + usbgem_nd_load(dp, "lp_100fdx_cap", + usbgem_param_get, NULL, PARAM_LP_100FDX_CAP); + usbgem_nd_load(dp, "lp_100hdx_cap", + usbgem_param_get, NULL, PARAM_LP_100HDX_CAP); + usbgem_nd_load(dp, "lp_10fdx_cap", + usbgem_param_get, NULL, PARAM_LP_10FDX_CAP); + usbgem_nd_load(dp, "lp_10hdx_cap", + usbgem_param_get, NULL, PARAM_LP_10HDX_CAP); + + /* Current operating modes */ + usbgem_nd_load(dp, "link_status", + usbgem_param_get, NULL, PARAM_LINK_STATUS); + usbgem_nd_load(dp, "link_speed", + usbgem_param_get, NULL, PARAM_LINK_SPEED); + usbgem_nd_load(dp, "link_duplex", + usbgem_param_get, NULL, PARAM_LINK_DUPLEX); + usbgem_nd_load(dp, "link_autoneg", + usbgem_param_get, NULL, PARAM_LINK_AUTONEG); + usbgem_nd_load(dp, "link_rx_pause", + usbgem_param_get, NULL, PARAM_LINK_RX_PAUSE); + usbgem_nd_load(dp, "link_tx_pause", + usbgem_param_get, NULL, PARAM_LINK_TX_PAUSE); +#ifdef DEBUG_RESUME + usbgem_nd_load(dp, "resume_test", + usbgem_param_get, usbgem_param_set, PARAM_RESUME_TEST); +#endif +#undef SETFUNC +} + +static +enum ioc_reply +usbgem_nd_ioctl(struct usbgem_dev *dp, + queue_t *wq, mblk_t *mp, struct iocblk *iocp) +{ + boolean_t ok; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + switch (iocp->ioc_cmd) { + case ND_GET: + ok = nd_getset(wq, dp->nd_data_p, mp); + DPRINTF(1, (CE_CONT, + "%s: get %s", dp->name, ok ? "OK" : "FAIL")); + return (ok ? IOC_REPLY : IOC_INVAL); + + case ND_SET: + ok = nd_getset(wq, dp->nd_data_p, mp); + + DPRINTF(1, (CE_CONT, "%s: set %s err %d", + dp->name, ok ? "OK" : "FAIL", iocp->ioc_error)); + + if (!ok) { + return (IOC_INVAL); + } + + if (iocp->ioc_error) { + return (IOC_REPLY); + } + + return (IOC_RESTART_REPLY); + } + + cmn_err(CE_WARN, "%s: invalid cmd 0x%x", dp->name, iocp->ioc_cmd); + + return (IOC_INVAL); +} + +static void +usbgem_nd_cleanup(struct usbgem_dev *dp) +{ + ASSERT(dp->nd_data_p != NULL); + ASSERT(dp->nd_arg_p != NULL); + + nd_free(&dp->nd_data_p); + + kmem_free(dp->nd_arg_p, sizeof (struct usbgem_nd_arg) * PARAM_COUNT); + dp->nd_arg_p = NULL; +} +#endif /* USBGEM_CONFIG_ND */ + +static void +usbgem_mac_ioctl(struct usbgem_dev *dp, queue_t *wq, mblk_t *mp) +{ + struct iocblk *iocp; + enum ioc_reply status; + int cmd; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Validate the command before bothering with the mutex ... + */ + iocp = (void *)mp->b_rptr; + iocp->ioc_error = 0; + cmd = iocp->ioc_cmd; + + DPRINTF(1, (CE_CONT, "%s: %s cmd:0x%x", dp->name, __func__, cmd)); + +#ifdef USBGEM_CONFIG_ND + switch (cmd) { + default: + _NOTE(NOTREACHED) + status = IOC_INVAL; + break; + + case ND_GET: + case ND_SET: + status = usbgem_nd_ioctl(dp, wq, mp, iocp); + break; + } + + /* + * Finally, decide how to reply + */ + switch (status) { + default: + case IOC_INVAL: + /* + * Error, reply with a NAK and EINVAL or the specified error + */ + miocnak(wq, mp, 0, iocp->ioc_error == 0 ? + EINVAL : iocp->ioc_error); + break; + + case IOC_DONE: + /* + * OK, reply already sent + */ + break; + + case IOC_RESTART_ACK: + case IOC_ACK: + /* + * OK, reply with an ACK + */ + miocack(wq, mp, 0, 0); + break; + + case IOC_RESTART_REPLY: + case IOC_REPLY: + /* + * OK, send prepared reply as ACK or NAK + */ + mp->b_datap->db_type = + iocp->ioc_error == 0 ? M_IOCACK : M_IOCNAK; + qreply(wq, mp); + break; + } +#else + miocnak(wq, mp, 0, EINVAL); + return; +#endif /* USBGEM_CONFIG_GLDv3 */ +} + +#ifndef SYS_MAC_H +#define XCVR_UNDEFINED 0 +#define XCVR_NONE 1 +#define XCVR_10 2 +#define XCVR_100T4 3 +#define XCVR_100X 4 +#define XCVR_100T2 5 +#define XCVR_1000X 6 +#define XCVR_1000T 7 +#endif +static int +usbgem_mac_xcvr_inuse(struct usbgem_dev *dp) +{ + int val = XCVR_UNDEFINED; + + if ((dp->mii_status & MII_STATUS_XSTATUS) == 0) { + if (dp->mii_status & MII_STATUS_100_BASE_T4) { + val = XCVR_100T4; + } else if (dp->mii_status & + (MII_STATUS_100_BASEX_FD | + MII_STATUS_100_BASEX)) { + val = XCVR_100X; + } else if (dp->mii_status & + (MII_STATUS_100_BASE_T2_FD | + MII_STATUS_100_BASE_T2)) { + val = XCVR_100T2; + } else if (dp->mii_status & + (MII_STATUS_10_FD | MII_STATUS_10)) { + val = XCVR_10; + } + } else if (dp->mii_xstatus & + (MII_XSTATUS_1000BASET_FD | MII_XSTATUS_1000BASET)) { + val = XCVR_1000T; + } else if (dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASEX)) { + val = XCVR_1000X; + } + + return (val); +} + +#ifdef USBGEM_CONFIG_GLDv3 +/* ============================================================== */ +/* + * GLDv3 interface + */ +/* ============================================================== */ +static int usbgem_m_getstat(void *, uint_t, uint64_t *); +static int usbgem_m_start(void *); +static void usbgem_m_stop(void *); +static int usbgem_m_setpromisc(void *, boolean_t); +static int usbgem_m_multicst(void *, boolean_t, const uint8_t *); +static int usbgem_m_unicst(void *, const uint8_t *); +static mblk_t *usbgem_m_tx(void *, mblk_t *); +static void usbgem_m_ioctl(void *, queue_t *, mblk_t *); +#ifdef GEM_CONFIG_MAC_PROP +static int usbgem_m_setprop(void *, const char *, mac_prop_id_t, + uint_t, const void *); +#ifdef MAC_VERSION_V1 +static int usbgem_m_getprop(void *, const char *, mac_prop_id_t, + uint_t, void *); +#else +static int usbgem_m_getprop(void *, const char *, mac_prop_id_t, + uint_t, uint_t, void *, uint_t *); +#endif +#endif + +#ifdef _SYS_MAC_PROVIDER_H +#define GEM_M_CALLBACK_FLAGS (MC_IOCTL) +#else +#define GEM_M_CALLBACK_FLAGS (MC_IOCTL) +#endif + +static mac_callbacks_t gem_m_callbacks = { +#ifdef USBGEM_CONFIG_MAC_PROP +#ifdef MAC_VERSION_V1 + GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP | MC_PROPINFO, +#else + GEM_M_CALLBACK_FLAGS | MC_SETPROP | MC_GETPROP, +#endif +#else + GEM_M_CALLBACK_FLAGS, +#endif + usbgem_m_getstat, + usbgem_m_start, + usbgem_m_stop, + usbgem_m_setpromisc, + usbgem_m_multicst, + usbgem_m_unicst, + usbgem_m_tx, +#ifdef _SYS_MAC_PROVIDER_H +#ifdef MAC_VERSION_V1 + NULL, +#endif +#else + NULL, /* m_resources */ +#endif + usbgem_m_ioctl, + NULL, /* m_getcapab */ +#ifdef USBGEM_CONFIG_MAC_PROP + NULL, + NULL, + usbgem_m_setprop, + usbgem_m_getprop, +#endif +#ifdef MAC_VERSION_V1 + usbgem_m_propinfo, +#endif +}; + +static int +usbgem_m_start(void *arg) +{ + int ret; + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + err = EIO; + + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_ONLINE; + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = 0; + goto x; + } + if (usbgem_mac_init(dp) != USB_SUCCESS) { + goto x; + } + + /* initialize rx filter state */ + sema_p(&dp->rxfilter_lock); + dp->mc_count = 0; + dp->mc_count_req = 0; + + bcopy(dp->dev_addr.ether_addr_octet, + dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + ret = usbgem_hal_set_rx_filter(dp); + sema_v(&dp->rxfilter_lock); + + if (ret != USB_SUCCESS) { + goto x; + } + + if (dp->mii_state == MII_STATE_LINKUP) { + /* setup media mode if the link have been up */ + if (usbgem_hal_set_media(dp) != USB_SUCCESS) { + goto x; + } + if (usbgem_mac_start(dp) != USB_SUCCESS) { + goto x; + } + } + + err = 0; +x: + rw_exit(&dp->dev_state_lock); + return (err); +} + +static void +usbgem_m_stop(void *arg) +{ + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* stop rx gracefully */ + rw_enter(&dp->dev_state_lock, RW_READER); + sema_p(&dp->rxfilter_lock); + dp->rxmode &= ~RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + + /* make the nic state inactive */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_STOPPED; + + /* stop mac completely */ + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL); + } + rw_exit(&dp->dev_state_lock); +} + +static int +usbgem_m_multicst(void *arg, boolean_t add, const uint8_t *ep) +{ + int err; + int ret; + struct usbgem_dev *dp = arg; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (add) { + ret = usbgem_add_multicast(dp, ep); + } else { + ret = usbgem_remove_multicast(dp, ep); + } + rw_exit(&dp->dev_state_lock); + + err = 0; + if (ret != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + err = EIO; + } + + return (err); +} + +static int +usbgem_m_setpromisc(void *arg, boolean_t on) +{ + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + + sema_p(&dp->rxfilter_lock); + if (on) { + dp->rxmode |= RXMODE_PROMISC; + } else { + dp->rxmode &= ~RXMODE_PROMISC; + } + + err = 0; + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) { + err = EIO; + } + } + sema_v(&dp->rxfilter_lock); + + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (err != 0) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + } +#endif + return (err); +} + +int +usbgem_m_getstat(void *arg, uint_t stat, uint64_t *valp) +{ + int ret; + uint64_t val; + struct usbgem_dev *dp = arg; + struct usbgem_stats *gstp = &dp->stats; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + rw_exit(&dp->dev_state_lock); + return (0); + } + ret = usbgem_hal_get_stats(dp); + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (ret != USB_SUCCESS) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + return (EIO); + } +#endif + + switch (stat) { + case MAC_STAT_IFSPEED: + val = usbgem_speed_value[dp->speed] *1000000ull; + break; + + case MAC_STAT_MULTIRCV: + val = gstp->rmcast; + break; + + case MAC_STAT_BRDCSTRCV: + val = gstp->rbcast; + break; + + case MAC_STAT_MULTIXMT: + val = gstp->omcast; + break; + + case MAC_STAT_BRDCSTXMT: + val = gstp->obcast; + break; + + case MAC_STAT_NORCVBUF: + val = gstp->norcvbuf + gstp->missed; + break; + + case MAC_STAT_IERRORS: + val = gstp->errrcv; + break; + + case MAC_STAT_NOXMTBUF: + val = gstp->noxmtbuf; + break; + + case MAC_STAT_OERRORS: + val = gstp->errxmt; + break; + + case MAC_STAT_COLLISIONS: + val = gstp->collisions; + break; + + case MAC_STAT_RBYTES: + val = gstp->rbytes; + break; + + case MAC_STAT_IPACKETS: + val = gstp->rpackets; + break; + + case MAC_STAT_OBYTES: + val = gstp->obytes; + break; + + case MAC_STAT_OPACKETS: + val = gstp->opackets; + break; + + case MAC_STAT_UNDERFLOWS: + val = gstp->underflow; + break; + + case MAC_STAT_OVERFLOWS: + val = gstp->overflow; + break; + + case ETHER_STAT_ALIGN_ERRORS: + val = gstp->frame; + break; + + case ETHER_STAT_FCS_ERRORS: + val = gstp->crc; + break; + + case ETHER_STAT_FIRST_COLLISIONS: + val = gstp->first_coll; + break; + + case ETHER_STAT_MULTI_COLLISIONS: + val = gstp->multi_coll; + break; + + case ETHER_STAT_SQE_ERRORS: + val = gstp->sqe; + break; + + case ETHER_STAT_DEFER_XMTS: + val = gstp->defer; + break; + + case ETHER_STAT_TX_LATE_COLLISIONS: + val = gstp->xmtlatecoll; + break; + + case ETHER_STAT_EX_COLLISIONS: + val = gstp->excoll; + break; + + case ETHER_STAT_MACXMT_ERRORS: + val = gstp->xmit_internal_err; + break; + + case ETHER_STAT_CARRIER_ERRORS: + val = gstp->nocarrier; + break; + + case ETHER_STAT_TOOLONG_ERRORS: + val = gstp->frame_too_long; + break; + + case ETHER_STAT_MACRCV_ERRORS: + val = gstp->rcv_internal_err; + break; + + case ETHER_STAT_XCVR_ADDR: + val = dp->mii_phy_addr; + break; + + case ETHER_STAT_XCVR_ID: + val = dp->mii_phy_id; + break; + + case ETHER_STAT_XCVR_INUSE: + val = usbgem_mac_xcvr_inuse(dp); + break; + + case ETHER_STAT_CAP_1000FDX: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + break; + + case ETHER_STAT_CAP_1000HDX: + val = (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + break; + + case ETHER_STAT_CAP_100FDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + break; + + case ETHER_STAT_CAP_100HDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + break; + + case ETHER_STAT_CAP_10FDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + break; + + case ETHER_STAT_CAP_10HDX: + val = BOOLEAN(dp->mii_status & MII_STATUS_10); + break; + + case ETHER_STAT_CAP_ASMPAUSE: + val = dp->ugc.usbgc_flow_control > FLOW_CONTROL_SYMMETRIC; + break; + + case ETHER_STAT_CAP_PAUSE: + val = dp->ugc.usbgc_flow_control != FLOW_CONTROL_NONE; + break; + + case ETHER_STAT_CAP_AUTONEG: + val = BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + break; + + case ETHER_STAT_ADV_CAP_1000FDX: + val = dp->anadv_1000fdx; + break; + + case ETHER_STAT_ADV_CAP_1000HDX: + val = dp->anadv_1000hdx; + break; + + case ETHER_STAT_ADV_CAP_100FDX: + val = dp->anadv_100fdx; + break; + + case ETHER_STAT_ADV_CAP_100HDX: + val = dp->anadv_100hdx; + break; + + case ETHER_STAT_ADV_CAP_10FDX: + val = dp->anadv_10fdx; + break; + + case ETHER_STAT_ADV_CAP_10HDX: + val = dp->anadv_10hdx; + break; + + case ETHER_STAT_ADV_CAP_ASMPAUSE: + val = dp->anadv_asmpause; + break; + + case ETHER_STAT_ADV_CAP_PAUSE: + val = dp->anadv_pause; + break; + + case ETHER_STAT_ADV_CAP_AUTONEG: + val = dp->anadv_autoneg; + break; + + case ETHER_STAT_LP_CAP_1000FDX: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + break; + + case ETHER_STAT_LP_CAP_1000HDX: + val = BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + break; + + case ETHER_STAT_LP_CAP_100FDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + break; + + case ETHER_STAT_LP_CAP_100HDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + break; + + case ETHER_STAT_LP_CAP_10FDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + break; + + case ETHER_STAT_LP_CAP_10HDX: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + break; + + case ETHER_STAT_LP_CAP_ASMPAUSE: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_ASM_DIR); + break; + + case ETHER_STAT_LP_CAP_PAUSE: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_PAUSE); + break; + + case ETHER_STAT_LP_CAP_AUTONEG: + val = BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case ETHER_STAT_LINK_ASMPAUSE: + val = BOOLEAN(dp->flow_control & 2); + break; + + case ETHER_STAT_LINK_PAUSE: + val = BOOLEAN(dp->flow_control & 1); + break; + + case ETHER_STAT_LINK_AUTONEG: + val = dp->anadv_autoneg && + BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + break; + + case ETHER_STAT_LINK_DUPLEX: + val = (dp->mii_state == MII_STATE_LINKUP) ? + (dp->full_duplex ? 2 : 1) : 0; + break; + + case ETHER_STAT_TOOSHORT_ERRORS: + val = gstp->runt; + break; +#ifdef NEVER /* it doesn't make sense */ + case ETHER_STAT_CAP_REMFAULT: + val = B_TRUE; + break; + + case ETHER_STAT_ADV_REMFAULT: + val = dp->anadv_remfault; + break; +#endif + case ETHER_STAT_LP_REMFAULT: + val = BOOLEAN(dp->mii_lpable & MII_AN_ADVERT_REMFAULT); + break; + + case ETHER_STAT_JABBER_ERRORS: + val = gstp->jabber; + break; + + case ETHER_STAT_CAP_100T4: + val = BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + break; + + case ETHER_STAT_ADV_CAP_100T4: + val = dp->anadv_100t4; + break; + + case ETHER_STAT_LP_CAP_100T4: + val = BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_T4); + break; + + default: +#if GEM_DEBUG_LEVEL > 2 + cmn_err(CE_WARN, + "%s: unrecognized parameter value = %d", + __func__, stat); +#endif + *valp = 0; + return (ENOTSUP); + } + + *valp = val; + + return (0); +} + +static int +usbgem_m_unicst(void *arg, const uint8_t *mac) +{ + int err; + struct usbgem_dev *dp = arg; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + + sema_p(&dp->rxfilter_lock); + bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + err = 0; + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_hal_set_rx_filter(dp) != USB_SUCCESS) { + err = EIO; + } + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + +#ifdef GEM_CONFIG_FMA + if (err != 0) { + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); + } +#endif + return (err); +} + +/* + * usbgem_m_tx is used only for sending data packets into ethernet wire. + */ +static mblk_t * +usbgem_m_tx(void *arg, mblk_t *mp_head) +{ + int limit; + mblk_t *mp; + mblk_t *nmp; + uint32_t flags; + struct usbgem_dev *dp = arg; + + DPRINTF(4, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + mp = mp_head; + flags = 0; + + rw_enter(&dp->dev_state_lock, RW_READER); + + if (dp->mii_state != MII_STATE_LINKUP || + dp->mac_state != MAC_STATE_ONLINE) { + /* some nics hate to send packets during the link is down */ + for (; mp; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + } + goto x; + } + + ASSERT(dp->nic_state == NIC_STATE_ONLINE); + + limit = dp->tx_max_packets; + for (; limit-- && mp; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + if (usbgem_send_common(dp, mp, + (limit == 0 && nmp) ? 1 : 0)) { + mp->b_next = nmp; + break; + } + } +#ifdef CONFIG_TX_LIMITER + if (mp == mp_head) { + /* no packets were sent, descrease allocation limit */ + mutex_enter(&dp->txlock); + dp->tx_max_packets = max(dp->tx_max_packets - 1, 1); + mutex_exit(&dp->txlock); + } +#endif +x: + rw_exit(&dp->dev_state_lock); + + return (mp); +} + +static void +usbgem_m_ioctl(void *arg, queue_t *wq, mblk_t *mp) +{ + struct usbgem_dev *dp = arg; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", + ((struct usbgem_dev *)arg)->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + usbgem_mac_ioctl((struct usbgem_dev *)arg, wq, mp); + rw_exit(&dp->dev_state_lock); +} + +static void +usbgem_gld3_init(struct usbgem_dev *dp, mac_register_t *macp) +{ + macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; + macp->m_driver = dp; + macp->m_dip = dp->dip; + macp->m_src_addr = dp->dev_addr.ether_addr_octet; + macp->m_callbacks = &gem_m_callbacks; + macp->m_min_sdu = 0; + macp->m_max_sdu = dp->mtu; + + if (dp->misc_flag & USBGEM_VLAN) { + macp->m_margin = VTAG_SIZE; + } +} +#else +/* ============================================================== */ +/* + * GLDv2 interface + */ +/* ============================================================== */ +static int usbgem_gld_reset(gld_mac_info_t *); +static int usbgem_gld_start(gld_mac_info_t *); +static int usbgem_gld_stop(gld_mac_info_t *); +static int usbgem_gld_set_mac_address(gld_mac_info_t *, uint8_t *); +static int usbgem_gld_set_multicast(gld_mac_info_t *, uint8_t *, int); +static int usbgem_gld_set_promiscuous(gld_mac_info_t *, int); +static int usbgem_gld_get_stats(gld_mac_info_t *, struct gld_stats *); +static int usbgem_gld_send(gld_mac_info_t *, mblk_t *); +static int usbgem_gld_send_tagged(gld_mac_info_t *, mblk_t *, uint32_t); + +static int +usbgem_gld_reset(gld_mac_info_t *macinfo) +{ + int err; + struct usbgem_dev *dp; + + err = GLD_SUCCESS; + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_WRITER); + if (usbgem_mac_init(dp) != USB_SUCCESS) { + err = GLD_FAILURE; + goto x; + } + + dp->nic_state = NIC_STATE_INITIALIZED; + + /* setup media mode if the link have been up */ + if (dp->mii_state == MII_STATE_LINKUP) { + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_media(dp); + } + } +x: + rw_exit(&dp->dev_state_lock); + return (err); +} + +static int +usbgem_gld_start(gld_mac_info_t *macinfo) +{ + int err; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_WRITER); + + dp->nic_state = NIC_STATE_ONLINE; + + if (dp->mii_state == MII_STATE_LINKUP) { + if (usbgem_mac_start(dp) != USB_SUCCESS) { + /* sema_v(&dp->mii_lock); */ + err = GLD_FAILURE; + goto x; + } + } + + /* + * XXX - don't call gld_linkstate() here, + * otherwise it cause recursive mutex call. + */ + err = GLD_SUCCESS; +x: + rw_exit(&dp->dev_state_lock); + + return (err); +} + +static int +usbgem_gld_stop(gld_mac_info_t *macinfo) +{ + int err = GLD_SUCCESS; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* try to stop rx gracefully */ + rw_enter(&dp->dev_state_lock, RW_READER); + sema_p(&dp->rxfilter_lock); + dp->rxmode &= ~RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + rw_exit(&dp->dev_state_lock); + + /* make the nic state inactive */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + dp->nic_state = NIC_STATE_STOPPED; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + if (usbgem_mac_stop(dp, MAC_STATE_STOPPED, STOP_GRACEFUL) + != USB_SUCCESS) { + err = GLD_FAILURE; + } + } + rw_exit(&dp->dev_state_lock); + + return (err); +} + +static int +usbgem_gld_set_multicast(gld_mac_info_t *macinfo, uint8_t *ep, int flag) +{ + int err; + int ret; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + rw_enter(&dp->dev_state_lock, RW_READER); + if (flag == GLD_MULTI_ENABLE) { + ret = usbgem_add_multicast(dp, ep); + } else { + ret = usbgem_remove_multicast(dp, ep); + } + rw_exit(&dp->dev_state_lock); + + err = GLD_SUCCESS; + if (ret != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + err = GLD_FAILURE; + } + return (err); +} + +static int +usbgem_gld_set_promiscuous(gld_mac_info_t *macinfo, int flag) +{ + boolean_t need_to_change = B_TRUE; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + if (flag == GLD_MAC_PROMISC_NONE) { + dp->rxmode &= ~(RXMODE_PROMISC | RXMODE_ALLMULTI_REQ); + } else if (flag == GLD_MAC_PROMISC_MULTI) { + dp->rxmode |= RXMODE_ALLMULTI_REQ; + } else if (flag == GLD_MAC_PROMISC_PHYS) { + dp->rxmode |= RXMODE_PROMISC; + } else { + /* mode unchanged */ + need_to_change = B_FALSE; + } + + if (need_to_change) { + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + } + sema_v(&dp->rxfilter_lock); + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_set_mac_address(gld_mac_info_t *macinfo, uint8_t *mac) +{ + struct usbgem_dev *dp; + dp = (struct usbgem_dev *)macinfo->gldm_private; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + sema_p(&dp->rxfilter_lock); + bcopy(mac, dp->cur_addr.ether_addr_octet, ETHERADDRL); + dp->rxmode |= RXMODE_ENABLE; + + if (dp->mac_state != MAC_STATE_DISCONNECTED) { + (void) usbgem_hal_set_rx_filter(dp); + } + sema_v(&dp->rxfilter_lock); + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_get_stats(gld_mac_info_t *macinfo, struct gld_stats *gs) +{ + struct usbgem_dev *dp; + struct usbgem_stats *vs; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + if ((*dp->ugc.usbgc_get_stats)(dp) != USB_SUCCESS) { +#ifdef GEM_CONFIG_FMA + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (USB_FAILURE); + } + + vs = &dp->stats; + + gs->glds_errxmt = vs->errxmt; + gs->glds_errrcv = vs->errrcv; + gs->glds_collisions = vs->collisions; + + gs->glds_excoll = vs->excoll; + gs->glds_defer = vs->defer; + gs->glds_frame = vs->frame; + gs->glds_crc = vs->crc; + + gs->glds_overflow = vs->overflow; /* fifo err,underrun,rbufovf */ + gs->glds_underflow = vs->underflow; + gs->glds_short = vs->runt; + gs->glds_missed = vs->missed; /* missed pkts while rbuf ovf */ + gs->glds_xmtlatecoll = vs->xmtlatecoll; + gs->glds_nocarrier = vs->nocarrier; + gs->glds_norcvbuf = vs->norcvbuf; /* OS resource exaust */ + gs->glds_intr = vs->intr; + + /* all before here must be kept in place for v0 compatibility */ + gs->glds_speed = usbgem_speed_value[dp->speed] * 1000000; + gs->glds_media = GLDM_PHYMII; + gs->glds_duplex = dp->full_duplex ? GLD_DUPLEX_FULL : GLD_DUPLEX_HALF; + + /* gs->glds_media_specific */ + gs->glds_dot3_first_coll = vs->first_coll; + gs->glds_dot3_multi_coll = vs->multi_coll; + gs->glds_dot3_sqe_error = 0; + gs->glds_dot3_mac_xmt_error = 0; + gs->glds_dot3_mac_rcv_error = 0; + gs->glds_dot3_frame_too_long = vs->frame_too_long; + + return (GLD_SUCCESS); +} + +static int +usbgem_gld_ioctl(gld_mac_info_t *macinfo, queue_t *wq, mblk_t *mp) +{ + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + usbgem_mac_ioctl(dp, wq, mp); + + return (GLD_SUCCESS); +} + +/* + * gem_gld_send is used only for sending data packets into ethernet wire. + */ +static int +usbgem_gld_send(gld_mac_info_t *macinfo, mblk_t *mp) +{ + int ret; + uint32_t flags = 0; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + /* nic state must be online of suspended */ + rw_enter(&dp->dev_state_lock, RW_READER); + + ASSERT(dp->nic_state == NIC_STATE_ONLINE); + ASSERT(mp->b_next == NULL); + + if (dp->mii_state != MII_STATE_LINKUP) { + /* Some nics hate to send packets while the link is down. */ + /* we discard the untransmitted packets silently */ + rw_exit(&dp->dev_state_lock); + + freemsg(mp); +#ifdef GEM_CONFIG_FMA + /* FIXME - should we ignore the error? */ + ddi_fm_service_impact(dp->dip, DDI_SERVICE_DEGRADED); +#endif + return (GLD_SUCCESS); + } + + ret = (usbgem_send_common(dp, mp, flags) == NULL) + ? GLD_SUCCESS : GLD_NORESOURCES; + rw_exit(&dp->dev_state_lock); + + return (ret); +} + +/* + * usbgem_gld_send is used only for sending data packets into ethernet wire. + */ +static int +usbgem_gld_send_tagged(gld_mac_info_t *macinfo, mblk_t *mp, uint32_t vtag) +{ + uint32_t flags; + struct usbgem_dev *dp; + + dp = (struct usbgem_dev *)macinfo->gldm_private; + + /* + * Some nics hate to send packets while the link is down. + */ + if (dp->mii_state != MII_STATE_LINKUP) { + /* we dicard the untransmitted packets silently */ + freemsg(mp); +#ifdef GEM_CONFIG_FMA + /* FIXME - should we ignore the error? */ + ddi_fm_service_impact(dp->dip, DDI_SERVICE_UNAFFECTED); +#endif + return (GLD_SUCCESS); + } +#ifdef notyet + flags = GLD_VTAG_TCI(vtag) << GEM_SEND_VTAG_SHIFT; +#endif + return ((usbgem_send_common(dp, mp, 0) == NULL) ? + GLD_SUCCESS : GLD_NORESOURCES); +} + +static void +usbgem_gld_init(struct usbgem_dev *dp, gld_mac_info_t *macinfo, char *ident) +{ + /* + * configure GLD + */ + macinfo->gldm_devinfo = dp->dip; + macinfo->gldm_private = (caddr_t)dp; + + macinfo->gldm_reset = usbgem_gld_reset; + macinfo->gldm_start = usbgem_gld_start; + macinfo->gldm_stop = usbgem_gld_stop; + macinfo->gldm_set_mac_addr = usbgem_gld_set_mac_address; + macinfo->gldm_send = usbgem_gld_send; + macinfo->gldm_set_promiscuous = usbgem_gld_set_promiscuous; + macinfo->gldm_get_stats = usbgem_gld_get_stats; + macinfo->gldm_ioctl = usbgem_gld_ioctl; + macinfo->gldm_set_multicast = usbgem_gld_set_multicast; + macinfo->gldm_intr = NULL; + macinfo->gldm_mctl = NULL; + + macinfo->gldm_ident = ident; + macinfo->gldm_type = DL_ETHER; + macinfo->gldm_minpkt = 0; + macinfo->gldm_maxpkt = dp->mtu; + macinfo->gldm_addrlen = ETHERADDRL; + macinfo->gldm_saplen = -2; + macinfo->gldm_ppa = ddi_get_instance(dp->dip); +#ifdef GLD_CAP_LINKSTATE + macinfo->gldm_capabilities = GLD_CAP_LINKSTATE; +#endif + macinfo->gldm_vendor_addr = dp->dev_addr.ether_addr_octet; + macinfo->gldm_broadcast_addr = usbgem_bcastaddr; +} +#endif /* USBGEM_CONFIG_GLDv3 */ + + +/* ======================================================================== */ +/* + * .conf interface + */ +/* ======================================================================== */ +void +usbgem_generate_macaddr(struct usbgem_dev *dp, uint8_t *mac) +{ + extern char hw_serial[]; + char *hw_serial_p; + int i; + uint64_t val; + uint64_t key; + + cmn_err(CE_NOTE, + "!%s: using temp ether address," + " do not use this for long time", + dp->name); + + /* prefer a fixed address for DHCP */ + hw_serial_p = &hw_serial[0]; + val = stoi(&hw_serial_p); + + key = 0; + for (i = 0; i < USBGEM_NAME_LEN; i++) { + if (dp->name[i] == 0) { + break; + } + key ^= dp->name[i]; + } + key ^= ddi_get_instance(dp->dip); + val ^= key << 32; + + /* generate a local address */ + mac[0] = 0x02; + mac[1] = (uint8_t)(val >> 32); + mac[2] = (uint8_t)(val >> 24); + mac[3] = (uint8_t)(val >> 16); + mac[4] = (uint8_t)(val >> 8); + mac[5] = (uint8_t)val; +} + +boolean_t +usbgem_get_mac_addr_conf(struct usbgem_dev *dp) +{ + char propname[32]; + char *valstr; + uint8_t mac[ETHERADDRL]; + char *cp; + int c; + int i; + int j; + uint8_t v; + uint8_t d; + uint8_t ored; + + DPRINTF(3, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + /* + * Get ethernet address from .conf file + */ + (void) sprintf(propname, "mac-addr"); + if ((ddi_prop_lookup_string(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, propname, &valstr)) != DDI_PROP_SUCCESS) { + return (B_FALSE); + } + + if (strlen(valstr) != ETHERADDRL*3-1) { + goto syntax_err; + } + + cp = valstr; + j = 0; + ored = 0; + for (;;) { + v = 0; + for (i = 0; i < 2; i++) { + c = *cp++; + + if (c >= 'a' && c <= 'f') { + d = c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + d = c - 'A' + 10; + } else if (c >= '0' && c <= '9') { + d = c - '0'; + } else { + goto syntax_err; + } + v = (v << 4) | d; + } + + mac[j++] = v; + ored |= v; + if (j == ETHERADDRL) { + /* done */ + break; + } + + c = *cp++; + if (c != ':') { + goto syntax_err; + } + } + + if (ored == 0) { + usbgem_generate_macaddr(dp, mac); + } + for (i = 0; i < ETHERADDRL; i++) { + dp->dev_addr.ether_addr_octet[i] = mac[i]; + } + ddi_prop_free(valstr); + return (B_TRUE); + +syntax_err: + cmn_err(CE_CONT, + "!%s: read mac addr: trying .conf: syntax err %s", + dp->name, valstr); + ddi_prop_free(valstr); + + return (B_FALSE); +} + +static void +usbgem_read_conf(struct usbgem_dev *dp) +{ + int val; + + DPRINTF(1, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + /* + * Get media mode infomation from .conf file + */ + dp->anadv_autoneg = usbgem_prop_get_int(dp, "adv_autoneg_cap", 1) != 0; + dp->anadv_1000fdx = usbgem_prop_get_int(dp, "adv_1000fdx_cap", 1) != 0; + dp->anadv_1000hdx = usbgem_prop_get_int(dp, "adv_1000hdx_cap", 1) != 0; + dp->anadv_100t4 = usbgem_prop_get_int(dp, "adv_100T4_cap", 1) != 0; + dp->anadv_100fdx = usbgem_prop_get_int(dp, "adv_100fdx_cap", 1) != 0; + dp->anadv_100hdx = usbgem_prop_get_int(dp, "adv_100hdx_cap", 1) != 0; + dp->anadv_10fdx = usbgem_prop_get_int(dp, "adv_10fdx_cap", 1) != 0; + dp->anadv_10hdx = usbgem_prop_get_int(dp, "adv_10hdx_cap", 1) != 0; + dp->anadv_1000t_ms = usbgem_prop_get_int(dp, "adv_1000t_ms", 0); + + if ((ddi_prop_exists(DDI_DEV_T_ANY, dp->dip, + DDI_PROP_DONTPASS, "full-duplex"))) { + dp->full_duplex = + usbgem_prop_get_int(dp, "full-duplex", 1) != 0; + dp->anadv_autoneg = B_FALSE; + if (dp->full_duplex) { + dp->anadv_1000hdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + } else { + dp->anadv_1000fdx = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + } + } + + if ((val = usbgem_prop_get_int(dp, "speed", 0)) > 0) { + dp->anadv_autoneg = B_FALSE; + switch (val) { + case 1000: + dp->speed = USBGEM_SPD_1000; + dp->anadv_100t4 = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + break; + case 100: + dp->speed = USBGEM_SPD_100; + dp->anadv_1000fdx = B_FALSE; + dp->anadv_1000hdx = B_FALSE; + dp->anadv_10fdx = B_FALSE; + dp->anadv_10hdx = B_FALSE; + break; + case 10: + dp->speed = USBGEM_SPD_10; + dp->anadv_1000fdx = B_FALSE; + dp->anadv_1000hdx = B_FALSE; + dp->anadv_100t4 = B_FALSE; + dp->anadv_100fdx = B_FALSE; + dp->anadv_100hdx = B_FALSE; + break; + default: + cmn_err(CE_WARN, + "!%s: property %s: illegal value:%d", + dp->name, "speed", val); + dp->anadv_autoneg = B_TRUE; + break; + } + } + val = usbgem_prop_get_int(dp, + "adv_pause", dp->ugc.usbgc_flow_control & 1); + val |= usbgem_prop_get_int(dp, + "adv_asmpause", BOOLEAN(dp->ugc.usbgc_flow_control & 2)) << 1; + if (val > FLOW_CONTROL_RX_PAUSE || val < FLOW_CONTROL_NONE) { + cmn_err(CE_WARN, + "!%s: property %s: illegal value:%d", + dp->name, "flow-control", val); + } else { + val = min(val, dp->ugc.usbgc_flow_control); + } + dp->anadv_pause = BOOLEAN(val & 1); + dp->anadv_asmpause = BOOLEAN(val & 2); + + dp->mtu = usbgem_prop_get_int(dp, "mtu", dp->mtu); + dp->txthr = usbgem_prop_get_int(dp, "txthr", dp->txthr); + dp->rxthr = usbgem_prop_get_int(dp, "rxthr", dp->rxthr); + dp->txmaxdma = usbgem_prop_get_int(dp, "txmaxdma", dp->txmaxdma); + dp->rxmaxdma = usbgem_prop_get_int(dp, "rxmaxdma", dp->rxmaxdma); +#ifdef GEM_CONFIG_POLLING + dp->poll_pkt_delay = + usbgem_prop_get_int(dp, "pkt_delay", dp->poll_pkt_delay); + + dp->max_poll_interval[GEM_SPD_10] = + usbgem_prop_get_int(dp, "max_poll_interval_10", + dp->max_poll_interval[GEM_SPD_10]); + dp->max_poll_interval[GEM_SPD_100] = + usbgem_prop_get_int(dp, "max_poll_interval_100", + dp->max_poll_interval[GEM_SPD_100]); + dp->max_poll_interval[GEM_SPD_1000] = + usbgem_prop_get_int(dp, "max_poll_interval_1000", + dp->max_poll_interval[GEM_SPD_1000]); + + dp->min_poll_interval[GEM_SPD_10] = + usbgem_prop_get_int(dp, "min_poll_interval_10", + dp->min_poll_interval[GEM_SPD_10]); + dp->min_poll_interval[GEM_SPD_100] = + usbgem_prop_get_int(dp, "min_poll_interval_100", + dp->min_poll_interval[GEM_SPD_100]); + dp->min_poll_interval[GEM_SPD_1000] = + usbgem_prop_get_int(dp, "min_poll_interval_1000", + dp->min_poll_interval[GEM_SPD_1000]); +#endif +} + +/* + * usbem kstat support + */ +#ifndef GEM_CONFIG_GLDv3 +/* kstat items based from dmfe driver */ + +struct usbgem_kstat_named { + struct kstat_named ks_xcvr_addr; + struct kstat_named ks_xcvr_id; + struct kstat_named ks_xcvr_inuse; + struct kstat_named ks_link_up; + struct kstat_named ks_link_duplex; /* 0:unknwon, 1:half, 2:full */ + struct kstat_named ks_cap_1000fdx; + struct kstat_named ks_cap_1000hdx; + struct kstat_named ks_cap_100fdx; + struct kstat_named ks_cap_100hdx; + struct kstat_named ks_cap_10fdx; + struct kstat_named ks_cap_10hdx; +#ifdef NEVER + struct kstat_named ks_cap_remfault; +#endif + struct kstat_named ks_cap_autoneg; + + struct kstat_named ks_adv_cap_1000fdx; + struct kstat_named ks_adv_cap_1000hdx; + struct kstat_named ks_adv_cap_100fdx; + struct kstat_named ks_adv_cap_100hdx; + struct kstat_named ks_adv_cap_10fdx; + struct kstat_named ks_adv_cap_10hdx; +#ifdef NEVER + struct kstat_named ks_adv_cap_remfault; +#endif + struct kstat_named ks_adv_cap_autoneg; + struct kstat_named ks_lp_cap_1000fdx; + struct kstat_named ks_lp_cap_1000hdx; + struct kstat_named ks_lp_cap_100fdx; + struct kstat_named ks_lp_cap_100hdx; + struct kstat_named ks_lp_cap_10fdx; + struct kstat_named ks_lp_cap_10hdx; + struct kstat_named ks_lp_cap_remfault; + struct kstat_named ks_lp_cap_autoneg; +}; + +static int +usbgem_kstat_update(kstat_t *ksp, int rw) +{ + struct usbgem_kstat_named *knp; + struct usbgem_dev *dp = (struct usbgem_dev *)ksp->ks_private; + + if (rw != KSTAT_READ) { + return (0); + } + + knp = (struct usbgem_kstat_named *)ksp->ks_data; + + knp->ks_xcvr_addr.value.ul = dp->mii_phy_addr; + knp->ks_xcvr_id.value.ul = dp->mii_phy_id; + knp->ks_xcvr_inuse.value.ul = usbgem_mac_xcvr_inuse(dp); + knp->ks_link_up.value.ul = dp->mii_state == MII_STATE_LINKUP; + knp->ks_link_duplex.value.ul = + (dp->mii_state == MII_STATE_LINKUP) ? + (dp->full_duplex ? 2 : 1) : 0; + + knp->ks_cap_1000fdx.value.ul = + (dp->mii_xstatus & MII_XSTATUS_1000BASET_FD) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX_FD); + knp->ks_cap_1000hdx.value.ul = + (dp->mii_xstatus & MII_XSTATUS_1000BASET) || + (dp->mii_xstatus & MII_XSTATUS_1000BASEX); + knp->ks_cap_100fdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + knp->ks_cap_100hdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + knp->ks_cap_10fdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + knp->ks_cap_10hdx.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_10); +#ifdef NEVER + knp->ks_cap_remfault.value.ul = B_TRUE; +#endif + knp->ks_cap_autoneg.value.ul = + BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + + knp->ks_adv_cap_1000fdx.value.ul = dp->anadv_1000fdx; + knp->ks_adv_cap_1000hdx.value.ul = dp->anadv_1000hdx; + knp->ks_adv_cap_100fdx.value.ul = dp->anadv_100fdx; + knp->ks_adv_cap_100hdx.value.ul = dp->anadv_100hdx; + knp->ks_adv_cap_10fdx.value.ul = dp->anadv_10fdx; + knp->ks_adv_cap_10hdx.value.ul = dp->anadv_10hdx; +#ifdef NEVER + knp->ks_adv_cap_remfault.value.ul = 0; +#endif + knp->ks_adv_cap_autoneg.value.ul = dp->anadv_autoneg; + + knp->ks_lp_cap_1000fdx.value.ul = + BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_FULL); + knp->ks_lp_cap_1000hdx.value.ul = + BOOLEAN(dp->mii_stat1000 & MII_1000TS_LP_HALF); + knp->ks_lp_cap_100fdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX_FD); + knp->ks_lp_cap_100hdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_100BASE_TX); + knp->ks_lp_cap_10fdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T_FD); + knp->ks_lp_cap_10hdx.value.ul = + BOOLEAN(dp->mii_lpable & MII_ABILITY_10BASE_T); + knp->ks_lp_cap_remfault.value.ul = + BOOLEAN(dp->mii_exp & MII_AN_EXP_PARFAULT); + knp->ks_lp_cap_autoneg.value.ul = + BOOLEAN(dp->mii_exp & MII_AN_EXP_LPCANAN); + + return (0); +} + + +static int +usbgem_kstat_init(struct usbgem_dev *dp) +{ + int i; + kstat_t *ksp; + struct usbgem_kstat_named *knp; + + ksp = kstat_create( + (char *)ddi_driver_name(dp->dip), ddi_get_instance(dp->dip), + "mii", "net", KSTAT_TYPE_NAMED, + sizeof (*knp) / sizeof (knp->ks_xcvr_addr), 0); + + if (ksp == NULL) { + cmn_err(CE_WARN, "%s: %s() for mii failed", + dp->name, __func__); + return (USB_FAILURE); + } + + knp = (struct usbgem_kstat_named *)ksp->ks_data; + + kstat_named_init(&knp->ks_xcvr_addr, "xcvr_addr", + KSTAT_DATA_INT32); + kstat_named_init(&knp->ks_xcvr_id, "xcvr_id", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_xcvr_inuse, "xcvr_inuse", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_link_up, "link_up", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_link_duplex, "link_duplex", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_1000fdx, "cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_1000hdx, "cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_100fdx, "cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_100hdx, "cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_10fdx, "cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_cap_10hdx, "cap_10hdx", + KSTAT_DATA_UINT32); +#ifdef NEVER + kstat_named_init(&knp->ks_cap_remfault, "cap_rem_fault", + KSTAT_DATA_UINT32); +#endif + kstat_named_init(&knp->ks_cap_autoneg, "cap_autoneg", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_1000fdx, "adv_cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_1000hdx, "adv_cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_100fdx, "adv_cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_100hdx, "adv_cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_10fdx, "adv_cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_adv_cap_10hdx, "adv_cap_10hdx", + KSTAT_DATA_UINT32); +#ifdef NEVER + kstat_named_init(&knp->ks_adv_cap_remfault, "adv_rem_fault", + KSTAT_DATA_UINT32); +#endif + kstat_named_init(&knp->ks_adv_cap_autoneg, "adv_cap_autoneg", + KSTAT_DATA_UINT32); + + kstat_named_init(&knp->ks_lp_cap_1000fdx, "lp_cap_1000fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_1000hdx, "lp_cap_1000hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_100fdx, "lp_cap_100fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_100hdx, "lp_cap_100hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_10fdx, "lp_cap_10fdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_10hdx, "lp_cap_10hdx", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_remfault, "lp_cap_rem_fault", + KSTAT_DATA_UINT32); + kstat_named_init(&knp->ks_lp_cap_autoneg, "lp_cap_autoneg", + KSTAT_DATA_UINT32); + + ksp->ks_private = (void *) dp; + ksp->ks_update = usbgem_kstat_update; + dp->ksp = ksp; + + kstat_install(ksp); + + return (USB_SUCCESS); +} +#endif /* GEM_CONFIG_GLDv3 */ +/* ======================================================================== */ +/* + * attach/detatch/usb support + */ +/* ======================================================================== */ +int +usbgem_ctrl_out(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size) +{ + mblk_t *data; + usb_ctrl_setup_t setup; + usb_cr_t completion_reason; + usb_cb_flags_t cb_flags; + usb_flags_t flags; + int i; + int ret; + + DPRINTF(4, (CE_CONT, "!%s: %s " + "reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x " + "bp:0x%p nic_state:%d", + dp->name, __func__, reqt, req, val, ix, len, bp, dp->nic_state)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + return (USB_PIPE_ERROR); + } + + data = NULL; + if (size > 0) { + if ((data = allocb(size, 0)) == NULL) { + return (USB_FAILURE); + } + + bcopy(bp, data->b_rptr, size); + data->b_wptr = data->b_rptr + size; + } + + setup.bmRequestType = reqt; + setup.bRequest = req; + setup.wValue = val; + setup.wIndex = ix; + setup.wLength = len; + setup.attrs = 0; /* attributes */ + + for (i = usbgem_ctrl_retry; i > 0; i--) { + completion_reason = 0; + cb_flags = 0; + + ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp), + &setup, &data, &completion_reason, &cb_flags, 0); + + if (ret == USB_SUCCESS) { + break; + } + if (i == 1) { + cmn_err(CE_WARN, + "!%s: %s failed: " + "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x " + "ret:%d cr:%s(%d), cb_flags:0x%x %s", + dp->name, __func__, reqt, req, val, ix, len, + ret, usb_str_cr(completion_reason), + completion_reason, + cb_flags, + (i > 1) ? "retrying..." : "fatal"); + } + } + + if (data != NULL) { + freemsg(data); + } + + return (ret); +} + +int +usbgem_ctrl_in(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size) +{ + mblk_t *data; + usb_ctrl_setup_t setup; + usb_cr_t completion_reason; + usb_cb_flags_t cb_flags; + int i; + int ret; + int reclen; + + DPRINTF(4, (CE_CONT, + "!%s: %s:" + " reqt:0x%02x req:0x%02x val:0x%04x ix:0x%04x len:0x%02x" + " bp:x%p mac_state:%d", + dp->name, __func__, reqt, req, val, ix, len, bp, dp->mac_state)); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + return (USB_PIPE_ERROR); + } + + data = NULL; + + setup.bmRequestType = reqt; + setup.bRequest = req; + setup.wValue = val; + setup.wIndex = ix; + setup.wLength = len; + setup.attrs = USB_ATTRS_AUTOCLEARING; /* XXX */ + + for (i = usbgem_ctrl_retry; i > 0; i--) { + completion_reason = 0; + cb_flags = 0; + ret = usb_pipe_ctrl_xfer_wait(DEFAULT_PIPE(dp), &setup, &data, + &completion_reason, &cb_flags, 0); + + if (ret == USB_SUCCESS) { + reclen = msgdsize(data); + bcopy(data->b_rptr, bp, min(reclen, size)); + break; + } + if (i == 1) { + cmn_err(CE_WARN, + "!%s: %s failed: " + "reqt:0x%x req:0x%x val:0x%x ix:0x%x len:0x%x " + "ret:%d cr:%s(%d) cb_flags:0x%x %s", + dp->name, __func__, + reqt, req, val, ix, len, + ret, usb_str_cr(completion_reason), + completion_reason, + cb_flags, + (i > 1) ? "retrying..." : "fatal"); + } + } + + if (data) { + freemsg(data); + } + + return (ret); +} + +int +usbgem_ctrl_out_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + uint32_t v) +{ + uint8_t buf[4]; + + /* convert to little endian from native byte order */ + switch (len) { + case 4: + buf[3] = v >> 24; + buf[2] = v >> 16; + /* fall thru */ + case 2: + buf[1] = v >> 8; + /* fall thru */ + case 1: + buf[0] = v; + } + + return (usbgem_ctrl_out(dp, reqt, req, val, ix, len, buf, len)); +} + +int +usbgem_ctrl_in_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *valp) +{ + uint8_t buf[4]; + uint_t v; + int err; + +#ifdef SANITY + bzero(buf, sizeof (buf)); +#endif + err = usbgem_ctrl_in(dp, reqt, req, val, ix, len, buf, len); + if (err == USB_SUCCESS) { + v = 0; + switch (len) { + case 4: + v |= buf[3] << 24; + v |= buf[2] << 16; + /* FALLTHROUGH */ + case 2: + v |= buf[1] << 8; + /* FALLTHROUGH */ + case 1: + v |= buf[0]; + } + + switch (len) { + case 4: + *(uint32_t *)valp = v; + break; + case 2: + *(uint16_t *)valp = v; + break; + case 1: + *(uint8_t *)valp = v; + break; + } + } + return (err); +} + +/* + * Attach / detach / disconnect / reconnect management + */ +static int +usbgem_open_pipes(struct usbgem_dev *dp) +{ + int i; + int ret; + int ifnum; + int alt; + usb_client_dev_data_t *reg_data; + usb_ep_data_t *ep_tree_node; + + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + ifnum = dp->ugc.usbgc_ifnum; + alt = dp->ugc.usbgc_alt; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_BULK, USB_EP_DIR_IN); + if (ep_tree_node == NULL) { + cmn_err(CE_WARN, "!%s: %s: ep_bulkin is NULL", + dp->name, __func__); + goto err; + } + dp->ep_bulkin = &ep_tree_node->ep_descr; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_BULK, USB_EP_DIR_OUT); + if (ep_tree_node == NULL) { + cmn_err(CE_WARN, "!%s: %s: ep_bulkout is NULL", + dp->name, __func__); + goto err; + } + dp->ep_bulkout = &ep_tree_node->ep_descr; + + ep_tree_node = usb_lookup_ep_data(dp->dip, dp->reg_data, ifnum, alt, + 0, USB_EP_ATTR_INTR, USB_EP_DIR_IN); + if (ep_tree_node) { + dp->ep_intr = &ep_tree_node->ep_descr; + } else { + /* don't care */ + DPRINTF(1, (CE_CONT, "!%s: %s: ep_intr is NULL", + dp->name, __func__)); + dp->ep_intr = NULL; + } + + /* XXX -- no need to open default pipe */ + + /* open bulk out pipe */ + bzero(&dp->policy_bulkout, sizeof (usb_pipe_policy_t)); + dp->policy_bulkout.pp_max_async_reqs = 1; + + if ((ret = usb_pipe_open(dp->dip, + dp->ep_bulkout, &dp->policy_bulkout, USB_FLAGS_SLEEP, + &dp->bulkout_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: err:%x: failed to open bulk-out pipe", + dp->name, __func__, ret); + dp->bulkout_pipe = NULL; + goto err; + } + DPRINTF(1, (CE_CONT, "!%s: %s: bulkout_pipe opened successfully", + dp->name, __func__)); + + /* open bulk in pipe */ + bzero(&dp->policy_bulkin, sizeof (usb_pipe_policy_t)); + dp->policy_bulkin.pp_max_async_reqs = 1; + if ((ret = usb_pipe_open(dp->dip, + dp->ep_bulkin, &dp->policy_bulkin, USB_FLAGS_SLEEP, + &dp->bulkin_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: ret:%x failed to open bulk-in pipe", + dp->name, __func__, ret); + dp->bulkin_pipe = NULL; + goto err; + } + DPRINTF(1, (CE_CONT, "!%s: %s: bulkin_pipe opened successfully", + dp->name, __func__)); + + if (dp->ep_intr) { + /* open interrupt pipe */ + bzero(&dp->policy_interrupt, sizeof (usb_pipe_policy_t)); + dp->policy_interrupt.pp_max_async_reqs = 1; + if ((ret = usb_pipe_open(dp->dip, dp->ep_intr, + &dp->policy_interrupt, USB_FLAGS_SLEEP, + &dp->intr_pipe)) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: ret:%x failed to open interrupt pipe", + dp->name, __func__, ret); + dp->intr_pipe = NULL; + goto err; + } + } + DPRINTF(1, (CE_CONT, "!%s: %s: intr_pipe opened successfully", + dp->name, __func__)); + + return (USB_SUCCESS); + +err: + if (dp->bulkin_pipe) { + usb_pipe_close(dp->dip, + dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkin_pipe = NULL; + } + if (dp->bulkout_pipe) { + usb_pipe_close(dp->dip, + dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkout_pipe = NULL; + } + if (dp->intr_pipe) { + usb_pipe_close(dp->dip, + dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->intr_pipe = NULL; + } + + return (USB_FAILURE); +} + +static int +usbgem_close_pipes(struct usbgem_dev *dp) +{ + DPRINTF(0, (CE_CONT, "!%s: %s: called", dp->name, __func__)); + + if (dp->intr_pipe) { + usb_pipe_close(dp->dip, + dp->intr_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->intr_pipe = NULL; + } + DPRINTF(1, (CE_CONT, "!%s: %s: 1", dp->name, __func__)); + + ASSERT(dp->bulkin_pipe); + usb_pipe_close(dp->dip, dp->bulkin_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkin_pipe = NULL; + DPRINTF(1, (CE_CONT, "!%s: %s: 2", dp->name, __func__)); + + ASSERT(dp->bulkout_pipe); + usb_pipe_close(dp->dip, dp->bulkout_pipe, USB_FLAGS_SLEEP, NULL, 0); + dp->bulkout_pipe = NULL; + DPRINTF(1, (CE_CONT, "!%s: %s: 3", dp->name, __func__)); + + return (USB_SUCCESS); +} + +#define FREEZE_GRACEFUL (B_TRUE) +#define FREEZE_NO_GRACEFUL (B_FALSE) +static int +usbgem_freeze_device(struct usbgem_dev *dp, boolean_t graceful) +{ + DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__)); + + /* stop nic activity */ + (void) usbgem_mac_stop(dp, MAC_STATE_DISCONNECTED, graceful); + + /* + * Here we free all memory resource allocated, because it will + * cause to panic the system that we free usb_bulk_req objects + * during the usb device is disconnected. + */ + (void) usbgem_free_memory(dp); + + return (USB_SUCCESS); +} + +static int +usbgem_disconnect_cb(dev_info_t *dip) +{ + int ret; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + cmn_err(CE_NOTE, "!%s: the usb device was disconnected (dp=%p)", + dp->name, dp); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + ret = usbgem_freeze_device(dp, 0); + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (ret); +} + +static int +usbgem_recover_device(struct usbgem_dev *dp) +{ + int err; + + DPRINTF(0, (CE_NOTE, "!%s: %s: called", dp->name, __func__)); + + err = USB_SUCCESS; + + /* reinitialize the usb connection */ + usbgem_close_pipes(dp); + if ((err = usbgem_open_pipes(dp)) != USB_SUCCESS) { + goto x; + } + + /* initialize nic state */ + dp->mac_state = MAC_STATE_STOPPED; + dp->mii_state = MII_STATE_UNKNOWN; + + /* allocate memory resources again */ + if ((err = usbgem_alloc_memory(dp)) != USB_SUCCESS) { + goto x; + } + + /* restart nic and recover state */ + (void) usbgem_restart_nic(dp); + + usbgem_mii_init(dp); + + /* kick potentially stopped house keeping thread */ + cv_signal(&dp->link_watcher_wait_cv); +x: + return (err); +} + +static int +usbgem_reconnect_cb(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + DPRINTF(0, (CE_CONT, "!%s: dp=%p", ddi_get_name(dip), dp)); +#ifdef notdef + /* check device changes after disconnect */ + if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1, + USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) { + cmn_err(CE_CONT, + "!%s: no or different device installed", dp->name); + return (DDI_SUCCESS); + } +#endif + cmn_err(CE_NOTE, "%s: the usb device was reconnected", dp->name); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_recover_device(dp); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +int +usbgem_suspend(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__)); + + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_freeze_device(dp, STOP_GRACEFUL); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +int +usbgem_resume(dev_info_t *dip) +{ + int err = USB_SUCCESS; + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + DPRINTF(0, (CE_CONT, "!%s: %s: callded", dp->name, __func__)); +#ifdef notdef + /* check device changes after disconnect */ + if (usb_check_same_device(dp->dip, NULL, USB_LOG_L2, -1, + USB_CHK_BASIC | USB_CHK_CFG, NULL) != USB_SUCCESS) { + cmn_err(CE_CONT, + "!%s: no or different device installed", dp->name); + return (DDI_SUCCESS); + } +#endif + /* start serialize */ + rw_enter(&dp->dev_state_lock, RW_WRITER); + + if (dp->mac_state == MAC_STATE_DISCONNECTED) { + err = usbgem_recover_device(dp); + } + + /* end of serialize */ + rw_exit(&dp->dev_state_lock); + + return (err == USB_SUCCESS ? DDI_SUCCESS : DDI_FAILURE); +} + +#define USBGEM_LOCAL_DATA_SIZE(gc) \ + (sizeof (struct usbgem_dev) + USBGEM_MCALLOC) + +struct usbgem_dev * +usbgem_do_attach(dev_info_t *dip, + struct usbgem_conf *gc, void *lp, int lmsize) +{ + struct usbgem_dev *dp; + int i; +#ifdef USBGEM_CONFIG_GLDv3 + mac_register_t *macp = NULL; +#else + gld_mac_info_t *macinfo; + void *tmp; +#endif + int ret; + int unit; + int err; + + unit = ddi_get_instance(dip); + + DPRINTF(2, (CE_CONT, "!usbgem%d: %s: called", unit, __func__)); + + /* + * Allocate soft data structure + */ + dp = kmem_zalloc(USBGEM_LOCAL_DATA_SIZE(gc), KM_SLEEP); + if (dp == NULL) { +#ifndef USBGEM_CONFIG_GLDv3 + gld_mac_free(macinfo); +#endif + return (NULL); + } +#ifdef USBGEM_CONFIG_GLDv3 + if ((macp = mac_alloc(MAC_VERSION)) == NULL) { + cmn_err(CE_WARN, "!gem%d: %s: mac_alloc failed", + unit, __func__); + return (NULL); + } +#else + macinfo = gld_mac_alloc(dip); + dp->macinfo = macinfo; +#endif + + /* link to private area */ + dp->private = lp; + dp->priv_size = lmsize; + dp->mc_list = (struct mcast_addr *)&dp[1]; + + dp->dip = dip; + bcopy(gc->usbgc_name, dp->name, USBGEM_NAME_LEN); + + /* + * register with usb service + */ + if (usb_client_attach(dip, USBDRV_VERSION, 0) != USB_SUCCESS) { + cmn_err(CE_WARN, + "%s: %s: usb_client_attach failed", + dp->name, __func__); + goto err_free_private; + } + + if (usb_get_dev_data(dip, &dp->reg_data, + USB_PARSE_LVL_ALL, 0) != USB_SUCCESS) { + dp->reg_data = NULL; + goto err_unregister_client; + } +#ifdef USBGEM_DEBUG_LEVEL + usb_print_descr_tree(dp->dip, dp->reg_data); +#endif + + if (usbgem_open_pipes(dp) != USB_SUCCESS) { + /* failed to open pipes */ + cmn_err(CE_WARN, "!%s: %s: failed to open pipes", + dp->name, __func__); + goto err_unregister_client; + } + + /* + * Initialize mutexs and condition variables + */ + mutex_init(&dp->rxlock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&dp->txlock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dp->rx_drain_cv, NULL, CV_DRIVER, NULL); + cv_init(&dp->tx_drain_cv, NULL, CV_DRIVER, NULL); + rw_init(&dp->dev_state_lock, NULL, RW_DRIVER, NULL); + mutex_init(&dp->link_watcher_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&dp->link_watcher_wait_cv, NULL, CV_DRIVER, NULL); + sema_init(&dp->hal_op_lock, 1, NULL, SEMA_DRIVER, NULL); + sema_init(&dp->rxfilter_lock, 1, NULL, SEMA_DRIVER, NULL); + + /* + * Initialize configuration + */ + dp->ugc = *gc; + + dp->mtu = ETHERMTU; + dp->rxmode = 0; + dp->speed = USBGEM_SPD_10; /* default is 10Mbps */ + dp->full_duplex = B_FALSE; /* default is half */ + dp->flow_control = FLOW_CONTROL_NONE; + + dp->nic_state = NIC_STATE_STOPPED; + dp->mac_state = MAC_STATE_STOPPED; + dp->mii_state = MII_STATE_UNKNOWN; + + /* performance tuning parameters */ + dp->txthr = ETHERMAX; /* tx fifo threshoold */ + dp->txmaxdma = 16*4; /* tx max dma burst size */ + dp->rxthr = 128; /* rx fifo threshoold */ + dp->rxmaxdma = 16*4; /* rx max dma burst size */ + + /* + * Get media mode infomation from .conf file + */ + usbgem_read_conf(dp); + + /* rx_buf_len depend on MTU */ + dp->rx_buf_len = MAXPKTBUF(dp) + dp->ugc.usbgc_rx_header_len; + + /* + * Reset the chip + */ + if (usbgem_hal_reset_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to reset the usb device", + dp->name, __func__); + goto err_destroy_locks; + } + + /* + * HW dependant paremeter initialization + */ + if (usbgem_hal_attach_chip(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to attach the usb device", + dp->name, __func__); + goto err_destroy_locks; + } + + /* allocate resources */ + if (usbgem_alloc_memory(dp) != USB_SUCCESS) { + goto err_destroy_locks; + } + + DPRINTF(0, (CE_CONT, + "!%s: %02x:%02x:%02x:%02x:%02x:%02x", + dp->name, + dp->dev_addr.ether_addr_octet[0], + dp->dev_addr.ether_addr_octet[1], + dp->dev_addr.ether_addr_octet[2], + dp->dev_addr.ether_addr_octet[3], + dp->dev_addr.ether_addr_octet[4], + dp->dev_addr.ether_addr_octet[5])); + + /* copy mac address */ + dp->cur_addr = dp->dev_addr; + + /* pre-calculated tx timeout in second for performance */ + dp->bulkout_timeout = + dp->ugc.usbgc_tx_timeout / drv_usectohz(1000*1000); + +#ifdef USBGEM_CONFIG_GLDv3 + usbgem_gld3_init(dp, macp); +#else + usbgem_gld_init(dp, macinfo, ident); +#endif + + /* Probe MII phy (scan phy) */ + dp->mii_lpable = 0; + dp->mii_advert = 0; + dp->mii_exp = 0; + dp->mii_ctl1000 = 0; + dp->mii_stat1000 = 0; + + dp->mii_status_ro = 0; + dp->mii_xstatus_ro = 0; + + if (usbgem_mii_probe(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: mii_probe failed", + dp->name, __func__); + goto err_free_memory; + } + + /* mask unsupported abilities */ + dp->anadv_autoneg &= BOOLEAN(dp->mii_status & MII_STATUS_CANAUTONEG); + dp->anadv_1000fdx &= + BOOLEAN(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX_FD | MII_XSTATUS_1000BASET_FD)); + dp->anadv_1000hdx &= + BOOLEAN(dp->mii_xstatus & + (MII_XSTATUS_1000BASEX | MII_XSTATUS_1000BASET)); + dp->anadv_100t4 &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASE_T4); + dp->anadv_100fdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX_FD); + dp->anadv_100hdx &= BOOLEAN(dp->mii_status & MII_STATUS_100_BASEX); + dp->anadv_10fdx &= BOOLEAN(dp->mii_status & MII_STATUS_10_FD); + dp->anadv_10hdx &= BOOLEAN(dp->mii_status & MII_STATUS_10); + + if (usbgem_mii_init(dp) != USB_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: mii_init failed", + dp->name, __func__); + goto err_free_memory; + } + + /* + * initialize kstats including mii statistics + */ +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + usbgem_nd_setup(dp); +#endif +#else + if (usbgem_kstat_init(dp) != USB_SUCCESS) { + goto err_free_memory; + } +#endif + + /* + * Add interrupt to system. + */ +#ifdef USBGEM_CONFIG_GLDv3 + if (ret = mac_register(macp, &dp->mh)) { + cmn_err(CE_WARN, "!%s: mac_register failed, error:%d", + dp->name, ret); + goto err_release_stats; + } + mac_free(macp); + macp = NULL; +#else + /* gld_register will corrupts driver_private */ + tmp = ddi_get_driver_private(dip); + if (gld_register(dip, + (char *)ddi_driver_name(dip), macinfo) != DDI_SUCCESS) { + cmn_err(CE_WARN, "!%s: %s: gld_register failed", + dp->name, __func__); + ddi_set_driver_private(dip, tmp); + goto err_release_stats; + } + /* restore driver private */ + ddi_set_driver_private(dip, tmp); +#endif /* USBGEM_CONFIG_GLDv3 */ + if (usb_register_hotplug_cbs(dip, + usbgem_suspend, usbgem_resume) != USB_SUCCESS) { + cmn_err(CE_WARN, + "!%s: %s: failed to register hotplug cbs", + dp->name, __func__); + goto err_unregister_gld; + } + + /* reset mii and start mii link watcher */ + if (usbgem_mii_start(dp) != USB_SUCCESS) { + goto err_unregister_hotplug; + } + + /* start tx watchdow watcher */ + if (usbgem_tx_watcher_start(dp)) { + goto err_usbgem_mii_stop; + } + + ddi_set_driver_private(dip, (caddr_t)dp); + + DPRINTF(2, (CE_CONT, "!%s: %s: return: success", dp->name, __func__)); + + return (dp); + +err_usbgem_mii_stop: + usbgem_mii_stop(dp); + +err_unregister_hotplug: + usb_unregister_hotplug_cbs(dip); + +err_unregister_gld: +#ifdef USBGEM_CONFIG_GLDv3 + mac_unregister(dp->mh); +#else + gld_unregister(macinfo); +#endif + +err_release_stats: +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + /* release NDD resources */ + usbgem_nd_cleanup(dp); +#endif +#else + kstat_delete(dp->ksp); +#endif + +err_free_memory: + usbgem_free_memory(dp); + +err_destroy_locks: + cv_destroy(&dp->tx_drain_cv); + cv_destroy(&dp->rx_drain_cv); + mutex_destroy(&dp->txlock); + mutex_destroy(&dp->rxlock); + rw_destroy(&dp->dev_state_lock); + mutex_destroy(&dp->link_watcher_lock); + cv_destroy(&dp->link_watcher_wait_cv); + sema_destroy(&dp->hal_op_lock); + sema_destroy(&dp->rxfilter_lock); + +err_close_pipes: + (void) usbgem_close_pipes(dp); + +err_unregister_client: + usb_client_detach(dp->dip, dp->reg_data); + +err_free_private: +#ifdef USBGEM_CONFIG_GLDv3 + if (macp) { + mac_free(macp); + } +#else + gld_mac_free(macinfo); +#endif + kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(gc)); + + return (NULL); +} + +int +usbgem_do_detach(dev_info_t *dip) +{ + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + +#ifdef USBGEM_CONFIG_GLDv3 + /* unregister with gld v3 */ + if (mac_unregister(dp->mh) != DDI_SUCCESS) { + return (DDI_FAILURE); + } +#else + /* unregister with gld v2 */ + if (gld_unregister(dp->macinfo) != DDI_SUCCESS) { + return (DDI_FAILURE); + } +#endif + /* unregister with hotplug service */ + usb_unregister_hotplug_cbs(dip); + + /* stop tx watchdog watcher*/ + usbgem_tx_watcher_stop(dp); + + /* stop the link manager */ + usbgem_mii_stop(dp); + + /* unregister with usb service */ + (void) usbgem_free_memory(dp); + (void) usbgem_close_pipes(dp); + usb_client_detach(dp->dip, dp->reg_data); + dp->reg_data = NULL; + + /* unregister with kernel statistics */ +#ifdef USBGEM_CONFIG_GLDv3 +#ifdef USBGEM_CONFIG_ND + /* release ndd resources */ + usbgem_nd_cleanup(dp); +#endif +#else + /* destroy kstat objects */ + kstat_delete(dp->ksp); +#endif + + /* release locks and condition variables */ + mutex_destroy(&dp->txlock); + mutex_destroy(&dp->rxlock); + cv_destroy(&dp->tx_drain_cv); + cv_destroy(&dp->rx_drain_cv); + rw_destroy(&dp->dev_state_lock); + mutex_destroy(&dp->link_watcher_lock); + cv_destroy(&dp->link_watcher_wait_cv); + sema_destroy(&dp->hal_op_lock); + sema_destroy(&dp->rxfilter_lock); + + /* release basic memory resources */ +#ifndef USBGEM_CONFIG_GLDv3 + gld_mac_free(dp->macinfo); +#endif + kmem_free((caddr_t)(dp->private), dp->priv_size); + kmem_free((caddr_t)dp, USBGEM_LOCAL_DATA_SIZE(&dp->ugc)); + + DPRINTF(2, (CE_CONT, "!%s: %s: return: success", + ddi_driver_name(dip), __func__)); + + return (DDI_SUCCESS); +} + +int +usbgem_mod_init(struct dev_ops *dop, char *name) +{ +#ifdef USBGEM_CONFIG_GLDv3 + major_t major; + major = ddi_name_to_major(name); + if (major == DDI_MAJOR_T_NONE) { + return (DDI_FAILURE); + } + mac_init_ops(dop, name); +#endif + return (DDI_SUCCESS); +} + +void +usbgem_mod_fini(struct dev_ops *dop) +{ +#ifdef USBGEM_CONFIG_GLDv3 + mac_fini_ops(dop); +#endif +} + +int +usbgem_quiesce(dev_info_t *dip) +{ + struct usbgem_dev *dp; + + dp = USBGEM_GET_DEV(dip); + + ASSERT(dp != NULL); + + if (dp->mac_state != MAC_STATE_DISCONNECTED && + dp->mac_state != MAC_STATE_STOPPED) { + if (usbgem_hal_stop_chip(dp) != USB_SUCCESS) { + (void) usbgem_hal_reset_chip(dp); + } + } + + /* devo_quiesce() must return DDI_SUCCESS always */ + return (DDI_SUCCESS); +} diff --git a/usr/src/uts/common/io/usbgem/usbgem.h b/usr/src/uts/common/io/usbgem/usbgem.h new file mode 100644 index 0000000000..80b89a260e --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem.h @@ -0,0 +1,428 @@ +/* + * usbgem.h: General USB to Ethernet MAC driver framework + * @(#)usbgem.h 1.4 12/02/09 + * (C) Copyright 2003-2009 Masayuki Murayama KHF04453@nifty.ne.jp + */ + +#ifndef __USBGEM_H__ +#define __USBGEM_H__ + +#pragma ident "@(#)usbgem.h 1.4 12/02/09" + +#ifdef USBGEM_CONFIG_GLDv3 +#include <sys/mac.h> +#ifndef MAC_VERSION +#include <sys/mac_provider.h> +#endif +#include <sys/mac_ether.h> +#else +#include <sys/gld.h> +#endif /* GLDv3 */ + +/* + * Useful macros and typedefs + */ +#define USBGEM_NAME_LEN 32 + +#define USBGEM_TX_TIMEOUT (drv_usectohz(3*1000000)) +#define USBGEM_TX_TIMEOUT_INTERVAL (drv_usectohz(1*1000000)) +#define USBGEM_LINK_WATCH_INTERVAL (drv_usectohz(1*1000000)) + +/* general return code */ +#define USBGEM_SUCCESS 0 +#define USBGEM_FAILURE 1 + +/* return code of usbgem_tx_done */ +#define INTR_RESTART_TX 0x80000000U + +struct usbgem_stats { + uint32_t intr; + + uint32_t crc; + uint32_t errrcv; + uint32_t overflow; + uint32_t frame; + uint32_t missed; + uint32_t runt; + uint32_t frame_too_long; + uint32_t norcvbuf; + uint32_t sqe; + + uint32_t collisions; + uint32_t first_coll; + uint32_t multi_coll; + uint32_t excoll; + uint32_t xmit_internal_err; + uint32_t nocarrier; + uint32_t defer; + uint32_t errxmt; + uint32_t underflow; + uint32_t xmtlatecoll; + uint32_t noxmtbuf; + uint32_t jabber; + + + uint64_t rbytes; + uint64_t obytes; + uint64_t rpackets; + uint64_t opackets; + uint32_t rbcast; + uint32_t obcast; + uint32_t rmcast; + uint32_t omcast; + uint32_t rcv_internal_err; +}; + +struct mcast_addr { + struct ether_addr addr; + uint32_t hash; +}; + +#define USBGEM_MAXMC 64 +#define USBGEM_MCALLOC (sizeof(struct mcast_addr) * USBGEM_MAXMC) + +#define SLOT(dp, n) ((n) % (dp)->ugc.usbgc_tx_list_max) + +/* + * mac soft state + */ +struct usbgem_dev { + dev_info_t *dip; +#ifdef USBGEM_CONFIG_GLDv3 + mac_handle_t mh; +#else + void *macinfo; /* opaque handle for upper layer */ +#endif + char name[USBGEM_NAME_LEN]; + + /* pointer to usb private data */ + usb_client_dev_data_t *reg_data; + + /* usb handles */ + usb_pipe_handle_t default_pipe; + usb_pipe_handle_t bulkin_pipe; + usb_pipe_handle_t bulkout_pipe; + usb_pipe_handle_t intr_pipe; + + /* usb endpoints */ + usb_ep_descr_t *ep_default; + usb_ep_descr_t *ep_bulkin; + usb_ep_descr_t *ep_bulkout; + usb_ep_descr_t *ep_intr; + + /* usb policies */ + usb_pipe_policy_t policy_default; + usb_pipe_policy_t policy_bulkin; + usb_pipe_policy_t policy_bulkout; + usb_pipe_policy_t policy_interrupt; + + /* MAC address information */ + struct ether_addr cur_addr; + struct ether_addr dev_addr; + + /* RX state and resource management */ + kmutex_t rxlock; + int rx_busy_cnt; + boolean_t rx_active; + kcondvar_t rx_drain_cv; + + /* RX buffer management */ + int rx_buf_len; + + /* TX state and resource management */ + kmutex_t txlock; + int tx_busy_cnt; + usb_bulk_req_t *tx_free_list; + kcondvar_t tx_drain_cv; + clock_t tx_start_time; + int bulkout_timeout; /* in second */ + int tx_max_packets; + int tx_seq_num; + int tx_intr_pended; + + /* NIC state from OS view */ + int nic_state; +#define NIC_STATE_UNKNOWN 0 +#define NIC_STATE_STOPPED 1 +#define NIC_STATE_INITIALIZED 2 +#define NIC_STATE_ONLINE 3 + + /* MAC state from hardware view */ + int mac_state; +#define MAC_STATE_DISCONNECTED 0 /* it includes suspended state too */ +#define MAC_STATE_STOPPED 1 /* powered up / buf not initialized */ +#define MAC_STATE_INITIALIZED 2 /* initialized */ +#define MAC_STATE_ONLINE 3 /* working correctly */ +#define MAC_STATE_ERROR 4 /* need to restart nic */ + + clock_t fatal_error; + + /* robustness: timer and watchdog */ + uint_t tx_watcher_stop; + kt_did_t tx_watcher_did; + kcondvar_t tx_watcher_cv; + kmutex_t tx_watcher_lock; + clock_t tx_watcher_timeout; + clock_t tx_watcher_interval; + + /* MII mamagement */ + boolean_t anadv_autoneg:1; + boolean_t anadv_1000fdx:1; + boolean_t anadv_1000hdx:1; + boolean_t anadv_100t4:1; + boolean_t anadv_100fdx:1; + boolean_t anadv_100hdx:1; + boolean_t anadv_10fdx:1; + boolean_t anadv_10hdx:1; + boolean_t anadv_1000t_ms:2; + boolean_t anadv_pause:1; + boolean_t anadv_asmpause:1; + boolean_t mii_advert_ro:1; + + boolean_t full_duplex:1; + int speed:3; +#define USBGEM_SPD_10 0 +#define USBGEM_SPD_100 1 +#define USBGEM_SPD_1000 2 +#define USBGEM_SPD_NUM 3 + unsigned int flow_control:2; +#define FLOW_CONTROL_NONE 0 +#define FLOW_CONTROL_SYMMETRIC 1 +#define FLOW_CONTROL_TX_PAUSE 2 +#define FLOW_CONTROL_RX_PAUSE 3 + + boolean_t mii_supress_msg:1; + + uint32_t mii_phy_id; + uint16_t mii_status; + uint16_t mii_advert; + uint16_t mii_lpable; + uint16_t mii_exp; + uint16_t mii_ctl1000; + uint16_t mii_stat1000; + uint16_t mii_xstatus; + int8_t mii_phy_addr; /* must be signed */ + + uint16_t mii_status_ro; + uint16_t mii_xstatus_ro; + + int mii_state; +#define MII_STATE_UNKNOWN 0 +#define MII_STATE_RESETTING 1 +#define MII_STATE_AUTONEGOTIATING 2 +#define MII_STATE_AN_DONE 3 +#define MII_STATE_MEDIA_SETUP 4 +#define MII_STATE_LINKUP 5 +#define MII_STATE_LINKDOWN 6 + + clock_t mii_last_check; /* in tick */ + clock_t mii_timer; /* in tick */ +#define MII_RESET_TIMEOUT drv_usectohz(1000*1000) +#define MII_AN_TIMEOUT drv_usectohz(5000*1000) +#define MII_LINKDOWN_TIMEOUT drv_usectohz(10000*1000) + + clock_t mii_interval; /* in tick */ + clock_t linkup_delay; /* in tick */ + + uint_t link_watcher_stop; + kt_did_t link_watcher_did; + kcondvar_t link_watcher_wait_cv; + kmutex_t link_watcher_lock; + + krwlock_t dev_state_lock; /* mac_state and nic_state */ + ksema_t hal_op_lock; /* serialize hw operations */ + ksema_t drv_op_lock; /* hotplug op lock */ + + /* multcast list */ + ksema_t rxfilter_lock; + int mc_count; + int mc_count_req; + struct mcast_addr *mc_list; + int rxmode; +#define RXMODE_PROMISC 0x01 +#define RXMODE_ALLMULTI_REQ 0x02 +#define RXMODE_MULTI_OVF 0x04 +#define RXMODE_ENABLE 0x08 +#define RXMODE_ALLMULTI (RXMODE_ALLMULTI_REQ | RXMODE_MULTI_OVF) +#define RXMODE_BITS \ + "\020" \ + "\004ENABLE" \ + "\003MULTI_OVF" \ + "\002ALLMULTI_REQ" \ + "\001PROMISC" + + /* statistcs */ + struct usbgem_stats stats; + + /* pointer to local structure */ + void *private; + int priv_size; + + /* configuration */ + struct usbgem_conf { + /* name */ + char usbgc_name[USBGEM_NAME_LEN]; + int usbgc_ppa; + + /* specification on usb */ + int usbgc_ifnum; /* interface number */ + int usbgc_alt; /* alternate */ + + /* specification on tx engine */ + int usbgc_tx_list_max; + + /* specification on rx engine */ + int usbgc_rx_header_len; + int usbgc_rx_list_max; + + /* time out parameters */ + clock_t usbgc_tx_timeout; + clock_t usbgc_tx_timeout_interval; + + /* flow control */ + int usbgc_flow_control; + + /* MII timeout parameters */ + clock_t usbgc_mii_linkdown_timeout; + clock_t usbgc_mii_link_watch_interval; + clock_t usbgc_mii_reset_timeout; + + clock_t usbgc_mii_an_watch_interval; + clock_t usbgc_mii_an_timeout; + clock_t usbgc_mii_an_wait; + clock_t usbgc_mii_an_delay; + + /* MII configuration */ + int usbgc_mii_addr_min; + int usbgc_mii_linkdown_action; + int usbgc_mii_linkdown_timeout_action; +#define MII_ACTION_NONE 0 +#define MII_ACTION_RESET 1 +#define MII_ACTION_RSA 2 + boolean_t usbgc_mii_dont_reset:1; + boolean_t usbgc_mii_an_oneshot:1; + boolean_t usbgc_mii_hw_link_detection:1; + boolean_t usbgc_mii_stop_mac_on_linkdown:1; + uint16_t usbgc_mii_an_cmd; + + /* I/O methods */ + + /* mac operation */ + int (*usbgc_attach_chip)(struct usbgem_dev *dp); + int (*usbgc_reset_chip)(struct usbgem_dev *dp); + int (*usbgc_init_chip)(struct usbgem_dev *dp); + int (*usbgc_start_chip)(struct usbgem_dev *dp); + int (*usbgc_stop_chip)(struct usbgem_dev *dp); + uint32_t (*usbgc_multicast_hash)(struct usbgem_dev *dp, + const uint8_t *); + int (*usbgc_set_rx_filter)(struct usbgem_dev *dp); + int (*usbgc_set_media)(struct usbgem_dev *dp); + int (*usbgc_get_stats)(struct usbgem_dev *dp); + void (*usbgc_interrupt)(struct usbgem_dev *dp, mblk_t *mp); + + /* packet manupilation */ + mblk_t *(*usbgc_tx_make_packet)(struct usbgem_dev *dp, mblk_t *mp); + mblk_t *(*usbgc_rx_make_packet)(struct usbgem_dev *dp, mblk_t *mp); + /* mii operations */ + int (*usbgc_mii_probe)(struct usbgem_dev *dp); + int (*usbgc_mii_init)(struct usbgem_dev *dp); + int (*usbgc_mii_config)(struct usbgem_dev *dp, int *errp); + uint16_t (*usbgc_mii_read)(struct usbgem_dev *dp, uint_t reg, int *errp); + void (*usbgc_mii_write)(struct usbgem_dev *dp, uint_t reg, uint16_t val, int *errp); + + /* jumbo frame */ + int usbgc_max_mtu; + int usbgc_default_mtu; + int usbgc_min_mtu; + } ugc; + + int misc_flag; +#define USBGEM_VLAN 0x0001 + timeout_id_t intr_watcher_id; + + /* buffer size */ + uint_t mtu; + + /* performance tuning parameters */ + uint_t txthr; /* tx fifo threshoold */ + uint_t txmaxdma; /* tx max dma burst size */ + uint_t rxthr; /* rx fifo threshoold */ + uint_t rxmaxdma; /* tx max dma burst size */ + + /* kstat stuff */ + kstat_t *ksp; + + /* ndd stuff */ + caddr_t nd_data_p; + caddr_t nd_arg_p; + +#ifdef USBGEM_DEBUG_LEVEL + int tx_cnt; +#endif +}; + +/* + * Exported functions + */ +int usbgem_ctrl_out(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size); + +int usbgem_ctrl_in(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *bp, int size); + +int usbgem_ctrl_out_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + uint32_t v); + +int usbgem_ctrl_in_val(struct usbgem_dev *dp, + uint8_t reqt, uint8_t req, uint16_t val, uint16_t ix, uint16_t len, + void *valp); + +void usbgem_generate_macaddr(struct usbgem_dev *, uint8_t *); +boolean_t usbgem_get_mac_addr_conf(struct usbgem_dev *); +int usbgem_mii_probe_default(struct usbgem_dev *); +int usbgem_mii_init_default(struct usbgem_dev *); +int usbgem_mii_config_default(struct usbgem_dev *, int *errp); +void usbgem_mii_update_link(struct usbgem_dev *); +void usbgem_restart_tx(struct usbgem_dev *); +boolean_t usbgem_tx_done(struct usbgem_dev *, int); +void usbgem_receive(struct usbgem_dev *); +struct usbgem_dev *usbgem_do_attach(dev_info_t *, + struct usbgem_conf *, void *, int); +int usbgem_do_detach(dev_info_t *); + +uint32_t usbgem_ether_crc_le(const uint8_t *addr); +uint32_t usbgem_ether_crc_be(const uint8_t *addr); + +int usbgem_resume(dev_info_t *); +int usbgem_suspend(dev_info_t *); +int usbgem_quiesce(dev_info_t *); + +#ifdef USBGEM_CONFIG_GLDv3 +#if DEVO_REV < 4 +#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \ + DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \ + nodev, NULL, D_MP, NULL) +#else +#define USBGEM_STREAM_OPS(dev_ops, attach, detach) \ + DDI_DEFINE_STREAM_OPS(dev_ops, nulldev, nulldev, attach, detach, \ + nodev, NULL, D_MP, NULL, usbgem_quiesce) +#endif +#else +#define usbgem_getinfo gld_getinfo +#define usbgem_open gld_open +#define usbgem_close gld_close +#define usbgem_wput gld_wput +#define usbgem_wsrv gld_wsrv +#define usbgem_rsrv gld_rsrv +#define usbgem_power NULL +#endif +int usbgem_mod_init(struct dev_ops *, char *); +void usbgem_mod_fini(struct dev_ops *); + +#define USBGEM_GET_DEV(dip) \ + ((struct usbgem_dev *)(ddi_get_driver_private(dip))) + +#endif /* __USBGEM_H__ */ diff --git a/usr/src/uts/common/io/usbgem/usbgem_mii.h b/usr/src/uts/common/io/usbgem/usbgem_mii.h new file mode 100644 index 0000000000..2b4176a340 --- /dev/null +++ b/usr/src/uts/common/io/usbgem/usbgem_mii.h @@ -0,0 +1,242 @@ +/* + * gem_mii.h: mii header for gem + * + * Copyright (c) 2002-2007 Masayuki Murayama. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the author nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ +#pragma ident "@(#)gem_mii.h 1.4 07/11/30" + +/* + * gem_mii.h : MII registers + */ +#ifndef _GEM_MII_H_ +#define _GEM_MII_H_ + +#ifdef GEM_CONFIG_GLDv3 +#include <sys/miiregs.h> +#else +#define MII_CONTROL 0 +#define MII_STATUS 1 +#define MII_PHYIDH 2 +#define MII_PHYIDL 3 +#define MII_AN_ADVERT 4 +#define MII_AN_LPABLE 5 +#define MII_AN_EXPANSION 6 +#define MII_AN_NXTPGXMIT 7 +#endif /* GEM_CONFIG_GLDv3 */ + +#define MII_AN_LPANXT 8 +#define MII_MS_CONTROL 9 +#define MII_MS_STATUS 10 +#define MII_XSTATUS 15 + +/* for 1000BaseT support */ +#define MII_1000TC MII_MS_CONTROL +#define MII_1000TS MII_MS_STATUS +#ifndef GEM_CONFIG_GLDv3 +#define MII_CONTROL_RESET 0x8000 +#define MII_CONTROL_LOOPBACK 0x4000 +#define MII_CONTROL_100MB 0x2000 +#define MII_CONTROL_ANE 0x1000 +#define MII_CONTROL_PWRDN 0x0800 +#define MII_CONTROL_ISOLATE 0x0400 +#define MII_CONTROL_RSAN 0x0200 +#define MII_CONTROL_FDUPLEX 0x0100 +#define MII_CONTROL_COLTST 0x0080 +#endif /* !GEM_CONFIG_GLDv3 */ +#define MII_CONTROL_SPEED 0x2040 + +#define MII_CONTROL_10MB 0x0000 +#define MII_CONTROL_1000MB 0x0040 + +#define MII_CONTROL_BITS \ + "\020" \ + "\020RESET" \ + "\017LOOPBACK" \ + "\016100MB" \ + "\015ANE" \ + "\014PWRDN" \ + "\013ISOLATE" \ + "\012RSAN" \ + "\011FDUPLEX" \ + "\010COLTST" \ + "\0071000M" +#ifndef GEM_CONFIG_GLDv3 +#define MII_STATUS_100_BASE_T4 0x8000 +#define MII_STATUS_100_BASEX_FD 0x4000 +#define MII_STATUS_100_BASEX 0x2000 +#define MII_STATUS_10_FD 0x1000 +#define MII_STATUS_10 0x0800 +#define MII_STATUS_MFPRMBLSUPR 0x0040 +#define MII_STATUS_ANDONE 0x0020 +#define MII_STATUS_REMFAULT 0x0010 +#define MII_STATUS_CANAUTONEG 0x0008 +#define MII_STATUS_LINKUP 0x0004 +#define MII_STATUS_JABBERING 0x0002 +#define MII_STATUS_EXTENDED 0x0001 +#endif /* !GEM_CONFIG_GLDv3 */ +#define MII_STATUS_XSTATUS 0x0100 +#define MII_STATUS_100_BASE_T2_FD 0x0400 +#define MII_STATUS_100_BASE_T2 0x0200 + +#define MII_STATUS_ABILITY_TECH \ + (MII_STATUS_100_BASE_T4 | \ + MII_STATUS_100_BASEX_FD | \ + MII_STATUS_100_BASEX | \ + MII_STATUS_10 | \ + MII_STATUS_10_FD) + + +#define MII_STATUS_BITS \ + "\020" \ + "\020100_BASE_T4" \ + "\017100_BASEX_FD" \ + "\016100_BASEX" \ + "\01510_BASE_FD" \ + "\01410_BASE" \ + "\013100_BASE_T2_FD" \ + "\012100_BASE_T2" \ + "\011XSTATUS" \ + "\007MFPRMBLSUPR" \ + "\006ANDONE" \ + "\005REMFAULT" \ + "\004CANAUTONEG" \ + "\003LINKUP" \ + "\002JABBERING" \ + "\001EXTENDED" +#ifndef GEM_CONFIG_GLDv3 +#define MII_AN_ADVERT_NP 0x8000 +#define MII_AN_ADVERT_REMFAULT 0x2000 +#define MII_AN_ADVERT_SELECTOR 0x001f +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_ABILITY_ASM_DIR 0x0800 /* for annex 28B */ +#ifndef MII_ABILITY_PAUSE +#define MII_ABILITY_PAUSE 0x0400 /* for IEEE 802.3x */ +#endif +#ifndef GEM_CONFIG_GLDv3 +#define MII_ABILITY_100BASE_T4 0x0200 +#define MII_ABILITY_100BASE_TX_FD 0x0100 +#define MII_ABILITY_100BASE_TX 0x0080 +#define MII_ABILITY_10BASE_T_FD 0x0040 +#define MII_ABILITY_10BASE_T 0x0020 +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_AN_LPABLE_NP 0x8000 + +#define MII_ABILITY_TECH \ + (MII_ABILITY_100BASE_T4 | \ + MII_ABILITY_100BASE_TX_FD | \ + MII_ABILITY_100BASE_TX | \ + MII_ABILITY_10BASE_T | \ + MII_ABILITY_10BASE_T_FD) + +#define MII_ABILITY_ALL \ + (MII_AN_ADVERT_REMFAULT | \ + MII_ABILITY_ASM_DIR | \ + MII_ABILITY_PAUSE | \ + MII_ABILITY_TECH) + + +#define MII_ABILITY_BITS \ + "\020" \ + "\016REMFAULT" \ + "\014ASM_DIR" \ + "\013PAUSE" \ + "\012100BASE_T4" \ + "\011100BASE_TX_FD" \ + "\010100BASE_TX" \ + "\00710BASE_T_FD" \ + "\00610BASE_T" +#ifndef GEM_CONFIG_GLDv3 +#define MII_AN_EXP_PARFAULT 0x0010 +#define MII_AN_EXP_LPCANNXTP 0x0008 +#define MII_AN_EXP_CANNXTPP 0x0004 +#define MII_AN_EXP_PAGERCVD 0x0002 +#define MII_AN_EXP_LPCANAN 0x0001 +#endif /* !GEM_CONFIG_GLDv3 */ + +#define MII_AN_EXP_BITS \ + "\020" \ + "\005PARFAULT" \ + "\004LPCANNXTP" \ + "\003CANNXTPP" \ + "\002PAGERCVD" \ + "\001LPCANAN" + +#define MII_1000TC_TESTMODE 0xe000 +#define MII_1000TC_CFG_EN 0x1000 +#define MII_1000TC_CFG_VAL 0x0800 +#define MII_1000TC_PORTTYPE 0x0400 +#define MII_1000TC_ADV_FULL 0x0200 +#define MII_1000TC_ADV_HALF 0x0100 + +#define MII_1000TC_BITS \ + "\020" \ + "\015CFG_EN" \ + "\014CFG_VAL" \ + "\013PORTTYPE" \ + "\012FULL" \ + "\011HALF" + +#define MII_1000TS_CFG_FAULT 0x8000 +#define MII_1000TS_CFG_MASTER 0x4000 +#define MII_1000TS_LOCALRXOK 0x2000 +#define MII_1000TS_REMOTERXOK 0x1000 +#define MII_1000TS_LP_FULL 0x0800 +#define MII_1000TS_LP_HALF 0x0400 + +#define MII_1000TS_BITS \ + "\020" \ + "\020CFG_FAULT" \ + "\017CFG_MASTER" \ + "\014CFG_LOCALRXOK" \ + "\013CFG_REMOTERXOK" \ + "\012LP_FULL" \ + "\011LP_HALF" + +#define MII_XSTATUS_1000BASEX_FD 0x8000 +#define MII_XSTATUS_1000BASEX 0x4000 +#define MII_XSTATUS_1000BASET_FD 0x2000 +#define MII_XSTATUS_1000BASET 0x1000 + +#define MII_XSTATUS_BITS \ + "\020" \ + "\0201000BASEX_FD" \ + "\0171000BASEX" \ + "\0161000BASET_FD" \ + "\0151000BASET" + +#define MII_READ_CMD(p, r) \ + ((6<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18)) + +#define MII_WRITE_CMD(p, r, v) \ + ((5<<(18+5+5)) | ((p)<<(18+5)) | ((r)<<18) | (2 << 16) | (v)) + +#endif /* _GEM_MII_H_ */ diff --git a/usr/src/uts/common/io/vnd/frameio.c b/usr/src/uts/common/io/vnd/frameio.c new file mode 100644 index 0000000000..e4e700fa12 --- /dev/null +++ b/usr/src/uts/common/io/vnd/frameio.c @@ -0,0 +1,464 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* + * Frame I/O utility functions + */ + +#include <sys/frameio.h> + +#include <sys/file.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/inttypes.h> + +static kmem_cache_t *frameio_cache; + +int +frameio_init(void) +{ + frameio_cache = kmem_cache_create("frameio_cache", + sizeof (frameio_t) + sizeof (framevec_t) * FRAMEIO_NVECS_MAX, + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (frameio_cache == NULL) + return (1); + + return (0); +} + +void +frameio_fini(void) +{ + if (frameio_cache != NULL) + kmem_cache_destroy(frameio_cache); +} + +frameio_t * +frameio_alloc(int kmflags) +{ + return (kmem_cache_alloc(frameio_cache, kmflags)); +} + +void +frameio_free(frameio_t *fio) +{ + return (kmem_cache_free(frameio_cache, fio)); +} + +/* + * Ensure that we don't see any garbage in the framevecs that we're nominally + * supposed to work with. Specifically we want to make sure that the buflen and + * the address are not zero. + */ +static int +frameio_hdr_check_vecs(frameio_t *fio) +{ + int i; + for (i = 0; i < fio->fio_nvecs; i++) + if (fio->fio_vecs[i].fv_buf == NULL || + fio->fio_vecs[i].fv_buflen == 0) + return (EINVAL); + + return (0); +} + +/* + * We have to copy in framevec32_t's. To work around the data model issues and + * trying not to copy memory we first copy in the framevec32_t data into the + * standard fio_vec space. Next we work backwards copying a given framevec32_t + * to a temporaory framevec_t and then overwrite the frameio_t's data. Note that + * it is important that we do this in reverse so as to ensure that we don't + * clobber data as the framevec_t is larger than the framevec32_t. + */ +static int +frameio_hdr_copyin_ilp32(frameio_t *fio, const void *addr) +{ + framevec32_t *vec32p; + framevec_t fv; + int i; + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + + if (ddi_copyin(addr, vec32p, sizeof (framevec32_t) * fio->fio_nvecs, + 0) != 0) + return (EFAULT); + + for (i = fio->fio_nvecs - 1; i >= 0; i--) { + fv.fv_buf = (void *)(uintptr_t)vec32p[i].fv_buf; + fv.fv_buflen = vec32p[i].fv_buflen; + fv.fv_actlen = vec32p[i].fv_actlen; + fio->fio_vecs[i].fv_buf = fv.fv_buf; + fio->fio_vecs[i].fv_buflen = fv.fv_buflen; + fio->fio_vecs[i].fv_actlen = fv.fv_actlen; + } + + return (frameio_hdr_check_vecs(fio)); +} + +/* + * Copy in a frame io header into fio with space for up to nvecs. If the frameio + * contains more vectors than specified it will be ignored. mode should contain + * information about the datamodel. + */ +int +frameio_hdr_copyin(frameio_t *fio, int max_vecs, const void *addr, uint_t mode) +{ + int model = ddi_model_convert_from(mode & FMODELS); + int cpf = mode & FKIOCTL ? FKIOCTL : 0; + size_t fsize = model == DDI_MODEL_ILP32 ? + sizeof (frameio32_t) : sizeof (frameio_t); + + /* + * The start of the header is the same in all data models for the + * current verison. + */ + if (ddi_copyin(addr, fio, fsize, cpf) != 0) + return (EFAULT); + + if (fio->fio_version != FRAMEIO_VERSION_ONE) + return (EINVAL); + + if (fio->fio_nvecs > FRAMEIO_NVECS_MAX || fio->fio_nvecs == 0) + return (EINVAL); + + if (fio->fio_nvpf == 0) + return (EINVAL); + + if (fio->fio_nvecs % fio->fio_nvpf != 0) + return (EINVAL); + + if (fio->fio_nvecs > max_vecs) + return (EOVERFLOW); + + addr = (void *)((uintptr_t)addr + fsize); + if (model == DDI_MODEL_ILP32) { + if (cpf != 0) + return (EINVAL); + return (frameio_hdr_copyin_ilp32(fio, addr)); + } + + if (ddi_copyin(addr, &fio->fio_vecs[0], + sizeof (framevec_t) * fio->fio_nvecs, cpf) != 0) + return (EFAULT); + + return (frameio_hdr_check_vecs(fio)); +} + +static mblk_t * +frameio_allocb(size_t sz) +{ + mblk_t *mp; + + mp = allocb(sz, 0); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_DATA; + return (mp); +} + +static int +framevec_mblk_read(framevec_t *fv, mblk_t **mpp, int cpf) +{ + mblk_t *mp; + cpf = cpf != 0 ? FKIOCTL : 0; + + mp = frameio_allocb(fv->fv_buflen); + + if (mp == NULL) { + freemsg(mp); + return (EAGAIN); + } + + if (ddi_copyin(fv->fv_buf, mp->b_wptr, fv->fv_buflen, + cpf) != 0) { + freemsg(mp); + return (EFAULT); + } + + mp->b_wptr += fv->fv_buflen; + *mpp = mp; + return (0); +} + +/* + * Read a set of frame vectors that make up a single message boundary and return + * that as a single message in *mpp that consists of multiple data parts. + */ +static int +frameio_mblk_read(frameio_t *fio, framevec_t *fv, mblk_t **mpp, int cpf) +{ + int nparts = fio->fio_nvpf; + int part, error; + mblk_t *mp; + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + /* + * Construct the initial frame + */ + for (part = 0; part < nparts; part++) { + error = framevec_mblk_read(fv, &mp, cpf); + if (error != 0) { + freemsg(*mpp); + return (error); + } + + if (*mpp == NULL) + *mpp = mp; + else + linkb(*mpp, mp); + fv++; + } + + return (0); +} + +/* + * Read data from a series of frameio vectors into a message block chain. A + * given frameio request has a number of discrete messages divided into + * individual vectors based on fio->fio_nvcspframe. Each discrete message will + * be constructed into a message block chain pointed to by b_next. + * + * If we get an EAGAIN while trying to construct a given message block what we + * return depends on what else we've done so far. If we have succesfully + * completed at least one message then we free everything else we've done so + * far and return that. If no messages have been completed we return EAGAIN. If + * instead we encounter a different error, say EFAULT, then all of the fv_actlen + * entries values are undefined. + */ +int +frameio_mblk_chain_read(frameio_t *fio, mblk_t **mpp, int *nvecs, int cpf) +{ + int error = ENOTSUP; + int nframes = fio->fio_nvecs / fio->fio_nvpf; + int frame; + framevec_t *fv; + mblk_t *mp, *bmp = NULL; + + /* + * Protect against bogus kernel subsystems. + */ + VERIFY(fio->fio_nvecs > 0); + VERIFY(fio->fio_nvecs % fio->fio_nvpf == 0); + + *mpp = NULL; + cpf = cpf != 0 ? FKIOCTL : 0; + + fv = &fio->fio_vecs[0]; + for (frame = 0; frame < nframes; frame++) { + error = frameio_mblk_read(fio, fv, &mp, cpf); + if (error != 0) + goto failed; + + if (bmp != NULL) + bmp->b_next = mp; + else + *mpp = mp; + bmp = mp; + } + + *nvecs = nframes; + return (0); +failed: + /* + * On EAGAIN we've already taken care of making sure that we have no + * leftover messages, eg. they were never linked in. + */ + if (error == EAGAIN) { + if (frame != 0) + error = 0; + if (*nvecs != NULL) + *nvecs = frame; + ASSERT(*mpp != NULL); + } else { + for (mp = *mpp; mp != NULL; mp = bmp) { + bmp = mp->b_next; + freemsg(mp); + } + if (nvecs != NULL) + *nvecs = 0; + *mpp = NULL; + } + return (error); +} + +size_t +frameio_frame_length(frameio_t *fio, framevec_t *fv) +{ + int i; + size_t len = 0; + + for (i = 0; i < fio->fio_nvpf; i++, fv++) + len += fv->fv_buflen; + + return (len); +} + +/* + * Write a portion of an mblk to the current. + */ +static int +framevec_write_mblk_part(framevec_t *fv, mblk_t *mp, size_t len, size_t moff, + size_t foff, int cpf) +{ + ASSERT(len <= MBLKL(mp) - moff); + ASSERT(len <= fv->fv_buflen - fv->fv_actlen); + cpf = cpf != 0 ? FKIOCTL : 0; + + if (ddi_copyout(mp->b_rptr + moff, fv->fv_buf + foff, len, cpf) != 0) + return (EFAULT); + fv->fv_actlen += len; + + return (0); +} + +/* + * Because copying this out to the user might fail we don't want to update the + * b_rptr in case we need to copy it out again. + */ +static int +framevec_map_blk(frameio_t *fio, framevec_t *fv, mblk_t *mp, int cpf) +{ + int err; + size_t msize, blksize, len, moff, foff; + + msize = msgsize(mp); + if (msize > frameio_frame_length(fio, fv)) + return (EOVERFLOW); + + moff = 0; + foff = 0; + blksize = MBLKL(mp); + fv->fv_actlen = 0; + while (msize != 0) { + len = MIN(blksize, fv->fv_buflen - fv->fv_actlen); + err = framevec_write_mblk_part(fv, mp, len, moff, foff, cpf); + if (err != 0) + return (err); + + msize -= len; + blksize -= len; + moff += len; + foff += len; + + if (blksize == 0 && msize != 0) { + mp = mp->b_cont; + ASSERT(mp != NULL); + moff = 0; + blksize = MBLKL(mp); + } + + if (fv->fv_buflen == fv->fv_actlen && msize != 0) { + fv++; + fv->fv_actlen = 0; + foff = 0; + } + } + + return (0); +} + +int +frameio_mblk_chain_write(frameio_t *fio, frameio_write_mblk_map_t map, + mblk_t *mp, int *nwrite, int cpf) +{ + int mcount = 0; + int ret = 0; + + if (map != MAP_BLK_FRAME) + return (EINVAL); + + while (mp != NULL && mcount < fio->fio_nvecs) { + ret = framevec_map_blk(fio, &fio->fio_vecs[mcount], mp, cpf); + if (ret != 0) + break; + mcount += fio->fio_nvpf; + mp = mp->b_next; + } + + if (ret != 0 && mcount == 0) { + if (nwrite != NULL) + *nwrite = 0; + return (ret); + } + + if (nwrite != NULL) + *nwrite = mcount / fio->fio_nvpf; + + return (0); +} + +/* + * Copy out nframes worth of frameio header data back to userland. + */ +int +frameio_hdr_copyout(frameio_t *fio, int nframes, void *addr, uint_t mode) +{ + int i; + int model = ddi_model_convert_from(mode & FMODELS); + framevec32_t *vec32p; + framevec32_t f; + + if (fio->fio_nvecs / fio->fio_nvpf < nframes) + return (EINVAL); + + fio->fio_nvecs = nframes * fio->fio_nvpf; + + if (model == DDI_MODEL_NONE) { + if (ddi_copyout(fio, addr, + sizeof (frameio_t) + fio->fio_nvecs * sizeof (framevec_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); + } + + ASSERT(model == DDI_MODEL_ILP32); + + vec32p = (framevec32_t *)&fio->fio_vecs[0]; + for (i = 0; i < fio->fio_nvecs; i++) { + f.fv_buf = (caddr32_t)(uintptr_t)fio->fio_vecs[i].fv_buf; + if (fio->fio_vecs[i].fv_buflen > UINT_MAX || + fio->fio_vecs[i].fv_actlen > UINT_MAX) + return (EOVERFLOW); + f.fv_buflen = fio->fio_vecs[i].fv_buflen; + f.fv_actlen = fio->fio_vecs[i].fv_actlen; + vec32p[i].fv_buf = f.fv_buf; + vec32p[i].fv_buflen = f.fv_buflen; + vec32p[i].fv_actlen = f.fv_actlen; + } + + if (ddi_copyout(fio, addr, + sizeof (frameio32_t) + fio->fio_nvecs * sizeof (framevec32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +void +frameio_mark_consumed(frameio_t *fio, int nframes) +{ + int i; + + ASSERT(fio->fio_nvecs / fio->fio_nvpf >= nframes); + for (i = 0; i < nframes * fio->fio_nvpf; i++) + fio->fio_vecs[i].fv_actlen = fio->fio_vecs[i].fv_buflen; +} diff --git a/usr/src/uts/common/io/vnd/vnd.c b/usr/src/uts/common/io/vnd/vnd.c new file mode 100644 index 0000000000..d62c80c1c6 --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.c @@ -0,0 +1,5584 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * vnd - virtual (machine) networking datapath + * + * vnd's purpose is to provide a highly performant data path for Layer 2 network + * traffic and exist side by side an active IP netstack, each servicing + * different datalinks. vnd provides many of the same capabilities as the + * current TCP/IP stack does and some specific to layer two. Specifically: + * + * o Use of the DLD fastpath + * o Packet capture hooks + * o Ability to use hardware capabilities + * o Useful interfaces for handling multiple frames + * + * The following image shows where vnd fits into today's networking stack: + * + * +---------+----------+----------+ + * | libdlpi | libvnd | libsocket| + * +---------+----------+----------+ + * | · · VFS | + * | VFS · VFS +----------+ + * | · | sockfs | + * +---------+----------+----------+ + * | | VND | IP | + * | +----------+----------+ + * | DLD/DLS | + * +-------------------------------+ + * | MAC | + * +-------------------------------+ + * | GLDv3 | + * +-------------------------------+ + * + * ----------------------------------------- + * A Tale of Two Devices - DDI Device Basics + * ----------------------------------------- + * + * vnd presents itself to userland as a character device; however, it also is a + * STREAMS device so that it can interface with dld and the rest of the + * networking stack. Users never interface with the STREAMs devices directly and + * they are purely an implementation detail of vnd. Opening the STREAMS device + * require kcred and as such userland cannot interact with it or push it onto + * the stream head. + * + * The main vnd character device, /dev/vnd/ctl, is a self-cloning device. Every + * clone gets its own minor number; however, minor nodes are not created in the + * devices tree for these instances. In this state a user may do two different + * things. They may issue ioctls that affect global state or they may issue + * ioctls that try to attach it to a given datalink. Once a minor device has + * been attached to a datalink, all operations on it are scoped to that context, + * therefore subsequent global operations are not permitted. + * + * A given device can be linked into the /devices and /dev name space via a link + * ioctl. That ioctl causes a minor node to be created in /devices and then it + * will also appear under /dev/vnd/ due to vnd's sdev plugin. This is similar + * to, but simpler than, IP's persistence mechanism. + * + * --------------------- + * Binding to a datalink + * --------------------- + * + * Datalinks are backed by the dld (datalink device) and dls (datalink services) + * drivers. These drivers provide a STREAMS device for datalinks on the system + * which are exposed through /dev/net. Userland generally manipulates datalinks + * through libdlpi. When an IP interface is being plumbed up what actually + * happens is that someone does a dlpi_open(3DLPI) of the underlying datalink + * and then pushes on the ip STREAMS module with an I_PUSH ioctl. Modules may + * then can negotiate with dld and dls to obtain access to various capabilities + * and fast paths via a series of STREAMS messages. + * + * In vnd, we do the same thing, but we leave our STREAMS module as an + * implementation detail of the system. We don't want users to be able to + * arbitrarily push vnd STREAMS module onto any stream, so we explicitly require + * kcred to manipulate it. Thus, when a user issues a request to attach a + * datalink to a minor instance of the character device, that vnd minor instance + * itself does a layered open (ldi_open_by_name(9F)) of the specified datalink. + * vnd does that open using the passed in credentials from the ioctl, not kcred. + * This ensures that users who doesn't have permissions to open the device + * cannot. Once that's been opened, we push on the vnd streams module. + * + * Once the vnd STREAMS instance has been created for this device, eg. the + * I_PUSH ioctl returns, we explicitly send a STREAMS ioctl + * (VND_STRIOC_ASSOCIATE) to associate the vnd STREAMS and character devices. + * This association begins the STREAM device's initialization. We start up an + * asynchronous state machine that takes care of all the different aspects of + * plumbing up the device with dld and dls and enabling the MAC fast path. We + * need to guarantee to consumers of the character device that by the time their + * ioctl returns, the data path has been fully initialized. + * + * The state progression is fairly linear. There are two general steady states. + * The first is VND_S_ONLINE, which means that everything is jacked up and good + * to go. The alternative is VND_S_ZOMBIE, which means that the streams device + * encountered an error or we have finished tearing it down and the character + * device can clean it up. The following is our state progression and the + * meaning of each state: + * + * | + * | + * V + * +---------------+ + * | VNS_S_INITIAL | This is our initial state. Every + * +---------------+ vnd STREAMS device starts here. + * | While in this state, only dlpi + * | M_PROTO and M_IOCTL messages can be + * | sent or received. All STREAMS based + * | data messages are dropped. + * | We transition out of this state by + * | sending a DL_INFO_REQ to obtain + * | information about the underlying + * | link. + * v + * +-----------------+ + * +--<-| VNS_S_INFO_SENT | In this state, we verify and + * | +-----------------+ record information about the + * | | underlying device. If the device is + * | | not suitable, eg. not of type + * v | DL_ETHER, then we immediately + * | | become a ZOMBIE. To leave this + * | | state we request exclusive active + * | | access to the device via + * v | DL_EXCLUSIVE_REQ. + * | v + * | +----------------------+ + * +--<-| VNS_S_EXCLUSIVE_SENT | In this state, we verify whether + * | +----------------------+ or not we were able to obtain + * | | | exclusive access to the device. If + * | | | we were not able to, then we leave, + * v | | as that means that something like + * | | | IP is already plumbed up on top of + * | | | the datalink. We leave this state + * | | | by progressing through to the + * | | | appropriate DLPI primitive, either + * v | | DLPI_ATTACH_REQ or DLPI_BIND_REQ + * | | | depending on the style of the + * | | | datalink. + * | | v + * | | +-------------------+ + * +------ |--<-| VNS_S_ATTACH_SENT | In this state, we verify we were + * | | +-------------------+ able to perform a standard DLPI + * | | | attach and if so, go ahead and + * v | | send a DLPI_BIND_REQ. + * | v v + * | +-------------------+ + * +--<-| VNS_S_BIND_SENT | In this state we see the result of + * | +-------------------+ our attempt to bind to PPA 0 of the + * v | underlying device. Because we're + * | | trying to be a layer two datapath, + * | | the specific attachment point isn't + * | | too important as we're going to + * v | have to enable promiscuous mode. We + * | | transition out of this by sending + * | | our first of three promiscuous mode + * | | requests. + * v v + * | +------------------------+ + * +--<-| VNS_S_SAP_PROMISC_SENT | In this state we verify that we + * | +------------------------+ were able to enable promiscuous + * | | mode at the physical level. We + * | | transition out of this by enabling + * | | multicast and broadcast promiscuous + * v | mode. + * | v + * | +--------------------------+ + * +--<-| VNS_S_MULTI_PROMISC_SENT | In this state we verify that we + * | +--------------------------+ have enabled DL_PROMISC_MULTI and + * v | move onto the final promiscuous + * | | mode request. + * | v + * | +----------------------------+ + * +--<-| VNS_S_RX_ONLY_PROMISC_SENT | In this state we verify that we + * | +----------------------------+ enabled RX_ONLY promiscuous mode. + * | | We specifically do this as we don't + * v | want to receive our own traffic + * | | that we'll send out. We leave this + * | | state by requesting the set of + * | | dld/dls capabilities that we can + * v | process. + * | | + * | v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_Q_SENT | We loop over the set of + * | +--------------------+ capabilities that dld advertised + * | | and enable the ones that currently + * v | support for use. See the section + * | | later on regarding capabilities + * | | for more information. We leave this + * | | state by sending an enable request. + * v v + * | +--------------------+ + * +--<-| VNS_S_CAPAB_E_SENT | Here we finish all capability + * | +--------------------+ initialization. Once finished, we + * | | transition to the next state. If + * v | the dld fast path is not available, + * | | we become a zombie. + * | v + * | +--------------+ + * +--<-| VNS_S_ONLINE | This is a vnd STREAMS device's + * | +--------------+ steady state. It will normally + * | | reside in this state while it is in + * | | active use. It will only transition + * v | to the next state when the STREAMS + * | | device is closed by the character + * | | device. In this state, all data + * | | flows over the dld fast path. + * | v + * | +---------------------+ + * +--<-| VNS_S_SHUTTING_DOWN | This vnd state takes care of + * | +---------------------+ disabling capabilities and then + * | | transitions to zombie state to + * v | indicate that it is finished. + * | v + * | +--------------+ + * +--->| VNS_S_ZOMBIE | In this state, the vnd STREAMS + * +--------------+ device is waiting to finished being + * reaped. + * + * If the stream association fails for any reason the state machine reaches + * VNS_S_ZOMBIE. A more detailed vnd_errno_t will propagate back through the + * STREAMS ioctl to the character device. That will fail the user ioctl and + * propagate the vnd_errno_t back to userland. If, on the other hand, the + * association succeeds, then the vnd STREAMS device will be fully plumbed up + * and ready to transmit and receive message blocks. Consumers will be able to + * start using the other cbops(9E) entry points once the attach has fully + * finished, which will occur after the original user attach ioctl to the + * character device returns. + * + * -------------------- + * General Architecture + * -------------------- + * + * There are several different devices and structures in the vnd driver. There + * is a per-netstack component, pieces related to the character device that + * consumers see, the internal STREAMS device state, and the data queues + * themselves. The following ASCII art picture describes their relationships and + * some of the major pieces of data that contain them. These are not exhaustive, + * eg. synchronization primitives are left out. + * + * +----------------+ +-----------------+ + * | global | | global | + * | device list | | netstack list | + * | vnd_dev_list | | vnd_nsd_list | + * +----------------+ +-----------------+ + * | | + * | v + * | +-------------------+ +-------------------+ + * | | per-netstack data | ---> | per-netstack data | --> ... + * | | vnd_pnsd_t | | vnd_pnsd_t | + * | | | +-------------------+ + * | | | + * | | nestackid_t ---+----> Netstack ID + * | | vnd_pnsd_flags_t -+----> Status flags + * | | zoneid_t ---+----> Zone ID for this netstack + * | | hook_family_t ---+----> VND IPv4 Hooks + * | | hook_family_t ---+----> VND IPv6 Hooks + * | | list_t ----+ | + * | +------------+------+ + * | | + * | v + * | +------------------+ +------------------+ + * | | character device | ---> | character device | -> ... + * +---------->| vnd_dev_t | | vnd_dev_t | + * | | +------------------+ + * | | + * | minor_t ---+--> device minor number + * | ldi_handle_t ---+--> handle to /dev/net/%datalink + * | vnd_dev_flags_t -+--> device flags, non blocking, etc. + * | char[] ---+--> name if linked + * | vnd_str_t * -+ | + * +--------------+---+ + * | + * v + * +-------------------------+ + * | STREAMS device | + * | vnd_str_t | + * | | + * | vnd_str_state_t ---+---> State machine state + * | gsqueue_t * ---+---> mblk_t Serialization queue + * | vnd_str_stat_t ---+---> per-device kstats + * | vnd_str_capab_t ---+----------------------------+ + * | vnd_data_queue_t ---+ | | + * | vnd_data_queue_t -+ | | v + * +-------------------+-+---+ +---------------------+ + * | | | Stream capabilities | + * | | | vnd_str_capab_t | + * | | | | + * | | supported caps <--+-- vnd_capab_flags_t | + * | | dld cap handle <--+-- void * | + * | | direct tx func <--+-- vnd_dld_tx_t | + * | | +---------------------+ + * | | + * +----------------+ +-------------+ + * | | + * v v + * +-------------------+ +-------------------+ + * | Read data queue | | Write data queue | + * | vnd_data_queue_t | | vnd_data_queue_t | + * | | | | + * | size_t ----+--> Current size | size_t ----+--> Current size + * | size_t ----+--> Max size | size_t ----+--> Max size + * | mblk_t * ----+--> Queue head | mblk_t * ----+--> Queue head + * | mblk_t * ----+--> Queue tail | mblk_t * ----+--> Queue tail + * +-------------------+ +-------------------+ + * + * + * Globally, we maintain two lists. One list contains all of the character + * device soft states. The other maintains a list of all our netstack soft + * states. Each netstack maintains a list of active devices that have been + * associated with a datalink in its netstack. + * + * Recall that a given minor instance of the character device exists in one of + * two modes. It can either be a cloned open of /dev/vnd/ctl, the control node, + * or it can be associated with a given datalink. When minor instances are in + * the former state, they do not exist in a given vnd_pnsd_t's list of devices. + * As part of attaching to a datalink, the given vnd_dev_t will be inserted into + * the appropriate vnd_pnsd_t. In addition, this will cause a STREAMS device, a + * vnd_str_t, to be created and associated to a vnd_dev_t. + * + * The character device, and its vnd_dev_t, is the interface to the rest of the + * system. The vnd_dev_t keeps track of various aspects like whether various + * operations, such as read, write and the frameio ioctls, are considered + * blocking or non-blocking in the O_NONBLOCK sense. It also is responsible for + * keeping track of things like the name of the device, if any, in /dev. The + * vnd_str_t, on the other hand manages aspects like buffer sizes and the actual + * data queues. However, ioctls that manipulate these properties all go through + * the vnd_dev_t to its associated vnd_str_t. + * + * Each of the STREAMS devices, the vnd_str_t, maintains two data queues. One + * for frames to transmit (write queue) and one for frames received (read + * queue). These data queues have a maximum size and attempting to add data + * beyond that maximum size will result in data being dropped. The sizes are + * configurable via ioctls VND_IOC_SETTXBUF, VND_IOC_SETRXBUF. Data either sits + * in those buffers or has a reservation in those buffers while they are in vnd + * and waiting to be consumed by the user or by mac. + * + * Finally, the vnd_str_t also has a vnd_str_capab_t which we use to manage the + * available, negotiated, and currently active features. + * + * ---------------------- + * Data Path and gsqueues + * ---------------------- + * + * There's a lot of plumbing in vnd to get to the point where we can send data, + * but vnd's bread and butter is the data path, so it's worth diving into it in + * more detail. Data enters and exits the system from two ends. + * + * The first end is the vnd consumer. This comes in the form of read and write + * system calls as well as the frame I/O ioctls. The read and write system calls + * operate on a single frame at a time. Think of a frame as a single message + * that has come in off the wire, which may itself comprise multiple mblk_t's + * linked together in the kernel. readv(2) and writev(2) have the same + * limitations as read(2) and write(2). We enforce this as the system is + * required to fill up every uio(9S) buffer before moving onto the next one. + * This means that if you have a MTU sized buffer and two frames come in which + * are less than half of the MTU they must fill up the given iovec. Even if we + * didn't want to do this, we have no way of informing the supplier of the + * iovecs that they were only partially filled or where one frame ends and + * another begins. That's life, as such we have frame I/O which solves this + * problem. It allows for multiple frames to be consumed as well as for frames + * to be broken down into multiple vector components. + * + * The second end is the mac direct calls. As part of negotiating capabilities + * via dld, we give mac a function of ours to call when packets are received + * [vnd_mac_input()] and a callback to indicate that flow has been restored + * [vnd_mac_flow_control()]. In turn, we also get a function pointer that we can + * transmit data with. As part of the contract with mac, mac is allowed to flow + * control us by returning a cookie to the transmit function. When that happens, + * all outbound traffic is halted until our callback function is called and we + * can schedule drains. + * + * It's worth looking at these in further detail. We'll start with the rx path. + * + * + * | + * * . . . packets from gld + * | + * v + * +-------------+ + * | mac | + * +-------------+ + * | + * v + * +-------------+ + * | dld | + * +-------------+ + * | + * * . . . dld direct callback + * | + * v + * +---------------+ + * | vnd_mac_input | + * +---------------+ + * | + * v + * +---------+ +-------------+ + * | dropped |<--*---------| vnd_hooks | + * | by | . +-------------+ + * | hooks | . drop probe | + * +---------+ kstat bump * . . . Do we have free + * | buffer space? + * | + * no . | . yes + * . + . + * +---*--+------*-------+ + * | | + * * . . drop probe * . . recv probe + * | kstat bump | kstat bump + * v | + * +---------+ * . . fire pollin + * | freemsg | v + * +---------+ +-----------------------+ + * | vnd_str_t`vns_dq_read | + * +-----------------------+ + * ^ ^ + * +----------+ | | +---------+ + * | read(9E) |-->-+ +--<--| frameio | + * +----------+ +---------+ + * + * The rx path is rather linear. Packets come into us from mac. We always run + * them through the various hooks, and if they come out of that, we inspect the + * read data queue. If there is not enough space for a packet, we drop it. + * Otherwise, we append it to the data queue, and fire read notifications + * targetting anyone polling or doing blocking I/O on this device. Those + * consumers then drain the head of the data queue. + * + * The tx path is more complicated due to mac flow control. After any call into + * mac, we may have to potentially suspend writes and buffer data for an + * arbitrary amount of time. As such, we need to carefully track the total + * amount of outstanding data so that we don't waste kernel memory. This is + * further complicated by the fact that mac will asynchronously tell us when our + * flow has been resumed. + * + * For data to be able to enter the system, it needs to be able to take a + * reservation from the write data queue. Once the reservation has been + * obtained, we enter the gsqueue so that we can actually append it. We use + * gsqueues (serialization queues) to ensure that packets are manipulated in + * order as we deal with the draining and appending packets. We also leverage + * its worker thread to help us do draining after mac has restorted our flow. + * + * The following image describes the flow: + * + * +-----------+ +--------------+ +-------------------------+ +------+ + * | write(9E) |-->| Space in the |--*--->| gsqueue_enter_one() |-->| Done | + * | frameio | | write queue? | . | +->vnd_squeue_tx_append | +------+ + * +-----------+ +--------------+ . +-------------------------+ + * | ^ . + * | | . reserve space from gsqueue + * | | | + * queue . . . * | space v + * full | * . . . avail +------------------------+ + * v | | vnd_squeue_tx_append() | + * +--------+ +------------+ +------------------------+ + * | EAGAIN |<--*------| Non-block? |<-+ | + * +--------+ . +------------+ | v + * . yes v | wait +--------------+ + * no . .* * . . for | append chain | + * +----+ space | to outgoing | + * | mblk chain | + * from gsqueue +--------------+ + * | | + * | +-------------------------------------------------+ + * | | + * | | yes . . . + * v v . + * +-----------------------+ +--------------+ . +------+ + * | vnd_squeue_tx_drain() |--->| mac blocked? |----*---->| Done | + * +-----------------------+ +--------------+ +------+ + * | | + * +---------------------------------|---------------------+ + * | | tx | + * | no . . * queue . . * + * | flow controlled . | empty * . fire pollout + * | . v | if mblk_t's + * +-------------+ . +---------------------+ | sent + * | set blocked |<----*------| vnd_squeue_tx_one() |--------^-------+ + * | flags | +---------------------+ | + * +-------------+ More data | | | More data | + * and limit ^ v * . . and limit ^ + * not reached . . * | | reached | + * +----+ | | + * v | + * +----------+ +-------------+ +---------------------------+ + * | mac flow |--------->| remove mac |--->| gsqueue_enter_one() with | + * | control | | block flags | | vnd_squeue_tx_drain() and | + * | callback | +-------------+ | GSQUEUE_FILL flag, iff | + * +----------+ | not already scheduled | + * +---------------------------+ + * + * The final path taken for a given write(9E)/frameio ioctl depends on whether + * or not the vnd_dev_t is non-blocking. That controls the initial path of + * trying to take a reservation in write data queue. If the device is in + * non-blocking mode, we'll return EAGAIN when there is not enough space + * available, otherwise, the calling thread blocks on the data queue. + * + * Today when we call into vnd_squeue_tx_drain() we will not try to drain the + * entire queue, as that could be quite large and we don't want to necessarily + * keep the thread that's doing the drain until it's been finished. Not only + * could more data be coming in, but the draining thread could be a userland + * thread that has more work to do. We have two limits today. There is an upper + * bound on the total amount of data and the total number of mblk_t chains. If + * we hit either limit, then we will schedule another drain in the gsqueue and + * go from there. + * + * It's worth taking some time to describe how we interact with gsqueues. vnd + * has a gsqueue_set_t for itself. It's important that it has its own set, as + * the profile of work that vnd does is different from other sub-systems in the + * kernel. When we open a STREAMS device in vnd_s_open, we get a random gsqueue. + * Unlike TCP/IP which uses an gsqueue for per TCP connection, we end up + * maintaining one for a given device. Because of that, we want to use a + * pseudo-random one to try and spread out the load, and picking one at random + * is likely to be just as good as any fancy algorithm we might come up with, + * especially as any two devices could have radically different transmit + * profiles. + * + * While some of the write path may seem complicated, it does allow us to + * maintain an important property. Once we have acknowledged a write(9E) or + * frameio ioctl, we will not drop the packet, excepting something like ipf via + * the firewall hooks. + * + * There is one other source of flow control that can exist in the system which + * is in the form of a barrier. The barrier is an internal mechanism used for + * ensuring that an gsqueue is drained for a given device. We use this as part + * of tearing down. Specifically we disable the write path so nothing new can be + * inserted into the gsqueue and then insert a barrier block. Once the barrier + * block comes out of the gsqueue, then we know nothing else in the gsqueue that + * could refer to the vnd_str_t, being destroyed, exists. + * + * --------------------- + * vnd, zones, netstacks + * --------------------- + * + * vnd devices are scoped to datalinks and datalinks are scoped to a netstack. + * Because of that, vnd is also a netstack module. It registers with the + * netstack sub-system and receives callbacks every time a netstack is created, + * being shutdown, and destroyed. The netstack callbacks drive the creation and + * destruction of the vnd_pnsd_t structures. + * + * Recall from the earlier architecture diagrams that every vnd device is scoped + * to a netstack and known about by a given vnd_pnsd_t. When that netstack is + * torn down, we also tear down any vnd devices that are hanging around. When + * the netstack is torn down, we know that any zones that are scoped to that + * netstack are being shut down and have no processes remaining. This is going + * to be the case whether they are shared or exclusive stack zones. We have to + * perform a careful dance. + * + * There are two different callbacks that happen on tear down, the first is a + * shutdown callback, the second is a destroy callback. When the shutdown + * callback is fired we need to prepare for the netstack to go away and ensure + * that nothing can continue to persist itself. + * + * More specifically, when we get notice of a stack being shutdown we first + * remove the netstack from the global netstack list to ensure that no one new + * can come in and find the netstack and get a reference to it. After that, we + * notify the neti hooks that they're going away. Once that's all done, we get + * to the heart of the matter. + * + * When shutting down there could be any number of outstanding contexts that + * have a reference on the vnd_pnsd_t and on the individual links. However, we + * know that no one new will be able to find the vnd_pnsd_t. To account for + * things that have existing references we mark the vnd_pnsd_t`vpnd_flags with + * VND_NS_CONDEMNED. This is checked by code paths that wish to append a device + * to the netstack's list. If this is set, then they must not append to it. + * Once this is set, we know that the netstack's list of devices can never grow, + * only shrink. + * + * Next, for each device we tag it with VND_D_ZONE_DYING. This indicates that + * the container for the device is being destroyed and that we should not allow + * additional references to the device to be created, whether via open, or + * linking. The presence of this bit also allows things like the list ioctl and + * sdev to know not to consider its existence. At the conclusion of this being + * set, we know that no one else should be able to obtain a new reference to the + * device. + * + * Once that has been set for all devices, we go through and remove any existing + * links that have been established in sdev. Because doing that may cause the + * final reference for the device to be dropped, which still has a reference to + * the netstack, we have to restart our walk due to dropped locks. We know that + * this walk will eventually complete because the device cannot be relinked and + * no new devices will be attached in this netstack due to VND_NS_CONDEMNED. + * Once that's finished, the shutdown callback returns. + * + * When we reach the destroy callback, we simply wait for references on the + * netstack to disappear. Because the zone has been shut down, all processes in + * it that have open references have been terminated and reaped. Any threads + * that are newly trying to reference it will fail. However, there is one thing + * that can halt this that we have no control over, which is the global zone + * holding open a reference to the device. In this case the zone halt will hang + * in vnd_stack_destroy. Once the last references is dropped we finish destroy + * the netinfo hooks and free the vnd_pnsd_t. + * + * ---- + * sdev + * ---- + * + * vnd registers a sdev plugin which allows it to dynamically fill out /dev/vnd + * for both the global and non-global zones. In any given zone we always supply + * a control node via /dev/vnd/ctl. This is the self-cloning node. Each zone + * will also have an entry per-link in that zone under /dev/vnd/%datalink, eg. + * if a link was named net0, there would be a /dev/vnd/net0. The global zone can + * also see every link for every zone, ala /dev/net, under + * /dev/vnd/%zonename/%datalink, eg. if a zone named 'turin' had a vnd device + * named net0, the global zone would have /dev/vnd/turin/net0. + * + * The sdev plugin has three interfaces that it supplies back to sdev. One is to + * validate that a given node is still valid. The next is a callback from sdev + * to say that it is no longer using the node. The third and final one is from + * sdev where it asks us to fill a directory. All of the heavy lifting is done + * in directory filling and in valiation. We opt not to maintain a reference on + * the device while there is an sdev node present. This makes the removal of + * nodes much simpler and most of the possible failure modes shouldn't cause any + * real problems. For example, the open path has to handle both dev_t's which no + * longer exist and which are no longer linked. + * + * ----- + * hooks + * ----- + * + * Like IP, vnd sends all L3 packets through its firewall hooks. Currently vnd + * provides these for L3 IP and IPv6 traffic. Each netstack provides these hooks + * in a minimal fashion. While we will allow traffic to be filtered through the + * hooks, we do not provide means for packet injection or additional inspection + * at this time. There are a total of four different events created: + * + * o IPv4 physical in + * o IPv4 physical out + * o IPv6 physical in + * o IPv6 physical out + * + * --------------- + * Synchronization + * --------------- + * + * To make our synchronization simpler, we've put more effort into making the + * metadata/setup paths do more work. That work allows the data paths to make + * assumptions around synchronization that simplify the general case. Each major + * structure, the vnd_pnsd_t, vnd_dev_t, vnd_str_t, and vnd_data_queue_t is + * annotated with the protection that its members receives. The following + * annotations are used: + * + * A Atomics; these values are only modified using atomics values. + * Currently this only applies to kstat values. + * E Existence; no lock is needed to access this member, it does not + * change while the structure is valid. + * GL Global Lock; these members are protected by the global + * vnd_dev_lock. + * L Locked; access to the member is controlled by a lock that is in + * the structure. + * NSL netstack lock; this member is protected by the containing + * netstack. This only applies to the vnd_dev_t`vdd_nslink. + * X This member is special, and is discussed in this section. + * + * In addition to locking, we also have reference counts on the vnd_dev_t and + * the vnd_pnsd_t. The reference counts describe the lifetimes of the structure. + * With rare exception, once a reference count is decremented, the consumer + * should not assume that the data is valid any more. The only exception to this + * is the case where we're removing an extant reference count from a link into + * /devices or /dev. Reference counts are obtained on these structures as a part + * of looking them up. + * + * # Global Lock Ordering + * ###################### + * + * The following is the order that you must take locks in vnd: + * + * 1) vnd`vnd_dev_lock + * 2) vnd_pnsd_t`vpnd_lock + * 3) vnd_dev_t`vnd_lock + * 4) vnd_str_t`vns_lock + * 5) vnd_data_queue_t`vdq_lock + * + * One must adhere to the following rules: + * + * o You must acquire a lower numbered lock before a high numbered lock. + * o It is NOT legal to hold two locks of the same level concurrently, eg. you + * can not hold two different vnd_dev_t's vnd_lock at the same time. + * o You may release locks in any order. + * o If you release a lock, you must honor the locking rules before acquiring + * it again. + * o You should not hold any locks when calling any of the rele functions. + * + * # Special Considerations + * ######################## + * + * While most of the locking is what's expected, it's worth going into the + * special nature that a few members hold. Today, only two structures have + * special considerations: the vnd_dev_t and the vnd_str_t. All members with + * special considerations have an additional annotation that describes how you + * should interact with it. + * + * vnd_dev_t: The vdd_nsd and vdd_cr are only valid when the minor node is + * attached or in the process of attaching. If the code path that goes through + * requires an attached vnd_dev_t, eg. the data path and tear down path, then it + * is always legal to dereference that member without a lock held. When they are + * added to the system, they should be done under the vdd_lock and done as part + * of setting the VND_D_ATTACH_INFLIGHT flag. These should not change during the + * lifetime of the vnd_dev_t. + * + * vnd_dev_t: The vdd_ldih is similar to the vdd_nsd and vdd_cr, except that it + * always exists as it is a part of the structure. The only time that it's valid + * to be using it is during the attach path with the VND_D_ATTACH_INFLIGHT flag + * set or during tear down. Outside of those paths which are naturally + * serialized, there is no explicit locking around the member. + * + * vnd_str_t: The vns_dev and vns_nsd work in similar ways. They are not + * initially set as part of creating the structure, but are set as part of + * responding to the association ioctl. Anything in the data path or metadata + * path that requires association may assume that they exist, as we do not kick + * off the state machine until they're set. + * + * vnd_str_t: The vns_drainblk and vns_barrierblk are similarly special. The + * members are designed to be used as part of various operations with the + * gsqueues. A lock isn't needed to use them, but to work with them, the + * appropriate flag in the vnd_str_t`vns_flags must have been set by the current + * thread. Otherwise, it is always fair game to refer to their addresses. Their + * contents are ignored by vnd, but some members are manipulated by the gsqueue + * subsystem. + */ + +#include <sys/conf.h> +#include <sys/devops.h> +#include <sys/modctl.h> +#include <sys/stat.h> +#include <sys/file.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/open.h> +#include <sys/ddi.h> +#include <sys/ethernet.h> +#include <sys/stropts.h> +#include <sys/sunddi.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/ksynch.h> +#include <sys/taskq_impl.h> +#include <sys/sdt.h> +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/dlpi.h> +#include <sys/cred.h> +#include <sys/id_space.h> +#include <sys/list.h> +#include <sys/ctype.h> +#include <sys/policy.h> +#include <sys/sunldi.h> +#include <sys/cred.h> +#include <sys/strsubr.h> +#include <sys/poll.h> +#include <sys/neti.h> +#include <sys/hook.h> +#include <sys/hook_event.h> +#include <sys/vlan.h> +#include <sys/dld.h> +#include <sys/mac_client.h> +#include <sys/netstack.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/kstat.h> +#include <sys/atomic.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/gsqueue.h> + +#include <inet/ip.h> +#include <inet/ip6.h> + +#include <sys/vnd.h> + +/* + * Globals + */ +static dev_info_t *vnd_dip; +static taskq_t *vnd_taskq; +static kmem_cache_t *vnd_str_cache; +static kmem_cache_t *vnd_dev_cache; +static kmem_cache_t *vnd_pnsd_cache; +static id_space_t *vnd_minors; +static int vnd_list_init = 0; +static sdev_plugin_hdl_t vnd_sdev_hdl; +static gsqueue_set_t *vnd_sqset; + +static kmutex_t vnd_dev_lock; +static list_t vnd_dev_list; /* Protected by the vnd_dev_lock */ +static list_t vnd_nsd_list; /* Protected by the vnd_dev_lock */ + +/* + * STREAMs ioctls + * + * The STREAMs ioctls are internal to vnd. No one should be seeing them, as such + * they aren't a part of the header file. + */ +#define VND_STRIOC (('v' << 24) | ('n' << 16) | ('d' << 8) | 0x80) + +/* + * Private ioctl to associate a given streams instance with a minor instance of + * the character device. + */ +#define VND_STRIOC_ASSOCIATE (VND_STRIOC | 0x1) + +typedef struct vnd_strioc_associate { + minor_t vsa_minor; /* minor device node */ + netstackid_t vsa_nsid; /* netstack id */ + vnd_errno_t vsa_errno; /* errno */ +} vnd_strioc_associate_t; + +typedef enum vnd_strioc_state { + VSS_UNKNOWN = 0, + VSS_COPYIN = 1, + VSS_COPYOUT = 2, +} vnd_strioc_state_t; + +typedef struct vnd_strioc { + vnd_strioc_state_t vs_state; + caddr_t vs_addr; +} vnd_strioc_t; + +/* + * VND SQUEUE TAGS, start at 0x42 so we don't overlap with extent tags. Though + * really, overlap is at the end of the day, inevitable. + */ +#define VND_SQUEUE_TAG_TX_DRAIN 0x42 +#define VND_SQUEUE_TAG_MAC_FLOW_CONTROL 0x43 +#define VND_SQUEUE_TAG_VND_WRITE 0x44 +#define VND_SQUEUE_TAG_ND_FRAMEIO_WRITE 0x45 +#define VND_SQUEUE_TAG_STRBARRIER 0x46 + +/* + * vnd reserved names. These are names which are reserved by vnd and thus + * shouldn't be used by some external program. + */ +static char *vnd_reserved_names[] = { + "ctl", + "zone", + NULL +}; + +/* + * vnd's DTrace probe macros + * + * DTRACE_VND* are all for a stable provider. We also have an unstable internal + * set of probes for reference count manipulation. + */ +#define DTRACE_VND3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__vnd_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_VND4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_VND5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__vnd_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_VND_REFINC(vdp) \ + DTRACE_PROBE2(vnd__ref__inc, vnd_dev_t *, vdp, int, vdp->vdd_ref); +#define DTRACE_VND_REFDEC(vdp) \ + DTRACE_PROBE2(vnd__ref__dec, vnd_dev_t *, vdp, int, vdp->vdd_ref); + + +/* + * Tunables + */ +size_t vnd_vdq_default_size = 1024 * 64; /* 64 KB */ +size_t vnd_vdq_hard_max = 1024 * 1024 * 4; /* 4 MB */ + +/* + * These numbers are designed as per-device tunables that are applied when a new + * vnd device is attached. They're a rough stab at what may be a reasonable + * amount of work to do in one burst in an squeue. + */ +size_t vnd_flush_burst_size = 1520 * 10; /* 10 1500 MTU packets */ +size_t vnd_flush_nburst = 10; /* 10 frames */ + +/* + * Constants related to our sdev plugins + */ +#define VND_SDEV_NAME "vnd" +#define VND_SDEV_ROOT "/dev/vnd" +#define VND_SDEV_ZROOT "/dev/vnd/zone" + +/* + * Statistic macros + */ +#define VND_STAT_INC(vsp, field, val) \ + atomic_add_64(&(vsp)->vns_ksdata.field.value.ui64, val) +#define VND_LATENCY_1MS 1000000 +#define VND_LATENCY_10MS 10000000 +#define VND_LATENCY_100MS 100000000 +#define VND_LATENCY_1S 1000000000 +#define VND_LATENCY_10S 10000000000 + +/* + * Constants for vnd hooks + */ +static uint8_t vnd_bcast_addr[6] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; +#define IPV4_MCAST_LEN 3 +static uint8_t vnd_ipv4_mcast[3] = { 0x01, 0x00, 0x5E }; +#define IPV6_MCAST_LEN 2 +static uint8_t vnd_ipv6_mcast[2] = { 0x33, 0x33 }; + +/* + * vnd internal data structures and types + */ + +struct vnd_str; +struct vnd_dev; +struct vnd_pnsd; + +/* + * As part of opening the device stream we need to properly communicate with our + * underlying stream. This is a bit of an asynchronous dance and we need to + * properly work with dld to get everything set up. We have to initiate the + * conversation with dld and as such we keep track of our state here. + */ +typedef enum vnd_str_state { + VNS_S_INITIAL = 0, + VNS_S_INFO_SENT, + VNS_S_EXCLUSIVE_SENT, + VNS_S_ATTACH_SENT, + VNS_S_BIND_SENT, + VNS_S_SAP_PROMISC_SENT, + VNS_S_MULTI_PROMISC_SENT, + VNS_S_RX_ONLY_PROMISC_SENT, + VNS_S_CAPAB_Q_SENT, + VNS_S_CAPAB_E_SENT, + VNS_S_ONLINE, + VNS_S_SHUTTING_DOWN, + VNS_S_ZOMBIE +} vnd_str_state_t; + +typedef enum vnd_str_flags { + VNS_F_NEED_ZONE = 0x1, + VNS_F_TASKQ_DISPATCHED = 0x2, + VNS_F_CONDEMNED = 0x4, + VNS_F_FLOW_CONTROLLED = 0x8, + VNS_F_DRAIN_SCHEDULED = 0x10, + VNS_F_BARRIER = 0x20, + VNS_F_BARRIER_DONE = 0x40 +} vnd_str_flags_t; + +typedef enum vnd_capab_flags { + VNS_C_HCKSUM = 0x1, + VNS_C_DLD = 0x2, + VNS_C_DIRECT = 0x4, + VNS_C_HCKSUM_BADVERS = 0x8 +} vnd_capab_flags_t; + +/* + * Definitions to interact with direct callbacks + */ +typedef void (*vnd_rx_t)(struct vnd_str *, mac_resource_t *, mblk_t *, + mac_header_info_t *); +typedef uintptr_t vnd_mac_cookie_t; +/* DLD Direct capability function */ +typedef int (*vnd_dld_cap_t)(void *, uint_t, void *, uint_t); +/* DLD Direct tx function */ +typedef vnd_mac_cookie_t (*vnd_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); +/* DLD Direct function to set flow control callback */ +typedef void *(*vnd_dld_set_fcb_t)(void *, void (*)(void *, vnd_mac_cookie_t), + void *); +/* DLD Direct function to see if flow controlled still */ +typedef int (*vnd_dld_is_fc_t)(void *, vnd_mac_cookie_t); + +/* + * The vnd_str_capab_t is always protected by the vnd_str_t it's a member of. + */ +typedef struct vnd_str_capab { + vnd_capab_flags_t vsc_flags; + t_uscalar_t vsc_hcksum_opts; + vnd_dld_cap_t vsc_capab_f; + void *vsc_capab_hdl; + vnd_dld_tx_t vsc_tx_f; + void *vsc_tx_hdl; + vnd_dld_set_fcb_t vsc_set_fcb_f; + void *vsc_set_fcb_hdl; + vnd_dld_is_fc_t vsc_is_fc_f; + void *vsc_is_fc_hdl; + vnd_mac_cookie_t vsc_fc_cookie; + void *vsc_tx_fc_hdl; +} vnd_str_capab_t; + +/* + * The vnd_data_queue is a simple construct for storing a series of messages in + * a queue. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_data_queue { + struct vnd_str *vdq_vns; /* E */ + kmutex_t vdq_lock; + kcondvar_t vdq_ready; /* Uses vdq_lock */ + ssize_t vdq_max; /* L */ + ssize_t vdq_cur; /* L */ + mblk_t *vdq_head; /* L */ + mblk_t *vdq_tail; /* L */ +} vnd_data_queue_t; + +typedef struct vnd_str_stat { + kstat_named_t vks_rbytes; + kstat_named_t vks_rpackets; + kstat_named_t vks_obytes; + kstat_named_t vks_opackets; + kstat_named_t vks_nhookindrops; + kstat_named_t vks_nhookoutdrops; + kstat_named_t vks_ndlpidrops; + kstat_named_t vks_ndataindrops; + kstat_named_t vks_ndataoutdrops; + kstat_named_t vks_tdrops; + kstat_named_t vks_linkname; + kstat_named_t vks_zonename; + kstat_named_t vks_nmacflow; + kstat_named_t vks_tmacflow; + kstat_named_t vks_mac_flow_1ms; + kstat_named_t vks_mac_flow_10ms; + kstat_named_t vks_mac_flow_100ms; + kstat_named_t vks_mac_flow_1s; + kstat_named_t vks_mac_flow_10s; +} vnd_str_stat_t; + +/* + * vnd stream structure + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_str { + kmutex_t vns_lock; + kcondvar_t vns_cancelcv; /* Uses vns_lock */ + kcondvar_t vns_barriercv; /* Uses vns_lock */ + kcondvar_t vns_stcv; /* Uses vns_lock */ + vnd_str_state_t vns_state; /* L */ + vnd_str_state_t vns_laststate; /* L */ + vnd_errno_t vns_errno; /* L */ + vnd_str_flags_t vns_flags; /* L */ + vnd_str_capab_t vns_caps; /* L */ + taskq_ent_t vns_tqe; /* L */ + vnd_data_queue_t vns_dq_read; /* E */ + vnd_data_queue_t vns_dq_write; /* E */ + mblk_t *vns_dlpi_inc; /* L */ + queue_t *vns_rq; /* E */ + queue_t *vns_wq; /* E */ + queue_t *vns_lrq; /* E */ + t_uscalar_t vns_dlpi_style; /* L */ + t_uscalar_t vns_minwrite; /* L */ + t_uscalar_t vns_maxwrite; /* L */ + hrtime_t vns_fclatch; /* L */ + hrtime_t vns_fcupdate; /* L */ + kstat_t *vns_kstat; /* E */ + gsqueue_t *vns_squeue; /* E */ + mblk_t vns_drainblk; /* E + X */ + mblk_t vns_barrierblk; /* E + X */ + vnd_str_stat_t vns_ksdata; /* A */ + size_t vns_nflush; /* L */ + size_t vns_bsize; /* L */ + struct vnd_dev *vns_dev; /* E + X */ + struct vnd_pnsd *vns_nsd; /* E + X */ +} vnd_str_t; + +typedef enum vnd_dev_flags { + VND_D_ATTACH_INFLIGHT = 0x001, + VND_D_ATTACHED = 0x002, + VND_D_LINK_INFLIGHT = 0x004, + VND_D_LINKED = 0x008, + VND_D_CONDEMNED = 0x010, + VND_D_ZONE_DYING = 0x020, + VND_D_OPENED = 0x040 +} vnd_dev_flags_t; + +/* + * This represents the data associated with a minor device instance. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_dev { + kmutex_t vdd_lock; + list_node_t vdd_link; /* GL */ + list_node_t vdd_nslink; /* NSL */ + int vdd_ref; /* L */ + vnd_dev_flags_t vdd_flags; /* L */ + minor_t vdd_minor; /* E */ + dev_t vdd_devid; /* E */ + ldi_ident_t vdd_ldiid; /* E */ + ldi_handle_t vdd_ldih; /* X */ + cred_t *vdd_cr; /* X */ + vnd_str_t *vdd_str; /* L */ + struct pollhead vdd_ph; /* E */ + struct vnd_pnsd *vdd_nsd; /* E + X */ + char vdd_datalink[VND_NAMELEN]; /* L */ + char vdd_lname[VND_NAMELEN]; /* L */ +} vnd_dev_t; + +typedef enum vnd_pnsd_flags { + VND_NS_CONDEMNED = 0x1 +} vnd_pnsd_flags_t; + +/* + * Per netstack data structure. + * + * See synchronization section of the big theory statement for member + * annotations. + */ +typedef struct vnd_pnsd { + list_node_t vpnd_link; /* protected by global dev lock */ + zoneid_t vpnd_zid; /* E */ + netstackid_t vpnd_nsid; /* E */ + boolean_t vpnd_hooked; /* E */ + net_handle_t vpnd_neti_v4; /* E */ + hook_family_t vpnd_family_v4; /* E */ + hook_event_t vpnd_event_in_v4; /* E */ + hook_event_t vpnd_event_out_v4; /* E */ + hook_event_token_t vpnd_token_in_v4; /* E */ + hook_event_token_t vpnd_token_out_v4; /* E */ + net_handle_t vpnd_neti_v6; /* E */ + hook_family_t vpnd_family_v6; /* E */ + hook_event_t vpnd_event_in_v6; /* E */ + hook_event_t vpnd_event_out_v6; /* E */ + hook_event_token_t vpnd_token_in_v6; /* E */ + hook_event_token_t vpnd_token_out_v6; /* E */ + kmutex_t vpnd_lock; /* Protects remaining members */ + kcondvar_t vpnd_ref_change; /* Uses vpnd_lock */ + int vpnd_ref; /* L */ + vnd_pnsd_flags_t vpnd_flags; /* L */ + list_t vpnd_dev_list; /* L */ +} vnd_pnsd_t; + +static void vnd_squeue_tx_drain(void *, mblk_t *, gsqueue_t *, void *); + +/* + * Drop function signature. + */ +typedef void (*vnd_dropper_f)(vnd_str_t *, mblk_t *, const char *); + +static void +vnd_drop_ctl(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__ctl, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_ndataoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_in(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__in, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookindrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_hook_out(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + DTRACE_VND4(drop__out, mblk_t *, mp, vnd_str_t *, vsp, mblk_t *, + mp, const char *, reason); + if (mp != NULL) { + freemsg(mp); + } + VND_STAT_INC(vsp, vks_nhookoutdrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); +} + +static void +vnd_drop_panic(vnd_str_t *vsp, mblk_t *mp, const char *reason) +{ + panic("illegal vnd drop"); +} + +static void +vnd_mac_drop_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + mblk_t *mp; + + while (mp_chain != NULL) { + mp = mp_chain; + mp_chain = mp->b_next; + vnd_drop_hook_in(vsp, mp, "stream not associated"); + } +} + +static vnd_pnsd_t * +vnd_nsd_lookup(netstackid_t nsid) +{ + vnd_pnsd_t *nsp; + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + if (nsp->vpnd_nsid == nsid) { + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref >= 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zid(zoneid_t zid) +{ + netstack_t *ns; + vnd_pnsd_t *nsp; + ns = netstack_find_by_zoneid(zid); + if (ns == NULL) + return (NULL); + nsp = vnd_nsd_lookup(ns->netstack_stackid); + netstack_rele(ns); + return (nsp); +} + +static vnd_pnsd_t * +vnd_nsd_lookup_by_zonename(char *zname) +{ + zone_t *zonep; + vnd_pnsd_t *nsp; + + zonep = zone_find_by_name(zname); + if (zonep == NULL) + return (NULL); + + nsp = vnd_nsd_lookup_by_zid(zonep->zone_id); + zone_rele(zonep); + return (nsp); +} + +static void +vnd_nsd_ref(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + /* + * This can only be used on something that has been obtained through + * some other means. As such, the caller should already have a reference + * before adding another one. This function should not be used as a + * means of creating the initial reference. + */ + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref++; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static void +vnd_nsd_rele(vnd_pnsd_t *nsp) +{ + mutex_enter(&nsp->vpnd_lock); + VERIFY(nsp->vpnd_ref > 0); + nsp->vpnd_ref--; + mutex_exit(&nsp->vpnd_lock); + cv_broadcast(&nsp->vpnd_ref_change); +} + +static vnd_dev_t * +vnd_dev_lookup(minor_t m) +{ + vnd_dev_t *vdp; + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + if (vdp->vdd_minor == m) { + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); + break; + } + } + mutex_exit(&vnd_dev_lock); + return (vdp); +} + +static void +vnd_dev_free(vnd_dev_t *vdp) +{ + /* + * When the STREAM exists we need to go through and make sure + * communication gets torn down. As part of closing the stream, we + * guarantee that nothing else should be able to enter the stream layer + * at this point. That means no one should be able to call + * read(),write() or one of the frameio ioctls. + */ + if (vdp->vdd_flags & VND_D_ATTACHED) { + ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + + /* + * We have to remove ourselves from our parents list now. It is + * really quite important that we have already set the condemend + * flag here so that our containing netstack basically knows + * that we're on the way down and knows not to wait for us. It's + * also important that we do that before we put a rele on the + * the device as that is the point at which it will check again. + */ + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + list_remove(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(vdp->vdd_nsd); + vdp->vdd_nsd = NULL; + } + ASSERT(vdp->vdd_flags & VND_D_CONDEMNED); + id_free(vnd_minors, vdp->vdd_minor); + mutex_destroy(&vdp->vdd_lock); + kmem_cache_free(vnd_dev_cache, vdp); +} + +static void +vnd_dev_ref(vnd_dev_t *vdp) +{ + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + mutex_exit(&vdp->vdd_lock); +} + +/* + * As part of releasing the hold on this we may tear down a given vnd_dev_t As + * such we need to make sure that we grab the list lock first before grabbing + * the vnd_dev_t's lock to ensure proper lock ordering. + */ +static void +vnd_dev_rele(vnd_dev_t *vdp) +{ + mutex_enter(&vnd_dev_lock); + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_ref > 0); + vdp->vdd_ref--; + DTRACE_VND_REFDEC(vdp); + if (vdp->vdd_ref > 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return; + } + + /* + * Now that we've removed this from the list, we can go ahead and + * drop the list lock. No one else can find this device and reference + * it. As its reference count is zero, it by definition does not have + * any remaining entries in /devices that could lead someone back to + * this. + */ + vdp->vdd_flags |= VND_D_CONDEMNED; + list_remove(&vnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + vnd_dev_free(vdp); +} + +/* + * Insert a mesage block chain if there's space, otherwise drop it. Return one + * so someone who was waiting for data would now end up having found it. eg. + * caller should consider a broadcast. + */ +static int +vnd_dq_push(vnd_data_queue_t *vqp, mblk_t *mp, boolean_t reserved, + vnd_dropper_f dropf) +{ + size_t msize; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + if (reserved == B_FALSE) { + msize = msgsize(mp); + if (vqp->vdq_cur + msize > vqp->vdq_max) { + dropf(vqp->vdq_vns, mp, "buffer full"); + return (0); + } + vqp->vdq_cur += msize; + } + + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + vqp->vdq_head = mp; + vqp->vdq_tail = mp; + } else { + vqp->vdq_tail->b_next = mp; + vqp->vdq_tail = mp; + } + + return (1); +} + +/* + * Remove a message message block chain. If the amount of space in the buffer + * has changed we return 1. We have no way of knowing whether or not there is + * enough space overall for a given writer who is blocked, so we always end up + * having to return true and thus tell consumers that they should consider + * signalling. + */ +static int +vnd_dq_pop(vnd_data_queue_t *vqp, mblk_t **mpp) +{ + size_t msize; + mblk_t *mp; + + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(mpp != NULL); + if (vqp->vdq_head == NULL) { + ASSERT(vqp->vdq_tail == NULL); + *mpp = NULL; + return (0); + } + + mp = vqp->vdq_head; + msize = msgsize(mp); + + vqp->vdq_cur -= msize; + if (mp->b_next == NULL) { + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + /* + * We can't be certain that this is always going to be zero. + * Someone may have basically taken a reservation of space on + * the data queue, eg. claimed spae but not yet pushed it on + * yet. + */ + ASSERT(vqp->vdq_cur >= 0); + } else { + vqp->vdq_head = mp->b_next; + ASSERT(vqp->vdq_cur > 0); + } + mp->b_next = NULL; + *mpp = mp; + return (1); +} + +/* + * Reserve space in the queue. This will bump up the size of the queue and + * entitle the user to push something on later without bumping the space. + */ +static int +vnd_dq_reserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size >= 0); + + if (size == 0) + return (0); + + if (size + vqp->vdq_cur > vqp->vdq_max) + return (0); + + vqp->vdq_cur += size; + return (1); +} + +static void +vnd_dq_unreserve(vnd_data_queue_t *vqp, ssize_t size) +{ + ASSERT(MUTEX_HELD(&vqp->vdq_lock)); + ASSERT(size > 0); + ASSERT(size <= vqp->vdq_cur); + + vqp->vdq_cur -= size; +} + +static void +vnd_dq_flush(vnd_data_queue_t *vqp, vnd_dropper_f dropf) +{ + mblk_t *mp, *next; + + mutex_enter(&vqp->vdq_lock); + for (mp = vqp->vdq_head; mp != NULL; mp = next) { + next = mp->b_next; + mp->b_next = NULL; + dropf(vqp->vdq_vns, mp, "vnd_dq_flush"); + } + vqp->vdq_cur = 0; + vqp->vdq_head = NULL; + vqp->vdq_tail = NULL; + mutex_exit(&vqp->vdq_lock); +} + +static boolean_t +vnd_dq_is_empty(vnd_data_queue_t *vqp) +{ + boolean_t ret; + + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head == NULL) + ret = B_TRUE; + else + ret = B_FALSE; + mutex_exit(&vqp->vdq_lock); + + return (ret); +} + +/* + * Get a network uint16_t from the message and translate it into something the + * host understands. + */ +static int +vnd_mbc_getu16(mblk_t *mp, off_t off, uint16_t *out) +{ + size_t mpsize; + uint8_t *bp; + + mpsize = msgsize(mp); + /* Check for overflow */ + if (off + sizeof (uint16_t) > mpsize) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + + /* + * Data is in network order. Note the second byte of data might be in + * the next mp. + */ + bp = mp->b_rptr + off; + *out = *bp << 8; + if (off + 1 == mpsize) { + mp = mp->b_cont; + bp = mp->b_rptr; + } else { + bp++; + } + + *out |= *bp; + return (0); +} + +/* + * Given an mblk chain find the mblk and address of a particular offset. + */ +static int +vnd_mbc_getoffset(mblk_t *mp, off_t off, mblk_t **mpp, uintptr_t *offp) +{ + size_t mpsize; + + if (off >= msgsize(mp)) + return (1); + + mpsize = MBLKL(mp); + while (off >= mpsize) { + mp = mp->b_cont; + off -= mpsize; + mpsize = MBLKL(mp); + } + *mpp = mp; + *offp = (uintptr_t)mp->b_rptr + off; + + return (0); +} + +/* + * Fetch the destination mac address. Set *dstp to that mac address. If the data + * is not contiguous in the first mblk_t, fill in datap and set *dstp to it. + */ +static int +vnd_mbc_getdstmac(mblk_t *mp, uint8_t **dstpp, uint8_t *datap) +{ + int i; + + if (MBLKL(mp) >= ETHERADDRL) { + *dstpp = mp->b_rptr; + return (0); + } + + *dstpp = datap; + for (i = 0; i < ETHERADDRL; i += 2, datap += 2) { + if (vnd_mbc_getu16(mp, i, (uint16_t *)datap) != 0) + return (1); + } + + return (0); +} + +static int +vnd_hook(vnd_str_t *vsp, mblk_t **mpp, net_handle_t netiv4, hook_event_t hev4, + hook_event_token_t hetv4, net_handle_t netiv6, hook_event_t hev6, + hook_event_token_t hetv6, vnd_dropper_f hdrop, vnd_dropper_f ddrop) +{ + uint16_t etype; + int vlan = 0; + hook_pkt_event_t info; + size_t offset, mblen; + uint8_t *dstp; + uint8_t dstaddr[6]; + hook_event_t he; + hook_event_token_t het; + net_handle_t neti; + + /* + * Before we can ask if we're interested we have to do enough work to + * determine the ethertype. + */ + + /* Byte 12 is either the VLAN tag or the ethertype */ + if (vnd_mbc_getu16(*mpp, 12, &etype) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + if (etype == ETHERTYPE_VLAN) { + vlan = 1; + /* Actual ethertype is another four bytes in */ + if (vnd_mbc_getu16(*mpp, 16, &etype) != 0) { + ddrop(vsp, *mpp, + "packet has incomplete ethernet vlan header"); + *mpp = NULL; + return (1); + } + offset = sizeof (struct ether_vlan_header); + } else { + offset = sizeof (struct ether_header); + } + + /* + * At the moment we only hook on the kinds of things that the IP module + * would normally. + */ + if (etype != ETHERTYPE_IP && etype != ETHERTYPE_IPV6) + return (0); + + if (etype == ETHERTYPE_IP) { + neti = netiv4; + he = hev4; + het = hetv4; + } else { + neti = netiv6; + he = hev6; + het = hetv6; + } + + if (!he.he_interested) + return (0); + + + if (vnd_mbc_getdstmac(*mpp, &dstp, dstaddr) != 0) { + ddrop(vsp, *mpp, "packet has incomplete ethernet header"); + *mpp = NULL; + return (1); + } + + /* + * Now that we know we're interested, we have to do some additional + * sanity checking for IPF's sake, ala ip_check_length(). Specifically + * we need to check to make sure that the remaining packet size, + * excluding MAC, is at least the size of an IP header. + */ + mblen = msgsize(*mpp); + if ((etype == ETHERTYPE_IP && + mblen - offset < IP_SIMPLE_HDR_LENGTH) || + (etype == ETHERTYPE_IPV6 && mblen - offset < IPV6_HDR_LEN)) { + ddrop(vsp, *mpp, "packet has invalid IP header"); + *mpp = NULL; + return (1); + } + + info.hpe_protocol = neti; + info.hpe_ifp = (phy_if_t)vsp; + info.hpe_ofp = (phy_if_t)vsp; + info.hpe_mp = mpp; + info.hpe_flags = 0; + + if (bcmp(vnd_bcast_addr, dstp, ETHERADDRL) == 0) + info.hpe_flags |= HPE_BROADCAST; + else if (etype == ETHERTYPE_IP && + bcmp(vnd_ipv4_mcast, vnd_bcast_addr, IPV4_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + else if (etype == ETHERTYPE_IPV6 && + bcmp(vnd_ipv6_mcast, vnd_bcast_addr, IPV6_MCAST_LEN) == 0) + info.hpe_flags |= HPE_MULTICAST; + + if (vnd_mbc_getoffset(*mpp, offset, &info.hpe_mb, + (uintptr_t *)&info.hpe_hdr) != 0) { + ddrop(vsp, *mpp, "packet too small -- " + "unable to find payload"); + *mpp = NULL; + return (1); + } + + if (hook_run(neti->netd_hooks, het, (hook_data_t)&info) != 0) { + hdrop(vsp, *mpp, "drooped by hooks"); + return (1); + } + + return (0); +} + +/* + * This should not be used for DL_INFO_REQ. + */ +static mblk_t * +vnd_dlpi_alloc(size_t len, t_uscalar_t prim) +{ + mblk_t *mp; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return (NULL); + + mp->b_datap->db_type = M_PROTO; + mp->b_wptr = mp->b_rptr + len; + bzero(mp->b_rptr, len); + ((dl_unitdata_req_t *)mp->b_rptr)->dl_primitive = prim; + + return (mp); +} + +static void +vnd_dlpi_inc_push(vnd_str_t *vsp, mblk_t *mp) +{ + mblk_t **mpp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + ASSERT(mp->b_next == NULL); + mpp = &vsp->vns_dlpi_inc; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + *mpp = mp; +} + +static mblk_t * +vnd_dlpi_inc_pop(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vsp->vns_dlpi_inc; + if (mp != NULL) { + VERIFY(mp->b_next == NULL || mp->b_next != mp); + vsp->vns_dlpi_inc = mp->b_next; + mp->b_next = NULL; + } + return (mp); +} + +static int +vnd_st_sinfo(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_info_req_t *dlir; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + vsp->vns_state = VNS_S_INFO_SENT; + cv_broadcast(&vsp->vns_stcv); + + mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_info(vnd_str_t *vsp) +{ + dl_info_ack_t *dlia; + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + dlia = (dl_info_ack_t *)mp->b_rptr; + vsp->vns_dlpi_style = dlia->dl_provider_style; + vsp->vns_minwrite = dlia->dl_min_sdu; + vsp->vns_maxwrite = dlia->dl_max_sdu; + + /* + * At this time we only support DL_ETHER devices. + */ + if (dlia->dl_mac_type != DL_ETHER) { + freemsg(mp); + vsp->vns_errno = VND_E_NOTETHER; + return (1); + } + + /* + * Because vnd operates on entire packets, we need to manually account + * for the ethernet header information. We add the size of the + * ether_vlan_header to account for this, regardless if it is using + * vlans or not. + */ + vsp->vns_maxwrite += sizeof (struct ether_vlan_header); + + freemsg(mp); + return (0); +} + +static int +vnd_st_sexclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_EXCLUSIVE_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_EXCLUSIVE_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + return (0); +} + +static int +vnd_st_exclusive(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_exclusive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_EXCLUSIVE_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_exclusive: got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_DLEXCL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +/* + * Send down a DLPI_ATTACH_REQ. + */ +static int +vnd_st_sattach(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_attach_req_t), DL_ATTACH_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + ((dl_attach_req_t *)mp->b_rptr)->dl_ppa = 0; + vsp->vns_state = VNS_S_ATTACH_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_attach(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "vnd_st_attach: unknown primitive type"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_ATTACH_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_attach: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_ATTACHFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_sbind(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_bind_req_t *dbrp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), + DL_BIND_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + dbrp = (dl_bind_req_t *)(mp->b_rptr); + dbrp->dl_sap = 0; + dbrp->dl_service_mode = DL_CLDLS; + + vsp->vns_state = VNS_S_BIND_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_bind(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_error_ack_t *)mp->b_rptr)->dl_primitive; + + if (prim != DL_BIND_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, "wrong dlpi primitive for vnd_st_bind"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_BINDFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_spromisc(vnd_str_t *vsp, int type, vnd_str_state_t next) +{ + mblk_t *mp; + dl_promiscon_req_t *dprp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_alloc(sizeof (dl_promiscon_req_t), DL_PROMISCON_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + dprp = (dl_promiscon_req_t *)mp->b_rptr; + dprp->dl_level = type; + + vsp->vns_state = next; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static int +vnd_st_promisc(vnd_str_t *vsp) +{ + mblk_t *mp; + t_uscalar_t prim, cprim; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + prim = ((dl_ok_ack_t *)mp->b_rptr)->dl_primitive; + cprim = ((dl_ok_ack_t *)mp->b_rptr)->dl_correct_primitive; + + if (prim != DL_OK_ACK && prim != DL_ERROR_ACK) { + vnd_drop_ctl(vsp, mp, + "wrong dlpi primitive for vnd_st_promisc"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (cprim != DL_PROMISCON_REQ) { + vnd_drop_ctl(vsp, mp, + "vnd_st_promisc: Got ack/nack for wrong primitive"); + vsp->vns_errno = VND_E_DLPIINVAL; + return (1); + } + + if (prim == DL_ERROR_ACK) + vsp->vns_errno = VND_E_PROMISCFAIL; + + freemsg(mp); + return (prim == DL_ERROR_ACK); +} + +static int +vnd_st_scapabq(vnd_str_t *vsp) +{ + mblk_t *mp; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + mp = vnd_dlpi_alloc(sizeof (dl_capability_req_t), DL_CAPABILITY_REQ); + if (mp == NULL) { + vsp->vns_errno = VND_E_NOMEM; + return (1); + } + + vsp->vns_state = VNS_S_CAPAB_Q_SENT; + cv_broadcast(&vsp->vns_stcv); + putnext(vsp->vns_wq, mp); + + return (0); +} + +static void +vnd_mac_input(vnd_str_t *vsp, mac_resource_t *unused, mblk_t *mp_chain, + mac_header_info_t *mhip) +{ + int signal = 0; + mblk_t *mp; + vnd_pnsd_t *nsp = vsp->vns_nsd; + + ASSERT(vsp != NULL); + ASSERT(mp_chain != NULL); + + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + uint16_t vid; + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * If we were operating in a traditional dlpi context then we + * would have enabled DLIOCRAW and rather than the fast path we + * would come through dld_str_rx_raw. That function does two + * things that we have to consider doing ourselves. The first is + * that it adjusts the b_rptr back to account for dld bumping us + * past the mac header. It also tries to account for cases where + * mac provides an illusion of the mac header. Fortunately, dld + * only allows the fastpath when the media type is the same as + * the native type. Therefore all we have to do here is adjust + * the b_rptr. + */ + ASSERT(mp->b_rptr >= DB_BASE(mp) + mhip->mhi_hdrsize); + mp->b_rptr -= mhip->mhi_hdrsize; + vid = VLAN_ID(mhip->mhi_tci); + if (mhip->mhi_istagged && vid != VLAN_ID_NONE) { + bcopy(mp->b_rptr, mp->b_rptr + 4, 12); + mp->b_rptr += 4; + } + + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_in_v4, nsp->vpnd_token_in_v4, + nsp->vpnd_neti_v6, nsp->vpnd_event_in_v6, + nsp->vpnd_token_in_v6, vnd_drop_hook_in, vnd_drop_in) != 0) + continue; + + VND_STAT_INC(vsp, vks_rpackets, 1); + VND_STAT_INC(vsp, vks_rbytes, msgsize(mp)); + DTRACE_VND5(recv, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + mutex_enter(&vsp->vns_dq_read.vdq_lock); + signal |= vnd_dq_push(&vsp->vns_dq_read, mp, B_FALSE, + vnd_drop_in); + mutex_exit(&vsp->vns_dq_read.vdq_lock); + + } + + if (signal != 0) { + cv_broadcast(&vsp->vns_dq_read.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLIN | POLLRDNORM); + } + +} + +static void +vnd_mac_flow_control_stat(vnd_str_t *vsp, hrtime_t diff) +{ + VND_STAT_INC(vsp, vks_nmacflow, 1); + VND_STAT_INC(vsp, vks_tmacflow, diff); + if (diff >= VND_LATENCY_1MS) + VND_STAT_INC(vsp, vks_mac_flow_1ms, 1); + if (diff >= VND_LATENCY_10MS) + VND_STAT_INC(vsp, vks_mac_flow_10ms, 1); + if (diff >= VND_LATENCY_100MS) + VND_STAT_INC(vsp, vks_mac_flow_100ms, 1); + if (diff >= VND_LATENCY_1S) + VND_STAT_INC(vsp, vks_mac_flow_1s, 1); + if (diff >= VND_LATENCY_10S) + VND_STAT_INC(vsp, vks_mac_flow_10s, 1); +} + +/* + * This is a callback from MAC that indicates that we are allowed to send + * packets again. + */ +static void +vnd_mac_flow_control(void *arg, vnd_mac_cookie_t cookie) +{ + vnd_str_t *vsp = arg; + hrtime_t now, diff; + + mutex_enter(&vsp->vns_lock); + now = gethrtime(); + + /* + * Check for the case that we beat vnd_squeue_tx_one to the punch. + * There's also an additional case here that we got notified because + * we're sharing a device that ran out of tx descriptors, even though it + * wasn't because of us. + */ + if (!(vsp->vns_flags & VNS_F_FLOW_CONTROLLED)) { + vsp->vns_fcupdate = now; + mutex_exit(&vsp->vns_lock); + return; + } + + ASSERT(vsp->vns_flags & VNS_F_FLOW_CONTROLLED); + ASSERT(vsp->vns_caps.vsc_fc_cookie == cookie); + vsp->vns_flags &= ~VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = NULL; + diff = now - vsp->vns_fclatch; + vsp->vns_fclatch = 0; + DTRACE_VND3(flow__resumed, vnd_str_t *, vsp, uint64_t, + vsp->vns_dq_write.vdq_cur, uintptr_t, cookie); + /* + * If someone has asked to flush the squeue and thus inserted a barrier, + * than we shouldn't schedule a drain. + */ + if (!(vsp->vns_flags & (VNS_F_DRAIN_SCHEDULED | VNS_F_BARRIER))) { + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_drainblk, + vnd_squeue_tx_drain, vsp, GSQUEUE_FILL, + VND_SQUEUE_TAG_MAC_FLOW_CONTROL); + } + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_mac_enter(vnd_str_t *vsp, mac_perim_handle_t *mphp) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mphp, DLD_ENABLE) == 0); +} + +static void +vnd_mac_exit(vnd_str_t *vsp, mac_perim_handle_t mph) +{ + ASSERT(MUTEX_HELD(&vsp->vns_lock)); + VERIFY(vsp->vns_caps.vsc_capab_f(vsp->vns_caps.vsc_capab_hdl, + DLD_CAPAB_PERIM, mph, DLD_DISABLE) == 0); +} + +static int +vnd_dld_cap_enable(vnd_str_t *vsp, vnd_rx_t rxfunc) +{ + int ret; + dld_capab_direct_t d; + mac_perim_handle_t mph; + vnd_str_capab_t *c = &vsp->vns_caps; + + bzero(&d, sizeof (d)); + d.di_rx_cf = (uintptr_t)rxfunc; + d.di_rx_ch = vsp; + d.di_flags = DI_DIRECT_RAW; + + vnd_mac_enter(vsp, &mph); + + /* + * If we're coming in here for a second pass, we need to make sure that + * we remove an existing flow control notification callback, otherwise + * we'll create a duplicate that will remain with garbage data. + */ + if (c->vsc_tx_fc_hdl != NULL) { + ASSERT(c->vsc_set_fcb_hdl != NULL); + (void) c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, NULL, + c->vsc_tx_fc_hdl); + c->vsc_tx_fc_hdl = NULL; + } + + if (vsp->vns_caps.vsc_capab_f(c->vsc_capab_hdl, + DLD_CAPAB_DIRECT, &d, DLD_ENABLE) == 0) { + c->vsc_tx_f = (vnd_dld_tx_t)d.di_tx_df; + c->vsc_tx_hdl = d.di_tx_dh; + c->vsc_set_fcb_f = (vnd_dld_set_fcb_t)d.di_tx_cb_df; + c->vsc_set_fcb_hdl = d.di_tx_cb_dh; + c->vsc_is_fc_f = (vnd_dld_is_fc_t)d.di_tx_fctl_df; + c->vsc_is_fc_hdl = d.di_tx_fctl_dh; + c->vsc_tx_fc_hdl = c->vsc_set_fcb_f(c->vsc_set_fcb_hdl, + vnd_mac_flow_control, vsp); + c->vsc_flags |= VNS_C_DIRECT; + ret = 0; + } else { + vsp->vns_errno = VND_E_DIRECTFAIL; + ret = 1; + } + vnd_mac_exit(vsp, mph); + return (ret); +} + +static int +vnd_st_capabq(vnd_str_t *vsp) +{ + mblk_t *mp; + dl_capability_ack_t *cap; + dl_capability_sub_t *subp; + dl_capab_hcksum_t *hck; + dl_capab_dld_t *dld; + unsigned char *rp; + int ret = 0; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + mp = vnd_dlpi_inc_pop(vsp); + + rp = mp->b_rptr; + cap = (dl_capability_ack_t *)rp; + if (cap->dl_sub_length == 0) + goto done; + + /* Don't try to process something too big */ + if (sizeof (dl_capability_ack_t) + cap->dl_sub_length > MBLKL(mp)) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_CAPACKINVAL; + ret = 1; + goto done; + } + + rp += cap->dl_sub_offset; + + while (cap->dl_sub_length > 0) { + subp = (dl_capability_sub_t *)rp; + /* Sanity check something crazy from down below */ + if (subp->dl_length + sizeof (dl_capability_sub_t) > + cap->dl_sub_length) { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vsp->vns_errno = VND_E_SUBCAPINVAL; + ret = 1; + goto done; + } + + switch (subp->dl_cap) { + case DL_CAPAB_HCKSUM: + hck = (dl_capab_hcksum_t *)(rp + + sizeof (dl_capability_sub_t)); + if (hck->hcksum_version != HCKSUM_CURRENT_VERSION) { + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM_BADVERS; + break; + } + if (dlcapabcheckqid(&hck->hcksum_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_HCKSUM; + vsp->vns_caps.vsc_hcksum_opts = hck->hcksum_txflags; + break; + case DL_CAPAB_DLD: + dld = (dl_capab_dld_t *)(rp + + sizeof (dl_capability_sub_t)); + if (dld->dld_version != DLD_CURRENT_VERSION) { + vsp->vns_errno = VND_E_DLDBADVERS; + ret = 1; + goto done; + } + if (dlcapabcheckqid(&dld->dld_mid, vsp->vns_lrq) != + B_TRUE) { + vsp->vns_errno = VND_E_CAPABPASS; + ret = 1; + goto done; + } + vsp->vns_caps.vsc_flags |= VNS_C_DLD; + vsp->vns_caps.vsc_capab_f = + (vnd_dld_cap_t)dld->dld_capab; + vsp->vns_caps.vsc_capab_hdl = + (void *)dld->dld_capab_handle; + /* + * At this point in time, we have to set up a direct + * function that drops all input. This validates that + * we'll be able to set up direct input and that we can + * easily switch it earlier to the real data function + * when we've plumbed everything up. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_drop_input) != 0) { + /* vns_errno set by vnd_dld_cap_enable */ + ret = 1; + goto done; + } + break; + default: + /* Ignore unsupported cap */ + break; + } + + rp += sizeof (dl_capability_sub_t) + subp->dl_length; + cap->dl_sub_length -= sizeof (dl_capability_sub_t) + + subp->dl_length; + } + +done: + /* Make sure we enabled direct callbacks */ + if (ret == 0 && !(vsp->vns_caps.vsc_flags & VNS_C_DIRECT)) { + vsp->vns_errno = VND_E_DIRECTNOTSUP; + ret = 1; + } + + freemsg(mp); + return (ret); +} + +static void +vnd_st_sonline(vnd_str_t *vsp) +{ + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + vsp->vns_state = VNS_S_ONLINE; + cv_broadcast(&vsp->vns_stcv); +} + +static void +vnd_st_shutdown(vnd_str_t *vsp) +{ + mac_perim_handle_t mph; + vnd_str_capab_t *vsc = &vsp->vns_caps; + + VERIFY(MUTEX_HELD(&vsp->vns_lock)); + + /* + * At this point in time we know that there is no one transmitting as + * our final reference has been torn down and that vnd_s_close inserted + * a barrier to validate that everything is flushed. + */ + if (vsc->vsc_flags & VNS_C_DIRECT) { + vnd_mac_enter(vsp, &mph); + vsc->vsc_flags &= ~VNS_C_DIRECT; + (void) vsc->vsc_set_fcb_f(vsc->vsc_set_fcb_hdl, NULL, + vsc->vsc_tx_fc_hdl); + vsc->vsc_tx_fc_hdl = NULL; + (void) vsc->vsc_capab_f(vsc->vsc_capab_hdl, DLD_CAPAB_DIRECT, + NULL, DLD_DISABLE); + vnd_mac_exit(vsp, mph); + } + + /* + * We could send an unbind, but dld also does that for us. As we add + * more capabilities and the like, we should revisit this. + */ + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); +} + +/* + * Perform state transitions. This is a one way shot down the flow chart + * described in the big theory statement. + */ +static void +vnd_str_state_transition(void *arg) +{ + boolean_t died = B_FALSE; + vnd_str_t *vsp = arg; + mblk_t *mp; + + mutex_enter(&vsp->vns_lock); + if (vsp->vns_dlpi_inc == NULL && (vsp->vns_state != VNS_S_INITIAL && + vsp->vns_state != VNS_S_SHUTTING_DOWN)) { + mutex_exit(&vsp->vns_lock); + return; + } + DTRACE_PROBE2(vnd__state__transition, uintptr_t, vsp, + vnd_str_state_t, vsp->vns_state); + switch (vsp->vns_state) { + case VNS_S_INITIAL: + VERIFY(vsp->vns_dlpi_inc == NULL); + if (vnd_st_sinfo(vsp) != 0) + died = B_TRUE; + break; + case VNS_S_INFO_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_info(vsp) == 0) { + if (vnd_st_sexclusive(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_EXCLUSIVE_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_exclusive(vsp) == 0) { + if (vsp->vns_dlpi_style == DL_STYLE2) { + if (vnd_st_sattach(vsp) != 0) + died = B_TRUE; + } else { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } + } else { + died = B_TRUE; + } + break; + case VNS_S_ATTACH_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_attach(vsp) == 0) { + if (vnd_st_sbind(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_BIND_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_bind(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_SAP, + VNS_S_SAP_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_SAP_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_MULTI, + VNS_S_MULTI_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_MULTI_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_spromisc(vsp, DL_PROMISC_RX_ONLY, + VNS_S_RX_ONLY_PROMISC_SENT) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_RX_ONLY_PROMISC_SENT: + VERIFY(vsp->vns_dlpi_inc != NULL); + if (vnd_st_promisc(vsp) == 0) { + if (vnd_st_scapabq(vsp) != 0) + died = B_TRUE; + } else { + died = B_TRUE; + } + break; + case VNS_S_CAPAB_Q_SENT: + if (vnd_st_capabq(vsp) != 0) + died = B_TRUE; + else + vnd_st_sonline(vsp); + break; + case VNS_S_SHUTTING_DOWN: + vnd_st_shutdown(vsp); + break; + case VNS_S_ZOMBIE: + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vsp committed suicide"); + break; + default: + panic("vnd_str_t entered an unknown state"); + } + + if (died == B_TRUE) { + ASSERT(vsp->vns_errno != VND_E_SUCCESS); + vsp->vns_laststate = vsp->vns_state; + vsp->vns_state = VNS_S_ZOMBIE; + cv_broadcast(&vsp->vns_stcv); + } + + mutex_exit(&vsp->vns_lock); +} + +static void +vnd_dlpi_taskq_dispatch(void *arg) +{ + vnd_str_t *vsp = arg; + int run = 1; + + while (run != 0) { + vnd_str_state_transition(vsp); + mutex_enter(&vsp->vns_lock); + if (vsp->vns_flags & VNS_F_CONDEMNED || + vsp->vns_dlpi_inc == NULL) { + run = 0; + vsp->vns_flags &= ~VNS_F_TASKQ_DISPATCHED; + } + if (vsp->vns_flags & VNS_F_CONDEMNED) + cv_signal(&vsp->vns_cancelcv); + mutex_exit(&vsp->vns_lock); + } +} + +static int +vnd_neti_getifname(net_handle_t neti, phy_if_t phy, char *buf, const size_t len) +{ + return (-1); +} + +static int +vnd_neti_getmtu(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +static int +vnd_neti_getptmue(net_handle_t neti) +{ + return (-1); +} + +static int +vnd_neti_getlifaddr(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +static int +vnd_neti_getlifzone(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + zoneid_t *zid) +{ + return (-1); +} + +static int +vnd_neti_getlifflags(net_handle_t neti, phy_if_t phy, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} + +static phy_if_t +vnd_neti_phygetnext(net_handle_t neti, phy_if_t phy) +{ + return (-1); +} + +static phy_if_t +vnd_neti_phylookup(net_handle_t neti, const char *name) +{ + return (-1); +} + +static lif_if_t +vnd_neti_lifgetnext(net_handle_t neti, phy_if_t phy, lif_if_t ifdata) +{ + return (-1); +} + +static int +vnd_neti_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +static phy_if_t +vnd_neti_route(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +static int +vnd_neti_ispchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static int +vnd_neti_isvchksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +static net_protocol_t vnd_neti_info_v4 = { + NETINFO_VERSION, + NHF_VND_INET, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + +static net_protocol_t vnd_neti_info_v6 = { + NETINFO_VERSION, + NHF_VND_INET6, + vnd_neti_getifname, + vnd_neti_getmtu, + vnd_neti_getptmue, + vnd_neti_getlifaddr, + vnd_neti_getlifzone, + vnd_neti_getlifflags, + vnd_neti_phygetnext, + vnd_neti_phylookup, + vnd_neti_lifgetnext, + vnd_neti_inject, + vnd_neti_route, + vnd_neti_ispchksum, + vnd_neti_isvchksum +}; + + +static int +vnd_netinfo_init(vnd_pnsd_t *nsp) +{ + nsp->vpnd_neti_v4 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v4); + ASSERT(nsp->vpnd_neti_v4 != NULL); + + nsp->vpnd_neti_v6 = net_protocol_register(nsp->vpnd_nsid, + &vnd_neti_info_v6); + ASSERT(nsp->vpnd_neti_v6 != NULL); + + nsp->vpnd_family_v4.hf_version = HOOK_VERSION; + nsp->vpnd_family_v4.hf_name = "vnd_inet"; + + if (net_family_register(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4) != 0) { + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_family_v6.hf_version = HOOK_VERSION; + nsp->vpnd_family_v6.hf_name = "vnd_inet6"; + + if (net_family_register(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6) != 0) { + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_family_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v4.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v4.he_flags = 0; + nsp->vpnd_event_in_v4.he_interested = B_FALSE; + + nsp->vpnd_token_in_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_in_v4); + if (nsp->vpnd_token_in_v4 == NULL) { + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_in_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_in_v6.he_name = NH_PHYSICAL_IN; + nsp->vpnd_event_in_v6.he_flags = 0; + nsp->vpnd_event_in_v6.he_interested = B_FALSE; + + nsp->vpnd_token_in_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_in_v6); + if (nsp->vpnd_token_in_v6 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v4.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v4.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v4.he_flags = 0; + nsp->vpnd_event_out_v4.he_interested = B_FALSE; + + nsp->vpnd_token_out_v4 = net_event_register(nsp->vpnd_neti_v4, + &nsp->vpnd_event_out_v4); + if (nsp->vpnd_token_out_v4 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + nsp->vpnd_event_out_v6.he_version = HOOK_VERSION; + nsp->vpnd_event_out_v6.he_name = NH_PHYSICAL_OUT; + nsp->vpnd_event_out_v6.he_flags = 0; + nsp->vpnd_event_out_v6.he_interested = B_FALSE; + + nsp->vpnd_token_out_v6 = net_event_register(nsp->vpnd_neti_v6, + &nsp->vpnd_event_out_v6); + if (nsp->vpnd_token_out_v6 == NULL) { + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + net_protocol_unregister(nsp->vpnd_neti_v4); + net_protocol_unregister(nsp->vpnd_neti_v6); + cmn_err(CE_NOTE, "vnd_netinfo_init: net_event_register " + "failed for stack %d", nsp->vpnd_nsid); + return (1); + } + + return (0); +} + +static void +vnd_netinfo_shutdown(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_shutdown(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); +} + +static void +vnd_netinfo_fini(vnd_pnsd_t *nsp) +{ + int ret; + + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_in_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_event_out_v4); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_in_v6); + VERIFY(ret == 0); + ret = net_event_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_event_out_v6); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v4, &nsp->vpnd_family_v4); + VERIFY(ret == 0); + ret = net_family_unregister(nsp->vpnd_neti_v6, &nsp->vpnd_family_v6); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v4); + VERIFY(ret == 0); + ret = net_protocol_unregister(nsp->vpnd_neti_v6); + VERIFY(ret == 0); +} + +static void +vnd_strbarrier_cb(void *arg, mblk_t *bmp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + + VERIFY(bmp == &vsp->vns_barrierblk); + mutex_enter(&vsp->vns_lock); + VERIFY(vsp->vns_flags & VNS_F_BARRIER); + VERIFY(!(vsp->vns_flags & VNS_F_BARRIER_DONE)); + vsp->vns_flags |= VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * For better or worse, we have to broadcast here as we could have a + * thread that's blocked for completion as well as one that's blocked + * waiting to do a barrier itself. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * This is a data barrier for the stream while it is in fastpath mode. It blocks + * and ensures that there is nothing else in the squeue. + */ +static void +vnd_strbarrier(vnd_str_t *vsp) +{ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_flags & VNS_F_BARRIER) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags |= VNS_F_BARRIER; + mutex_exit(&vsp->vns_lock); + + gsqueue_enter_one(vsp->vns_squeue, &vsp->vns_barrierblk, + vnd_strbarrier_cb, vsp, GSQUEUE_PROCESS, VND_SQUEUE_TAG_STRBARRIER); + + mutex_enter(&vsp->vns_lock); + while (!(vsp->vns_flags & VNS_F_BARRIER_DONE)) + cv_wait(&vsp->vns_barriercv, &vsp->vns_lock); + vsp->vns_flags &= ~VNS_F_BARRIER; + vsp->vns_flags &= ~VNS_F_BARRIER_DONE; + mutex_exit(&vsp->vns_lock); + + /* + * We have to broadcast in case anyone is waiting for the barrier + * themselves. + */ + cv_broadcast(&vsp->vns_barriercv); +} + +/* + * Based on the type of message that we're dealing with we're going to want to + * do one of several things. Basically if it looks like it's something we know + * about, we should probably handle it in one of our transition threads. + * Otherwise, we should just simply putnext. + */ +static int +vnd_s_rput(queue_t *q, mblk_t *mp) +{ + t_uscalar_t prim; + int dispatch = 0; + vnd_str_t *vsp = q->q_ptr; + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if (MBLKL(mp) < sizeof (t_uscalar_t)) { + vnd_drop_ctl(vsp, mp, "PROTO message too short"); + break; + } + + prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive; + if (prim == DL_UNITDATA_REQ || prim == DL_UNITDATA_IND) { + vnd_drop_ctl(vsp, mp, + "recieved an unsupported dlpi DATA req"); + break; + } + + /* + * Enqueue the entry and fire off a taskq dispatch. + */ + mutex_enter(&vsp->vns_lock); + vnd_dlpi_inc_push(vsp, mp); + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + dispatch = 1; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + } + mutex_exit(&vsp->vns_lock); + if (dispatch != 0) + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, + vsp, 0, &vsp->vns_tqe); + break; + case M_DATA: + vnd_drop_in(vsp, mp, "M_DATA via put(9E)"); + break; + default: + putnext(vsp->vns_rq, mp); + } + return (0); +} + +static void +vnd_strioctl(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct iocblk *iocp) +{ + int error; + vnd_strioc_t *visp; + + if (iocp->ioc_cmd != VND_STRIOC_ASSOCIATE || + iocp->ioc_count != TRANSPARENT) { + error = EINVAL; + goto nak; + } + + /* + * All streams ioctls that we support must use kcred as a means to + * distinguish that this is a layered open by the kernel as opposed to + * one by a user who has done an I_PUSH of the module. + */ + if (iocp->ioc_cr != kcred) { + error = EPERM; + goto nak; + } + + if (mp->b_cont == NULL) { + error = EAGAIN; + goto nak; + } + + visp = kmem_alloc(sizeof (vnd_strioc_t), KM_SLEEP); + ASSERT(MBLKL(mp->b_cont) == sizeof (caddr_t)); + visp->vs_addr = *(caddr_t *)mp->b_cont->b_rptr; + visp->vs_state = VSS_COPYIN; + + mcopyin(mp, (void *)visp, sizeof (vnd_strioc_associate_t), NULL); + qreply(q, mp); + + return; + +nak: + if (mp->b_cont != NULL) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + + iocp->ioc_error = error; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); +} + +static void +vnd_striocdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + int error; + vnd_str_state_t state; + struct copyreq *crp; + vnd_strioc_associate_t *vss; + vnd_dev_t *vdp = NULL; + vnd_pnsd_t *nsp = NULL; + char iname[2*VND_NAMELEN]; + zone_t *zone; + vnd_strioc_t *visp; + + visp = (vnd_strioc_t *)csp->cp_private; + + /* If it's not ours, it's not our problem */ + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYIN failed"); + kmem_free(visp, sizeof (vnd_strioc_t)); + return; + } + + /* Data is sitting for us in b_cont */ + if (mp->b_cont == NULL || + MBLKL(mp->b_cont) != sizeof (vnd_strioc_associate_t)) { + kmem_free(visp, sizeof (vnd_strioc_t)); + miocnak(q, mp, 0, EINVAL); + qreply(q, mp); + return; + } + + vss = (vnd_strioc_associate_t *)mp->b_cont->b_rptr; + vdp = vnd_dev_lookup(vss->vsa_minor); + if (vdp == NULL) { + error = EIO; + vss->vsa_errno = VND_E_NODEV; + goto nak; + } + + nsp = vnd_nsd_lookup(vss->vsa_nsid); + if (nsp == NULL) { + error = EIO; + vss->vsa_errno = VND_E_NONETSTACK; + goto nak; + } + + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_NEED_ZONE)) { + mutex_exit(&vsp->vns_lock); + error = EEXIST; + vss->vsa_errno = VND_E_ASSOCIATED; + goto nak; + } + + vsp->vns_nsd = nsp; + vsp->vns_flags &= ~VNS_F_NEED_ZONE; + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + mutex_exit(&vsp->vns_lock); + + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, 0, + &vsp->vns_tqe); + + + /* At this point we need to wait until we have transitioned to ONLINE */ + mutex_enter(&vsp->vns_lock); + while (vsp->vns_state != VNS_S_ONLINE && vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + state = vsp->vns_state; + mutex_exit(&vsp->vns_lock); + + if (state == VNS_S_ZOMBIE) { + vss->vsa_errno = vsp->vns_errno; + error = EIO; + goto nak; + } + + mutex_enter(&vdp->vdd_lock); + mutex_enter(&vsp->vns_lock); + VERIFY(vdp->vdd_str == NULL); + /* + * Now initialize the remaining kstat properties and let's go ahead and + * create it. + */ + (void) snprintf(iname, sizeof (iname), "z%d_%d", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_minor); + vsp->vns_kstat = kstat_create_zone("vnd", vdp->vdd_minor, iname, "net", + KSTAT_TYPE_NAMED, sizeof (vnd_str_stat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID); + if (vsp->vns_kstat == NULL) { + error = EIO; + vss->vsa_errno = VND_E_KSTATCREATE; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + vdp->vdd_str = vsp; + vsp->vns_dev = vdp; + + /* + * Now, it's time to do the las thing that can fail, changing out the + * input function. After this we know that we can receive data, so we + * should make sure that we're ready. + */ + if (vnd_dld_cap_enable(vsp, vnd_mac_input) != 0) { + error = EIO; + vss->vsa_errno = VND_E_DIRECTFAIL; + vdp->vdd_str = NULL; + vsp->vns_dev = NULL; + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + goto nak; + } + + zone = zone_find_by_id(vdp->vdd_nsd->vpnd_zid); + ASSERT(zone != NULL); + vsp->vns_kstat->ks_data = &vsp->vns_ksdata; + /* Account for zone name */ + vsp->vns_kstat->ks_data_size += strlen(zone->zone_name) + 1; + /* Account for eventual link name */ + vsp->vns_kstat->ks_data_size += VND_NAMELEN; + kstat_named_setstr(&vsp->vns_ksdata.vks_zonename, zone->zone_name); + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + zone_rele(zone); + kstat_install(vsp->vns_kstat); + + mutex_exit(&vsp->vns_lock); + mutex_exit(&vdp->vdd_lock); + + /* + * Note that the vnd_str_t does not keep a permanent hold on the + * vnd_pnsd_t. We leave that up to the vnd_dev_t as that's also what + * the nestack goes through to take care of everything. + */ + vss->vsa_errno = VND_E_SUCCESS; +nak: + if (vdp != NULL) + vnd_dev_rele(vdp); + if (nsp != NULL) + vnd_nsd_rele(nsp); + /* + * Change the copyin request to a copyout. Note that we can't use + * mcopyout here as it only works when the DB_TYPE is M_IOCTL. That's + * okay, as the copyin vs. copyout is basically the same. + */ + DB_TYPE(mp) = M_COPYOUT; + visp->vs_state = VSS_COPYOUT; + crp = (struct copyreq *)mp->b_rptr; + crp->cq_private = (void *)visp; + crp->cq_addr = visp->vs_addr; + crp->cq_size = sizeof (vnd_strioc_associate_t); + qreply(q, mp); +} + +static void +vnd_stroutdata(queue_t *q, vnd_str_t *vsp, mblk_t *mp, struct copyresp *csp) +{ + ASSERT(csp->cp_private != NULL); + kmem_free(csp->cp_private, sizeof (vnd_strioc_t)); + if (csp->cp_cmd != VND_STRIOC_ASSOCIATE) { + if (q->q_next != NULL) { + putnext(q, mp); + } else { + VND_STAT_INC(vsp, vks_ndlpidrops, 1); + VND_STAT_INC(vsp, vks_tdrops, 1); + vnd_drop_ctl(vsp, mp, "uknown cmd for M_IOCDATA"); + } + return; + } + + /* The nak is already sent for us */ + if (csp->cp_rval != 0) { + vnd_drop_ctl(vsp, mp, "M_COPYOUT failed"); + return; + } + + /* Ack and let's be done with it all */ + miocack(q, mp, 0, 0); +} + +static int +vnd_s_wput(queue_t *q, mblk_t *mp) +{ + vnd_str_t *vsp = q->q_ptr; + struct copyresp *crp; + vnd_strioc_state_t vstate; + vnd_strioc_t *visp; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + vnd_strioctl(q, vsp, mp, (struct iocblk *)mp->b_rptr); + return (0); + case M_IOCDATA: + crp = (struct copyresp *)mp->b_rptr; + ASSERT(crp->cp_private != NULL); + visp = (vnd_strioc_t *)crp->cp_private; + vstate = visp->vs_state; + ASSERT(vstate == VSS_COPYIN || vstate == VSS_COPYOUT); + if (vstate == VSS_COPYIN) + vnd_striocdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + else + vnd_stroutdata(q, vsp, mp, + (struct copyresp *)mp->b_rptr); + return (0); + default: + break; + } + if (q->q_next != NULL) + putnext(q, mp); + else + vnd_drop_ctl(vsp, mp, "!M_IOCTL in wput"); + + return (0); +} + +static int +vnd_s_open(queue_t *q, dev_t *devp, int oflag, int sflag, cred_t *credp) +{ + vnd_str_t *vsp; + uint_t rand; + + if (q->q_ptr != NULL) + return (EINVAL); + + if (!(sflag & MODOPEN)) + return (ENXIO); + + if (credp != kcred) + return (EPERM); + + vsp = kmem_cache_alloc(vnd_str_cache, KM_SLEEP); + bzero(vsp, sizeof (*vsp)); + mutex_init(&vsp->vns_lock, NULL, MUTEX_DRIVER, NULL); + cv_init(&vsp->vns_cancelcv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_barriercv, NULL, CV_DRIVER, NULL); + cv_init(&vsp->vns_stcv, NULL, CV_DRIVER, NULL); + vsp->vns_state = VNS_S_INITIAL; + + mutex_init(&vsp->vns_dq_read.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vsp->vns_dq_write.vdq_lock, NULL, MUTEX_DRIVER, NULL); + mutex_enter(&vnd_dev_lock); + vsp->vns_dq_read.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_read.vdq_vns = vsp; + vsp->vns_dq_write.vdq_max = vnd_vdq_default_size; + vsp->vns_dq_write.vdq_vns = vsp; + mutex_exit(&vnd_dev_lock); + vsp->vns_rq = q; + vsp->vns_wq = WR(q); + q->q_ptr = WR(q)->q_ptr = vsp; + vsp->vns_flags = VNS_F_NEED_ZONE; + vsp->vns_nflush = vnd_flush_nburst; + vsp->vns_bsize = vnd_flush_burst_size; + + (void) random_get_pseudo_bytes((uint8_t *)&rand, sizeof (rand)); + vsp->vns_squeue = gsqueue_set_get(vnd_sqset, rand); + + /* + * We create our kstat and initialize all of its fields now, but we + * don't install it until we actually do the zone association so we can + * get everything. + */ + kstat_named_init(&vsp->vns_ksdata.vks_rbytes, "rbytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_rpackets, "rpackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_obytes, "obytes", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_opackets, "opackets", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookindrops, "nhookindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_nhookoutdrops, "nhookoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndlpidrops, "ndlpidrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataindrops, "ndataindrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_ndataoutdrops, "ndataoutdrops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tdrops, "total_drops", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_linkname, "linkname", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_zonename, "zonename", + KSTAT_DATA_STRING); + kstat_named_init(&vsp->vns_ksdata.vks_nmacflow, "flowcontrol_events", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_tmacflow, "flowcontrol_time", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1ms, "flowcontrol_1ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10ms, "flowcontrol_10ms", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_100ms, + "flowcontrol_100ms", KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_1s, "flowcontrol_1s", + KSTAT_DATA_UINT64); + kstat_named_init(&vsp->vns_ksdata.vks_mac_flow_10s, "flowcontrol_10s", + KSTAT_DATA_UINT64); + qprocson(q); + /* + * Now that we've called qprocson, grab the lower module for making sure + * that we don't have any pass through modules. + */ + vsp->vns_lrq = RD(vsp->vns_wq->q_next); + + return (0); +} + +static int +vnd_s_close(queue_t *q, int flag, cred_t *credp) +{ + vnd_str_t *vsp; + mblk_t *mp; + + VERIFY(WR(q)->q_next != NULL); + + vsp = q->q_ptr; + ASSERT(vsp != NULL); + + /* + * We need to transition ourselves down. This means that we have a few + * important different things to do in the process of tearing down our + * input and output buffers, making sure we've drained the current + * squeue, and disabling the fast path. Before we disable the fast path, + * we should make sure the squeue is drained. Because we're in streams + * close, we know that no packets can come into us from userland, but we + * can receive more. As such, the following is the exact order of things + * that we do: + * + * 1) flush the vns_dq_read + * 2) Insert the drain mblk + * 3) When it's been received, tear down the fast path by kicking + * off the state machine. + * 4) One final flush of both the vns_dq_read,vns_dq_write + */ + + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_strbarrier(vsp); + mutex_enter(&vsp->vns_lock); + vsp->vns_state = VNS_S_SHUTTING_DOWN; + if (!(vsp->vns_flags & VNS_F_TASKQ_DISPATCHED)) { + vsp->vns_flags |= VNS_F_TASKQ_DISPATCHED; + taskq_dispatch_ent(vnd_taskq, vnd_dlpi_taskq_dispatch, vsp, + 0, &vsp->vns_tqe); + } + while (vsp->vns_state != VNS_S_ZOMBIE) + cv_wait(&vsp->vns_stcv, &vsp->vns_lock); + mutex_exit(&vsp->vns_lock); + + qprocsoff(q); + mutex_enter(&vsp->vns_lock); + vsp->vns_flags |= VNS_F_CONDEMNED; + while (vsp->vns_flags & VNS_F_TASKQ_DISPATCHED) + cv_wait(&vsp->vns_cancelcv, &vsp->vns_lock); + + while ((mp = vnd_dlpi_inc_pop(vsp)) != NULL) + vnd_drop_ctl(vsp, mp, "vnd_s_close"); + mutex_exit(&vsp->vns_lock); + + q->q_ptr = NULL; + vnd_dq_flush(&vsp->vns_dq_read, vnd_drop_in); + vnd_dq_flush(&vsp->vns_dq_write, vnd_drop_out); + mutex_destroy(&vsp->vns_dq_read.vdq_lock); + mutex_destroy(&vsp->vns_dq_write.vdq_lock); + + if (vsp->vns_kstat != NULL) + kstat_delete(vsp->vns_kstat); + mutex_destroy(&vsp->vns_lock); + cv_destroy(&vsp->vns_stcv); + cv_destroy(&vsp->vns_barriercv); + cv_destroy(&vsp->vns_cancelcv); + kmem_cache_free(vnd_str_cache, vsp); + + return (0); +} + +static vnd_mac_cookie_t +vnd_squeue_tx_one(vnd_str_t *vsp, mblk_t *mp) +{ + hrtime_t txtime; + vnd_mac_cookie_t vc; + + VND_STAT_INC(vsp, vks_opackets, 1); + VND_STAT_INC(vsp, vks_obytes, msgsize(mp)); + DTRACE_VND5(send, mblk_t *, mp, void *, NULL, void *, NULL, + vnd_str_t *, vsp, mblk_t *, mp); + /* Actually tx now */ + txtime = gethrtime(); + vc = vsp->vns_caps.vsc_tx_f(vsp->vns_caps.vsc_tx_hdl, + mp, 0, MAC_DROP_ON_NO_DESC); + + /* + * We need to check two different conditions before we immediately set + * the flow control lock. The first thing that we need to do is verify + * that this is an instance of hard flow control, so to say. The flow + * control callbacks won't always fire in cases where we still get a + * cookie returned. The explicit check for flow control will guarantee + * us that we'll get a subsequent notification callback. + * + * The second case comes about because we do not hold the + * vnd_str_t`vns_lock across calls to tx, we need to determine if a flow + * control notification already came across for us in a different thread + * calling vnd_mac_flow_control(). To deal with this, we record a + * timestamp every time that we change the flow control state. We grab + * txtime here before we transmit because that guarantees that the + * hrtime_t of the call to vnd_mac_flow_control() will be after txtime. + * + * If the flow control notification beat us to the punch, the value of + * vns_fcupdate will be larger than the value of txtime, and we should + * just record the statistics. However, if we didn't beat it to the + * punch (txtime > vns_fcupdate), then we know that it's safe to wait + * for a notification. + */ + if (vc != NULL) { + hrtime_t diff; + + if (vsp->vns_caps.vsc_is_fc_f(vsp->vns_caps.vsc_is_fc_hdl, + vc) == 0) + return (NULL); + mutex_enter(&vsp->vns_lock); + diff = vsp->vns_fcupdate - txtime; + if (diff > 0) { + mutex_exit(&vsp->vns_lock); + vnd_mac_flow_control_stat(vsp, diff); + return (NULL); + } + vsp->vns_flags |= VNS_F_FLOW_CONTROLLED; + vsp->vns_caps.vsc_fc_cookie = vc; + vsp->vns_fclatch = txtime; + vsp->vns_fcupdate = txtime; + DTRACE_VND3(flow__blocked, vnd_str_t *, vsp, + uint64_t, vsp->vns_dq_write.vdq_cur, uintptr_t, vc); + mutex_exit(&vsp->vns_lock); + } + + return (vc); +} + +static void +vnd_squeue_tx_drain(void *arg, mblk_t *drain_mp, gsqueue_t *gsp, void *dummy) +{ + mblk_t *mp; + int nmps; + size_t mptot, nflush, bsize; + boolean_t blocked, empty; + vnd_data_queue_t *vqp; + vnd_str_t *vsp = arg; + + mutex_enter(&vsp->vns_lock); + /* + * We either enter here via an squeue or via vnd_squeue_tx_append(). In + * the former case we need to mark that there is no longer an active + * user of the drain block. + */ + if (drain_mp != NULL) { + VERIFY(drain_mp == &vsp->vns_drainblk); + VERIFY(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED); + vsp->vns_flags &= ~VNS_F_DRAIN_SCHEDULED; + } + + /* + * If we're still flow controlled or under a flush barrier, nothing to + * do. + */ + if (vsp->vns_flags & (VNS_F_FLOW_CONTROLLED | VNS_F_BARRIER)) { + mutex_exit(&vsp->vns_lock); + return; + } + + nflush = vsp->vns_nflush; + bsize = vsp->vns_bsize; + mutex_exit(&vsp->vns_lock); + + nmps = 0; + mptot = 0; + blocked = B_FALSE; + vqp = &vsp->vns_dq_write; + while (nmps < nflush && mptot <= bsize) { + mutex_enter(&vqp->vdq_lock); + if (vnd_dq_pop(vqp, &mp) == 0) { + mutex_exit(&vqp->vdq_lock); + break; + } + mutex_exit(&vqp->vdq_lock); + + nmps++; + mptot += msgsize(mp); + if (vnd_squeue_tx_one(vsp, mp) != NULL) { + blocked = B_TRUE; + break; + } + } + + empty = vnd_dq_is_empty(&vsp->vns_dq_write); + + /* + * If the queue is not empty, we're not blocked, and there isn't a drain + * scheduled, put it into the squeue with the drain block and + * GSQUEUE_FILL. + */ + if (blocked == B_FALSE && empty == B_FALSE) { + mutex_enter(&vsp->vns_lock); + if (!(vsp->vns_flags & VNS_F_DRAIN_SCHEDULED)) { + mblk_t *mp = &vsp->vns_drainblk; + vsp->vns_flags |= VNS_F_DRAIN_SCHEDULED; + gsqueue_enter_one(vsp->vns_squeue, + mp, vnd_squeue_tx_drain, vsp, + GSQUEUE_FILL, VND_SQUEUE_TAG_TX_DRAIN); + } + mutex_exit(&vsp->vns_lock); + } + + /* + * If we drained some amount of data, we need to signal the data queue. + */ + if (nmps > 0) { + cv_broadcast(&vsp->vns_dq_write.vdq_ready); + pollwakeup(&vsp->vns_dev->vdd_ph, POLLOUT); + } +} + +static void +vnd_squeue_tx_append(void *arg, mblk_t *mp, gsqueue_t *gsp, void *dummy) +{ + vnd_str_t *vsp = arg; + vnd_data_queue_t *vqp = &vsp->vns_dq_write; + vnd_pnsd_t *nsp = vsp->vns_nsd; + size_t len = msgsize(mp); + + /* + * Before we append this packet, we should run it through the firewall + * rules. + */ + if (nsp->vpnd_hooked && vnd_hook(vsp, &mp, nsp->vpnd_neti_v4, + nsp->vpnd_event_out_v4, nsp->vpnd_token_out_v4, nsp->vpnd_neti_v6, + nsp->vpnd_event_out_v6, nsp->vpnd_token_out_v6, vnd_drop_hook_out, + vnd_drop_out) != 0) { + /* + * Because we earlier reserved space for this packet and it's + * not making the cut, we need to go through and unreserve that + * space. Also note that the message block will likely be freed + * by the time we return from vnd_hook so we cannot rely on it. + */ + mutex_enter(&vqp->vdq_lock); + vnd_dq_unreserve(vqp, len); + mutex_exit(&vqp->vdq_lock); + return; + } + + /* + * We earlier reserved space for this packet. So for now simply append + * it and call drain. We know that no other drain can be going on right + * now thanks to the squeue. + */ + mutex_enter(&vqp->vdq_lock); + (void) vnd_dq_push(&vsp->vns_dq_write, mp, B_TRUE, vnd_drop_panic); + mutex_exit(&vqp->vdq_lock); + vnd_squeue_tx_drain(vsp, NULL, NULL, NULL); +} + +/* + * We need to see if this is a valid name of sorts for us. That means a few + * things. First off, we can't assume that what we've been given has actually + * been null terminated. More importantly, that it's a valid name as far as + * ddi_create_minor_node is concerned (that means no '@', '/', or ' '). We + * further constrain ourselves to simply alphanumeric characters and a few + * additional ones, ':', '-', and '_'. + */ +static int +vnd_validate_name(const char *buf, size_t buflen) +{ + int i, len; + + /* First make sure a null terminator exists */ + for (i = 0; i < buflen; i++) + if (buf[i] == '\0') + break; + len = i; + if (i == 0 || i == buflen) + return (0); + + for (i = 0; i < len; i++) + if (!isalnum(buf[i]) && buf[i] != ':' && buf[i] != '-' && + buf[i] != '_') + return (0); + + return (1); +} + +static int +vnd_ioctl_attach(vnd_dev_t *vdp, uintptr_t arg, cred_t *credp, int cpflag) +{ + vnd_ioc_attach_t via; + vnd_strioc_associate_t vss; + vnd_pnsd_t *nsp; + zone_t *zonep; + zoneid_t zid; + char buf[2*VND_NAMELEN]; + int ret, rp; + + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &via, sizeof (via), cpflag) != 0) + return (EFAULT); + via.via_errno = VND_E_SUCCESS; + + if (vnd_validate_name(via.via_name, VND_NAMELEN) == 0) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + + /* + * Only the global zone can request to create a device in a different + * zone. + */ + zid = crgetzoneid(credp); + if (zid != GLOBAL_ZONEID && via.via_zoneid != -1 && + zid != via.via_zoneid) { + via.via_errno = VND_E_PERM; + ret = EIO; + goto errcopyout; + } + + if (via.via_zoneid == -1) + via.via_zoneid = zid; + + /* + * Establish the name we'll use now. We want to be extra paranoid about + * the device we're opening so check that now. + */ + if (zid == GLOBAL_ZONEID && via.via_zoneid != zid) { + zonep = zone_find_by_id(via.via_zoneid); + if (zonep == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + if (snprintf(NULL, 0, "/dev/net/zone/%s/%s", zonep->zone_name, + via.via_name) >= sizeof (buf)) { + zone_rele(zonep); + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/zone/%s/%s", + zonep->zone_name, via.via_name); + zone_rele(zonep); + zonep = NULL; + } else { + if (snprintf(NULL, 0, "/dev/net/%s", via.via_name) >= + sizeof (buf)) { + via.via_errno = VND_E_BADNAME; + ret = EIO; + goto errcopyout; + } + (void) snprintf(buf, sizeof (buf), "/dev/net/%s", via.via_name); + } + + /* + * If our zone is dying then the netstack will have been removed from + * this list. + */ + nsp = vnd_nsd_lookup_by_zid(via.via_zoneid); + if (nsp == NULL) { + via.via_errno = VND_E_NOZONE; + ret = EIO; + goto errcopyout; + } + + /* + * Note we set the attached handle even though we haven't actually + * finished the process of attaching the ldi handle. + */ + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & (VND_D_ATTACHED | VND_D_ATTACH_INFLIGHT)) { + mutex_exit(&vdp->vdd_lock); + vnd_nsd_rele(nsp); + via.via_errno = VND_E_ATTACHED; + ret = EIO; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_ATTACH_INFLIGHT; + ASSERT(vdp->vdd_cr == NULL); + crhold(credp); + vdp->vdd_cr = credp; + ASSERT(vdp->vdd_nsd == NULL); + vdp->vdd_nsd = nsp; + mutex_exit(&vdp->vdd_lock); + + /* + * Place an additional hold on the vnd_pnsd_t as we go through and do + * all of the rest of our work. This will be the hold that we keep for + * as long as this thing is attached. + */ + vnd_nsd_ref(nsp); + + ret = ldi_open_by_name(buf, FREAD | FWRITE, vdp->vdd_cr, + &vdp->vdd_ldih, vdp->vdd_ldiid); + if (ret != 0) { + if (ret == ENODEV) + via.via_errno = VND_E_NODATALINK; + goto err; + } + + /* + * Unfortunately the I_PUSH interface doesn't allow us a way to detect + * whether or not we're coming in from a layered device. We really want + * to make sure that a normal user can't push on our streams module. + * Currently the only idea I have for this is to make sure that the + * credp is kcred which is really terrible. + */ + ret = ldi_ioctl(vdp->vdd_ldih, I_PUSH, (intptr_t)"vnd", FKIOCTL, + kcred, &rp); + if (ret != 0) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + via.via_errno = VND_E_STRINIT; + ret = EIO; + goto err; + } + + vss.vsa_minor = vdp->vdd_minor; + vss.vsa_nsid = nsp->vpnd_nsid; + + ret = ldi_ioctl(vdp->vdd_ldih, VND_STRIOC_ASSOCIATE, (intptr_t)&vss, + FKIOCTL, kcred, &rp); + if (ret != 0 || vss.vsa_errno != VND_E_SUCCESS) { + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + if (ret == 0) { + via.via_errno = vss.vsa_errno; + ret = EIO; + } + goto err; + } + + mutex_enter(&vdp->vdd_nsd->vpnd_lock); + + /* + * There's a chance that our netstack was condemned while we've had a + * hold on it. As such we need to check and if so, error out. + */ + if (vdp->vdd_nsd->vpnd_flags & VND_NS_CONDEMNED) { + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + rp = ldi_close(vdp->vdd_ldih, FREAD | FWRITE, vdp->vdd_cr); + VERIFY(rp == 0); + ret = EIO; + via.via_errno = VND_E_NOZONE; + goto err; + } + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_str != NULL); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + vdp->vdd_flags |= VND_D_ATTACHED; + (void) strlcpy(vdp->vdd_datalink, via.via_name, + sizeof (vdp->vdd_datalink)); + list_insert_tail(&vdp->vdd_nsd->vpnd_dev_list, vdp); + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vdp->vdd_nsd->vpnd_lock); + vnd_nsd_rele(nsp); + + return (0); + +err: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_ATTACH_INFLIGHT; + crfree(vdp->vdd_cr); + vdp->vdd_cr = NULL; + vdp->vdd_nsd = NULL; + mutex_exit(&vdp->vdd_lock); + + /* + * We have two holds to drop here. One for our original reference and + * one for the hold this operation would have represented. + */ + vnd_nsd_rele(nsp); + vnd_nsd_rele(nsp); +errcopyout: + if (ddi_copyout(&via, (void *)arg, sizeof (via), cpflag) != 0) + ret = EFAULT; + + return (ret); +} + +static int +vnd_ioctl_link(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret = 0; + vnd_ioc_link_t vil; + char mname[2*VND_NAMELEN]; + char **c; + vnd_dev_t *v; + zoneid_t zid; + + /* Not anyone can link something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + if (ddi_copyin((void *)arg, &vil, sizeof (vil), cpflag) != 0) + return (EFAULT); + + if (vnd_validate_name(vil.vil_name, VND_NAMELEN) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + c = vnd_reserved_names; + while (*c != NULL) { + if (strcmp(vil.vil_name, *c) == 0) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + c++; + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOTATTACHED; + goto errcopyout; + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto errcopyout; + } + + if (vdp->vdd_flags & (VND_D_LINK_INFLIGHT | VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKED; + goto errcopyout; + } + vdp->vdd_flags |= VND_D_LINK_INFLIGHT; + zid = vdp->vdd_nsd->vpnd_zid; + mutex_exit(&vdp->vdd_lock); + + if (snprintf(NULL, 0, "z%d:%s", zid, vil.vil_name) >= + sizeof (mname)) { + ret = EIO; + vil.vil_errno = VND_E_BADNAME; + goto errcopyout; + } + + mutex_enter(&vnd_dev_lock); + for (v = list_head(&vnd_dev_list); v != NULL; + v = list_next(&vnd_dev_list, v)) { + if (!(v->vdd_flags & VND_D_LINKED)) + continue; + + if (v->vdd_nsd->vpnd_zid == zid && + strcmp(v->vdd_lname, vil.vil_name) == 0) { + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_LINKEXISTS; + goto error; + } + } + + /* + * We set the name and mark ourselves attached while holding the list + * lock to ensure that no other user can mistakingly find our name. + */ + (void) snprintf(mname, sizeof (mname), "z%d:%s", zid, + vil.vil_name); + mutex_enter(&vdp->vdd_lock); + + /* + * Because we dropped our lock, we need to double check whether or not + * the zone was marked as dying while we were here. If it hasn't, then + * it's safe for us to link it in. + */ + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + ret = EIO; + vil.vil_errno = VND_E_NOZONE; + goto error; + } + + (void) strlcpy(vdp->vdd_lname, vil.vil_name, sizeof (vdp->vdd_lname)); + if (ddi_create_minor_node(vnd_dip, mname, S_IFCHR, vdp->vdd_minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + ret = EIO; + vil.vil_errno = VND_E_MINORNODE; + } else { + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_flags |= VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + ret = 0; + } + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + + if (ret == 0) { + /* + * Add a reference to represent that this device is linked into + * the file system name space to ensure that it doesn't + * disappear. + */ + vnd_dev_ref(vdp); + return (0); + } + +error: + mutex_enter(&vdp->vdd_lock); + vdp->vdd_flags &= ~VND_D_LINK_INFLIGHT; + vdp->vdd_lname[0] = '\0'; + mutex_exit(&vdp->vdd_lock); + +errcopyout: + if (ddi_copyout(&vil, (void *)arg, sizeof (vil), cpflag) != 0) + ret = EFAULT; + return (ret); +} + +/* + * Common unlink function. This is used both from the ioctl path and from the + * netstack shutdown path. The caller is required to hold the mutex on the + * vnd_dev_t, but they basically will have it relinquished for them. The only + * thing the caller is allowed to do afterward is to potentially rele the + * vnd_dev_t if they have their own hold. Note that only the ioctl path has its + * own hold. + */ +static void +vnd_dev_unlink(vnd_dev_t *vdp) +{ + char mname[2*VND_NAMELEN]; + + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + (void) snprintf(mname, sizeof (mname), "z%d:%s", + vdp->vdd_nsd->vpnd_zid, vdp->vdd_lname); + ddi_remove_minor_node(vnd_dip, mname); + vdp->vdd_lname[0] = '\0'; + vdp->vdd_flags &= ~VND_D_LINKED; + kstat_named_setstr(&vdp->vdd_str->vns_ksdata.vks_linkname, + vdp->vdd_lname); + mutex_exit(&vdp->vdd_lock); + + /* + * This rele corresponds to the reference that we took in + * vnd_ioctl_link. + */ + vnd_dev_rele(vdp); +} + +static int +vnd_ioctl_unlink(vnd_dev_t *vdp, intptr_t arg, cred_t *credp, int cpflag) +{ + int ret; + zoneid_t zid; + vnd_ioc_unlink_t viu; + + /* Not anyone can unlink something */ + if (secpolicy_net_config(credp, B_FALSE) != 0) + return (EPERM); + + zid = crgetzoneid(credp); + + if (ddi_copyin((void *)arg, &viu, sizeof (viu), cpflag) != 0) + return (EFAULT); + + viu.viu_errno = VND_E_SUCCESS; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_NOTLINKED; + goto err; + } + VERIFY(vdp->vdd_flags & VND_D_ATTACHED); + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + ret = EIO; + viu.viu_errno = VND_E_PERM; + goto err; + } + + /* vnd_dev_unlink releases the vdp mutex for us */ + vnd_dev_unlink(vdp); + ret = 0; +err: + if (ddi_copyout(&viu, (void *)arg, sizeof (viu), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_setrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vdp->vdd_str->vns_dq_read.vdq_max = vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getrxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_read.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_read.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_read.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_getmaxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + vnd_ioc_buf_t vib; + + mutex_enter(&vnd_dev_lock); + vib.vib_size = vnd_vdq_hard_max; + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_ioctl_gettxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vib.vib_size = vdp->vdd_str->vns_dq_write.vdq_max; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_settxbuf(vnd_dev_t *vdp, intptr_t arg, int cpflag) +{ + int ret; + vnd_ioc_buf_t vib; + + if (ddi_copyin((void *)arg, &vib, sizeof (vib), cpflag) != 0) + return (EFAULT); + + mutex_enter(&vnd_dev_lock); + if (vib.vib_size > vnd_vdq_hard_max) { + mutex_exit(&vnd_dev_lock); + vib.vib_errno = VND_E_BUFTOOBIG; + ret = EIO; + goto err; + } + mutex_exit(&vnd_dev_lock); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_NOTATTACHED; + ret = EIO; + goto err; + } + + mutex_enter(&vdp->vdd_str->vns_lock); + if (vib.vib_size < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + mutex_exit(&vdp->vdd_lock); + vib.vib_errno = VND_E_BUFTOOSMALL; + ret = EIO; + goto err; + } + mutex_exit(&vdp->vdd_str->vns_lock); + + mutex_enter(&vdp->vdd_str->vns_dq_write.vdq_lock); + vdp->vdd_str->vns_dq_write.vdq_max = vib.vib_size; + mutex_exit(&vdp->vdd_str->vns_dq_write.vdq_lock); + mutex_exit(&vdp->vdd_lock); + ret = 0; + +err: + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), cpflag) != 0) + return (EFAULT); + + return (ret); +} + +static int +vnd_ioctl_gettu(vnd_dev_t *vdp, intptr_t arg, int mode, boolean_t min) +{ + vnd_ioc_buf_t vib; + + vib.vib_errno = 0; + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED) { + mutex_enter(&vdp->vdd_str->vns_lock); + if (min == B_TRUE) + vib.vib_size = vdp->vdd_str->vns_minwrite; + else + vib.vib_size = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + } else { + vib.vib_errno = VND_E_NOTATTACHED; + } + mutex_exit(&vdp->vdd_lock); + + if (ddi_copyout(&vib, (void *)arg, sizeof (vib), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + +static int +vnd_frameio_read(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + int ret, nonblock, nwrite; + frameio_t *fio; + vnd_data_queue_t *vqp; + mblk_t *mp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (const void *)addr, + mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EWOULDBLOCK); + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + } + + ret = frameio_mblk_chain_write(fio, MAP_BLK_FRAME, vqp->vdq_head, + &nwrite, mode & FKIOCTL); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + ret = frameio_hdr_copyout(fio, nwrite, (void *)addr, mode); + if (ret != 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (ret); + } + + while (nwrite > 0) { + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); + nwrite--; + } + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + + return (0); +} + +static int +vnd_frameio_write(vnd_dev_t *vdp, intptr_t addr, int mode) +{ + frameio_t *fio; + int ret, nonblock, nframes, i, nread; + size_t maxwrite, minwrite, total, flen; + mblk_t *mp_chain, *mp, *nmp; + vnd_data_queue_t *vqp; + + fio = frameio_alloc(KM_NOSLEEP | KM_NORMALPRI); + if (fio == NULL) + return (EAGAIN); + + ret = frameio_hdr_copyin(fio, FRAMEIO_NVECS_MAX, (void *)addr, mode); + if (ret != 0) { + frameio_free(fio); + return (ret); + } + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + frameio_free(fio); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + nonblock = mode & (FNONBLOCK | FNDELAY); + + /* + * Make sure no single frame is larger than we can accept. + */ + mutex_enter(&vdp->vdd_str->vns_lock); + minwrite = vdp->vdd_str->vns_minwrite; + maxwrite = vdp->vdd_str->vns_maxwrite; + mutex_exit(&vdp->vdd_str->vns_lock); + + nframes = fio->fio_nvpf / fio->fio_nvecs; + total = 0; + for (i = 0; i < nframes; i++) { + flen = frameio_frame_length(fio, + &fio->fio_vecs[i*fio->fio_nvpf]); + if (flen < minwrite || flen > maxwrite) { + frameio_free(fio); + return (ERANGE); + } + total += flen; + } + + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, total) == 0) { + if (nonblock != 0) { + frameio_free(fio); + mutex_exit(&vqp->vdq_lock); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + frameio_free(fio); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * We've reserved our space, let's copyin and go from here. + */ + ret = frameio_mblk_chain_read(fio, &mp_chain, &nread, mode & FKIOCTL); + if (ret != 0) { + frameio_free(fio); + vnd_dq_unreserve(vqp, total); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + return (ret); + } + + for (mp = mp_chain; mp != NULL; mp = nmp) { + nmp = mp->b_next; + mp->b_next = NULL; + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + } + + /* + * Update the frameio structure to indicate that we wrote those frames. + */ + frameio_mark_consumed(fio, nread); + ret = frameio_hdr_copyout(fio, nread, (void *)addr, mode); + frameio_free(fio); + + return (ret); +} + +static int +vnd_ioctl_list_copy_info(vnd_dev_t *vdp, vnd_ioc_info_t *arg, int mode) +{ + const char *link; + uint32_t vers = 1; + ASSERT(MUTEX_HELD(&vdp->vdd_lock)); + + /* + * Copy all of the members out to userland. + */ + if (ddi_copyout(&vers, &arg->vii_version, sizeof (uint32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (vdp->vdd_flags & VND_D_LINKED) + link = vdp->vdd_lname; + else + link = "<anonymous>"; + if (ddi_copyout(link, arg->vii_name, sizeof (arg->vii_name), + mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(vdp->vdd_datalink, arg->vii_datalink, + sizeof (arg->vii_datalink), mode & FKIOCTL) != 0) + return (EFAULT); + + if (ddi_copyout(&vdp->vdd_nsd->vpnd_zid, &arg->vii_zone, + sizeof (zoneid_t), mode & FKIOCTL) != 0) + return (EFAULT); + return (0); +} + +static int +vnd_ioctl_list(intptr_t arg, cred_t *credp, int mode) +{ + vnd_ioc_list_t vl; + vnd_ioc_list32_t vl32; + zoneid_t zid; + vnd_dev_t *vdp; + vnd_ioc_info_t *vip; + int found, cancopy, ret; + + if (ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) { + if (ddi_copyin((void *)arg, &vl32, sizeof (vnd_ioc_list32_t), + mode & FKIOCTL) != 0) + return (EFAULT); + vl.vl_nents = vl32.vl_nents; + vl.vl_actents = vl32.vl_actents; + vl.vl_ents = (void *)(uintptr_t)vl32.vl_ents; + } else { + if (ddi_copyin((void *)arg, &vl, sizeof (vnd_ioc_list_t), + mode & FKIOCTL) != 0) + return (EFAULT); + } + + cancopy = vl.vl_nents; + vip = vl.vl_ents; + found = 0; + zid = crgetzoneid(credp); + mutex_enter(&vnd_dev_lock); + for (vdp = list_head(&vnd_dev_list); vdp != NULL; + vdp = list_next(&vnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (vdp->vdd_flags & VND_D_ATTACHED && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING)) && + (zid == GLOBAL_ZONEID || zid == vdp->vdd_nsd->vpnd_zid)) { + found++; + if (cancopy > 0) { + ret = vnd_ioctl_list_copy_info(vdp, vip, mode); + if (ret != 0) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&vnd_dev_lock); + return (ret); + } + cancopy--; + vip++; + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&vnd_dev_lock); + + if (ddi_copyout(&found, &((vnd_ioc_list_t *)arg)->vl_actents, + sizeof (uint_t), mode & FKIOCTL) != 0) + return (EFAULT); + + return (0); +} + + +static int +vnd_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, + int *rvalp) +{ + int ret; + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + ASSERT(m != 0); + + /* + * Make sure no one has come in on an ioctl from the strioc case. + */ + if ((cmd & VND_STRIOC) == VND_STRIOC) + return (ENOTTY); + + /* + * Like close, seems like if this minor isn't found, it's a programmer + * error somehow. + */ + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + switch (cmd) { + case VND_IOC_ATTACH: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_attach(vdp, arg, credp, mode); + break; + case VND_IOC_LINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_link(vdp, arg, credp, mode); + break; + case VND_IOC_UNLINK: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_unlink(vdp, arg, credp, mode); + break; + case VND_IOC_GETRXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_getrxbuf(vdp, arg, mode); + break; + case VND_IOC_SETRXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_setrxbuf(vdp, arg, mode); + break; + case VND_IOC_GETTXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettxbuf(vdp, arg, mode); + break; + case VND_IOC_SETTXBUF: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_settxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMAXBUF: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + if (crgetzoneid(credp) != GLOBAL_ZONEID) { + ret = EPERM; + break; + } + ret = vnd_ioctl_getmaxbuf(vdp, arg, mode); + break; + case VND_IOC_GETMINTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_TRUE); + break; + case VND_IOC_GETMAXTU: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_gettu(vdp, arg, mode, B_FALSE); + break; + case VND_IOC_FRAMEIO_READ: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_frameio_read(vdp, arg, mode); + break; + case VND_IOC_FRAMEIO_WRITE: + if (!(mode & FWRITE)) { + ret = EBADF; + break; + } + ret = vnd_frameio_write(vdp, arg, mode); + break; + case VND_IOC_LIST: + if (!(mode & FREAD)) { + ret = EBADF; + break; + } + ret = vnd_ioctl_list(arg, credp, mode); + break; + default: + ret = ENOTTY; + break; + } + + vnd_dev_rele(vdp); + return (ret); +} + +static int +vnd_open(dev_t *devp, int flag, int otyp, cred_t *credp) +{ + vnd_dev_t *vdp; + minor_t m; + zoneid_t zid; + + if (flag & (FEXCL | FNDELAY)) + return (ENOTSUP); + + if (otyp & OTYP_BLK) + return (ENOTSUP); + + zid = crgetzoneid(credp); + m = getminor(*devp); + + /* + * If we have an open of a non-zero instance then we need to look that + * up in our list of entries. + */ + if (m != 0) { + + /* + * We don't check for rawaccess globally as a user could be + * doing a list ioctl on the control node which doesn't require + * this privilege. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENOENT); + + /* + * We need to check to make sure that the user is allowed to + * open this node. At this point it should be an attached handle + * as that's all we're allowed to access. + */ + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (vdp->vdd_flags & VND_D_ZONE_DYING) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if (zid != GLOBAL_ZONEID && zid != vdp->vdd_nsd->vpnd_zid) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENOENT); + } + + if ((flag & FEXCL) && (vdp->vdd_flags & VND_D_OPENED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (EBUSY); + } + + if (!(vdp->vdd_flags & VND_D_OPENED)) { + vdp->vdd_flags |= VND_D_OPENED; + vdp->vdd_ref++; + DTRACE_VND_REFINC(vdp); + } + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + + return (0); + } + + if (flag & FEXCL) + return (ENOTSUP); + + /* + * We need to clone ourselves and set up new a state. + */ + vdp = kmem_cache_alloc(vnd_dev_cache, KM_SLEEP); + bzero(vdp, sizeof (vnd_dev_t)); + + if (ldi_ident_from_dev(*devp, &vdp->vdd_ldiid) != 0) { + kmem_cache_free(vnd_dev_cache, vdp); + return (EINVAL); + } + + vdp->vdd_minor = id_alloc(vnd_minors); + mutex_init(&vdp->vdd_lock, NULL, MUTEX_DRIVER, NULL); + list_link_init(&vdp->vdd_link); + vdp->vdd_ref = 1; + *devp = makedevice(getmajor(*devp), vdp->vdd_minor); + vdp->vdd_devid = *devp; + DTRACE_VND_REFINC(vdp); + vdp->vdd_flags |= VND_D_OPENED; + + mutex_enter(&vnd_dev_lock); + list_insert_head(&vnd_dev_list, vdp); + mutex_exit(&vnd_dev_lock); + + return (0); +} + +static int +vnd_close(dev_t dev, int flag, int otyp, cred_t *credp) +{ + minor_t m; + vnd_dev_t *vdp; + + m = getminor(dev); + if (m == 0) + return (ENXIO); + + vdp = vnd_dev_lookup(m); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + VERIFY(vdp->vdd_flags & VND_D_OPENED); + vdp->vdd_flags &= ~VND_D_OPENED; + mutex_exit(&vdp->vdd_lock); + + /* Remove the hold from the previous open. */ + vnd_dev_rele(vdp); + + /* And now from lookup */ + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_read(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error = 0; + size_t mpsize; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + mblk_t *mp = NULL; + offset_t u_loffset; + + /* + * If we have more than one uio we refuse to do anything. That's for + * frameio. + */ + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + + /* Check empty case */ + if (vqp->vdq_cur == 0) { + if (nonblock != 0) { + error = EWOULDBLOCK; + goto err; + } + while (vqp->vdq_cur == 0) { + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + error = EINTR; + goto err; + } + } + } + + /* Ensure our buffer is big enough */ + mp = vqp->vdq_head; + ASSERT(mp != NULL); + mpsize = msgsize(mp); + if (mpsize > uiop->uio_resid) { + error = EOVERFLOW; + goto err; + } + + u_loffset = uiop->uio_loffset; + while (mp != NULL) { + if (uiomove(mp->b_rptr, MBLKL(mp), UIO_READ, uiop) != 0) { + error = EFAULT; + uiop->uio_loffset = u_loffset; + mp = NULL; + goto err; + } + mpsize -= MBLKL(mp); + mp = mp->b_cont; + } + ASSERT(mpsize == 0); + (void) vnd_dq_pop(vqp, &mp); + freemsg(mp); +err: + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + + return (error); +} + +static int +vnd_write(dev_t dev, struct uio *uiop, cred_t *credp) +{ + int nonblock, error; + vnd_dev_t *vdp; + mblk_t *mp; + ssize_t iosize, origsize; + vnd_data_queue_t *vqp; + + if (uiop->uio_iovcnt > 1) + return (EINVAL); + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + nonblock = uiop->uio_fmode & (FNONBLOCK | FNDELAY); + + mutex_enter(&vdp->vdd_str->vns_lock); + if (uiop->uio_resid > vdp->vdd_str->vns_maxwrite || + uiop->uio_resid < vdp->vdd_str->vns_minwrite) { + mutex_exit(&vdp->vdd_str->vns_lock); + vnd_dev_rele(vdp); + return (ERANGE); + } + mutex_exit(&vdp->vdd_str->vns_lock); + VERIFY(vdp->vdd_str != NULL); + + /* + * Reserve space in the data queue if we can. If we can't, block or + * return EAGAIN. If we can, go and squeue_enter. + */ + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + while (vnd_dq_reserve(vqp, uiop->uio_resid) == 0) { + if (nonblock != 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EAGAIN); + } + if (cv_wait_sig(&vqp->vdq_ready, &vqp->vdq_lock) <= 0) { + mutex_exit(&vqp->vdq_lock); + vnd_dev_rele(vdp); + return (EINTR); + } + } + mutex_exit(&vqp->vdq_lock); + + /* + * Now that we've reserved the space, try to allocate kernel space for + * and copy in the block. To take care of all this we use the + * strmakedata subroutine for now. + */ + origsize = iosize = uiop->uio_resid; + error = strmakedata(&iosize, uiop, vdp->vdd_str->vns_wq->q_stream, 0, + &mp); + + /* + * strmakedata() will return an error or it may only consume a portion + * of the data. + */ + if (error != 0 || uiop->uio_resid != 0) { + vnd_dq_unreserve(vqp, origsize); + cv_broadcast(&vqp->vdq_ready); + pollwakeup(&vdp->vdd_ph, POLLOUT); + vnd_dev_rele(vdp); + return (ENOSR); + } + + gsqueue_enter_one(vdp->vdd_str->vns_squeue, mp, + vnd_squeue_tx_append, vdp->vdd_str, GSQUEUE_PROCESS, + VND_SQUEUE_TAG_VND_WRITE); + + vnd_dev_rele(vdp); + return (0); +} + +static int +vnd_chpoll(dev_t dev, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int ready = 0; + vnd_dev_t *vdp; + vnd_data_queue_t *vqp; + + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (ENXIO); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_ATTACHED)) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (ENXIO); + } + mutex_exit(&vdp->vdd_lock); + + if ((events & POLLIN) || (events & POLLRDNORM)) { + vqp = &vdp->vdd_str->vns_dq_read; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_head != NULL) + ready |= events & (POLLIN | POLLRDNORM); + mutex_exit(&vqp->vdq_lock); + } + + if (events & POLLOUT) { + vqp = &vdp->vdd_str->vns_dq_write; + mutex_enter(&vqp->vdq_lock); + if (vqp->vdq_cur != vqp->vdq_max) + ready |= POLLOUT; + mutex_exit(&vqp->vdq_lock); + } + + if (ready != 0) { + *reventsp = ready; + vnd_dev_rele(vdp); + return (0); + } + + *reventsp = 0; + if (!anyyet) + *phpp = &vdp->vdd_ph; + + vnd_dev_rele(vdp); + return (0); +} + +static void * +vnd_stack_init(netstackid_t stackid, netstack_t *ns) +{ + vnd_pnsd_t *nsp; + + nsp = kmem_cache_alloc(vnd_pnsd_cache, KM_SLEEP); + bzero(nsp, sizeof (*nsp)); + nsp->vpnd_nsid = stackid; + nsp->vpnd_zid = netstackid_to_zoneid(stackid); + nsp->vpnd_flags = 0; + mutex_init(&nsp->vpnd_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&nsp->vpnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_nslink)); + if (vnd_netinfo_init(nsp) == 0) + nsp->vpnd_hooked = B_TRUE; + + mutex_enter(&vnd_dev_lock); + list_insert_tail(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + return (nsp); +} + +static void +vnd_stack_shutdown(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + vnd_dev_t *vdp; + + ASSERT(nsp != NULL); + /* + * After shut down no one should be able to find their way to this + * netstack again. + */ + mutex_enter(&vnd_dev_lock); + list_remove(&vnd_nsd_list, nsp); + mutex_exit(&vnd_dev_lock); + + /* + * Make sure hooks know that they're going away. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_shutdown(nsp); + + /* + * Now we need to go through and notify each zone that they are in + * teardown phase. See the big theory statement section on vnd, zones, + * netstacks, and sdev for more information about this. + */ + mutex_enter(&nsp->vpnd_lock); + nsp->vpnd_flags |= VND_NS_CONDEMNED; + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_CONDEMNED)) + vdp->vdd_flags |= VND_D_ZONE_DYING; + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + /* + * Next we remove all the links as we know nothing new can be added to + * the list and that none of the extent devices can obtain additional + * links. + */ +restart: + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_CONDEMNED) || + !(vdp->vdd_flags & VND_D_LINKED)) { + mutex_exit(&vdp->vdd_lock); + continue; + } + + /* + * We drop our lock here and restart afterwards. Note that as + * part of unlinking we end up doing a rele of the vnd_dev_t. If + * this is the final hold on the vnd_dev_t then it might try and + * remove itself. Our locking rules requires not to be holding + * any locks when we call any of the rele functions. + * + * Note that the unlink function requires holders to call into + * it with the vnd_dev_t->vdd_lock held and will take care of it + * for us. Because we don't have a hold on it, we're done at + * this point. + */ + mutex_exit(&nsp->vpnd_lock); + /* Forcibly unlink */ + vnd_dev_unlink(vdp); + goto restart; + } + mutex_exit(&nsp->vpnd_lock); +} + +static void +vnd_stack_destroy(netstackid_t stackid, void *arg) +{ + vnd_pnsd_t *nsp = arg; + + ASSERT(nsp != NULL); + + /* + * Now that we've unlinked everything we just have to hang out for + * it to finish exiting. Now that it's no longer the kernel itself + * that's doing this we just need to wait for our reference count to + * equal zero and then we're free. If the global zone is holding open a + * reference to a vnd device for another zone, that's bad, but there's + * nothing much we can do. See the section on 'vnd, zones, netstacks' in + * the big theory statement for more information. + */ + mutex_enter(&nsp->vpnd_lock); + while (nsp->vpnd_ref != 0) + cv_wait(&nsp->vpnd_ref_change, &nsp->vpnd_lock); + mutex_exit(&nsp->vpnd_lock); + + /* + * During shutdown we removed ourselves from the list and now we have no + * more references so we can safely say that there is nothing left and + * destroy everything that we had sitting around. + */ + if (nsp->vpnd_hooked == B_TRUE) + vnd_netinfo_fini(nsp); + + mutex_destroy(&nsp->vpnd_lock); + list_destroy(&nsp->vpnd_dev_list); + kmem_cache_free(vnd_pnsd_cache, nsp); +} + +/* + * Convert a node with a name of the form /dev/vnd/zone/%zonename and + * /dev/vnd/zone/%zonename/%linkname to the corresponding vnd netstack. + */ +static vnd_pnsd_t * +vnd_sdev_ctx_to_ns(sdev_ctx_t ctx) +{ + enum vtype vt; + const char *path = sdev_ctx_path(ctx); + char *zstart, *dup; + size_t duplen; + vnd_pnsd_t *nsp; + + vt = sdev_ctx_vtype(ctx); + ASSERT(strncmp(path, VND_SDEV_ZROOT, strlen(VND_SDEV_ZROOT)) == 0); + + if (vt == VDIR) { + zstart = strrchr(path, '/'); + ASSERT(zstart != NULL); + zstart++; + return (vnd_nsd_lookup_by_zonename(zstart)); + } + + ASSERT(vt == VCHR); + + dup = strdup(path); + duplen = strlen(dup) + 1; + zstart = strrchr(dup, '/'); + *zstart = '\0'; + zstart--; + zstart = strrchr(dup, '/'); + zstart++; + nsp = vnd_nsd_lookup_by_zonename(zstart); + kmem_free(dup, duplen); + + return (nsp); +} + +static sdev_plugin_validate_t +vnd_sdev_validate_dir(sdev_ctx_t ctx) +{ + vnd_pnsd_t *nsp; + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ROOT) == 0) + return (SDEV_VTOR_VALID); + + if (strcmp(sdev_ctx_path(ctx), VND_SDEV_ZROOT) == 0) { + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + return (SDEV_VTOR_VALID); + } + + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (SDEV_VTOR_INVALID); + vnd_nsd_rele(nsp); + + return (SDEV_VTOR_VALID); +} + +static sdev_plugin_validate_t +vnd_sdev_validate(sdev_ctx_t ctx) +{ + enum vtype vt; + dev_t dev; + vnd_dev_t *vdp; + + vt = sdev_ctx_vtype(ctx); + if (vt == VDIR) + return (vnd_sdev_validate_dir(ctx)); + ASSERT(vt == VCHR); + + if (strcmp("ctl", sdev_ctx_name(ctx)) == 0) + return (SDEV_VTOR_VALID); + + dev = (uintptr_t)sdev_ctx_vtype_data(ctx); + vdp = vnd_dev_lookup(getminor(dev)); + if (vdp == NULL) + return (SDEV_VTOR_STALE); + + mutex_enter(&vdp->vdd_lock); + if (!(vdp->vdd_flags & VND_D_LINKED) || + (vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + if (strcmp(sdev_ctx_name(ctx), vdp->vdd_lname) != 0) { + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_STALE); + } + + mutex_exit(&vdp->vdd_lock); + vnd_dev_rele(vdp); + return (SDEV_VTOR_VALID); +} + +/* + * This function is a no-op. sdev never has holds on our devices as they can go + * away at any time and specfs has to deal with that fact. + */ +static void +vnd_sdev_inactive(sdev_ctx_t ctx) +{ +} + +static int +vnd_sdev_fillzone(vnd_pnsd_t *nsp, sdev_ctx_t ctx) +{ + int ret; + vnd_dev_t *vdp; + + mutex_enter(&nsp->vpnd_lock); + for (vdp = list_head(&nsp->vpnd_dev_list); vdp != NULL; + vdp = list_next(&nsp->vpnd_dev_list, vdp)) { + mutex_enter(&vdp->vdd_lock); + if ((vdp->vdd_flags & VND_D_LINKED) && + !(vdp->vdd_flags & (VND_D_CONDEMNED | VND_D_ZONE_DYING))) { + ret = sdev_plugin_mknod(ctx, vdp->vdd_lname, S_IFCHR, + vdp->vdd_devid); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vdp->vdd_lock); + mutex_exit(&nsp->vpnd_lock); + vnd_nsd_rele(nsp); + return (ret); + } + } + mutex_exit(&vdp->vdd_lock); + } + mutex_exit(&nsp->vpnd_lock); + + return (0); +} + +static int +vnd_sdev_filldir_root(sdev_ctx_t ctx) +{ + zoneid_t zid; + vnd_pnsd_t *nsp; + int ret; + + zid = getzoneid(); + nsp = vnd_nsd_lookup(zoneid_to_netstackid(zid)); + ASSERT(nsp != NULL); + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + if (ret != 0) + return (ret); + + /* + * Checking the zone id is not sufficient as the global zone could be + * reaching down into a non-global zone's mounted /dev. + */ + if (zid == GLOBAL_ZONEID && (sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL)) { + ret = sdev_plugin_mkdir(ctx, "zone"); + if (ret != 0 && ret != EEXIST) + return (ret); + } + + /* + * Always add a reference to the control node. There's no need to + * reference it since it always exists and is always what we clone from. + */ + ret = sdev_plugin_mknod(ctx, "ctl", S_IFCHR, + makedevice(ddi_driver_major(vnd_dip), 0)); + if (ret != 0 && ret != EEXIST) + return (ret); + + return (0); +} + +static int +vnd_sdev_filldir_zroot(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + zone_t *zonep; + + ASSERT(getzoneid() == GLOBAL_ZONEID); + ASSERT(sdev_ctx_flags(ctx) & SDEV_CTX_GLOBAL); + + mutex_enter(&vnd_dev_lock); + for (nsp = list_head(&vnd_nsd_list); nsp != NULL; + nsp = list_next(&vnd_nsd_list, nsp)) { + mutex_enter(&nsp->vpnd_lock); + if (list_is_empty(&nsp->vpnd_dev_list)) { + mutex_exit(&nsp->vpnd_lock); + continue; + } + mutex_exit(&nsp->vpnd_lock); + zonep = zone_find_by_id(nsp->vpnd_zid); + /* + * This zone must be being torn down, so skip it. + */ + if (zonep == NULL) + continue; + ret = sdev_plugin_mkdir(ctx, zonep->zone_name); + zone_rele(zonep); + if (ret != 0 && ret != EEXIST) { + mutex_exit(&vnd_dev_lock); + return (ret); + } + } + mutex_exit(&vnd_dev_lock); + return (0); +} + +static int +vnd_sdev_filldir(sdev_ctx_t ctx) +{ + int ret; + vnd_pnsd_t *nsp; + + ASSERT(sdev_ctx_vtype(ctx) == VDIR); + if (strcmp(VND_SDEV_ROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_root(ctx)); + + if (strcmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx)) == 0) + return (vnd_sdev_filldir_zroot(ctx)); + + ASSERT(strncmp(VND_SDEV_ZROOT, sdev_ctx_path(ctx), + strlen(VND_SDEV_ZROOT)) == 0); + nsp = vnd_sdev_ctx_to_ns(ctx); + if (nsp == NULL) + return (0); + + ret = vnd_sdev_fillzone(nsp, ctx); + vnd_nsd_rele(nsp); + + return (ret); +} + +static sdev_plugin_ops_t vnd_sdev_ops = { + SDEV_PLUGIN_VERSION, + SDEV_PLUGIN_SUBDIR, + vnd_sdev_validate, + vnd_sdev_filldir, + vnd_sdev_inactive +}; + +static int +vnd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + int errp = 0; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + /* + * Only allow one instance. + */ + if (vnd_dip != NULL) + return (DDI_FAILURE); + + vnd_dip = dip; + if (ddi_create_minor_node(vnd_dip, "vnd", S_IFCHR, 0, DDI_PSEUDO, 0) != + DDI_SUCCESS) { + vnd_dip = NULL; + return (DDI_FAILURE); + } + + if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP, + DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) { + ddi_remove_minor_node(vnd_dip, NULL); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sdev_hdl = sdev_plugin_register(VND_SDEV_NAME, &vnd_sdev_ops, + &errp); + if (vnd_sdev_hdl == NULL) { + ddi_remove_minor_node(vnd_dip, NULL); + ddi_prop_remove_all(vnd_dip); + vnd_dip = NULL; + return (DDI_FAILURE); + } + + vnd_sqset = gsqueue_set_create(GSQUEUE_DEFAULT_WAIT, + GSQUEUE_DEFAULT_PRIORITY); + + return (DDI_SUCCESS); +} + +static int +vnd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + mutex_enter(&vnd_dev_lock); + if (!list_is_empty(&vnd_dev_list)) { + mutex_exit(&vnd_dev_lock); + return (DDI_FAILURE); + } + mutex_exit(&vnd_dev_lock); + + return (DDI_FAILURE); +} + +static int +vnd_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) +{ + int error; + + switch (cmd) { + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vnd_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; + break; + default: + error = DDI_FAILURE; + break; + } + return (error); +} + + + +static void +vnd_ddi_fini(void) +{ + netstack_unregister(NS_VND); + if (vnd_taskq != NULL) + taskq_destroy(vnd_taskq); + if (vnd_str_cache != NULL) + kmem_cache_destroy(vnd_str_cache); + if (vnd_dev_cache != NULL) + kmem_cache_destroy(vnd_dev_cache); + if (vnd_pnsd_cache != NULL) + kmem_cache_destroy(vnd_pnsd_cache); + if (vnd_minors != NULL) + id_space_destroy(vnd_minors); + if (vnd_list_init != 0) { + list_destroy(&vnd_nsd_list); + list_destroy(&vnd_dev_list); + mutex_destroy(&vnd_dev_lock); + vnd_list_init = 0; + } + frameio_fini(); +} + +static int +vnd_ddi_init(void) +{ + if (frameio_init() != 0) + return (DDI_FAILURE); + + vnd_str_cache = kmem_cache_create("vnd_str_cache", sizeof (vnd_str_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_str_cache == NULL) { + frameio_fini(); + return (DDI_FAILURE); + } + vnd_dev_cache = kmem_cache_create("vnd_dev_cache", sizeof (vnd_dev_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_dev_cache == NULL) { + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + vnd_pnsd_cache = kmem_cache_create("vnd_pnsd_cache", + sizeof (vnd_pnsd_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + if (vnd_pnsd_cache == NULL) { + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_taskq = taskq_create_instance("vnd", -1, 1, minclsyspri, 0, 0, 0); + if (vnd_taskq == NULL) { + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + vnd_minors = id_space_create("vnd_minors", 1, INT32_MAX); + if (vnd_minors == NULL) { + taskq_destroy(vnd_taskq); + kmem_cache_destroy(vnd_pnsd_cache); + kmem_cache_destroy(vnd_dev_cache); + kmem_cache_destroy(vnd_str_cache); + frameio_fini(); + return (DDI_FAILURE); + } + + mutex_init(&vnd_dev_lock, NULL, MUTEX_DRIVER, NULL); + list_create(&vnd_dev_list, sizeof (vnd_dev_t), + offsetof(vnd_dev_t, vdd_link)); + list_create(&vnd_nsd_list, sizeof (vnd_pnsd_t), + offsetof(vnd_pnsd_t, vpnd_link)); + vnd_list_init = 1; + + netstack_register(NS_VND, vnd_stack_init, vnd_stack_shutdown, + vnd_stack_destroy); + + return (DDI_SUCCESS); +} + +static struct module_info vnd_minfo = { + 0, /* module id */ + "vnd", /* module name */ + 1, /* smallest packet size */ + INFPSZ, /* largest packet size (infinite) */ + 1, /* high watermark */ + 0 /* low watermark */ +}; + +static struct qinit vnd_r_qinit = { + vnd_s_rput, + NULL, + vnd_s_open, + vnd_s_close, + NULL, + &vnd_minfo, + NULL +}; + +static struct qinit vnd_w_qinit = { + vnd_s_wput, + NULL, + NULL, + NULL, + NULL, + &vnd_minfo, + NULL +}; + +static struct streamtab vnd_strtab = { + &vnd_r_qinit, + &vnd_w_qinit, + NULL, + NULL +}; + + +static struct cb_ops vnd_cb_ops = { + vnd_open, /* open */ + vnd_close, /* close */ + nulldev, /* strategy */ + nulldev, /* print */ + nodev, /* dump */ + vnd_read, /* read */ + vnd_write, /* write */ + vnd_ioctl, /* ioctl */ + nodev, /* devmap */ + nodev, /* mmap */ + nodev, /* segmap */ + vnd_chpoll, /* poll */ + ddi_prop_op, /* cb_prop_op */ + NULL, /* streamtab */ + D_MP /* Driver compatibility flag */ +}; + +static struct dev_ops vnd_dev_ops = { + DEVO_REV, /* devo_rev */ + 0, /* refcnt */ + vnd_info, /* get_dev_info */ + nulldev, /* identify */ + nulldev, /* probe */ + vnd_attach, /* attach */ + vnd_detach, /* detach */ + nodev, /* reset */ + &vnd_cb_ops, /* driver operations */ + NULL, /* bus operations */ + nodev, /* dev power */ + ddi_quiesce_not_needed /* quiesce */ +}; + +static struct modldrv vnd_modldrv = { + &mod_driverops, + "Virtual Networking Datapath Driver", + &vnd_dev_ops +}; + +static struct fmodsw vnd_fmodfsw = { + "vnd", + &vnd_strtab, + D_NEW | D_MP +}; + +static struct modlstrmod vnd_modlstrmod = { + &mod_strmodops, + "Virtual Networking Datapath Driver", + &vnd_fmodfsw +}; + +static struct modlinkage vnd_modlinkage = { + MODREV_1, + &vnd_modldrv, + &vnd_modlstrmod, + NULL +}; + +int +_init(void) +{ + int error; + + /* + * We need to do all of our global initialization in init as opposed to + * attach and detach. The problem here is that because vnd can be used + * from a stream context while being detached, we can not rely on having + * run attach to create everything, alas. so it goes in _init, just like + * our friend ip. + */ + if ((error = vnd_ddi_init()) != DDI_SUCCESS) + return (error); + error = mod_install((&vnd_modlinkage)); + if (error != 0) + vnd_ddi_fini(); + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&vnd_modlinkage, modinfop)); +} + +int +_fini(void) +{ + int error; + + error = mod_remove(&vnd_modlinkage); + if (error == 0) + vnd_ddi_fini(); + return (error); +} diff --git a/usr/src/uts/common/io/vnd/vnd.conf b/usr/src/uts/common/io/vnd/vnd.conf new file mode 100644 index 0000000000..65872e1ddf --- /dev/null +++ b/usr/src/uts/common/io/vnd/vnd.conf @@ -0,0 +1,16 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2014, Joyent, Inc. All rights reserved. +# + +name="vnd" parent="pseudo" instance=0; diff --git a/usr/src/uts/common/io/vnic/vnic_dev.c b/usr/src/uts/common/io/vnic/vnic_dev.c index 4615f00e51..c183a9c4f5 100644 --- a/usr/src/uts/common/io/vnic/vnic_dev.c +++ b/usr/src/uts/common/io/vnic/vnic_dev.c @@ -52,6 +52,7 @@ #include <sys/vlan.h> #include <sys/vnic.h> #include <sys/vnic_impl.h> +#include <sys/mac_impl.h> #include <sys/mac_flow_impl.h> #include <inet/ip_impl.h> @@ -1059,6 +1060,18 @@ vnic_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, err = mac_maxsdu_update(vn->vn_mh, mtu); break; } + case MAC_PROP_VN_PROMISC_FILTERED: { + boolean_t filtered; + + if (pr_valsize < sizeof (filtered)) { + err = EINVAL; + break; + } + + bcopy(pr_val, &filtered, sizeof (filtered)); + mac_set_promisc_filtered(vn->vn_mch, filtered); + break; + } case MAC_PROP_SECONDARY_ADDRS: { mac_secondary_addr_t msa; @@ -1080,8 +1093,14 @@ vnic_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, { vnic_t *vn = arg; int ret = 0; + boolean_t out; switch (pr_num) { + case MAC_PROP_VN_PROMISC_FILTERED: + out = mac_get_promisc_filtered(vn->vn_mch); + ASSERT(pr_valsize >= sizeof (boolean_t)); + bcopy(&out, pr_val, sizeof (boolean_t)); + break; case MAC_PROP_SECONDARY_ADDRS: ret = vnic_get_secondary_macs(vn, pr_valsize, pr_val); break; diff --git a/usr/src/uts/common/io/vscan/vscan_svc.c b/usr/src/uts/common/io/vscan/vscan_svc.c index e26fa9e292..b5efbb3a99 100644 --- a/usr/src/uts/common/io/vscan/vscan_svc.c +++ b/usr/src/uts/common/io/vscan/vscan_svc.c @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/stat.h> @@ -461,7 +462,7 @@ vscan_svc_scan_file(vnode_t *vp, cred_t *cr, int async) boolean_t allow; clock_t timeout, time_left; - if ((vp == NULL) || (vp->v_path == NULL) || cr == NULL) + if ((vp == NULL) || (vp->v_path == vn_vpath_empty) || cr == NULL) return (0); DTRACE_PROBE2(vscan__scan__file, char *, vp->v_path, int, async); @@ -1080,7 +1081,6 @@ vscan_svc_exempt_file(vnode_t *vp, boolean_t *allow) struct vattr attr; ASSERT(vp != NULL); - ASSERT(vp->v_path != NULL); attr.va_mask = AT_SIZE; diff --git a/usr/src/uts/common/io/zfd.c b/usr/src/uts/common/io/zfd.c new file mode 100644 index 0000000000..3edaa62bbe --- /dev/null +++ b/usr/src/uts/common/io/zfd.c @@ -0,0 +1,844 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Zone File Descriptor Driver. + * + * This driver is derived from the zcons driver which is in turn derived from + * the pts/ptm drivers. The purpose is to expose file descriptors within the + * zone which are connected to zoneadmd and used for logging or an interactive + * connection to a process within the zone. + * + * Its implementation is straightforward. Each instance of the driver + * represents a global-zone/local-zone pair. Unlike the zcons device, zoneadmd + * uses these devices unidirectionally to provide stdin, stdout and stderr to + * the process within the zone. + * + * Instances of zfd are onlined as children of /pseudo/zfdnex@2/ by zoneadmd, + * using the devctl framework; thus the driver does not need to maintain any + * sort of "admin" node. + * + * The driver shuttles I/O from master side to slave side and back. In a break + * from the pts/ptm semantics, if one side is not open, I/O directed towards + * it will simply be discarded. This is so that if zoneadmd is not holding the + * master side fd open (i.e. it has died somehow), processes in the zone do not + * experience any errors and I/O to the fd does not cause the process to hang. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/conf.h> +#include <sys/cred.h> +#include <sys/ddi.h> +#include <sys/debug.h> +#include <sys/devops.h> +#include <sys/errno.h> +#include <sys/file.h> +#include <sys/kstr.h> +#include <sys/modctl.h> +#include <sys/param.h> +#include <sys/stat.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sunddi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/types.h> +#include <sys/zfd.h> +#include <sys/vnode.h> +#include <sys/fs/snode.h> +#include <sys/zone.h> + +static int zfd_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); +static int zfd_attach(dev_info_t *, ddi_attach_cmd_t); +static int zfd_detach(dev_info_t *, ddi_detach_cmd_t); + +static int zfd_open(queue_t *, dev_t *, int, int, cred_t *); +static int zfd_close(queue_t *, int, cred_t *); +static void zfd_wput(queue_t *, mblk_t *); +static void zfd_rsrv(queue_t *); +static void zfd_wsrv(queue_t *); + +/* + * The instance number is encoded in the dev_t in the minor number; the lowest + * bit of the minor number is used to track the master vs. slave side of the + * fd. The rest of the bits in the minor number are the instance. + */ +#define ZFD_MASTER_MINOR 0 +#define ZFD_SLAVE_MINOR 1 + +#define ZFD_INSTANCE(x) (getminor((x)) >> 1) +#define ZFD_NODE(x) (getminor((x)) & 0x01) + +/* + * This macro converts a zfd_state_t pointer to the associated slave minor + * node's dev_t. + */ +#define ZFD_STATE_TO_SLAVEDEV(x) \ + (makedevice(ddi_driver_major((x)->zfd_devinfo), \ + (minor_t)(ddi_get_instance((x)->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR))) + +int zfd_debug = 0; +#define DBG(a) if (zfd_debug) cmn_err(CE_NOTE, a) +#define DBG1(a, b) if (zfd_debug) cmn_err(CE_NOTE, a, b) + +/* + * ZFD Pseudo Terminal Module: stream data structure definitions, + * based on zcons. + */ +static struct module_info zfd_info = { + 0x20FD, /* ZOFD - 8445 */ + "zfd", + 0, /* min packet size */ + INFPSZ, /* max packet size - infinity */ + 2048, /* high water */ + 128 /* low water */ +}; + +static struct qinit zfd_rinit = { + NULL, + (int (*)()) zfd_rsrv, + zfd_open, + zfd_close, + NULL, + &zfd_info, + NULL +}; + +static struct qinit zfd_winit = { + (int (*)()) zfd_wput, + (int (*)()) zfd_wsrv, + NULL, + NULL, + NULL, + &zfd_info, + NULL +}; + +static struct streamtab zfd_tab_info = { + &zfd_rinit, + &zfd_winit, + NULL, + NULL +}; + +#define ZFD_CONF_FLAG (D_MP | D_MTQPAIR | D_MTOUTPERIM | D_MTOCEXCL) + +/* + * this will define (struct cb_ops cb_zfd_ops) and (struct dev_ops zfd_ops) + */ +DDI_DEFINE_STREAM_OPS(zfd_ops, nulldev, nulldev, zfd_attach, zfd_detach, \ + nodev, zfd_getinfo, ZFD_CONF_FLAG, &zfd_tab_info, \ + ddi_quiesce_not_needed); + +/* + * Module linkage information for the kernel. + */ + +static struct modldrv modldrv = { + &mod_driverops, /* Type of module (this is a pseudo driver) */ + "Zone FD driver", /* description of module */ + &zfd_ops /* driver ops */ +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modldrv, + NULL +}; + +typedef struct zfd_state { + dev_info_t *zfd_devinfo; + queue_t *zfd_master_rdq; + queue_t *zfd_slave_rdq; + vnode_t *zfd_slave_vnode; + int zfd_state; + int zfd_tty; +} zfd_state_t; + +#define ZFD_STATE_MOPEN 0x01 +#define ZFD_STATE_SOPEN 0x02 + +static void *zfd_soft_state; + +/* + * List of STREAMS modules that is pushed onto a slave instance after the + * ZFD_MAKETTY ioctl has been received. + */ +static char *zfd_mods[] = { + "ptem", + "ldterm", + "ttcompat", + NULL +}; + +int +_init(void) +{ + int err; + + if ((err = ddi_soft_state_init(&zfd_soft_state, sizeof (zfd_state_t), + 0)) != 0) { + return (err); + } + + if ((err = mod_install(&modlinkage)) != 0) + ddi_soft_state_fini(zfd_soft_state); + + return (err); +} + + +int +_fini(void) +{ + int err; + + if ((err = mod_remove(&modlinkage)) != 0) { + return (err); + } + + ddi_soft_state_fini(&zfd_soft_state); + return (0); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} + +static int +zfd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + char masternm[ZFD_NAME_LEN], slavenm[ZFD_NAME_LEN]; + + if (cmd != DDI_ATTACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if (ddi_soft_state_zalloc(zfd_soft_state, instance) != DDI_SUCCESS) + return (DDI_FAILURE); + + (void) snprintf(masternm, sizeof (masternm), "%s%d", ZFD_MASTER_NAME, + instance); + (void) snprintf(slavenm, sizeof (slavenm), "%s%d", ZFD_SLAVE_NAME, + instance); + + /* + * Create the master and slave minor nodes. + */ + if ((ddi_create_minor_node(dip, slavenm, S_IFCHR, + instance << 1 | ZFD_SLAVE_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE) || + (ddi_create_minor_node(dip, masternm, S_IFCHR, + instance << 1 | ZFD_MASTER_MINOR, DDI_PSEUDO, 0) == DDI_FAILURE)) { + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + return (DDI_FAILURE); + } + + VERIFY((zfds = ddi_get_soft_state(zfd_soft_state, instance)) != NULL); + zfds->zfd_devinfo = dip; + zfds->zfd_tty = 0; + return (DDI_SUCCESS); +} + +static int +zfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) +{ + zfd_state_t *zfds; + int instance; + + if (cmd != DDI_DETACH) + return (DDI_FAILURE); + + instance = ddi_get_instance(dip); + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (DDI_FAILURE); + + if ((zfds->zfd_state & ZFD_STATE_MOPEN) || + (zfds->zfd_state & ZFD_STATE_SOPEN)) { + DBG1("zfd_detach: device (dip=%p) still open\n", (void *)dip); + return (DDI_FAILURE); + } + + ddi_remove_minor_node(dip, NULL); + ddi_soft_state_free(zfd_soft_state, instance); + + return (DDI_SUCCESS); +} + +/* + * zfd_getinfo() + * getinfo(9e) entrypoint. + */ +/*ARGSUSED*/ +static int +zfd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +{ + zfd_state_t *zfds; + int instance = ZFD_INSTANCE((dev_t)arg); + + switch (infocmd) { + case DDI_INFO_DEVT2DEVINFO: + if ((zfds = ddi_get_soft_state(zfd_soft_state, + instance)) == NULL) + return (DDI_FAILURE); + *result = zfds->zfd_devinfo; + return (DDI_SUCCESS); + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)(uintptr_t)instance; + return (DDI_SUCCESS); + } + return (DDI_FAILURE); +} + +/* + * Return the equivalent queue from the other side of the relationship. + * e.g.: given the slave's write queue, return the master's write queue. + */ +static queue_t * +zfd_switch(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq) + return (zfds->zfd_slave_rdq); + else if (OTHERQ(qp) == zfds->zfd_master_rdq && zfds->zfd_slave_rdq + != NULL) + return (OTHERQ(zfds->zfd_slave_rdq)); + else if (qp == zfds->zfd_slave_rdq) + return (zfds->zfd_master_rdq); + else if (OTHERQ(qp) == zfds->zfd_slave_rdq && zfds->zfd_master_rdq + != NULL) + return (OTHERQ(zfds->zfd_master_rdq)); + else + return (NULL); +} + +/* + * For debugging and outputting messages. Returns the name of the side of + * the relationship associated with this queue. + */ +static const char * +zfd_side(queue_t *qp) +{ + zfd_state_t *zfds = qp->q_ptr; + ASSERT(zfds != NULL); + + if (qp == zfds->zfd_master_rdq || + OTHERQ(qp) == zfds->zfd_master_rdq) { + return ("master"); + } + ASSERT(qp == zfds->zfd_slave_rdq || OTHERQ(qp) == zfds->zfd_slave_rdq); + return ("slave"); +} + +/*ARGSUSED*/ +static int +zfd_master_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + + /* + * Enforce exclusivity on the master side; the only consumer should + * be the zoneadmd for the zone. + */ + if ((zfds->zfd_state & ZFD_STATE_MOPEN) != 0) + return (EBUSY); + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_master_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_MOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + qprocson(rqp); + + /* + * Following qprocson(), the master side is fully plumbed into the + * STREAM and may send/receive messages. Setting zfds->zfd_master_rdq + * will allow the slave to send messages to us (the master). + * This cannot occur before qprocson() because the master is not + * ready to process them until that point. + */ + zfds->zfd_master_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + if (oflag & FNOCTTY) + sop->so_flags = SO_HIWAT | SO_LOWAT; + else + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/*ARGSUSED*/ +static int +zfd_slave_open(zfd_state_t *zfds, + queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + mblk_t *mop; + struct stroptions *sop; + /* + * The slave side can be opened as many times as needed. + */ + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + ASSERT((rqp != NULL) && (WR(rqp)->q_ptr == zfds)); + return (0); + } + + if (zfds->zfd_tty == 1) { + major_t major; + minor_t minor; + minor_t lastminor; + uint_t anchorindex; + + /* + * Set up sad(7D) so that the necessary STREAMS modules will + * be in place. A wrinkle is that 'ptem' must be anchored + * in place (see streamio(7i)) because we always want the + * fd to have terminal semantics. + */ + minor = + ddi_get_instance(zfds->zfd_devinfo) << 1 | ZFD_SLAVE_MINOR; + major = ddi_driver_major(zfds->zfd_devinfo); + lastminor = 0; + anchorindex = 1; + if (kstr_autopush(SET_AUTOPUSH, &major, &minor, &lastminor, + &anchorindex, zfd_mods) != 0) { + DBG("zfd_slave_open(): kstr_autopush() failed\n"); + return (EIO); + } + } + + if ((mop = allocb(sizeof (struct stroptions), BPRI_MED)) == NULL) { + DBG("zfd_slave_open(): mop allocation failed\n"); + return (ENOMEM); + } + + zfds->zfd_state |= ZFD_STATE_SOPEN; + + /* + * q_ptr stores driver private data; stash the soft state data on both + * read and write sides of the queue. + */ + WR(rqp)->q_ptr = rqp->q_ptr = zfds; + + qprocson(rqp); + + /* + * Must follow qprocson(), since we aren't ready to process until then. + */ + zfds->zfd_slave_rdq = rqp; + + /* + * set up hi/lo water marks on stream head read queue and add + * controlling tty as needed. + */ + mop->b_datap->db_type = M_SETOPTS; + mop->b_wptr += sizeof (struct stroptions); + sop = (struct stroptions *)(void *)mop->b_rptr; + sop->so_flags = SO_HIWAT | SO_LOWAT | SO_ISTTY; + sop->so_hiwat = 512; + sop->so_lowat = 256; + putnext(rqp, mop); + + return (0); +} + +/* + * open(9e) entrypoint; checks sflag, and rejects anything unordinary. + */ +static int +zfd_open(queue_t *rqp, /* pointer to the read side queue */ + dev_t *devp, /* pointer to stream tail's dev */ + int oflag, /* the user open(2) supplied flags */ + int sflag, /* open state flag */ + cred_t *credp) /* credentials */ +{ + int instance = ZFD_INSTANCE(*devp); + int ret; + zfd_state_t *zfds; + + if (sflag != 0) + return (EINVAL); + + if ((zfds = ddi_get_soft_state(zfd_soft_state, instance)) == NULL) + return (ENXIO); + + switch (ZFD_NODE(*devp)) { + case ZFD_MASTER_MINOR: + ret = zfd_master_open(zfds, rqp, devp, oflag, sflag, credp); + break; + case ZFD_SLAVE_MINOR: + ret = zfd_slave_open(zfds, rqp, devp, oflag, sflag, credp); + break; + default: + ret = ENXIO; + break; + } + + return (ret); +} + +/* + * close(9e) entrypoint. + */ +/*ARGSUSED1*/ +static int +zfd_close(queue_t *rqp, int flag, cred_t *credp) +{ + queue_t *wqp; + mblk_t *bp; + zfd_state_t *zfds; + major_t major; + minor_t minor; + + zfds = (zfd_state_t *)rqp->q_ptr; + + if (rqp == zfds->zfd_master_rdq) { + DBG("Closing master side"); + + zfds->zfd_master_rdq = NULL; + zfds->zfd_state &= ~ZFD_STATE_MOPEN; + + /* + * qenable slave side write queue so that it can flush + * its messages as master's read queue is going away + */ + if (zfds->zfd_slave_rdq != NULL) { + qenable(WR(zfds->zfd_slave_rdq)); + } + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + } else if (rqp == zfds->zfd_slave_rdq) { + + DBG("Closing slave side"); + zfds->zfd_state &= ~ZFD_STATE_SOPEN; + zfds->zfd_slave_rdq = NULL; + + wqp = WR(rqp); + while ((bp = getq(wqp)) != NULL) { + if (zfds->zfd_master_rdq != NULL) + putnext(zfds->zfd_master_rdq, bp); + else if (bp->b_datap->db_type == M_IOCTL) + miocnak(wqp, bp, 0, 0); + else + freemsg(bp); + } + + /* + * Qenable master side write queue so that it can flush its + * messages as slaves's read queue is going away. + */ + if (zfds->zfd_master_rdq != NULL) + qenable(WR(zfds->zfd_master_rdq)); + + qprocsoff(rqp); + WR(rqp)->q_ptr = rqp->q_ptr = NULL; + + if (zfds->zfd_tty == 1) { + /* + * Clear the sad configuration so that reopening + * doesn't fail to set up sad configuration. + */ + major = ddi_driver_major(zfds->zfd_devinfo); + minor = ddi_get_instance(zfds->zfd_devinfo) << 1 | + ZFD_SLAVE_MINOR; + (void) kstr_autopush(CLR_AUTOPUSH, &major, &minor, + NULL, NULL, NULL); + } + } + + return (0); +} + +static void +handle_mflush(queue_t *qp, mblk_t *mp) +{ + mblk_t *nmp; + DBG1("M_FLUSH on %s side", zfd_side(qp)); + + if (*mp->b_rptr & FLUSHW) { + DBG1("M_FLUSH, FLUSHW, %s side", zfd_side(qp)); + flushq(qp, FLUSHDATA); + *mp->b_rptr &= ~FLUSHW; + if ((*mp->b_rptr & FLUSHR) == 0) { + /* + * FLUSHW only. Change to FLUSHR and putnext other side, + * then we are done. + */ + *mp->b_rptr |= FLUSHR; + if (zfd_switch(RD(qp)) != NULL) { + putnext(zfd_switch(RD(qp)), mp); + return; + } + } else if ((zfd_switch(RD(qp)) != NULL) && + (nmp = copyb(mp)) != NULL) { + /* + * It is a FLUSHRW; we copy the mblk and send + * it to the other side, since we still need to use + * the mblk in FLUSHR processing, below. + */ + putnext(zfd_switch(RD(qp)), nmp); + } + } + + if (*mp->b_rptr & FLUSHR) { + DBG("qreply(qp) turning FLUSHR around\n"); + qreply(qp, mp); + return; + } + freemsg(mp); +} + +/* + * wput(9E) is symmetric for master and slave sides, so this handles both + * without splitting the codepath. (The only exception to this is the + * processing of zfd ioctls, which is restricted to the master side.) + * + * zfd_wput() looks at the other side; if there is no process holding that + * side open, it frees the message. This prevents processes from hanging + * if no one is holding open the fd. Otherwise, it putnext's high + * priority messages, putnext's normal messages if possible, and otherwise + * enqueues the messages; in the case that something is enqueued, wsrv(9E) + * will take care of eventually shuttling I/O to the other side. + */ +static void +zfd_wput(queue_t *qp, mblk_t *mp) +{ + unsigned char type = mp->b_datap->db_type; + zfd_state_t *zfds; + struct iocblk *iocbp; + + ASSERT(qp->q_ptr); + + DBG1("entering zfd_wput, %s side", zfd_side(qp)); + + /* + * Process zfd ioctl messages if qp is the master side's write queue. + */ + zfds = (zfd_state_t *)qp->q_ptr; + if (zfds->zfd_master_rdq != NULL && qp == WR(zfds->zfd_master_rdq) && + type == M_IOCTL) { + iocbp = (struct iocblk *)(void *)mp->b_rptr; + switch (iocbp->ioc_cmd) { + case ZFD_MAKETTY: + /* + * The process that passed the ioctl must be running in + * the global zone. + */ + if (crgetzoneid(iocbp->ioc_cr) != GLOBAL_ZONEID) { + miocack(qp, mp, 0, EINVAL); + return; + } + zfds->zfd_tty = 1; + miocack(qp, mp, 0, 0); + return; + case ZFD_EOF: + /* + * The process that passed the ioctl must be running in + * the global zone. + */ + if (crgetzoneid(iocbp->ioc_cr) != GLOBAL_ZONEID) { + miocack(qp, mp, 0, EINVAL); + return; + } + if (zfds->zfd_slave_rdq != NULL) + (void) putnextctl(zfds->zfd_slave_rdq, + M_HANGUP); + miocack(qp, mp, 0, 0); + return; + case ZFD_HAS_SLAVE: + /* + * The process that passed the ioctl must be running in + * the global zone. + */ + if (crgetzoneid(iocbp->ioc_cr) != GLOBAL_ZONEID) { + miocack(qp, mp, 0, EINVAL); + return; + } + if ((zfds->zfd_state & ZFD_STATE_SOPEN) != 0) { + miocack(qp, mp, 0, 0); + } else { + miocack(qp, mp, 0, ENOTTY); + } + return; + default: + break; + } + } + + if (zfd_switch(RD(qp)) == NULL) { + DBG1("wput to %s side (no one listening)", zfd_side(qp)); + switch (type) { + case M_FLUSH: + handle_mflush(qp, mp); + break; + case M_IOCTL: + miocnak(qp, mp, 0, 0); + break; + default: + freemsg(mp); + break; + } + return; + } + + if (type >= QPCTL) { + DBG1("(hipri) wput, %s side", zfd_side(qp)); + switch (type) { + case M_READ: /* supposedly from ldterm? */ + DBG("zfd_wput: tossing M_READ\n"); + freemsg(mp); + break; + case M_FLUSH: + handle_mflush(qp, mp); + break; + default: + /* + * Put this to the other side. + */ + ASSERT(zfd_switch(RD(qp)) != NULL); + putnext(zfd_switch(RD(qp)), mp); + break; + } + DBG1("done (hipri) wput, %s side", zfd_side(qp)); + return; + } + + /* + * Only putnext if there isn't already something in the queue. + * otherwise things would wind up out of order. + */ + if (qp->q_first == NULL && + bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + DBG("wput: putting message to other side\n"); + putnext(RD(zfd_switch(qp)), mp); + } else { + DBG("wput: putting msg onto queue\n"); + (void) putq(qp, mp); + } + DBG1("done wput, %s side", zfd_side(qp)); +} + +/* + * rsrv(9E) is symmetric for master and slave, so zfd_rsrv() handles both + * without splitting up the codepath. + * + * Enable the write side of the partner. This triggers the partner to send + * messages queued on its write side to this queue's read side. + */ +static void +zfd_rsrv(queue_t *qp) +{ + zfd_state_t *zfds; + zfds = (zfd_state_t *)qp->q_ptr; + + /* + * Care must be taken here, as either of the master or slave side + * qptr could be NULL. + */ + ASSERT(qp == zfds->zfd_master_rdq || qp == zfds->zfd_slave_rdq); + if (zfd_switch(qp) == NULL) { + DBG("zfd_rsrv: other side isn't listening\n"); + return; + } + qenable(WR(zfd_switch(qp))); +} + +/* + * This routine is symmetric for master and slave, so it handles both without + * splitting up the codepath. + * + * If there are messages on this queue that can be sent to the other, send + * them via putnext(). Else, if queued messages cannot be sent, leave them + * on this queue. + */ +static void +zfd_wsrv(queue_t *qp) +{ + mblk_t *mp; + + DBG1("zfd_wsrv master (%s) side", zfd_side(qp)); + + /* + * Partner has no read queue, so take the data, and throw it away. + */ + if (zfd_switch(RD(qp)) == NULL) { + DBG("zfd_wsrv: other side isn't listening"); + while ((mp = getq(qp)) != NULL) { + if (mp->b_datap->db_type == M_IOCTL) + miocnak(qp, mp, 0, 0); + else + freemsg(mp); + } + flushq(qp, FLUSHALL); + return; + } + + /* + * while there are messages on this write queue... + */ + while ((mp = getq(qp)) != NULL) { + /* + * Due to the way zfd_wput is implemented, we should never + * see a control message here. + */ + ASSERT(mp->b_datap->db_type < QPCTL); + + if (bcanputnext(RD(zfd_switch(qp)), mp->b_band)) { + DBG("wsrv: send message to other side\n"); + putnext(RD(zfd_switch(qp)), mp); + } else { + DBG("wsrv: putting msg back on queue\n"); + (void) putbq(qp, mp); + break; + } + } +} diff --git a/usr/src/uts/common/krtld/bootrd.c b/usr/src/uts/common/krtld/bootrd.c index 08e5d98c09..35ad67da96 100644 --- a/usr/src/uts/common/krtld/bootrd.c +++ b/usr/src/uts/common/krtld/bootrd.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2013 Joyent, Inc. All rights reserved. */ @@ -36,17 +37,95 @@ extern void (*_kobj_printf)(void *, const char *fmt, ...); extern int get_weakish_int(int *); extern struct bootops *ops; -extern struct boot_fs_ops bufs_ops, bhsfs_ops; +extern struct boot_fs_ops bufs_ops, bhsfs_ops, bbootfs_ops; extern int kmem_ready; static uint64_t rd_start, rd_end; struct boot_fs_ops *bfs_ops; -struct boot_fs_ops *bfs_tab[] = {&bufs_ops, &bhsfs_ops, NULL}; +struct boot_fs_ops *bfs_tab[] = {&bufs_ops, &bhsfs_ops, &bbootfs_ops, NULL}; static uintptr_t scratch_max = 0; #define _kmem_ready get_weakish_int(&kmem_ready) +int +BRD_MOUNTROOT(struct boot_fs_ops *ops, char *str) +{ + return (ops->fsw_mountroot(str)); +} + +int +BRD_UNMOUNTROOT(struct boot_fs_ops *ops) +{ + if (bfs_ops != &bbootfs_ops) + bbootfs_ops.fsw_closeall(1); + + return (ops->fsw_unmountroot()); +} + +int +BRD_OPEN(struct boot_fs_ops *ops, char *file, int flags) +{ + int len = strlen(SYSTEM_BOOT_PATH); + int fd; + + /* + * Our policy is that we try bootfs first. If bootfs is the only + * filesystem, that's the end of it. Otherwise we will fall back to + * the normal root (i.e., ramdisk) filesystem at this point and try + * again if the file does not exist in bootfs. + */ + fd = bbootfs_ops.fsw_open(file, flags); + + if (bfs_ops == &bbootfs_ops) + return (fd); + + if (strncmp(file, SYSTEM_BOOT_PATH, len) == 0 || fd >= 0) + return ((fd < 0) ? fd : (fd | BFD_F_SYSTEM_BOOT)); + + return (ops->fsw_open(file, flags)); +} + +int +BRD_CLOSE(struct boot_fs_ops *ops, int fd) +{ + if (fd & BFD_F_SYSTEM_BOOT) + return (bbootfs_ops.fsw_close(fd & ~BFD_F_SYSTEM_BOOT)); + + return (ops->fsw_close(fd)); +} + +ssize_t +BRD_READ(struct boot_fs_ops *ops, int fd, caddr_t buf, size_t len) +{ + if (fd & BFD_F_SYSTEM_BOOT) { + return (bbootfs_ops.fsw_read(fd & ~BFD_F_SYSTEM_BOOT, + buf, len)); + } + + return (ops->fsw_read(fd, buf, len)); +} + +off_t +BRD_SEEK(struct boot_fs_ops *ops, int fd, off_t addr, int whence) +{ + if (fd & BFD_F_SYSTEM_BOOT) { + return (bbootfs_ops.fsw_lseek(fd & ~BFD_F_SYSTEM_BOOT, + addr, whence)); + } + + return (ops->fsw_lseek(fd, addr, whence)); +} + +int +BRD_FSTAT(struct boot_fs_ops *ops, int fd, struct bootstat *bsp) +{ + if (fd & BFD_F_SYSTEM_BOOT) + return (bbootfs_ops.fsw_fstat(fd & ~BFD_F_SYSTEM_BOOT, bsp)); + + return (ops->fsw_fstat(fd, bsp)); +} + /* * This one reads the ramdisk. If fi_memp is set, we copy the * ramdisk content to the designated buffer. Otherwise, we diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index d530b7f36e..7927cf5e24 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -3,6 +3,7 @@ * Use is subject to license terms. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* * Copyright (c) 1982, 1986 Regents of the University of California. @@ -225,6 +226,7 @@ typedef uint16_t sa_family_t; #define IPPORT_SLP 427 #define IPPORT_MIP 434 #define IPPORT_SMB 445 /* a.k.a. microsoft-ds */ +#define IPPORT_VXLAN 4789 /* * Internet Key Exchange (IKE) ports @@ -268,6 +270,11 @@ typedef uint16_t sa_family_t; #define IPPORT_RESERVED 1024 #define IPPORT_USERRESERVED 5000 +#ifdef _KERNEL +#define IPPORT_DYNAMIC_MIN 49152 +#define IPPORT_DYNAMIC_MAX 65535 +#endif + /* * Link numbers */ diff --git a/usr/src/uts/common/netinet/udp.h b/usr/src/uts/common/netinet/udp.h index c65a9bad3a..fb9f8a0976 100644 --- a/usr/src/uts/common/netinet/udp.h +++ b/usr/src/uts/common/netinet/udp.h @@ -17,9 +17,6 @@ #ifndef _NETINET_UDP_H #define _NETINET_UDP_H -#pragma ident "%Z%%M% %I% %E% SMI" -/* udp.h 1.7 88/08/19 SMI; from UCB 7.1 6/5/86 */ - #ifdef __cplusplus extern "C" { #endif @@ -36,6 +33,15 @@ struct udphdr { #define UDP_EXCLBIND 0x0101 /* for internal use only */ #define UDP_RCVHDR 0x0102 /* for internal use only */ #define UDP_NAT_T_ENDPOINT 0x0103 /* for internal use only */ +#define UDP_SRCPORT_HASH 0x0104 /* for internal use only */ + +/* + * Hash definitions for UDP_SRCPORT_HASH that effectively tell UDP how to go + * handle UDP_SRCPORT_HASH. + */ +#define UDP_HASH_DISABLE 0x0000 /* for internal use only */ +#define UDP_HASH_VXLAN 0x0001 /* for internal use only */ + /* * Following option in UDP_ namespace required to be exposed through * <xti.h> (It also requires exposing options not implemented). The options diff --git a/usr/src/uts/common/os/bio.c b/usr/src/uts/common/os/bio.c index 96502b8230..b8d2e29058 100644 --- a/usr/src/uts/common/os/bio.c +++ b/usr/src/uts/common/os/bio.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -1320,6 +1321,9 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) cpup = CPU; /* get pointer AFTER preemption is disabled */ CPU_STATS_ADDQ(cpup, vm, pgin, 1); CPU_STATS_ADDQ(cpup, vm, pgpgin, btopr(len)); + + atomic_add_64(&curzone->zone_pgpgin, btopr(len)); + if ((flags & B_ASYNC) == 0) { klwp_t *lwp = ttolwp(curthread); if (lwp != NULL) @@ -1336,13 +1340,19 @@ pageio_setup(struct page *pp, size_t len, struct vnode *vp, int flags) if (pp != NULL && pp->p_vnode != NULL) { if (IS_SWAPFSVP(pp->p_vnode)) { CPU_STATS_ADDQ(cpup, vm, anonpgin, btopr(len)); + atomic_add_64(&curzone->zone_anonpgin, + btopr(len)); } else { if (pp->p_vnode->v_flag & VVMEXEC) { CPU_STATS_ADDQ(cpup, vm, execpgin, btopr(len)); + atomic_add_64(&curzone->zone_execpgin, + btopr(len)); } else { CPU_STATS_ADDQ(cpup, vm, fspgin, btopr(len)); + atomic_add_64(&curzone->zone_fspgin, + btopr(len)); } } } diff --git a/usr/src/uts/common/os/brand.c b/usr/src/uts/common/os/brand.c index eb8c6e730a..00f2ce0440 100644 --- a/usr/src/uts/common/os/brand.c +++ b/usr/src/uts/common/os/brand.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/kmem.h> @@ -45,7 +46,7 @@ struct brand_mach_ops native_mach_ops = { }; #else /* !__sparcv9 */ struct brand_mach_ops native_mach_ops = { - NULL, NULL, NULL, NULL + NULL, NULL, NULL, NULL, NULL, NULL, NULL }; #endif /* !__sparcv9 */ @@ -53,7 +54,8 @@ brand_t native_brand = { BRAND_VER_1, "native", NULL, - &native_mach_ops + &native_mach_ops, + 0 }; /* @@ -314,42 +316,55 @@ void brand_setbrand(proc_t *p) { brand_t *bp = p->p_zone->zone_brand; + void *brand_data = NULL; - ASSERT(bp != NULL); - ASSERT(p->p_brand == &native_brand); + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); /* - * We should only be called from exec(), when we know the process - * is single-threaded. + * We should only be called from exec() or getproc(), when we know the + * process has 0 or 1 threads. */ - ASSERT(p->p_tlist == p->p_tlist->t_forw); + VERIFY((p->p_tlist == NULL) || (p->p_tlist == p->p_tlist->t_forw)); + if (bp->b_data_size > 0) { + brand_data = kmem_zalloc(bp->b_data_size, KM_SLEEP); + } + + mutex_enter(&p->p_lock); + ASSERT(!PROC_IS_BRANDED(p)); p->p_brand = bp; + p->p_brand_data = brand_data; ASSERT(PROC_IS_BRANDED(p)); BROP(p)->b_setbrand(p); + mutex_exit(&p->p_lock); } void -brand_clearbrand(proc_t *p, boolean_t no_lwps) +brand_clearbrand(proc_t *p) { brand_t *bp = p->p_zone->zone_brand; - klwp_t *lwp = NULL; - ASSERT(bp != NULL); - ASSERT(!no_lwps || (p->p_tlist == NULL)); + void *brand_data; + + VERIFY(MUTEX_NOT_HELD(&p->p_lock)); + VERIFY(bp != NULL); + VERIFY(PROC_IS_BRANDED(p)); /* - * If called from exec_common() or proc_exit(), - * we know the process is single-threaded. - * If called from fork_fail, p_tlist is NULL. + * There cannot be more than one lwp associated with a process when + * stripping the brand. */ - if (!no_lwps) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - lwp = p->p_tlist->t_lwp; - } + VERIFY((p->p_tlist == NULL) || (p->p_tlist == p->p_tlist->t_forw)); - ASSERT(PROC_IS_BRANDED(p)); - BROP(p)->b_proc_exit(p, lwp); + mutex_enter(&p->p_lock); p->p_brand = &native_brand; + brand_data = p->p_brand_data; + p->p_brand_data = NULL; + mutex_exit(&p->p_lock); + + if (brand_data != NULL) { + kmem_free(brand_data, bp->b_data_size); + } } #if defined(__sparcv9) @@ -483,7 +498,7 @@ brand_solaris_cmd(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (ENOSYS); /* For all other operations this must be a branded process. */ - if (p->p_brand == &native_brand) + if (!PROC_IS_BRANDED(p)) return (ENOSYS); ASSERT(p->p_brand == pbrand); @@ -601,15 +616,15 @@ restoreexecenv(struct execenv *ep, stack_t *sp) int brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, intpdata_t *idatap, int level, long *execsz, int setid, caddr_t exec_file, - cred_t *cred, int brand_action, struct brand *pbrand, char *bname, - char *brandlib, char *brandlib32, char *brandlinker, char *brandlinker32) + cred_t *cred, int *brand_action, struct brand *pbrand, char *bname, + char *brandlib, char *brandlib32) { vnode_t *nvp; Ehdr ehdr; Addr uphdr_vaddr; intptr_t voffset; - int interp; + char *interp; int i, err; struct execenv env; struct execenv origenv; @@ -619,7 +634,6 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, klwp_t *lwp = ttolwp(curthread); brand_proc_data_t *spd; brand_elf_data_t sed, *sedp; - char *linker; uintptr_t lddata; /* lddata of executable's linker */ ASSERT(curproc->p_brand == pbrand); @@ -636,12 +650,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, */ if (args->to_model == DATAMODEL_NATIVE) { args->emulator = brandlib; - linker = brandlinker; } #if defined(_LP64) else { args->emulator = brandlib32; - linker = brandlinker32; } #endif /* _LP64 */ @@ -725,7 +737,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(vp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); } #if defined(_LP64) else { @@ -733,7 +745,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(vp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, &env.ex_bssbase, - &env.ex_brkbase, &env.ex_brksize, NULL); + &env.ex_brkbase, &env.ex_brksize, NULL, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -744,6 +756,10 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, #endif /* _LP64 */ if (err != 0) { restoreexecenv(&origenv, &orig_sigaltstack); + + if (interp != NULL) + kmem_free(interp, MAXPATHLEN); + return (err); } @@ -761,7 +777,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, sedp->sed_phent = ehdr.e_phentsize; sedp->sed_phnum = ehdr.e_phnum; - if (interp) { + if (interp != NULL) { if (ehdr.e_type == ET_DYN) { /* * This is a shared object executable, so we @@ -777,16 +793,20 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, * it in and store relevant information about it in the * aux vector, where the brand library can find it. */ - if ((err = lookupname(linker, UIO_SYSSPACE, + if ((err = lookupname(interp, UIO_SYSSPACE, FOLLOW, NULLVPP, &nvp)) != 0) { - uprintf("%s: not found.", brandlinker); + uprintf("%s: not found.", interp); restoreexecenv(&origenv, &orig_sigaltstack); + kmem_free(interp, MAXPATHLEN); return (err); } + + kmem_free(interp, MAXPATHLEN); + if (args->to_model == DATAMODEL_NATIVE) { err = mapexec_brand(nvp, args, &ehdr, &uphdr_vaddr, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); } #if defined(_LP64) else { @@ -794,7 +814,7 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, Elf32_Addr uphdr_vaddr32; err = mapexec32_brand(nvp, args, &ehdr32, &uphdr_vaddr32, &voffset, exec_file, &interp, - NULL, NULL, NULL, &lddata); + NULL, NULL, NULL, &lddata, NULL); Ehdr32to64(&ehdr32, &ehdr); if (uphdr_vaddr32 == (Elf32_Addr)-1) @@ -934,9 +954,9 @@ brand_solaris_elfexec(vnode_t *vp, execa_t *uap, uarg_t *args, /* * Third, the /proc aux vectors set up by elfexec() point to - * brand emulation library and it's linker. Copy these to the + * brand emulation library and its linker. Copy these to the * /proc brand specific aux vector, and update the regular - * /proc aux vectors to point to the executable (and it's + * /proc aux vectors to point to the executable (and its * linker). This will enable debuggers to access the * executable via the usual /proc or elf notes aux vectors. * @@ -1078,55 +1098,31 @@ brand_solaris_freelwp(klwp_t *l, struct brand *pbrand) } /*ARGSUSED*/ -int +void brand_solaris_initlwp(klwp_t *l, struct brand *pbrand) { ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand == NULL); l->lwp_brand = (void *)-1; - return (0); } /*ARGSUSED*/ void brand_solaris_lwpexit(klwp_t *l, struct brand *pbrand) { - proc_t *p = l->lwp_procp; - ASSERT(l->lwp_procp->p_brand == pbrand); ASSERT(l->lwp_procp->p_brand_data != NULL); ASSERT(l->lwp_brand != NULL); - - /* - * We should never be called for the last thread in a process. - * (That case is handled by brand_solaris_proc_exit().) - * Therefore this lwp must be exiting from a multi-threaded - * process. - */ - ASSERT(p->p_tlist != p->p_tlist->t_forw); - - l->lwp_brand = NULL; } /*ARGSUSED*/ void -brand_solaris_proc_exit(struct proc *p, klwp_t *l, struct brand *pbrand) +brand_solaris_proc_exit(struct proc *p, struct brand *pbrand) { ASSERT(p->p_brand == pbrand); ASSERT(p->p_brand_data != NULL); - /* - * When called from proc_exit(), we know that process is - * single-threaded and free our lwp brand data. - * otherwise just free p_brand_data and return. - */ - if (l != NULL) { - ASSERT(p->p_tlist == p->p_tlist->t_forw); - ASSERT(p->p_tlist->t_lwp == l); - (void) brand_solaris_freelwp(l, pbrand); - } - /* upon exit, free our proc brand data */ kmem_free(p->p_brand_data, sizeof (brand_proc_data_t)); p->p_brand_data = NULL; @@ -1145,5 +1141,4 @@ brand_solaris_setbrand(proc_t *p, struct brand *pbrand) ASSERT(p->p_tlist == p->p_tlist->t_forw); p->p_brand_data = kmem_zalloc(sizeof (brand_proc_data_t), KM_SLEEP); - (void) brand_solaris_initlwp(p->p_tlist->t_lwp, pbrand); } diff --git a/usr/src/uts/common/os/clock_highres.c b/usr/src/uts/common/os/clock_highres.c index bcdf20c0bd..805813037d 100644 --- a/usr/src/uts/common/os/clock_highres.c +++ b/usr/src/uts/common/os/clock_highres.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright (c) 2015, Joyent Inc. All rights reserved. */ #include <sys/timer.h> @@ -66,7 +66,7 @@ clock_highres_getres(timespec_t *ts) /*ARGSUSED*/ static int -clock_highres_timer_create(itimer_t *it, struct sigevent *ev) +clock_highres_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { /* * CLOCK_HIGHRES timers of sufficiently high resolution can deny @@ -80,6 +80,7 @@ clock_highres_timer_create(itimer_t *it, struct sigevent *ev) } it->it_arg = kmem_zalloc(sizeof (cyclic_id_t), KM_SLEEP); + it->it_fire = fire; return (0); } @@ -95,7 +96,7 @@ clock_highres_fire(void *arg) old = *addr; } while (atomic_cas_64((uint64_t *)addr, old, new) != old); - timer_fire(it); + it->it_fire(it); } static int diff --git a/usr/src/uts/common/os/clock_realtime.c b/usr/src/uts/common/os/clock_realtime.c index ef3383fb28..4a75984b23 100644 --- a/usr/src/uts/common/os/clock_realtime.c +++ b/usr/src/uts/common/os/clock_realtime.c @@ -24,7 +24,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015, Joyent Inc. All rights reserved. + */ #include <sys/timer.h> #include <sys/systm.h> @@ -80,8 +82,7 @@ clock_realtime_fire(void *arg) /* * First call into the timer subsystem to get the signal going. */ - timer_fire(it); - + it->it_fire(it); val = &it->it_itime.it_value; interval = &it->it_itime.it_interval; @@ -171,9 +172,10 @@ clock_realtime_fire_first(void *arg) /*ARGSUSED*/ static int -clock_realtime_timer_create(itimer_t *it, struct sigevent *ev) +clock_realtime_timer_create(itimer_t *it, void (*fire)(itimer_t *)) { it->it_arg = kmem_zalloc(sizeof (timeout_id_t), KM_SLEEP); + it->it_fire = fire; return (0); } @@ -184,7 +186,7 @@ clock_realtime_timer_settime(itimer_t *it, int flags, { timeout_id_t tid, *tidp = it->it_arg; timespec_t now; - proc_t *p = curproc; + proc_t *p = it->it_proc; clock_t ticks; gethrestime(&now); @@ -246,7 +248,7 @@ static int clock_realtime_timer_gettime(itimer_t *it, struct itimerspec *when) { timespec_t now; - proc_t *p = curproc; + proc_t *p = it->it_proc; /* * We always keep it_itime up to date, so we just need to snapshot @@ -276,7 +278,7 @@ clock_realtime_timer_gettime(itimer_t *it, struct itimerspec *when) static int clock_realtime_timer_delete(itimer_t *it) { - proc_t *p = curproc; + proc_t *p = it->it_proc; timeout_id_t tid, *tidp = it->it_arg; mutex_enter(&p->p_lock); diff --git a/usr/src/uts/common/os/core.c b/usr/src/uts/common/os/core.c index d4dddbe477..3ca17e1f17 100644 --- a/usr/src/uts/common/os/core.c +++ b/usr/src/uts/common/os/core.c @@ -64,6 +64,7 @@ #include <sys/contract/process_impl.h> #include <sys/ddi.h> +extern int yield(void); /* * Processes running within a zone potentially dump core in 3 locations, * based on the per-process, per-zone, and the global zone's core settings. diff --git a/usr/src/uts/common/os/cred.c b/usr/src/uts/common/os/cred.c index 733fd03a92..b0098946b3 100644 --- a/usr/src/uts/common/os/cred.c +++ b/usr/src/uts/common/os/cred.c @@ -727,6 +727,14 @@ crgetzoneid(const cred_t *cr) cr->cr_zone->zone_id); } +zoneid_t +crgetzonedid(const cred_t *cr) +{ + return (cr->cr_zone == NULL ? + (cr->cr_uid == -1 ? (zoneid_t)-1 : GLOBAL_ZONEID) : + cr->cr_zone->zone_did); +} + projid_t crgetprojid(const cred_t *cr) { diff --git a/usr/src/uts/common/os/ddi_intr_irm.c b/usr/src/uts/common/os/ddi_intr_irm.c index c3c0481e7f..a4b35dcb5b 100644 --- a/usr/src/uts/common/os/ddi_intr_irm.c +++ b/usr/src/uts/common/os/ddi_intr_irm.c @@ -1320,7 +1320,7 @@ i_ddi_irm_notify(ddi_irm_pool_t *pool_p, ddi_irm_req_t *req_p) /* Log callback errors */ if (ret != DDI_SUCCESS) { - cmn_err(CE_WARN, "%s%d: failed callback (action=%d, ret=%d)\n", + cmn_err(CE_WARN, "!%s%d: failed callback (action=%d, ret=%d)\n", ddi_driver_name(req_p->ireq_dip), ddi_get_instance(req_p->ireq_dip), (int)action, ret); } diff --git a/usr/src/uts/common/os/exec.c b/usr/src/uts/common/os/exec.c index 7adc5c0c9d..7e32fb0506 100644 --- a/usr/src/uts/common/os/exec.c +++ b/usr/src/uts/common/os/exec.c @@ -26,7 +26,7 @@ /* Copyright (c) 1988 AT&T */ /* All Rights Reserved */ /* - * Copyright 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -69,6 +69,7 @@ #include <sys/sdt.h> #include <sys/brand.h> #include <sys/klpd.h> +#include <sys/random.h> #include <c2/audit.h> @@ -97,6 +98,7 @@ uint_t auxv_hwcap32_2 = 0; /* 32-bit version of auxv_hwcap2 */ #endif #define PSUIDFLAGS (SNOCD|SUGID) +#define RANDOM_LEN 16 /* 16 bytes for AT_RANDOM aux entry */ /* * exece() - system call wrapper around exec_common() @@ -297,14 +299,30 @@ exec_common(const char *fname, const char **argp, const char **envp, ua.argp = argp; ua.envp = envp; - /* If necessary, brand this process before we start the exec. */ - if (brandme) + /* If necessary, brand this process/lwp before we start the exec. */ + if (brandme) { + void *brand_data = NULL; + brand_setbrand(p); + if (BROP(p)->b_lwpdata_alloc != NULL && + (brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + VN_RELE(vp); + if (dir != NULL) + VN_RELE(dir); + pn_free(&resolvepn); + goto fail; + } + mutex_enter(&p->p_lock); + BROP(p)->b_initlwp(lwp, brand_data); + mutex_exit(&p->p_lock); + } if ((error = gexec(&vp, &ua, &args, NULL, 0, &execsz, - exec_file, p->p_cred, brand_action)) != 0) { - if (brandme) - brand_clearbrand(p, B_FALSE); + exec_file, p->p_cred, &brand_action)) != 0) { + if (brandme) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p); + } VN_RELE(vp); if (dir != NULL) VN_RELE(dir); @@ -360,6 +378,8 @@ exec_common(const char *fname, const char **argp, const char **envp, * pending held signals remain held, so don't clear t_hold. */ mutex_enter(&p->p_lock); + DTRACE_PROBE3(oldcontext__set, klwp_t *, lwp, + uintptr_t, lwp->lwp_oldcontext, uintptr_t, 0); lwp->lwp_oldcontext = 0; lwp->lwp_ustack = 0; lwp->lwp_old_stk_ctl = 0; @@ -419,8 +439,10 @@ exec_common(const char *fname, const char **argp, const char **envp, TRACE_2(TR_FAC_PROC, TR_PROC_EXEC, "proc_exec:p %p up %p", p, up); /* Unbrand ourself if necessary. */ - if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) - brand_clearbrand(p, B_FALSE); + if (PROC_IS_BRANDED(p) && (brand_action == EBA_NATIVE)) { + BROP(p)->b_freelwp(lwp); + brand_clearbrand(p); + } setregs(&args); @@ -544,7 +566,7 @@ gexec( long *execsz, caddr_t exec_file, struct cred *cred, - int brand_action) + int *brand_action) { struct vnode *vp, *execvp = NULL; proc_t *pp = ttoproc(curthread); @@ -858,8 +880,22 @@ gexec( if (pp->p_plist || (pp->p_proc_flag & P_PR_TRACE)) args->traceinval = 1; } - if (pp->p_proc_flag & P_PR_PTRACE) - psignal(pp, SIGTRAP); + + /* + * If legacy ptrace is enabled, defer to the brand as to the + * behavior as to the SIGTRAP generated during exec(). (If + * we're not branded or the brand isn't interested in changing + * the default behavior, we generate the SIGTRAP.) + */ + if (pp->p_proc_flag & P_PR_PTRACE) { + if (PROC_IS_BRANDED(pp) && + BROP(pp)->b_ptrace_exectrap != NULL) { + BROP(pp)->b_ptrace_exectrap(pp); + } else { + psignal(pp, SIGTRAP); + } + } + if (args->traceinval) prinvalidate(&pp->p_user); } @@ -1517,6 +1553,27 @@ stk_add(uarg_t *args, const char *sp, enum uio_seg segflg) return (0); } +/* + * Add a fixed size byte array to the stack (only from kernel space). + */ +static int +stk_byte_add(uarg_t *args, const uint8_t *sp, size_t len) +{ + int error; + + if (STK_AVAIL(args) < sizeof (int)) + return (E2BIG); + *--args->stk_offp = args->stk_strp - args->stk_base; + + if (len > STK_AVAIL(args)) + return (E2BIG); + bcopy(sp, args->stk_strp, len); + + args->stk_strp += len; + + return (0); +} + static int stk_getptr(uarg_t *args, char *src, char **dst) { @@ -1553,6 +1610,7 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) size_t size, pad; char *argv = (char *)uap->argp; char *envp = (char *)uap->envp; + uint8_t rdata[RANDOM_LEN]; /* * Copy interpreter's name and argument to argv[0] and argv[1]. @@ -1622,8 +1680,9 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) args->ne = args->na - argc; /* - * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, and - * AT_SUN_EMULATOR strings to the stack. + * Add AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME, + * AT_SUN_BRAND_NROOT, and AT_SUN_EMULATOR strings, as well as AT_RANDOM + * array, to the stack. */ if (auxvpp != NULL && *auxvpp != NULL) { if ((error = stk_add(args, platform, UIO_SYSSPACE)) != 0) @@ -1636,6 +1695,20 @@ stk_copyin(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) if (args->emulator != NULL && (error = stk_add(args, args->emulator, UIO_SYSSPACE)) != 0) return (error); + + /* + * For the AT_RANDOM aux vector we provide 16 bytes of random + * data. + */ + (void) random_get_pseudo_bytes(rdata, sizeof (rdata)); + + if ((error = stk_byte_add(args, rdata, sizeof (rdata))) != 0) + return (error); + + if (args->brand_nroot != NULL && + (error = stk_add(args, args->brand_nroot, + UIO_SYSSPACE)) != 0) + return (error); } /* @@ -1742,7 +1815,7 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) /* * Fill in the aux vector now that we know the user stack addresses * for the AT_SUN_PLATFORM, AT_SUN_EXECNAME, AT_SUN_BRANDNAME and - * AT_SUN_EMULATOR strings. + * AT_SUN_EMULATOR strings, as well as the AT_RANDOM array. */ if (auxvpp != NULL && *auxvpp != NULL) { if (args->to_model == DATAMODEL_NATIVE) { @@ -1755,6 +1828,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (long)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (long)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, + AT_SUN_BRAND_NROOT, (long)&ustrp[*--offp]) + } } else { auxv32_t **a = (auxv32_t **)auxvpp; ADDAUX(*a, @@ -1767,6 +1845,11 @@ stk_copyout(uarg_t *args, char *usrstack, void **auxvpp, user_t *up) if (args->emulator != NULL) ADDAUX(*a, AT_SUN_EMULATOR, (int)(uintptr_t)&ustrp[*--offp]) + ADDAUX(*a, AT_RANDOM, (int)(uintptr_t)&ustrp[*--offp]) + if (args->brand_nroot != NULL) { + ADDAUX(*a, AT_SUN_BRAND_NROOT, + (int)(uintptr_t)&ustrp[*--offp]) + } } } @@ -1855,6 +1938,9 @@ exec_args(execa_t *uap, uarg_t *args, intpdata_t *intp, void **auxvpp) usrstack = (char *)USRSTACK32; } + if (args->maxstack != 0 && (uintptr_t)usrstack > args->maxstack) + usrstack = (char *)args->maxstack; + ASSERT(P2PHASE((uintptr_t)usrstack, args->stk_align) == 0); #if defined(__sparc) diff --git a/usr/src/uts/common/os/exit.c b/usr/src/uts/common/os/exit.c index c5d54b5978..06a3856332 100644 --- a/usr/src/uts/common/os/exit.c +++ b/usr/src/uts/common/os/exit.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -366,19 +366,6 @@ proc_exit(int why, int what) } mutex_exit(&p->p_lock); - DTRACE_PROC(lwp__exit); - DTRACE_PROC1(exit, int, why); - - /* - * Will perform any brand specific proc exit processing, since this - * is always the last lwp, will also perform lwp_exit and free brand - * data - */ - if (PROC_IS_BRANDED(p)) { - lwp_detach_brand_hdlrs(lwp); - brand_clearbrand(p, B_FALSE); - } - /* * Don't let init exit unless zone_start_init() failed its exec, or * we are shutting down the zone or the machine. @@ -390,12 +377,35 @@ proc_exit(int why, int what) if (z->zone_boot_err == 0 && zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { - if (z->zone_restart_init == B_TRUE) { - if (restart_init(what, why) == 0) - return (0); + + /* + * If the init process should be restarted, the + * "zone_restart_init" member will be set. Some init + * programs in branded zones do not tolerate a restart + * in the traditional manner; setting the + * "zone_reboot_on_init_exit" member will cause the + * entire zone to be rebooted instead. If neither of + * these flags is set the zone will shut down. + */ + if (z->zone_reboot_on_init_exit == B_TRUE && + z->zone_restart_init == B_TRUE) { + /* + * Trigger a zone reboot and continue + * with exit processing. + */ + z->zone_init_status = wstat(why, what); + (void) zone_kadmin(A_REBOOT, 0, NULL, + zone_kcred()); + } else { + if (z->zone_restart_init == B_TRUE) { + if (restart_init(what, why) == 0) + return (0); + } + + z->zone_init_status = wstat(why, what); (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, - CRED()); + zone_kcred()); } } @@ -407,6 +417,32 @@ proc_exit(int why, int what) z->zone_proc_initpid = -1; } + /* + * Delay firing probes (and performing brand cleanup) until after the + * zone_proc_initpid check. Cases which result in zone shutdown or + * restart via zone_kadmin eventually result in a call back to + * proc_exit. + */ + DTRACE_PROC(lwp__exit); + DTRACE_PROC1(exit, int, why); + + /* + * Will perform any brand specific proc exit processing. Since this + * is always the last lwp, will also perform lwp exit/free and proc + * exit. Brand data will be freed when the process is reaped. + */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_proc_exit(p); + /* + * To ensure that b_proc_exit has access to brand-specific data + * contained by the one remaining lwp, call the freelwp hook as + * the last part of this clean-up process. + */ + BROP(p)->b_freelwp(lwp); + lwp_detach_brand_hdlrs(lwp); + } + lwp_pcb_exit(); /* @@ -650,10 +686,22 @@ proc_exit(int why, int what) if ((q = p->p_child) != NULL && p != proc_init) { struct proc *np; struct proc *initp = proc_init; + pid_t zone_initpid = 1; + struct proc *zoneinitp = NULL; boolean_t setzonetop = B_FALSE; - if (!INGLOBALZONE(curproc)) - setzonetop = B_TRUE; + if (!INGLOBALZONE(curproc)) { + zone_initpid = curproc->p_zone->zone_proc_initpid; + + ASSERT(MUTEX_HELD(&pidlock)); + zoneinitp = prfind(zone_initpid); + if (zoneinitp != NULL) { + initp = zoneinitp; + } else { + zone_initpid = 1; + setzonetop = B_TRUE; + } + } pgdetach(p); @@ -665,7 +713,8 @@ proc_exit(int why, int what) */ delete_ns(q->p_parent, q); - q->p_ppid = 1; + q->p_ppid = zone_initpid; + q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); if (setzonetop) { mutex_enter(&q->p_lock); @@ -839,8 +888,50 @@ proc_exit(int why, int what) mutex_exit(&p->p_lock); if (!evaporate) { - p->p_pidflag &= ~CLDPEND; - sigcld(p, sqp); + /* + * The brand specific code only happens when the brand has a + * function to call in place of sigcld and the parent of the + * exiting process is not the global zone init. If the parent + * is the global zone init, then the process was reparented, + * and we don't want brand code delivering possibly strange + * signals to init. Also, init is not branded, so any brand + * specific exit data will not be picked up by init anyway. + */ + if (PROC_IS_BRANDED(p) && + BROP(p)->b_exit_with_sig != NULL && + p->p_ppid != 1) { + /* + * The code for _fini that could unload the brand_t + * blocks until the count of zones using the module + * reaches zero. Zones decrement the refcount on their + * brands only after all user tasks in that zone have + * exited and been waited on. The decrement on the + * brand's refcount happen in zone_destroy(). That + * depends on zone_shutdown() having been completed. + * zone_shutdown() includes a call to zone_empty(), + * where the zone waits for itself to reach the state + * ZONE_IS_EMPTY. This state is only set in either + * zone_shutdown(), when there are no user processes as + * the zone enters this function, or in + * zone_task_rele(). zone_task_rele() is called from + * code triggered by waiting on processes, not by the + * processes exiting through proc_exit(). This means + * all the branded processes that could exist for a + * specific brand_t must exit and get reaped before the + * refcount on the brand_t can reach 0. _fini will + * never unload the corresponding brand module before + * proc_exit finishes execution for all processes + * branded with a particular brand_t, which makes the + * operation below safe to do. Brands that wish to use + * this mechanism must wait in _fini as described + * above. + */ + BROP(p)->b_exit_with_sig(p, sqp); + } else { + p->p_pidflag &= ~CLDPEND; + sigcld(p, sqp); + } + } else { /* * Do what sigcld() would do if the disposition @@ -919,10 +1010,9 @@ winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) int waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) { - int found; proc_t *cp, *pp; - int proc_gone; int waitflag = !(options & WNOWAIT); + boolean_t have_brand_helper = B_FALSE; /* * Obsolete flag, defined here only for binary compatibility @@ -950,7 +1040,8 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) pp = ttoproc(curthread); /* - * lock parent mutex so that sibling chain can be searched. + * Anytime you are looking for a process, you take pidlock to prevent + * things from changing as you look. */ mutex_enter(&pidlock); @@ -970,10 +1061,37 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) return (ECHILD); } - while (pp->p_child != NULL) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_waitid_helper != NULL) { + have_brand_helper = B_TRUE; + } + + while (pp->p_child != NULL || have_brand_helper) { + boolean_t brand_wants_wait = B_FALSE; + int proc_gone = 0; + int found = 0; + + /* + * Give the brand a chance to return synthetic results from + * this waitid() call before we do the real thing. + */ + if (have_brand_helper) { + int ret; + + if (BROP(pp)->b_waitid_helper(idtype, id, ip, options, + &brand_wants_wait, &ret) == 0) { + mutex_exit(&pidlock); + return (ret); + } - proc_gone = 0; + if (pp->p_child == NULL) { + goto no_real_children; + } + } + /* + * Look for interesting children in the newstate list. + */ + VERIFY(pp->p_child != NULL); for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) continue; @@ -981,6 +1099,11 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { @@ -1025,12 +1148,16 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * Wow! None of the threads on the p_sibling_ns list were * interesting threads. Check all the kids! */ - found = 0; for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { if (idtype == P_PID && id != cp->p_pid) continue; if (idtype == P_PGID && id != cp->p_pgrp) continue; + if (PROC_IS_BRANDED(pp)) { + if (BROP(pp)->b_wait_filter != NULL && + BROP(pp)->b_wait_filter(pp, cp) == B_FALSE) + continue; + } switch (cp->p_wcode) { case CLD_TRAPPED: @@ -1099,11 +1226,12 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) break; } +no_real_children: /* * If we found no interesting processes at all, * break out and return ECHILD. */ - if (found + proc_gone == 0) + if (!brand_wants_wait && (found + proc_gone == 0)) break; if (options & WNOHANG) { @@ -1122,7 +1250,7 @@ waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) * change state while we wait, we don't wait at all. * Get out with ECHILD according to SVID. */ - if (found == proc_gone) + if (!brand_wants_wait && (found == proc_gone)) break; if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { @@ -1218,6 +1346,12 @@ freeproc(proc_t *p) p->p_killsqp = NULL; } + /* Clear any remaining brand data */ + if (PROC_IS_BRANDED(p)) { + brand_clearbrand(p); + } + + prfree(p); /* inform /proc */ /* diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 6dc0d00011..bfee77130d 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2015, Joyent Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -40,6 +40,7 @@ #include <sys/vnode.h> #include <sys/pathname.h> #include <sys/file.h> +#include <sys/flock.h> #include <sys/proc.h> #include <sys/var.h> #include <sys/cpuvar.h> @@ -851,7 +852,8 @@ flist_fork(uf_info_t *pfip, uf_info_t *cfip) */ cfip->fi_nfiles = nfiles = flist_minsize(pfip); - cfip->fi_list = kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); + cfip->fi_list = nfiles == 0 ? NULL : + kmem_zalloc(nfiles * sizeof (uf_entry_t), KM_SLEEP); for (fd = 0, pufp = pfip->fi_list, cufp = cfip->fi_list; fd < nfiles; fd++, pufp++, cufp++) { @@ -952,6 +954,8 @@ closef(file_t *fp) return (error); } ASSERT(fp->f_count == 0); + /* Last reference, remove any OFD style lock for the file_t */ + ofdcleanlock(fp); mutex_exit(&fp->f_tlock); /* @@ -1209,7 +1213,8 @@ f_getfl(int fd, int *flagp) error = EBADF; else { vnode_t *vp = fp->f_vnode; - int flag = fp->f_flag | (fp->f_flag2 << 16); + int flag = fp->f_flag | + ((fp->f_flag2 & ~FEPOLLED) << 16); /* * BSD fcntl() FASYNC compatibility. diff --git a/usr/src/uts/common/os/flock.c b/usr/src/uts/common/os/flock.c index 5dad4abb61..a54d6028d5 100644 --- a/usr/src/uts/common/os/flock.c +++ b/usr/src/uts/common/os/flock.c @@ -29,6 +29,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/flock_impl.h> @@ -243,9 +244,293 @@ flk_get_lockmgr_status(void) } /* - * Routine called from fs_frlock in fs/fs_subr.c + * This implements Open File Description (not descriptor) style record locking. + * These locks can also be thought of as pid-less since they are not tied to a + * specific process, thus they're preserved across fork. + * + * Called directly from fcntl. + * + * See reclock() for the implementation of the traditional POSIX style record + * locking scheme (pid-ful). This function is derived from reclock() but + * simplified and modified to work for OFD style locking. + * + * The two primary advantages of OFD style of locking are: + * 1) It is per-file description, so closing a file descriptor that refers to a + * different file description for the same file will not drop the lock (i.e. + * two open's of the same file get different descriptions but a dup or fork + * will refer to the same description). + * 2) Locks are preserved across fork(2). + * + * Because these locks are per-description a lock ptr lives at the f_filocks + * member of the file_t and the lock_descriptor includes a file_t pointer + * to enable unique lock identification and management. + * + * Since these locks are pid-less we cannot do deadlock detection with the + * current process-oriented implementation. This is consistent with OFD locking + * behavior on other operating systems such as Linux. Since we don't do + * deadlock detection we never interact with the process graph that is + * maintained for deadlock detection on the traditional POSIX-style locks. + * + * Future Work: + * + * The current implementation does not support record locks. That is, + * currently the single lock must cover the entire file. This is validated in + * fcntl. To support record locks the f_filock pointer in the file_t needs to + * be changed to a list of pointers to the locks. That list needs to be + * managed independently of the lock list on the vnode itself and it needs to + * be maintained as record locks are created, split, coalesced and deleted. + * + * The current implementation does not support remote file systems (e.g. + * NFS or CIFS). This is handled in fs_frlock(). The design of how OFD locks + * interact with the NLM is not clear since the NLM protocol/implementation + * appears to be oriented around locks associated with a process. A further + * problem is that a design is needed for what nlm_send_siglost() should do and + * where it will send SIGLOST. More recent versions of Linux apparently try to + * emulate OFD locks on NFS by converting them to traditional POSIX style locks + * that work with the NLM. It is not clear that this provides the correct + * semantics in all cases. */ +int +ofdlock(file_t *fp, int fcmd, flock64_t *lckdat, int flag, u_offset_t offset) +{ + int cmd = 0; + vnode_t *vp; + lock_descriptor_t stack_lock_request; + lock_descriptor_t *lock_request; + int error = 0; + graph_t *gp; + int serialize = 0; + + if (fcmd != F_OFD_GETLK) + cmd = SETFLCK; + + if (fcmd == F_OFD_SETLKW || fcmd == F_FLOCKW) + cmd |= SLPFLCK; + + /* see block comment */ + VERIFY(lckdat->l_whence == 0); + VERIFY(lckdat->l_start == 0); + VERIFY(lckdat->l_len == 0); + + vp = fp->f_vnode; + + /* + * For reclock fs_frlock() would normally have set these in a few + * places but for us it's cleaner to centralize it here. Note that + * IGN_PID is -1. We use 0 for our pid-less locks. + */ + lckdat->l_pid = 0; + lckdat->l_sysid = 0; + + /* + * Check access permissions + */ + if ((fcmd == F_OFD_SETLK || fcmd == F_OFD_SETLKW) && + ((lckdat->l_type == F_RDLCK && (flag & FREAD) == 0) || + (lckdat->l_type == F_WRLCK && (flag & FWRITE) == 0))) + return (EBADF); + + /* + * for query and unlock we use the stack_lock_request + */ + if (lckdat->l_type == F_UNLCK || !(cmd & SETFLCK)) { + lock_request = &stack_lock_request; + (void) bzero((caddr_t)lock_request, + sizeof (lock_descriptor_t)); + + /* + * following is added to make the assertions in + * flk_execute_request() pass + */ + lock_request->l_edge.edge_in_next = &lock_request->l_edge; + lock_request->l_edge.edge_in_prev = &lock_request->l_edge; + lock_request->l_edge.edge_adj_next = &lock_request->l_edge; + lock_request->l_edge.edge_adj_prev = &lock_request->l_edge; + lock_request->l_status = FLK_INITIAL_STATE; + } else { + lock_request = flk_get_lock(); + fp->f_filock = (struct filock *)lock_request; + } + lock_request->l_state = 0; + lock_request->l_vnode = vp; + lock_request->l_zoneid = getzoneid(); + lock_request->l_ofd = fp; + + /* + * Convert the request range into the canonical start and end + * values then check the validity of the lock range. + */ + error = flk_convert_lock_data(vp, lckdat, &lock_request->l_start, + &lock_request->l_end, offset); + if (error) + goto done; + + error = flk_check_lock_data(lock_request->l_start, lock_request->l_end, + MAXEND); + if (error) + goto done; + + ASSERT(lock_request->l_end >= lock_request->l_start); + + lock_request->l_type = lckdat->l_type; + if (cmd & SLPFLCK) + lock_request->l_state |= WILLING_TO_SLEEP_LOCK; + + if (!(cmd & SETFLCK)) { + if (lock_request->l_type == F_RDLCK || + lock_request->l_type == F_WRLCK) + lock_request->l_state |= QUERY_LOCK; + } + lock_request->l_flock = (*lckdat); + + /* + * We are ready for processing the request + */ + + if (fcmd != F_OFD_GETLK && lock_request->l_type != F_UNLCK && + nbl_need_check(vp)) { + nbl_start_crit(vp, RW_WRITER); + serialize = 1; + } + /* Get the lock graph for a particular vnode */ + gp = flk_get_lock_graph(vp, FLK_INIT_GRAPH); + + mutex_enter(&gp->gp_mutex); + + lock_request->l_state |= REFERENCED_LOCK; + lock_request->l_graph = gp; + + switch (lock_request->l_type) { + case F_RDLCK: + case F_WRLCK: + if (IS_QUERY_LOCK(lock_request)) { + flk_get_first_blocking_lock(lock_request); + if (lock_request->l_ofd != NULL) + lock_request->l_flock.l_pid = -1; + (*lckdat) = lock_request->l_flock; + } else { + /* process the request now */ + error = flk_process_request(lock_request); + } + break; + + case F_UNLCK: + /* unlock request will not block so execute it immediately */ + error = flk_execute_request(lock_request); + break; + + default: + error = EINVAL; + break; + } + + if (lock_request == &stack_lock_request) { + flk_set_state(lock_request, FLK_DEAD_STATE); + } else { + lock_request->l_state &= ~REFERENCED_LOCK; + if ((error != 0) || IS_DELETED(lock_request)) { + flk_set_state(lock_request, FLK_DEAD_STATE); + flk_free_lock(lock_request); + } + } + + mutex_exit(&gp->gp_mutex); + if (serialize) + nbl_end_crit(vp); + + return (error); + +done: + flk_set_state(lock_request, FLK_DEAD_STATE); + if (lock_request != &stack_lock_request) + flk_free_lock(lock_request); + return (error); +} + +/* + * Remove any lock on the vnode belonging to the given file_t. + * Called from closef on last close, file_t is locked. + * + * This is modeled on the cleanlocks() function but only removes the single + * lock associated with fp. + */ +void +ofdcleanlock(file_t *fp) +{ + lock_descriptor_t *fplock, *lock, *nlock; + vnode_t *vp; + graph_t *gp; + + ASSERT(MUTEX_HELD(&fp->f_tlock)); + + if ((fplock = (lock_descriptor_t *)fp->f_filock) == NULL) + return; + + fp->f_filock = NULL; + vp = fp->f_vnode; + + gp = flk_get_lock_graph(vp, FLK_USE_GRAPH); + + if (gp == NULL) + return; + mutex_enter(&gp->gp_mutex); + + CHECK_SLEEPING_LOCKS(gp); + CHECK_ACTIVE_LOCKS(gp); + + SET_LOCK_TO_FIRST_SLEEP_VP(gp, lock, vp); + + if (lock) { + do { + nlock = lock->l_next; + if (fplock == lock) { + CANCEL_WAKEUP(lock); + break; + } + lock = nlock; + } while (lock->l_vnode == vp); + } + + SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); + + if (lock) { + do { + nlock = lock->l_next; + if (fplock == lock) { + flk_delete_active_lock(lock, 0); + flk_wakeup(lock, 1); + flk_free_lock(lock); + break; + } + lock = nlock; + } while (lock->l_vnode == vp); + } + + CHECK_SLEEPING_LOCKS(gp); + CHECK_ACTIVE_LOCKS(gp); + mutex_exit(&gp->gp_mutex); +} + +/* + * Routine called from fs_frlock in fs/fs_subr.c + * + * This implements traditional POSIX style record locking. The two primary + * drawbacks to this style of locking are: + * 1) It is per-process, so any close of a file descriptor that refers to the + * file will drop the lock (e.g. lock /etc/passwd, call a library function + * which opens /etc/passwd to read the file, when the library closes it's + * file descriptor the application loses its lock and does not know). + * 2) Locks are not preserved across fork(2). + * + * Because these locks are only assoiciated with a pid they are per-process. + * This is why any close will drop the lock and is also why once the process + * forks then the lock is no long related to the new process. These locks can + * be considered as pid-ful. + * + * See ofdlock() for the implementation of a similar but improved locking + * scheme. + */ int reclock(vnode_t *vp, flock64_t *lckdat, @@ -424,6 +709,8 @@ reclock(vnode_t *vp, case F_WRLCK: if (IS_QUERY_LOCK(lock_request)) { flk_get_first_blocking_lock(lock_request); + if (lock_request->l_ofd != NULL) + lock_request->l_flock.l_pid = -1; (*lckdat) = lock_request->l_flock; break; } @@ -712,7 +999,13 @@ flk_get_lock(void) void flk_free_lock(lock_descriptor_t *lock) { + file_t *fp; + ASSERT(IS_DEAD(lock)); + + if ((fp = lock->l_ofd) != NULL) + fp->f_filock = NULL; + if (IS_REFERENCED(lock)) { lock->l_state |= DELETED_LOCK; return; @@ -1214,7 +1507,7 @@ flk_add_edge(lock_descriptor_t *from_lock, lock_descriptor_t *to_lock, from_lock->l_edge.edge_adj_next = edge; /* - * put in in list of to vertex + * put in list of to vertex */ to_lock->l_edge.edge_in_next->edge_in_prev = edge; @@ -2601,9 +2894,11 @@ flk_canceled(lock_descriptor_t *request) } /* - * Remove all the locks for the vnode belonging to the given pid and sysid. + * Remove all non-OFD locks for the vnode belonging to the given pid and sysid. + * That is, since OFD locks are pid-less we'll never match on the incoming + * pid. OFD locks are removed earlier in the close() path via closef() and + * ofdcleanlock(). */ - void cleanlocks(vnode_t *vp, pid_t pid, int sysid) { @@ -2770,6 +3065,14 @@ flk_check_deadlock(lock_descriptor_t *lock) edge_t *ep, *nep; proc_vertex_t *process_stack; + /* + * OFD style locks are not associated with any process so there is + * no proc graph for these. Thus we cannot, and do not, do deadlock + * detection. + */ + if (lock->l_ofd != NULL) + return (0); + STACK_INIT(process_stack); mutex_enter(&flock_lock); @@ -3062,6 +3365,16 @@ flk_update_proc_graph(edge_t *ep, int delete) proc_edge_t *pep, *prevpep; mutex_enter(&flock_lock); + + /* + * OFD style locks are not associated with any process so there is + * no proc graph for these. + */ + if (ep->from_vertex->l_ofd != NULL) { + mutex_exit(&flock_lock); + return; + } + toproc = flk_get_proc_vertex(ep->to_vertex); fromproc = flk_get_proc_vertex(ep->from_vertex); @@ -3891,6 +4204,7 @@ report_blocker(lock_descriptor_t *blocker, lock_descriptor_t *request) flrp->l_type = blocker->l_type; flrp->l_pid = blocker->l_flock.l_pid; flrp->l_sysid = blocker->l_flock.l_sysid; + request->l_ofd = blocker->l_ofd; if (IS_LOCKMGR(request)) { flrp->l_start = blocker->l_start; @@ -4206,6 +4520,10 @@ check_owner_locks(graph_t *gp, pid_t pid, int sysid, vnode_t *vp) { lock_descriptor_t *lock; + /* Ignore OFD style locks since they're not process-wide. */ + if (pid == 0) + return; + SET_LOCK_TO_FIRST_ACTIVE_VP(gp, lock, vp); if (lock) { diff --git a/usr/src/uts/common/os/fork.c b/usr/src/uts/common/os/fork.c index 210a301850..fcd33a7788 100644 --- a/usr/src/uts/common/os/fork.c +++ b/usr/src/uts/common/os/fork.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -696,7 +696,7 @@ fork_fail(proc_t *cp) if (PTOU(curproc)->u_cwd) refstr_rele(PTOU(curproc)->u_cwd); if (PROC_IS_BRANDED(cp)) { - brand_clearbrand(cp, B_TRUE); + brand_clearbrand(cp); } } @@ -1004,6 +1004,9 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_t1_lgrpid = LGRP_NONE; cp->p_tr_lgrpid = LGRP_NONE; + /* Default to native brand initially */ + cp->p_brand = &native_brand; + if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) { if (nproc == v.v_proc) { CPU_STATS_ADDQ(CPU, sys, procovf, 1); @@ -1071,9 +1074,6 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); cp->p_sessp = pp->p_sessp; sess_hold(pp); - cp->p_brand = pp->p_brand; - if (PROC_IS_BRANDED(pp)) - BROP(pp)->b_copy_procdata(cp, pp); cp->p_bssbase = pp->p_bssbase; cp->p_brkbase = pp->p_brkbase; cp->p_brksize = pp->p_brksize; @@ -1153,6 +1153,11 @@ getproc(proc_t **cpp, pid_t pid, uint_t flags) mutex_exit(&cp->p_lock); mutex_exit(&pidlock); + if (PROC_IS_BRANDED(pp)) { + brand_setbrand(cp); + BROP(pp)->b_copy_procdata(cp, pp); + } + avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), offsetof(contract_t, ct_ctlist)); diff --git a/usr/src/uts/common/os/grow.c b/usr/src/uts/common/os/grow.c index f5e92cfd94..0c4c0bcad6 100644 --- a/usr/src/uts/common/os/grow.c +++ b/usr/src/uts/common/os/grow.c @@ -19,7 +19,10 @@ * CDDL HEADER END */ -/* Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright 2013 OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. @@ -52,6 +55,7 @@ #include <sys/fcntl.h> #include <sys/lwpchan_impl.h> #include <sys/nbmlock.h> +#include <sys/brand.h> #include <vm/hat.h> #include <vm/as.h> @@ -522,6 +526,20 @@ choose_addr(struct as *as, caddr_t *addrp, size_t len, offset_t off, return (0); } +caddr_t +map_userlimit(proc_t *pp, struct as *as, int flags) +{ + if (flags & _MAP_LOW32) { + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_map32limit != NULL) { + return ((caddr_t)(uintptr_t)BROP(pp)->b_map32limit(pp)); + } else { + return ((caddr_t)_userlimit32); + } + } + + return (as->a_userlimit); +} + /* * Used for MAP_ANON - fast way to get anonymous pages @@ -537,8 +555,6 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, return (EACCES); if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -547,9 +563,8 @@ zmap(struct as *as, caddr_t *addrp, size_t len, uint_t uprot, int flags, if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(as->a_proc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: @@ -717,8 +732,6 @@ smmap_common(caddr_t *addrp, size_t len, * If the user specified an address, do some simple checks here */ if ((flags & MAP_FIXED) != 0) { - caddr_t userlimit; - /* * Use the user address. First verify that * the address to be used is page aligned. @@ -726,10 +739,8 @@ smmap_common(caddr_t *addrp, size_t len, */ if (((uintptr_t)*addrp & PAGEOFFSET) != 0) return (EINVAL); - - userlimit = flags & _MAP_LOW32 ? - (caddr_t)USERLIMIT32 : as->a_userlimit; - switch (valid_usr_range(*addrp, len, uprot, as, userlimit)) { + switch (valid_usr_range(*addrp, len, uprot, as, + map_userlimit(curproc, as, flags))) { case RANGE_OKAY: break; case RANGE_BADPROT: diff --git a/usr/src/uts/common/os/id_space.c b/usr/src/uts/common/os/id_space.c deleted file mode 100644 index 2dad0cb940..0000000000 --- a/usr/src/uts/common/os/id_space.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. - */ - -#include <sys/types.h> -#include <sys/id_space.h> -#include <sys/debug.h> - -/* - * ID Spaces - * - * The id_space_t provides a simple implementation of a managed range of - * integer identifiers using a vmem arena. An ID space guarantees that the - * next identifer returned by an allocation is larger than the previous one, - * unless there are no larger slots remaining in the range. In this case, - * the ID space will return the first available slot in the lower part of the - * range (viewing the previous identifier as a partitioning element). If no - * slots are available, id_alloc()/id_allocff() will sleep until an - * identifier becomes available. Accordingly, id_space allocations must be - * initiated from contexts where sleeping is acceptable. id_alloc_nosleep()/ - * id_allocff_nosleep() will return -1 if no slots are available or if the - * system is low on memory. If id_alloc_nosleep() fails, callers should - * not try to extend the ID space. This is to avoid making a possible - * low-memory situation worse. - * - * As an ID space is designed for representing a range of id_t's, there - * is a preexisting maximal range: [0, MAXUID]. ID space requests outside - * that range will fail on a DEBUG kernel. The id_allocff*() functions - * return the first available id, and should be used when there is benefit - * to having a compact allocated range. - * - * (Presently, the id_space_t abstraction supports only direct allocations; ID - * reservation, in which an ID is allocated but placed in a internal - * dictionary for later use, should be added when a consuming subsystem - * arrives.) - */ - -#define ID_TO_ADDR(id) ((void *)(uintptr_t)(id + 1)) -#define ADDR_TO_ID(addr) ((id_t)((uintptr_t)addr - 1)) - -/* - * Create an arena to represent the range [low, high). - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_space_t * -id_space_create(const char *name, id_t low, id_t high) -{ - ASSERT(low >= 0); - ASSERT(low < high); - - return (vmem_create(name, ID_TO_ADDR(low), high - low, 1, - NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER)); -} - -/* - * Destroy a previously created ID space. - * No restrictions on caller's context. - */ -void -id_space_destroy(id_space_t *isp) -{ - vmem_destroy(isp); -} - -void -id_space_extend(id_space_t *isp, id_t low, id_t high) -{ - (void) vmem_add(isp, ID_TO_ADDR(low), high - low, VM_SLEEP); -} - -/* - * Allocate an id_t from specified ID space. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_alloc(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space. - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_alloc_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_NEXTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT. - * Caller must be in a context in which VM_SLEEP is legal. - */ -id_t -id_allocff(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_SLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate an id_t from specified ID space using FIRSTFIT - * Returns -1 on failure (see module block comments for more information on - * failure modes). - */ -id_t -id_allocff_nosleep(id_space_t *isp) -{ - return (ADDR_TO_ID(vmem_alloc(isp, 1, VM_NOSLEEP | VM_FIRSTFIT))); -} - -/* - * Allocate a specific identifier if possible, returning the id if - * successful, or -1 on failure. - */ -id_t -id_alloc_specific_nosleep(id_space_t *isp, id_t id) -{ - void *minaddr = ID_TO_ADDR(id); - void *maxaddr = ID_TO_ADDR(id + 1); - - /* - * Note that even though we're vmem_free()ing this later, it - * should be OK, since there's no quantum cache. - */ - return (ADDR_TO_ID(vmem_xalloc(isp, 1, 1, 0, 0, - minaddr, maxaddr, VM_NOSLEEP))); -} - -/* - * Free a previously allocated ID. - * No restrictions on caller's context. - */ -void -id_free(id_space_t *isp, id_t id) -{ - vmem_free(isp, ID_TO_ADDR(id), 1); -} diff --git a/usr/src/uts/common/os/kmem.c b/usr/src/uts/common/os/kmem.c index af94c2d16f..9d38183ce1 100644 --- a/usr/src/uts/common/os/kmem.c +++ b/usr/src/uts/common/os/kmem.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* @@ -1002,6 +1003,13 @@ size_t kmem_lite_maxalign = 1024; /* maximum buffer alignment for KMF_LITE */ int kmem_lite_pcs = 4; /* number of PCs to store in KMF_LITE mode */ size_t kmem_maxverify; /* maximum bytes to inspect in debug routines */ size_t kmem_minfirewall; /* hardware-enforced redzone threshold */ +int kmem_warn_zerosized = 1; /* whether to warn on zero-sized KM_SLEEP */ + +#ifdef DEBUG +int kmem_panic_zerosized = 1; /* whether to panic on zero-sized KM_SLEEP */ +#else +int kmem_panic_zerosized = 0; /* whether to panic on zero-sized KM_SLEEP */ +#endif #ifdef _LP64 size_t kmem_max_cached = KMEM_BIG_MAXBUF; /* maximum kmem_alloc cache */ @@ -1036,6 +1044,8 @@ static vmem_t *kmem_default_arena; static vmem_t *kmem_firewall_va_arena; static vmem_t *kmem_firewall_arena; +static int kmem_zerosized; /* # of zero-sized allocs */ + /* * Define KMEM_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. @@ -2925,8 +2935,31 @@ kmem_alloc(size_t size, int kmflag) /* fall through to kmem_cache_alloc() */ } else { - if (size == 0) + if (size == 0) { + if (kmflag != KM_SLEEP && !(kmflag & KM_PANIC)) + return (NULL); + + /* + * If this is a sleeping allocation or one that has + * been specified to panic on allocation failure, we + * consider it to be deprecated behavior to allocate + * 0 bytes. If we have been configured to panic under + * this condition, we panic; if to warn, we warn -- and + * regardless, we bump a counter to at least indicate + * that this condition has occurred. + */ + if (kmem_panic && kmem_panic_zerosized) + panic("attempted to kmem_alloc() size of 0"); + + if (kmem_warn_zerosized) { + cmn_err(CE_WARN, "kmem_alloc(): sleeping " + "allocation with size of 0"); + } + + kmem_zerosized++; + return (NULL); + } buf = vmem_alloc(kmem_oversize_arena, size, kmflag & KM_VMFLAGS); @@ -4120,7 +4153,8 @@ kmem_cache_destroy(kmem_cache_t *cp) if (kmem_taskq != NULL) taskq_wait(kmem_taskq); - if (kmem_move_taskq != NULL) + + if (kmem_move_taskq != NULL && cp->cache_defrag != NULL) taskq_wait(kmem_move_taskq); kmem_cache_magazine_purge(cp); diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 149f5f8a88..6a603c8982 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2013 Gary Mills * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ #include <sys/types.h> @@ -249,8 +250,7 @@ log_init(void) */ printf("\rSunOS Release %s Version %s %u-bit\n", utsname.release, utsname.version, NBBY * (uint_t)sizeof (void *)); - printf("Copyright (c) 1983, 2010, Oracle and/or its affiliates. " - "All rights reserved.\n"); + printf("Copyright (c) 2010-2015, Joyent Inc. All rights reserved.\n"); #ifdef DEBUG printf("DEBUG enabled\n"); #endif diff --git a/usr/src/uts/common/os/lwp.c b/usr/src/uts/common/os/lwp.c index feb8e76c42..f5be1b40b3 100644 --- a/usr/src/uts/common/os/lwp.c +++ b/usr/src/uts/common/os/lwp.c @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/param.h> @@ -115,7 +115,7 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, ret_tidhash_t *ret_tidhash = NULL; int i; int rctlfail = 0; - boolean_t branded = 0; + void *brand_data = NULL; struct ctxop *ctx = NULL; ASSERT(cid != sysdccid); /* system threads must start in SYS */ @@ -283,6 +283,19 @@ lwp_create(void (*proc)(), caddr_t arg, size_t len, proc_t *p, */ lep = kmem_zalloc(sizeof (*lep), KM_SLEEP); + /* + * If necessary, speculatively allocate lwp brand data. This is done + * ahead of time so p_lock need not be dropped during lwp branding. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_lwpdata_alloc != NULL) { + if ((brand_data = BROP(p)->b_lwpdata_alloc(p)) == NULL) { + mutex_enter(&p->p_lock); + err = 1; + atomic_inc_32(&p->p_zone->zone_ffmisc); + goto error; + } + } + mutex_enter(&p->p_lock); grow: /* @@ -630,18 +643,6 @@ grow: } while (lwp_hash_lookup(p, t->t_tid) != NULL); } - /* - * If this is a branded process, let the brand do any necessary lwp - * initialization. - */ - if (PROC_IS_BRANDED(p)) { - if (BROP(p)->b_initlwp(lwp)) { - err = 1; - atomic_inc_32(&p->p_zone->zone_ffmisc); - goto error; - } - branded = 1; - } if (t->t_tid == 1) { kpreempt_disable(); @@ -654,7 +655,6 @@ grow: } } - p->p_lwpcnt++; t->t_waitfor = -1; /* @@ -695,9 +695,21 @@ grow: t->t_pre_sys = 1; t->t_post_sys = 1; + /* Complete branded lwp initialization */ + if (PROC_IS_BRANDED(p)) { + BROP(p)->b_initlwp(lwp, brand_data); + /* + * The b_initlwp hook is expected to consume any preallocated + * brand_data in a way that prepares it for deallocation by the + * b_freelwp hook. + */ + brand_data = NULL; + } + /* * Insert the new thread into the list of all threads. */ + p->p_lwpcnt++; if ((tx = p->p_tlist) == NULL) { t->t_back = t; t->t_forw = t; @@ -753,8 +765,9 @@ error: if (cid != NOCLASS && bufp != NULL) CL_FREE(cid, bufp); - if (branded) - BROP(p)->b_freelwp(lwp); + if (brand_data != NULL) { + BROP(p)->b_lwpdata_free(brand_data); + } mutex_exit(&p->p_lock); t->t_state = TS_FREE; @@ -891,13 +904,6 @@ lwp_exit(void) if (t->t_upimutex != NULL) upimutex_cleanup(); - /* - * Perform any brand specific exit processing, then release any - * brand data associated with the lwp - */ - if (PROC_IS_BRANDED(p)) - BROP(p)->b_lwpexit(lwp); - lwp_pcb_exit(); mutex_enter(&p->p_lock); @@ -941,6 +947,17 @@ lwp_exit(void) DTRACE_PROC(lwp__exit); /* + * Perform any brand specific exit processing, then release any + * brand data associated with the lwp + */ + if (PROC_IS_BRANDED(p)) { + mutex_exit(&p->p_lock); + BROP(p)->b_lwpexit(lwp); + BROP(p)->b_freelwp(lwp); + mutex_enter(&p->p_lock); + } + + /* * If the lwp is a detached lwp or if the process is exiting, * remove (lwp_hash_out()) the lwp from the lwp directory. * Otherwise null out the lwp's le_thread pointer in the lwp diff --git a/usr/src/uts/common/os/main.c b/usr/src/uts/common/os/main.c index 7afc1cfe00..dda0b3e4a6 100644 --- a/usr/src/uts/common/os/main.c +++ b/usr/src/uts/common/os/main.c @@ -27,7 +27,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -157,7 +157,7 @@ exec_init(const char *initpath, const char *args) int error = 0, count = 0; proc_t *p = ttoproc(curthread); klwp_t *lwp = ttolwp(curthread); - int brand_action; + int brand_action = EBA_NONE; if (args == NULL) args = ""; @@ -268,7 +268,15 @@ exec_init(const char *initpath, const char *args) */ sigemptyset(&curthread->t_hold); - brand_action = ZONE_IS_BRANDED(p->p_zone) ? EBA_BRAND : EBA_NONE; + /* + * Only instruct exec_common to brand the process if necessary. It is + * possible that the init process is already properly branded due to the + * proc_exit -> restart_init -> exec_init call chain. + */ + if (ZONE_IS_BRANDED(p->p_zone) && + p->p_brand != p->p_zone->zone_brand) { + brand_action = EBA_BRAND; + } again: error = exec_common((const char *)(uintptr_t)exec_fnamep, (const char **)(uintptr_t)uap, NULL, brand_action); diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c index 928c6b3bb4..66994321f7 100644 --- a/usr/src/uts/common/os/msacct.c +++ b/usr/src/uts/common/os/msacct.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -566,27 +567,18 @@ cpu_update_pct(kthread_t *t, hrtime_t newtime) */ do { - if (T_ONPROC(t) && t->t_waitrq == 0) { - hrlb = t->t_hrtime; + pctcpu = t->t_pctcpu; + hrlb = t->t_hrtime; + delta = newtime - hrlb; + if (delta < 0) { + newtime = gethrtime_unscaled(); delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; + } + t->t_hrtime = newtime; + scalehrtime(&delta); + if (T_ONPROC(t) && t->t_waitrq == 0) { npctcpu = cpu_grow(pctcpu, delta); } else { - hrlb = t->t_hrtime; - delta = newtime - hrlb; - if (delta < 0) { - newtime = gethrtime_unscaled(); - delta = newtime - hrlb; - } - t->t_hrtime = newtime; - scalehrtime(&delta); - pctcpu = t->t_pctcpu; npctcpu = cpu_decay(pctcpu, delta); } } while (atomic_cas_32(&t->t_pctcpu, pctcpu, npctcpu) != pctcpu); diff --git a/usr/src/uts/common/os/pid.c b/usr/src/uts/common/os/pid.c index b555bb82b7..39db5cb27d 100644 --- a/usr/src/uts/common/os/pid.c +++ b/usr/src/uts/common/os/pid.c @@ -112,6 +112,18 @@ pid_lookup(pid_t pid) return (pidp); } +struct pid * +pid_find(pid_t pid) +{ + struct pid *pidp; + + mutex_enter(&pidlinklock); + pidp = pid_lookup(pid); + mutex_exit(&pidlinklock); + + return (pidp); +} + void pid_setmin(void) { diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index 05deaaf43b..d2bdb4ce37 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +55,7 @@ #include <sys/mntent.h> #include <sys/contract_impl.h> #include <sys/dld_ioc.h> +#include <sys/brand.h> /* * There are two possible layers of privilege routines and two possible @@ -1243,6 +1244,22 @@ secpolicy_vnode_owner(const cred_t *cr, uid_t owner) void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { + proc_t *p = curproc; + + /* + * Allow the brand to override this behaviour. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_setid_clear != NULL) { + /* + * This brand hook will return 0 if handling is complete, or + * some other value if the brand would like us to fall back to + * the usual behaviour. + */ + if (BROP(p)->b_setid_clear(vap, cr) == 0) { + return; + } + } + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(cr, (vap->va_mode & S_ISUID) != 0 && @@ -2072,6 +2089,19 @@ secpolicy_tasksys(const cred_t *cr) } int +secpolicy_meminfo(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_PROC_MEMINFO, B_FALSE, EPERM, NULL)); +} + +int +secpolicy_fs_import(const cred_t *cr) +{ + return (PRIV_POLICY(cr, PRIV_SYS_FS_IMPORT, B_FALSE, EPERM, NULL)); +} + + +int secpolicy_pfexec_register(const cred_t *cr) { return (PRIV_POLICY(cr, PRIV_SYS_ADMIN, B_TRUE, EPERM, NULL)); @@ -2575,3 +2605,11 @@ secpolicy_ppp_config(const cred_t *cr) return (secpolicy_net_config(cr, B_FALSE)); return (PRIV_POLICY(cr, PRIV_SYS_PPP_CONFIG, B_FALSE, EPERM, NULL)); } + +int +secpolicy_hyprlofs_control(const cred_t *cr) +{ + if (PRIV_POLICY(cr, PRIV_HYPRLOFS_CONTROL, B_FALSE, EPERM, NULL)) + return (EPERM); + return (0); +} diff --git a/usr/src/uts/common/os/priv_defs b/usr/src/uts/common/os/priv_defs index bfacce1739..cc1c5e03a6 100644 --- a/usr/src/uts/common/os/priv_defs +++ b/usr/src/uts/common/os/priv_defs @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. * INSERT COMMENT */ @@ -177,6 +177,10 @@ privilege PRIV_GRAPHICS_MAP Allows a process to perform privileged mappings through a graphics device. +privilege PRIV_HYPRLOFS_CONTROL + + Allows a process to manage hyprlofs entries. + privilege PRIV_IPC_DAC_READ Allows a process to read a System V IPC @@ -291,6 +295,10 @@ privilege PRIV_PROC_LOCK_MEMORY Allows a process to lock pages in physical memory. +privilege PRIV_PROC_MEMINFO + + Allows a process to access physical memory information. + privilege PRIV_PROC_OWNER Allows a process to send signals to other processes, inspect @@ -368,6 +376,10 @@ privilege PRIV_SYS_DEVICES Allows a process to open the real console device directly. Allows a process to open devices that have been exclusively opened. +privilege PRIV_SYS_FS_IMPORT + + Allows a process to import a potentially untrusted file system. + privilege PRIV_SYS_IPC_CONFIG Allows a process to increase the size of a System V IPC Message diff --git a/usr/src/uts/common/os/sched.c b/usr/src/uts/common/os/sched.c index c1d6569f11..15e77d39f7 100644 --- a/usr/src/uts/common/os/sched.c +++ b/usr/src/uts/common/os/sched.c @@ -27,6 +27,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #include <sys/param.h> #include <sys/types.h> #include <sys/sysmacros.h> @@ -646,16 +650,17 @@ top: klwp_t *lwp = ttolwp(tp); /* - * Swapout eligible lwps (specified by the scheduling - * class) which don't have TS_DONT_SWAP set. Set the - * "intent to swap" flag (TS_SWAPENQ) on threads - * which have TS_DONT_SWAP set so that they can be + * Swapout eligible lwps (specified by the scheduling class) + * which don't have TS_DONT_SWAP set. Set the "intent to swap" + * flag (TS_SWAPENQ) on threads which have either TS_DONT_SWAP + * set or are currently on a split stack so that they can be * swapped if and when they reach a safe point. */ thread_lock(tp); thread_pri = CL_SWAPOUT(tp, swapflags); if (thread_pri != -1) { - if (tp->t_schedflag & TS_DONT_SWAP) { + if ((tp->t_schedflag & TS_DONT_SWAP) || + (tp->t_flag & T_SPLITSTK)) { tp->t_schedflag |= TS_SWAPENQ; tp->t_trapret = 1; aston(tp); diff --git a/usr/src/uts/common/os/sig.c b/usr/src/uts/common/os/sig.c index 0b79c3765a..bc48c6e6e8 100644 --- a/usr/src/uts/common/os/sig.c +++ b/usr/src/uts/common/os/sig.c @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -60,6 +60,7 @@ #include <sys/cyclic.h> #include <sys/dtrace.h> #include <sys/sdt.h> +#include <sys/brand.h> const k_sigset_t nullsmask = {0, 0, 0}; @@ -141,6 +142,21 @@ signal_is_blocked(kthread_t *t, int sig) } /* + * Return true if the signal can safely be ignored. + * That is, if the signal is included in the p_ignore mask and doing so is not + * forbidden by any process branding. + */ +static int +sig_ignorable(proc_t *p, int sig) +{ + return (sigismember(&p->p_ignore, sig) && /* sig in ignore mask */ + !(PROC_IS_BRANDED(p) && /* allowed by brand */ + BROP(p)->b_sig_ignorable != NULL && + BROP(p)->b_sig_ignorable(p, sig) == B_FALSE)); + +} + +/* * Return true if the signal can safely be discarded on generation. * That is, if there is no need for the signal on the receiving end. * The answer is true if the process is a zombie or @@ -157,7 +173,7 @@ sig_discardable(proc_t *p, int sig) kthread_t *t = p->p_tlist; return (t == NULL || /* if zombie or ... */ - (sigismember(&p->p_ignore, sig) && /* signal is ignored */ + (sig_ignorable(p, sig) && /* signal is ignored */ t->t_forw == t && /* and single-threaded */ !tracing(p, sig) && /* and no /proc tracing */ !signal_is_blocked(t, sig) && /* and signal not blocked */ @@ -193,7 +209,7 @@ eat_signal(kthread_t *t, int sig) !(ttoproc(t)->p_proc_flag & P_PR_LOCK)) { ttoproc(t)->p_stopsig = 0; t->t_dtrace_stop = 0; - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); } else if (t != curthread && t->t_state == TS_ONPROC) { aston(t); /* make it do issig promptly */ @@ -481,7 +497,7 @@ issig_justlooking(void) if (sigismember(&set, sig) && (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig))) { + !sig_ignorable(p, sig))) { /* * Don't promote a signal that will stop * the process when lwp_nostop is set. @@ -607,6 +623,21 @@ issig_forreal(void) } /* + * Allow the brand the chance to alter (or suppress) delivery + * of this signal. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_issig_stop != NULL) { + /* + * The brand hook will return 0 if it would like + * us to drive on, or -1 if we should restart + * the loop to check other conditions. + */ + if (BROP(p)->b_issig_stop(p, lwp) != 0) { + continue; + } + } + + /* * Honor requested stop before dealing with the * current signal; a debugger may change it. * Do not want to go back to loop here since this is a special @@ -640,7 +671,7 @@ issig_forreal(void) lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (sigismember(&t->t_sigwait, sig) || - (!sigismember(&p->p_ignore, sig) && + (!sig_ignorable(p, sig) && !isjobstop(sig))) { if (p->p_flag & (SEXITLWPS|SKILLED)) { sig = SIGKILL; @@ -692,7 +723,7 @@ issig_forreal(void) toproc = 0; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, sig)) { if (sigismember(&t->t_extsig, sig)) ext = 1; break; @@ -706,7 +737,7 @@ issig_forreal(void) toproc = 1; if (tracing(p, sig) || sigismember(&t->t_sigwait, sig) || - !sigismember(&p->p_ignore, sig)) { + !sig_ignorable(p, sig)) { if (sigismember(&p->p_extsig, sig)) ext = 1; break; @@ -938,6 +969,16 @@ stop(int why, int what) } break; + case PR_BRAND: + /* + * We have been stopped by the brand code for a brand-private + * reason. This is an asynchronous stop affecting only this + * LWP. + */ + VERIFY(PROC_IS_BRANDED(p)); + flags &= ~TS_BSTART; + break; + default: /* /proc stop */ flags &= ~TS_PSTART; /* @@ -1049,7 +1090,7 @@ stop(int why, int what) } } - if (why != PR_JOBCONTROL && why != PR_CHECKPOINT) { + if (why != PR_JOBCONTROL && why != PR_CHECKPOINT && why != PR_BRAND) { /* * Do process-level notification when all lwps are * either stopped on events of interest to /proc @@ -1155,6 +1196,13 @@ stop(int why, int what) if (why == PR_CHECKPOINT) del_one_utstop(); + /* + * Allow the brand to post notification of this stop condition. + */ + if (PROC_IS_BRANDED(p) && BROP(p)->b_stop_notify != NULL) { + BROP(p)->b_stop_notify(p, lwp, why, what); + } + thread_lock(t); ASSERT((t->t_schedflag & TS_ALLSTART) == 0); t->t_schedflag |= flags; @@ -1176,7 +1224,7 @@ stop(int why, int what) (p->p_flag & (SEXITLWPS|SKILLED))) { p->p_stopsig = 0; thread_lock(t); - t->t_schedflag |= TS_XSTART | TS_PSTART; + t->t_schedflag |= TS_XSTART | TS_PSTART | TS_BSTART; setrun_locked(t); thread_unlock_nopreempt(t); } else if (why == PR_JOBCONTROL) { @@ -1311,7 +1359,7 @@ psig(void) * this signal from pending to current (we dropped p->p_lock). * This can happen only in a multi-threaded process. */ - if (sigismember(&p->p_ignore, sig) || + if (sig_ignorable(p, sig) || (func == SIG_DFL && sigismember(&stopdefault, sig))) { lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; @@ -1413,6 +1461,9 @@ psig(void) DTRACE_PROC3(signal__handle, int, sig, k_siginfo_t *, sip, void (*)(void), func); + if (PROC_IS_BRANDED(p) && BROP(p)->b_psig_to_proc) + BROP(p)->b_psig_to_proc(p, t, sig); + lwp->lwp_cursig = 0; lwp->lwp_extsig = 0; if (lwp->lwp_curinfo) { @@ -1755,9 +1806,12 @@ post_sigcld(proc_t *cp, sigqueue_t *sqp) /* * This can only happen when the parent is init. * (See call to sigcld(q, NULL) in exit().) - * Use KM_NOSLEEP to avoid deadlock. + * Use KM_NOSLEEP to avoid deadlock. The child procs + * initpid can be 1 for zlogin. */ - ASSERT(pp == proc_init); + ASSERT(pp->p_pidp->pid_id == + cp->p_zone->zone_proc_initpid || + pp->p_pidp->pid_id == 1); winfo(cp, &info, 0); sigaddq(pp, NULL, &info, KM_NOSLEEP); } else { @@ -1788,6 +1842,15 @@ sigcld_repost() sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); mutex_enter(&pidlock); + if (PROC_IS_BRANDED(pp) && BROP(pp)->b_sigcld_repost != NULL) { + /* + * Allow the brand to inject synthetic SIGCLD signals. + */ + if (BROP(pp)->b_sigcld_repost(pp, sqp) == 0) { + mutex_exit(&pidlock); + return; + } + } for (cp = pp->p_child; cp; cp = cp->p_sibling) { if (cp->p_pidflag & CLDPEND) { post_sigcld(cp, sqp); diff --git a/usr/src/uts/common/os/smb_subr.c b/usr/src/uts/common/os/smb_subr.c index 6084676b17..6dc7230bed 100644 --- a/usr/src/uts/common/os/smb_subr.c +++ b/usr/src/uts/common/os/smb_subr.c @@ -25,7 +25,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ #include <sys/smbios_impl.h> #include <sys/cmn_err.h> @@ -43,13 +45,13 @@ smb_strerror(int err) void * smb_alloc(size_t len) { - return (kmem_alloc(len, KM_SLEEP)); + return (len > 0 ? kmem_alloc(len, KM_SLEEP) : NULL); } void * smb_zalloc(size_t len) { - return (kmem_zalloc(len, KM_SLEEP)); + return (len > 0 ? kmem_zalloc(len, KM_SLEEP) : NULL); } void diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index c6ebe8b110..0f3aafcc42 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -24,6 +24,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #include <sys/types.h> @@ -76,6 +77,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/limits.h> #include <c2/audit.h> /* @@ -984,12 +986,20 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, * (registered in sd_wakeq). */ struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; if (first) stp->sd_wakeq &= ~RSLEEP; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_mp = 0; /* * Mark that a thread is in rwnext on the read side @@ -1028,6 +1038,8 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if ((bp = uiod.d_mp) != NULL) { *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (bp); } error = 0; @@ -1047,8 +1059,14 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } else { *errorp = error; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (NULL); } + + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); + /* * Try a getq in case a rwnext() generated mblk * has bubbled up via strrput(). @@ -2543,6 +2561,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int b_flag, int pri, int flags) { struiod_t uiod; + struct iovec buf[IOV_MAX_STACK]; + int iovlen = 0; mblk_t *mp; queue_t *wqp = stp->sd_wrq; int error = 0; @@ -2634,13 +2654,21 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, mp->b_flag |= b_flag; mp->b_band = (uchar_t)pri; - (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, - sizeof (uiod.d_iov) / sizeof (*uiod.d_iov)); + if (uiop->uio_iovcnt > IOV_MAX_STACK) { + iovlen = uiop->uio_iovcnt * sizeof (iovec_t); + uiod.d_iov = (struct iovec *)kmem_alloc(iovlen, KM_SLEEP); + } else { + uiod.d_iov = buf; + } + + (void) uiodup(uiop, &uiod.d_uio, uiod.d_iov, uiop->uio_iovcnt); uiod.d_uio.uio_offset = 0; uiod.d_mp = mp; error = rwnext(wqp, &uiod); if (! uiod.d_mp) { uioskip(uiop, *iosize); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } ASSERT(mp == uiod.d_mp); @@ -2658,17 +2686,23 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, error = 0; } else { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } /* Have to check canput before consuming data from the uio */ if (pri == 0) { if (!canputnext(wqp) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } else { if (!bcanputnext(wqp, pri) && !(flags & MSG_IGNFLOW)) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (EWOULDBLOCK); } } @@ -2676,6 +2710,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, /* Copyin data from the uio */ if ((error = struioget(wqp, mp, &uiod, 0)) != 0) { freemsg(mp); + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (error); } uioskip(uiop, *iosize); @@ -2692,6 +2728,8 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, putnext(wqp, mp); stream_runservice(stp); } + if (iovlen != 0) + kmem_free(uiod.d_iov, iovlen); return (0); } @@ -3177,6 +3215,7 @@ job_control_type(int cmd) case JAGENT: /* Obsolete */ case JTRUN: /* Obsolete */ case JXTPROTO: /* Obsolete */ + case TIOCSETLD: return (JCSETP); } @@ -8311,7 +8350,7 @@ chkrd: } *reventsp = (short)retevents; - if (retevents) { + if (retevents && !(events & POLLET)) { if (headlocked) mutex_exit(&stp->sd_lock); return (0); diff --git a/usr/src/uts/common/os/sysent.c b/usr/src/uts/common/os/sysent.c index 0d1bb6a8a1..aa44ccf788 100644 --- a/usr/src/uts/common/os/sysent.c +++ b/usr/src/uts/common/os/sysent.c @@ -1093,18 +1093,20 @@ char **syscallnames; systrace_sysent_t *systrace_sysent; void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); /*ARGSUSED*/ void systrace_stub(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7) {} /*ARGSUSED*/ int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent[curthread->t_sysnum]; dtrace_id_t id; @@ -1112,7 +1114,8 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1126,14 +1129,15 @@ dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, + arg6, arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((int64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((int64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1145,7 +1149,8 @@ systrace_sysent_t *systrace_sysent32; /*ARGSUSED*/ int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, - uintptr_t arg3, uintptr_t arg4, uintptr_t arg5) + uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, uintptr_t arg6, + uintptr_t arg7) { systrace_sysent_t *sy = &systrace_sysent32[curthread->t_sysnum]; dtrace_id_t id; @@ -1153,7 +1158,8 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, proc_t *p; if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5); + (*systrace_probe)(id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); /* * We want to explicitly allow DTrace consumers to stop a process @@ -1167,14 +1173,15 @@ dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, uintptr_t arg2, } mutex_exit(&p->p_lock); - rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5); + rval = (*sy->stsy_underlying)(arg0, arg1, arg2, arg3, arg4, arg5, arg6, + arg7); if (ttolwp(curthread)->lwp_errno != 0) rval = -1; if ((id = sy->stsy_return) != DTRACE_IDNONE) (*systrace_probe)(id, (uintptr_t)rval, (uintptr_t)rval, - (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0); + (uintptr_t)((uint64_t)rval >> 32), 0, 0, 0, 0, 0); return (rval); } @@ -1202,5 +1209,5 @@ dtrace_systrace_rtt(void) } if ((id = sy->stsy_return) != DTRACE_IDNONE) - (*systrace_probe)(id, 0, 0, 0, 0, 0, 0); + (*systrace_probe)(id, 0, 0, 0, 0, 0, 0, 0, 0); } diff --git a/usr/src/uts/common/os/timer.c b/usr/src/uts/common/os/timer.c index 8559d8736c..b25a6cbcf1 100644 --- a/usr/src/uts/common/os/timer.c +++ b/usr/src/uts/common/os/timer.c @@ -269,6 +269,15 @@ clock_add_backend(clockid_t clock, clock_backend_t *backend) clock_backend[clock] = backend; } +clock_backend_t * +clock_get_backend(clockid_t clock) +{ + if (clock < 0 || clock >= CLOCK_MAX) + return (NULL); + + return (clock_backend[clock]); +} + int clock_settime(clockid_t clock, timespec_t *tp) { @@ -398,7 +407,7 @@ timer_signal(sigqueue_t *sigq) /* * This routine is called from the clock backend. */ -void +static void timer_fire(itimer_t *it) { proc_t *p; @@ -672,7 +681,7 @@ timer_create(clockid_t clock, struct sigevent *evp, timer_t *tid) * Call on the backend to verify the event argument (or return * EINVAL if this clock type does not support timers). */ - if ((error = backend->clk_timer_create(it, &ev)) != 0) + if ((error = backend->clk_timer_create(it, timer_fire)) != 0) goto err; it->it_lwp = ttolwp(curthread); diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index a554f8c3f3..0a6fe0ef96 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -1618,7 +1618,7 @@ vmem_destroy(vmem_t *vmp) leaked = vmem_size(vmp, VMEM_ALLOC); if (leaked != 0) - cmn_err(CE_WARN, "vmem_destroy('%s'): leaked %lu %s", + cmn_err(CE_WARN, "!vmem_destroy('%s'): leaked %lu %s", vmp->vm_name, leaked, (vmp->vm_cflags & VMC_IDENTIFIER) ? "identifiers" : "bytes"); diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 56c654331e..b097b351e6 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent Inc. All rights reserved. + * Copyright 2015, Joyent Inc. All rights reserved. */ /* @@ -370,8 +370,12 @@ static char *zone_ref_subsys_names[] = { rctl_hndl_t rc_zone_cpu_shares; rctl_hndl_t rc_zone_locked_mem; rctl_hndl_t rc_zone_max_swap; +rctl_hndl_t rc_zone_phys_mem; rctl_hndl_t rc_zone_max_lofi; rctl_hndl_t rc_zone_cpu_cap; +rctl_hndl_t rc_zone_cpu_baseline; +rctl_hndl_t rc_zone_cpu_burst_time; +rctl_hndl_t rc_zone_zfs_io_pri; rctl_hndl_t rc_zone_nlwps; rctl_hndl_t rc_zone_nprocs; rctl_hndl_t rc_zone_shmmax; @@ -417,8 +421,9 @@ static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, * Version 5 alters the zone_boot system call, and converts its old * bootargs parameter to be set by the zone_setattr API instead. * Version 6 adds the flag argument to zone_create. + * Version 7 adds the requested zoneid to zone_create. */ -static const int ZONE_SYSCALL_API_VERSION = 6; +static const int ZONE_SYSCALL_API_VERSION = 7; /* * Certain filesystems (such as NFS and autofs) need to know which zone @@ -1377,6 +1382,114 @@ static rctl_ops_t zone_cpu_cap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_cpu_base_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_base(p->p_zone)); +} + +/* + * The zone cpu base is used to set the baseline CPU for the zone + * so we can track when the zone is bursting. + */ +/*ARGSUSED*/ +static int +zone_cpu_base_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_base(zone, nv)); +} + +static rctl_ops_t zone_cpu_base_ops = { + rcop_no_action, + zone_cpu_base_get, + zone_cpu_base_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t +zone_cpu_burst_time_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (cpucaps_zone_get_burst_time(p->p_zone)); +} + +/* + * The zone cpu burst time is used to set the amount of time CPU(s) can be + * bursting for the zone. + */ +/*ARGSUSED*/ +static int +zone_cpu_burst_time_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + return (cpucaps_zone_set_burst_time(zone, nv)); +} + +static rctl_ops_t zone_cpu_burst_time_ops = { + rcop_no_action, + zone_cpu_burst_time_get, + zone_cpu_burst_time_set, + rcop_no_test +}; + +/* + * zone.zfs-io-pri resource control support (IO priority). + */ +/*ARGSUSED*/ +static rctl_qty_t +zone_zfs_io_pri_get(rctl_t *rctl, struct proc *p) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + return (p->p_zone->zone_zfs_io_pri); +} + +/*ARGSUSED*/ +static int +zone_zfs_io_pri_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + zone_t *zone = e->rcep_p.zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + + if (zone == NULL) + return (0); + + /* + * set priority to the new value. + */ + zone->zone_zfs_io_pri = nv; + return (0); +} + +static rctl_ops_t zone_zfs_io_pri_ops = { + rcop_no_action, + zone_zfs_io_pri_get, + zone_zfs_io_pri_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_lwps_usage(rctl_t *r, proc_t *p) { rctl_qty_t nlwps; @@ -1671,6 +1784,39 @@ static rctl_ops_t zone_max_swap_ops = { /*ARGSUSED*/ static rctl_qty_t +zone_phys_mem_usage(rctl_t *rctl, struct proc *p) +{ + rctl_qty_t q; + zone_t *z = p->p_zone; + + ASSERT(MUTEX_HELD(&p->p_lock)); + /* No additional lock because not enforced in the kernel */ + q = z->zone_phys_mem; + return (q); +} + +/*ARGSUSED*/ +static int +zone_phys_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, + rctl_qty_t nv) +{ + ASSERT(MUTEX_HELD(&p->p_lock)); + ASSERT(e->rcep_t == RCENTITY_ZONE); + if (e->rcep_p.zone == NULL) + return (0); + e->rcep_p.zone->zone_phys_mem_ctl = nv; + return (0); +} + +static rctl_ops_t zone_phys_mem_ops = { + rcop_no_action, + zone_phys_mem_usage, + zone_phys_mem_set, + rcop_no_test +}; + +/*ARGSUSED*/ +static rctl_qty_t zone_max_lofi_usage(rctl_t *rctl, struct proc *p) { rctl_qty_t q; @@ -1764,6 +1910,20 @@ zone_lockedmem_kstat_update(kstat_t *ksp, int rw) } static int +zone_physmem_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_kstat_t *zk = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zk->zk_usage.value.ui64 = zone->zone_phys_mem; + zk->zk_value.value.ui64 = zone->zone_phys_mem_ctl; + return (0); +} + +static int zone_nprocs_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1792,7 +1952,7 @@ zone_swapresv_kstat_update(kstat_t *ksp, int rw) } static kstat_t * -zone_kstat_create_common(zone_t *zone, char *name, +zone_rctl_kstat_create_common(zone_t *zone, char *name, int (*updatefunc) (kstat_t *, int)) { kstat_t *ksp; @@ -1818,6 +1978,230 @@ zone_kstat_create_common(zone_t *zone, char *name, } static int +zone_vfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_vfs_kstat_t *zvp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_vfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the VFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the slow ops + * counters are updated directly by the VFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zvp->zv_nread.value.ui64 = kiop->nread; + zvp->zv_reads.value.ui64 = kiop->reads; + zvp->zv_rtime.value.ui64 = kiop->rtime; + zvp->zv_rlentime.value.ui64 = kiop->rlentime; + zvp->zv_nwritten.value.ui64 = kiop->nwritten; + zvp->zv_writes.value.ui64 = kiop->writes; + zvp->zv_wtime.value.ui64 = kiop->wtime; + zvp->zv_wlentime.value.ui64 = kiop->wlentime; + + scalehrtime((hrtime_t *)&zvp->zv_rtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_rlentime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wtime.value.ui64); + scalehrtime((hrtime_t *)&zvp->zv_wlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_vfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_vfs_kstat_t *zvp; + + if ((ksp = kstat_create_zone("zone_vfs", zone->zone_id, + zone->zone_name, "zone_vfs", KSTAT_TYPE_NAMED, + sizeof (zone_vfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zvp = ksp->ks_data = kmem_zalloc(sizeof (zone_vfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_vfs_lock; + zone->zone_vfs_stats = zvp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zvp->zv_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zvp->zv_zonename, zone->zone_name); + kstat_named_init(&zvp->zv_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wtime, "wtime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_wlentime, "wlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10ms_ops, "10ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_100ms_ops, "100ms_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_1s_ops, "1s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_10s_ops, "10s_ops", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_cnt, "delay_cnt", KSTAT_DATA_UINT64); + kstat_named_init(&zvp->zv_delay_time, "delay_time", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_vfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_zfs_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_zfs_kstat_t *zzp = ksp->ks_data; + kstat_io_t *kiop = &zone->zone_zfs_rwstats; + + if (rw == KSTAT_WRITE) + return (EACCES); + + /* + * Extract the ZFS statistics from the kstat_io_t structure used by + * kstat_runq_enter() and related functions. Since the I/O throttle + * counters are updated directly by the ZFS layer, there's no need to + * copy those statistics here. + * + * Note that kstat_runq_enter() and the related functions use + * gethrtime_unscaled(), so scale the time here. + */ + zzp->zz_nread.value.ui64 = kiop->nread; + zzp->zz_reads.value.ui64 = kiop->reads; + zzp->zz_rtime.value.ui64 = kiop->rtime; + zzp->zz_rlentime.value.ui64 = kiop->rlentime; + zzp->zz_nwritten.value.ui64 = kiop->nwritten; + zzp->zz_writes.value.ui64 = kiop->writes; + + scalehrtime((hrtime_t *)&zzp->zz_rtime.value.ui64); + scalehrtime((hrtime_t *)&zzp->zz_rlentime.value.ui64); + + return (0); +} + +static kstat_t * +zone_zfs_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_zfs_kstat_t *zzp; + + if ((ksp = kstat_create_zone("zone_zfs", zone->zone_id, + zone->zone_name, "zone_zfs", KSTAT_TYPE_NAMED, + sizeof (zone_zfs_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zzp = ksp->ks_data = kmem_zalloc(sizeof (zone_zfs_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_zfs_lock; + zone->zone_zfs_stats = zzp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zzp->zz_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zzp->zz_zonename, zone->zone_name); + kstat_named_init(&zzp->zz_nread, "nread", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_reads, "reads", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rtime, "rtime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_rlentime, "rlentime", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_nwritten, "nwritten", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_writes, "writes", KSTAT_DATA_UINT64); + kstat_named_init(&zzp->zz_waittime, "waittime", KSTAT_DATA_UINT64); + + ksp->ks_update = zone_zfs_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int +zone_mcap_kstat_update(kstat_t *ksp, int rw) +{ + zone_t *zone = ksp->ks_private; + zone_mcap_kstat_t *zmp = ksp->ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + zmp->zm_rss.value.ui64 = zone->zone_phys_mem; + zmp->zm_phys_cap.value.ui64 = zone->zone_phys_mem_ctl; + zmp->zm_swap.value.ui64 = zone->zone_max_swap; + zmp->zm_swap_cap.value.ui64 = zone->zone_max_swap_ctl; + zmp->zm_nover.value.ui64 = zone->zone_mcap_nover; + zmp->zm_pagedout.value.ui64 = zone->zone_mcap_pagedout; + zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; + zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; + zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; + zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; + zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; + zmp->zm_pf_throttle.value.ui64 = zone->zone_pf_throttle; + zmp->zm_pf_throttle_usec.value.ui64 = zone->zone_pf_throttle_usec; + + return (0); +} + +static kstat_t * +zone_mcap_kstat_create(zone_t *zone) +{ + kstat_t *ksp; + zone_mcap_kstat_t *zmp; + + if ((ksp = kstat_create_zone("memory_cap", zone->zone_id, + zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED, + sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) + return (NULL); + + if (zone->zone_id != GLOBAL_ZONEID) + kstat_zone_add(ksp, GLOBAL_ZONEID); + + zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP); + ksp->ks_data_size += strlen(zone->zone_name) + 1; + ksp->ks_lock = &zone->zone_mcap_lock; + zone->zone_mcap_stats = zmp; + + /* The kstat "name" field is not large enough for a full zonename */ + kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); + kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); + kstat_named_init(&zmp->zm_rss, "rss", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_phys_cap, "physcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap, "swap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_swap_cap, "swapcap", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_nover, "nover", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pagedout, "pagedout", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle, "n_pf_throttle", + KSTAT_DATA_UINT64); + kstat_named_init(&zmp->zm_pf_throttle_usec, "n_pf_throttle_usec", + KSTAT_DATA_UINT64); + + ksp->ks_update = zone_mcap_kstat_update; + ksp->ks_private = zone; + + kstat_install(ksp); + return (ksp); +} + +static int zone_misc_kstat_update(kstat_t *ksp, int rw) { zone_t *zone = ksp->ks_private; @@ -1846,6 +2230,9 @@ zone_misc_kstat_update(kstat_t *ksp, int rw) zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; + zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; + zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; + return (0); } @@ -1884,7 +2271,8 @@ zone_misc_kstat_create(zone_t *zone) KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); - + kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); + kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); ksp->ks_update = zone_misc_kstat_update; ksp->ks_private = zone; @@ -1896,13 +2284,30 @@ zone_misc_kstat_create(zone_t *zone) static void zone_kstat_create(zone_t *zone) { - zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, + zone->zone_lockedmem_kstat = zone_rctl_kstat_create_common(zone, "lockedmem", zone_lockedmem_kstat_update); - zone->zone_swapresv_kstat = zone_kstat_create_common(zone, + zone->zone_swapresv_kstat = zone_rctl_kstat_create_common(zone, "swapresv", zone_swapresv_kstat_update); - zone->zone_nprocs_kstat = zone_kstat_create_common(zone, + zone->zone_physmem_kstat = zone_rctl_kstat_create_common(zone, + "physicalmem", zone_physmem_kstat_update); + zone->zone_nprocs_kstat = zone_rctl_kstat_create_common(zone, "nprocs", zone_nprocs_kstat_update); + if ((zone->zone_vfs_ksp = zone_vfs_kstat_create(zone)) == NULL) { + zone->zone_vfs_stats = kmem_zalloc( + sizeof (zone_vfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_zfs_ksp = zone_zfs_kstat_create(zone)) == NULL) { + zone->zone_zfs_stats = kmem_zalloc( + sizeof (zone_zfs_kstat_t), KM_SLEEP); + } + + if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { + zone->zone_mcap_stats = kmem_zalloc( + sizeof (zone_mcap_kstat_t), KM_SLEEP); + } + if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { zone->zone_misc_stats = kmem_zalloc( sizeof (zone_misc_kstat_t), KM_SLEEP); @@ -1929,8 +2334,17 @@ zone_kstat_delete(zone_t *zone) sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_swapresv_kstat, sizeof (zone_kstat_t)); + zone_kstat_delete_common(&zone->zone_physmem_kstat, + sizeof (zone_kstat_t)); zone_kstat_delete_common(&zone->zone_nprocs_kstat, sizeof (zone_kstat_t)); + + zone_kstat_delete_common(&zone->zone_vfs_ksp, + sizeof (zone_vfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_zfs_ksp, + sizeof (zone_zfs_kstat_t)); + zone_kstat_delete_common(&zone->zone_mcap_ksp, + sizeof (zone_mcap_kstat_t)); zone_kstat_delete_common(&zone->zone_misc_ksp, sizeof (zone_misc_kstat_t)); } @@ -1966,6 +2380,8 @@ zone_zsd_init(void) zone0.zone_locked_mem_ctl = UINT64_MAX; ASSERT(zone0.zone_max_swap == 0); zone0.zone_max_swap_ctl = UINT64_MAX; + zone0.zone_phys_mem = 0; + zone0.zone_phys_mem_ctl = UINT64_MAX; zone0.zone_max_lofi = 0; zone0.zone_max_lofi_ctl = UINT64_MAX; zone0.zone_shmmax = 0; @@ -1989,8 +2405,9 @@ zone_zsd_init(void) zone0.zone_initname = initname; zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; + zone0.zone_physmem_kstat = NULL; zone0.zone_nprocs_kstat = NULL; - + zone0.zone_zfs_io_pri = 1; zone0.zone_stime = 0; zone0.zone_utime = 0; zone0.zone_wtime = 0; @@ -2101,6 +2518,21 @@ zone_init(void) RCTL_GLOBAL_INFINITE, MAXCAP, MAXCAP, &zone_cpu_cap_ops); + rc_zone_cpu_baseline = rctl_register("zone.cpu-baseline", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + MAXCAP, MAXCAP, &zone_cpu_base_ops); + + rc_zone_cpu_burst_time = rctl_register("zone.cpu-burst-time", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + INT_MAX, INT_MAX, &zone_cpu_burst_time_ops); + + rc_zone_zfs_io_pri = rctl_register("zone.zfs-io-priority", + RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | + RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, + 16384, 16384, &zone_zfs_io_pri_ops); + rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, INT_MAX, INT_MAX, &zone_lwps_ops); @@ -2142,6 +2574,20 @@ zone_init(void) rde = rctl_dict_lookup("zone.cpu-shares"); (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + /* + * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach + * this at the head of the rctl_dict_entry for ``zone.zfs-io-priority'. + */ + dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); + bzero(dval, sizeof (rctl_val_t)); + dval->rcv_value = 1; + dval->rcv_privilege = RCPRIV_PRIVILEGED; + dval->rcv_flagaction = RCTL_LOCAL_NOACTION; + dval->rcv_action_recip_pid = -1; + + rde = rctl_dict_lookup("zone.zfs-io-priority"); + (void) rctl_val_list_insert(&rde->rcd_default_value, dval); + rc_zone_locked_mem = rctl_register("zone.max-locked-memory", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2152,6 +2598,11 @@ zone_init(void) RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, &zone_max_swap_ops); + rc_zone_phys_mem = rctl_register("zone.max-physical-memory", + RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | + RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, + &zone_phys_mem_ops); + rc_zone_max_lofi = rctl_register("zone.max-lofi", RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, @@ -2173,6 +2624,8 @@ zone_init(void) zone0.zone_ntasks = 1; mutex_exit(&p0.p_lock); zone0.zone_restart_init = B_TRUE; + zone0.zone_reboot_on_init_exit = B_FALSE; + zone0.zone_init_status = -1; zone0.zone_brand = &native_brand; rctl_prealloc_destroy(gp); /* @@ -2252,6 +2705,8 @@ zone_init(void) static void zone_free(zone_t *zone) { + zone_dl_t *zdl; + ASSERT(zone != global_zone); ASSERT(zone->zone_ntasks == 0); ASSERT(zone->zone_nlwps == 0); @@ -2280,6 +2735,19 @@ zone_free(zone_t *zone) list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); + + /* + * While dlmgmtd should have removed all of these, it could have left + * something behind or crashed. In which case it's not safe for us to + * assume that the list is empty which list_destroy() will ASSERT. We + * clean up for our userland comrades which may have crashed, or worse, + * been disabled by SMF. + */ + while ((zdl = list_remove_head(&zone->zone_dl_list)) != NULL) { + if (zdl->zdl_net != NULL) + nvlist_free(zdl->zdl_net); + kmem_free(zdl, sizeof (zone_dl_t)); + } list_destroy(&zone->zone_dl_list); if (zone->zone_rootvp != NULL) @@ -2324,12 +2792,18 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + timestruc_t now; + uint64_t t; nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + /* Current time since Jan 1 1970 but consumers expect NS */ + gethrestime(&now); + t = (now.tv_sec * NANOSEC) + now.tv_nsec; + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || nvlist_add_string(nvl, ZONE_CB_NEWSTATE, @@ -2337,7 +2811,7 @@ zone_status_set(zone_t *zone, zone_status_t status) nvlist_add_string(nvl, ZONE_CB_OLDSTATE, zone_status_table[zone->zone_status]) || nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || - nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, t) || sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { #ifdef DEBUG @@ -2463,14 +2937,65 @@ zone_set_initname(zone_t *zone, const char *zone_initname) return (0); } +/* + * The zone_set_mcap_nover and zone_set_mcap_pageout functions are used + * to provide the physical memory capping kstats. Since physical memory + * capping is currently implemented in userland, that code uses the setattr + * entry point to increment the kstats. We always simply increment nover + * every time that setattr is called and we always add in the input value + * to zone_mcap_pagedout every time that is called. + */ +/*ARGSUSED*/ static int -zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) +zone_set_mcap_nover(zone_t *zone, const uint64_t *zone_nover) { - uint64_t mcap; - int err = 0; + zone->zone_mcap_nover++; + + return (0); +} + +static int +zone_set_mcap_pageout(zone_t *zone, const uint64_t *zone_pageout) +{ + uint64_t pageout; + int err; + + if ((err = copyin(zone_pageout, &pageout, sizeof (uint64_t))) == 0) + zone->zone_mcap_pagedout += pageout; + + return (err); +} + +/* + * The zone_set_page_fault_delay function is used to set the number of usecs + * to throttle page faults. This is normally 0 but can be set to a non-0 value + * by the user-land memory capping code when the zone is over its physcial + * memory cap. + */ +static int +zone_set_page_fault_delay(zone_t *zone, const uint32_t *pfdelay) +{ + uint32_t dusec; + int err; + + if ((err = copyin(pfdelay, &dusec, sizeof (uint32_t))) == 0) + zone->zone_pg_flt_delay = dusec; + + return (err); +} + +/* + * The zone_set_rss function is used to set the zone's RSS when we do the + * fast, approximate calculation in user-land. + */ +static int +zone_set_rss(zone_t *zone, const uint64_t *prss) +{ + uint64_t rss; + int err; - if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) - zone->zone_phys_mcap = mcap; + if ((err = copyin(prss, &rss, sizeof (uint64_t))) == 0) + zone->zone_phys_mem = rss; return (err); } @@ -2882,6 +3407,12 @@ getzoneid(void) return (curproc->p_zone->zone_id); } +zoneid_t +getzonedid(void) +{ + return (curproc->p_zone->zone_did); +} + /* * Internal versions of zone_find_by_*(). These don't zone_hold() or * check the validity of a zone's state. @@ -4139,8 +4670,9 @@ parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp) error = EINVAL; name = nvpair_name(nvp); - if (strncmp(nvpair_name(nvp), "zone.", sizeof ("zone.") - 1) - != 0 || nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { + if ((strncmp(name, "zone.", sizeof ("zone.") - 1) != 0 && + strncmp(name, "project.", sizeof ("project.") - 1) != 0) || + nvpair_type(nvp) != DATA_TYPE_NVLIST_ARRAY) { goto out; } if ((hndl = rctl_hndl_lookup(name)) == -1) { @@ -4258,7 +4790,7 @@ zone_create(const char *zone_name, const char *zone_root, caddr_t rctlbuf, size_t rctlbufsz, caddr_t zfsbuf, size_t zfsbufsz, int *extended_error, int match, uint32_t doi, const bslabel_t *label, - int flags) + int flags, zoneid_t zone_did) { struct zsched_arg zarg; nvlist_t *rctls = NULL; @@ -4281,6 +4813,7 @@ zone_create(const char *zone_name, const char *zone_root, zone = kmem_zalloc(sizeof (zone_t), KM_SLEEP); zoneid = zone->zone_id = id_alloc(zoneid_space); + zone->zone_did = zone_did; zone->zone_status = ZONE_IS_UNINITIALIZED; zone->zone_pool = pool_default; zone->zone_pool_mod = gethrtime(); @@ -4288,6 +4821,8 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_ncpus = 0; zone->zone_ncpus_online = 0; zone->zone_restart_init = B_TRUE; + zone->zone_reboot_on_init_exit = B_FALSE; + zone->zone_init_status = -1; zone->zone_brand = &native_brand; zone->zone_initname = NULL; mutex_init(&zone->zone_lock, NULL, MUTEX_DEFAULT, NULL); @@ -4349,10 +4884,14 @@ zone_create(const char *zone_name, const char *zone_root, zone->zone_locked_mem_ctl = UINT64_MAX; zone->zone_max_swap = 0; zone->zone_max_swap_ctl = UINT64_MAX; + zone->zone_phys_mem = 0; + zone->zone_phys_mem_ctl = UINT64_MAX; zone->zone_max_lofi = 0; zone->zone_max_lofi_ctl = UINT64_MAX; - zone0.zone_lockedmem_kstat = NULL; - zone0.zone_swapresv_kstat = NULL; + zone->zone_lockedmem_kstat = NULL; + zone->zone_swapresv_kstat = NULL; + zone->zone_physmem_kstat = NULL; + zone->zone_zfs_io_pri = 1; /* * Zsched initializes the rctls. @@ -4509,8 +5048,8 @@ zone_create(const char *zone_name, const char *zone_root, /* * The process, task, and project rctls are probably wrong; * we need an interface to get the default values of all rctls, - * and initialize zsched appropriately. I'm not sure that that - * makes much of a difference, though. + * and initialize zsched appropriately. However, we allow zoneadmd + * to pass down both zone and project rctls for the zone's init. */ error = newproc(zsched, (void *)&zarg, syscid, minclsyspri, NULL, 0); if (error != 0) { @@ -4651,6 +5190,7 @@ zone_boot(zoneid_t zoneid) static int zone_empty(zone_t *zone) { + int cnt = 0; int waitstatus; /* @@ -4661,7 +5201,16 @@ zone_empty(zone_t *zone) ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); while ((waitstatus = zone_status_timedwait_sig(zone, ddi_get_lbolt() + hz, ZONE_IS_EMPTY)) == -1) { - killall(zone->zone_id); + boolean_t force = B_FALSE; + + /* Every 30 seconds, try harder */ + if (cnt++ >= 30) { + cmn_err(CE_WARN, "attempt to force kill zone %d\n", + zone->zone_id); + force = B_TRUE; + cnt = 0; + } + killall(zone->zone_id, force); } /* * return EINTR if we were signaled @@ -5412,14 +5961,6 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) error = EFAULT; } break; - case ZONE_ATTR_PHYS_MCAP: - size = sizeof (zone->zone_phys_mcap); - if (bufsize > size) - bufsize = size; - if (buf != NULL && - copyout(&zone->zone_phys_mcap, buf, bufsize) != 0) - error = EFAULT; - break; case ZONE_ATTR_SCHED_CLASS: mutex_enter(&class_lock); @@ -5474,6 +6015,14 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) } kmem_free(zbuf, bufsize); break; + case ZONE_ATTR_DID: + size = sizeof (zoneid_t); + if (bufsize > size) + bufsize = size; + + if (buf != NULL && copyout(&zone->zone_did, buf, bufsize) != 0) + error = EFAULT; + break; default: if ((attr >= ZONE_ATTR_BRAND_ATTRS) && ZONE_IS_BRANDED(zone)) { size = bufsize; @@ -5505,10 +6054,11 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EPERM)); /* - * Only the ZONE_ATTR_PHYS_MCAP attribute can be set on the - * global zone. + * Only the ZONE_ATTR_PMCAP_NOVER and ZONE_ATTR_PMCAP_PAGEOUT + * attributes can be set on the global zone. */ - if (zoneid == GLOBAL_ZONEID && attr != ZONE_ATTR_PHYS_MCAP) { + if (zoneid == GLOBAL_ZONEID && + attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT) { return (set_errno(EINVAL)); } @@ -5525,7 +6075,9 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + if (attr != ZONE_ATTR_PMCAP_NOVER && attr != ZONE_ATTR_PMCAP_PAGEOUT && + attr != ZONE_ATTR_PG_FLT_DELAY && attr != ZONE_ATTR_RSS && + zone_status > ZONE_IS_READY) { err = EINVAL; goto done; } @@ -5547,8 +6099,17 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_FS_ALLOWED: err = zone_set_fs_allowed(zone, (const char *)buf); break; - case ZONE_ATTR_PHYS_MCAP: - err = zone_set_phys_mcap(zone, (const uint64_t *)buf); + case ZONE_ATTR_PMCAP_NOVER: + err = zone_set_mcap_nover(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PMCAP_PAGEOUT: + err = zone_set_mcap_pageout(zone, (const uint64_t *)buf); + break; + case ZONE_ATTR_PG_FLT_DELAY: + err = zone_set_page_fault_delay(zone, (const uint32_t *)buf); + break; + case ZONE_ATTR_RSS: + err = zone_set_rss(zone, (const uint64_t *)buf); break; case ZONE_ATTR_SCHED_CLASS: err = zone_set_sched_class(zone, (const char *)buf); @@ -6269,6 +6830,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) zs.doi = zs32.doi; zs.label = (const bslabel_t *)(uintptr_t)zs32.label; zs.flags = zs32.flags; + zs.zoneid = zs32.zoneid; #else panic("get_udatamodel() returned bogus result\n"); #endif @@ -6279,7 +6841,7 @@ zone(int cmd, void *arg1, void *arg2, void *arg3, void *arg4) (caddr_t)zs.rctlbuf, zs.rctlbufsz, (caddr_t)zs.zfsbuf, zs.zfsbufsz, zs.extended_error, zs.match, zs.doi, - zs.label, zs.flags)); + zs.label, zs.flags, zs.zoneid)); case ZONE_BOOT: return (zone_boot((zoneid_t)(uintptr_t)arg1)); case ZONE_DESTROY: @@ -6380,6 +6942,7 @@ zone_ki_call_zoneadmd(struct zarg *zargp) bcopy(zone->zone_name, zone_name, zone_namelen); zoneid = zone->zone_id; uniqid = zone->zone_uniqid; + arg.status = zone->zone_init_status; /* * zoneadmd may be down, but at least we can empty out the zone. * We can ignore the return value of zone_empty() since we're called @@ -6557,7 +7120,7 @@ zone_kadmin(int cmd, int fcn, const char *mdep, cred_t *credp) * zone_ki_call_zoneadmd() will do a more thorough job of this * later. */ - killall(zone->zone_id); + killall(zone->zone_id, B_FALSE); /* * Now, create the thread to contact zoneadmd and do the rest of the * work. This thread can't be created in our zone otherwise diff --git a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c b/usr/src/uts/common/refhash/refhash.c index 8f96c2d9f1..e2de00597e 100644 --- a/usr/src/uts/common/io/scsi/adapters/mpt_sas/mptsas_hash.c +++ b/usr/src/uts/common/refhash/refhash.c @@ -10,16 +10,18 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> +#include <sys/refhash.h> #include <sys/sysmacros.h> #include <sys/types.h> #include <sys/kmem.h> #include <sys/list.h> #include <sys/ddi.h> +#define RHL_F_DEAD 0x01 + #ifdef lint extern refhash_link_t *obj_to_link(refhash_t *, void *); extern void *link_to_obj(refhash_t *, refhash_link_t *); diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index c99ef8accb..a6537c9db3 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -22,6 +22,7 @@ # Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2013, Joyent, Inc. All rights reserved. # Copyright 2013 Garrett D'Amore <garrett@damore.org> +# Copyright 2015, Joyent, Inc. All rights reserved. # include $(SRC)/uts/Makefile.uts @@ -212,6 +213,7 @@ CHKHDRS= \ emul64cmd.h \ emul64var.h \ epm.h \ + epoll.h \ errno.h \ errorq.h \ errorq_impl.h \ @@ -219,6 +221,7 @@ CHKHDRS= \ ethernet.h \ euc.h \ eucioctl.h \ + eventfd.h \ exacct.h \ exacct_catalog.h \ exacct_impl.h \ @@ -243,6 +246,7 @@ CHKHDRS= \ flock.h \ flock_impl.h \ fork.h \ + frameio.h \ fss.h \ fsspriocntl.h \ fsid.h \ @@ -268,6 +272,7 @@ CHKHDRS= \ idmap.h \ ieeefp.h \ id_space.h \ + inotify.h \ instance.h \ int_const.h \ int_fmtio.h \ @@ -336,6 +341,7 @@ CHKHDRS= \ lgrp.h \ lgrp_user.h \ libc_kernel.h \ + limits.h \ link.h \ list.h \ list_impl.h \ @@ -416,6 +422,9 @@ CHKHDRS= \ ontrap.h \ open.h \ openpromio.h \ + overlay.h \ + overlay_common.h \ + overlay_target.h \ panic.h \ param.h \ pathconf.h \ @@ -581,6 +590,7 @@ CHKHDRS= \ time_std_impl.h \ timeb.h \ timer.h \ + timerfd.h \ times.h \ timex.h \ timod.h \ @@ -635,6 +645,8 @@ CHKHDRS= \ vmem.h \ vmem_impl.h \ vmsystm.h \ + vnd.h \ + vnd_errno.h \ vnic.h \ vnic_impl.h \ vnode.h \ @@ -646,12 +658,14 @@ CHKHDRS= \ vuid_queue.h \ vuid_state.h \ vuid_store.h \ + vxlan.h \ wait.h \ waitq.h \ wanboot_impl.h \ watchpoint.h \ winlockio.h \ zcons.h \ + zfd.h \ zone.h \ xti_inet.h \ xti_osi.h \ @@ -852,13 +866,14 @@ FSHDRS= \ cachefs_log.h \ decomp.h \ dv_node.h \ - sdev_impl.h \ fifonode.h \ hsfs_isospec.h \ hsfs_node.h \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ @@ -868,6 +883,8 @@ FSHDRS= \ pc_label.h \ pc_node.h \ pxfs_ki.h \ + sdev_impl.h \ + sdev_plugin.h \ snode.h \ swapnode.h \ tmp.h \ diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 5b3bba08c2..a02240ecea 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. */ #ifndef _SYS_AGGR_IMPL_H @@ -307,6 +308,8 @@ extern boolean_t aggr_port_notify_link(aggr_grp_t *, aggr_port_t *); extern void aggr_port_init_callbacks(aggr_port_t *); extern void aggr_recv_cb(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void aggr_recv_promisc_cb(void *, mac_resource_handle_t, mblk_t *, + boolean_t); extern void aggr_tx_ring_update(void *, uintptr_t); extern void aggr_tx_notify_thread(void *); diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 3a2e705850..e875cd10d9 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -29,7 +29,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ #ifndef _SYS_AUXV_H @@ -78,6 +78,9 @@ typedef struct { #define AT_FLAGS 8 /* processor flags */ #define AT_ENTRY 9 /* a.out entry point */ +/* First introduced on Linux */ +#define AT_RANDOM 25 /* address of 16 random bytes */ + /* * These relate to the original PPC ABI document; Linux reused * the values for other things (see below), so disambiguation of @@ -110,6 +113,16 @@ typedef struct { * AT_UCACHEBSIZE 21 (moved from 12) * * AT_IGNOREPPC 22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE 23 secure mode boolean + * AT_BASE_PLATFORM 24 string identifying real platform, may + * differ from AT_PLATFORM. + * AT_HWCAP2 26 extension of AT_HWCAP + * AT_EXECFN 31 filename of program + * AT_SYSINFO 32 + * AT_SYSINFO_EHDR 33 The vDSO location */ /* @@ -186,6 +199,8 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_AUX1 2020 #define AT_SUN_BRAND_AUX2 2021 #define AT_SUN_BRAND_AUX3 2022 +#define AT_SUN_BRAND_AUX4 2025 +#define AT_SUN_BRAND_NROOT 2024 /* * Note that 2023 is reserved for the AT_SUN_HWCAP2 word defined above. diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index badc3faff8..8183f643a4 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_BRAND_H @@ -102,29 +103,98 @@ struct brand_mach_ops; struct intpdata; struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + * during lwp creation before b_initlwp could be called. + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + */ struct brand_ops { void (*b_init_brand_data)(zone_t *); void (*b_free_brand_data)(zone_t *); int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t); void (*b_setbrand)(struct proc *); int (*b_getattr)(zone_t *, int, void *, size_t *); int (*b_setattr)(zone_t *, int, void *, size_t); void (*b_copy_procdata)(struct proc *, struct proc *); - void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_proc_exit)(struct proc *); void (*b_exec)(); void (*b_lwp_setrval)(klwp_t *, int, int); - int (*b_initlwp)(klwp_t *); + void *(*b_lwpdata_alloc)(struct proc *); + void (*b_lwpdata_free)(void *); + void (*b_initlwp)(klwp_t *, void *); void (*b_forklwp)(klwp_t *, klwp_t *); void (*b_freelwp)(klwp_t *); void (*b_lwpexit)(klwp_t *); int (*b_elfexec)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); void (*b_sigset_native_to_brand)(sigset_t *); void (*b_sigset_brand_to_native)(sigset_t *); + void (*b_psig_to_proc)(proc_t *, kthread_t *, int); int b_nsig; + void (*b_exit_with_sig)(proc_t *, sigqueue_t *); + boolean_t (*b_wait_filter)(proc_t *, proc_t *); + boolean_t (*b_native_exec)(uint8_t, const char **); + void (*b_ptrace_exectrap)(proc_t *); + uint32_t (*b_map32limit)(proc_t *); + void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); + int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, + boolean_t *, int *); + int (*b_sigcld_repost)(proc_t *, sigqueue_t *); + int (*b_issig_stop)(proc_t *, klwp_t *); + boolean_t (*b_sig_ignorable)(proc_t *, int); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); + int (*b_setid_clear)(vattr_t *vap, cred_t *cr); + int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); }; /* @@ -135,6 +205,7 @@ typedef struct brand { char *b_name; struct brand_ops *b_ops; struct brand_mach_ops *b_machops; + size_t b_data_size; } brand_t; extern brand_t native_brand; @@ -166,7 +237,7 @@ extern brand_t *brand_find_name(char *); extern void brand_unregister_zone(brand_t *); extern int brand_zone_count(brand_t *); extern void brand_setbrand(proc_t *); -extern void brand_clearbrand(proc_t *, boolean_t); +extern void brand_clearbrand(proc_t *); /* * The following functions can be shared among kernel brand modules which @@ -178,17 +249,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t, extern void brand_solaris_copy_procdata(proc_t *, proc_t *, struct brand *); extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, - intpdata_t *, int, long *, int, caddr_t, cred_t *, int, - struct brand *, char *, char *, char *, char *, char *); + intpdata_t *, int, long *, int, caddr_t, cred_t *, int *, + struct brand *, char *, char *, char *); extern void brand_solaris_exec(struct brand *); extern int brand_solaris_fini(char **, struct modlinkage *, struct brand *); extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *); extern void brand_solaris_freelwp(klwp_t *, struct brand *); -extern int brand_solaris_initlwp(klwp_t *, struct brand *); +extern void brand_solaris_initlwp(klwp_t *, struct brand *); extern void brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void brand_solaris_proc_exit(struct proc *, klwp_t *, - struct brand *); +extern void brand_solaris_proc_exit(struct proc *, struct brand *); extern void brand_solaris_setbrand(proc_t *, struct brand *); #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index a9191aed7c..cb8a6012fc 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -186,6 +187,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -198,6 +200,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index 5056f9a511..914f132dc0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/ctf_api.h b/usr/src/uts/common/sys/ctf_api.h index 04d73c3181..73beb2d244 100644 --- a/usr/src/uts/common/sys/ctf_api.h +++ b/usr/src/uts/common/sys/ctf_api.h @@ -24,7 +24,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* @@ -60,6 +60,62 @@ extern "C" { typedef struct ctf_file ctf_file_t; typedef long ctf_id_t; +#define ECTF_BASE 1000 /* base value for libctf errnos */ + +enum { + ECTF_FMT = ECTF_BASE, /* file is not in CTF or ELF format */ + ECTF_ELFVERS, /* ELF version is more recent than libctf */ + ECTF_CTFVERS, /* CTF version is more recent than libctf */ + ECTF_ENDIAN, /* data is different endian-ness than lib */ + ECTF_SYMTAB, /* symbol table uses invalid entry size */ + ECTF_SYMBAD, /* symbol table data buffer invalid */ + ECTF_STRBAD, /* string table data buffer invalid */ + ECTF_CORRUPT, /* file data corruption detected */ + ECTF_NOCTFDATA, /* ELF file does not contain CTF data */ + ECTF_NOCTFBUF, /* buffer does not contain CTF data */ + ECTF_NOSYMTAB, /* symbol table data is not available */ + ECTF_NOPARENT, /* parent CTF container is not available */ + ECTF_DMODEL, /* data model mismatch */ + ECTF_MMAP, /* failed to mmap a data section */ + ECTF_ZMISSING, /* decompression library not installed */ + ECTF_ZINIT, /* failed to initialize decompression library */ + ECTF_ZALLOC, /* failed to allocate decompression buffer */ + ECTF_DECOMPRESS, /* failed to decompress CTF data */ + ECTF_STRTAB, /* string table for this string is missing */ + ECTF_BADNAME, /* string offset is corrupt w.r.t. strtab */ + ECTF_BADID, /* invalid type ID number */ + ECTF_NOTSOU, /* type is not a struct or union */ + ECTF_NOTENUM, /* type is not an enum */ + ECTF_NOTSUE, /* type is not a struct, union, or enum */ + ECTF_NOTINTFP, /* type is not an integer or float */ + ECTF_NOTARRAY, /* type is not an array */ + ECTF_NOTREF, /* type does not reference another type */ + ECTF_NAMELEN, /* buffer is too small to hold type name */ + ECTF_NOTYPE, /* no type found corresponding to name */ + ECTF_SYNTAX, /* syntax error in type name */ + ECTF_NOTFUNC, /* symtab entry does not refer to a function */ + ECTF_NOFUNCDAT, /* no func info available for function */ + ECTF_NOTDATA, /* symtab entry does not refer to a data obj */ + ECTF_NOTYPEDAT, /* no type info available for object */ + ECTF_NOLABEL, /* no label found corresponding to name */ + ECTF_NOLABELDATA, /* file does not contain any labels */ + ECTF_NOTSUP, /* feature not supported */ + ECTF_NOENUMNAM, /* enum element name not found */ + ECTF_NOMEMBNAM, /* member name not found */ + ECTF_RDONLY, /* CTF container is read-only */ + ECTF_DTFULL, /* CTF type is full (no more members allowed) */ + ECTF_FULL, /* CTF container is full */ + ECTF_DUPMEMBER, /* duplicate member name definition */ + ECTF_CONFLICT, /* conflicting type definition present */ + ECTF_REFERENCED, /* type has outstanding references */ + ECTF_NOTDYN, /* type is not a dynamic type */ + ECTF_ELF, /* elf library failure */ + ECTF_MCHILD, /* cannot merge child container */ + ECTF_LABELEXISTS, /* label already exists */ + ECTF_LCONFLICT, /* merged labels conflict */ + ECTF_ZLIB /* zlib library failure */ +}; + /* * If the debugger needs to provide the CTF library with a set of raw buffers * for use as the CTF data, symbol table, and string table, it can do so by @@ -143,19 +199,24 @@ typedef struct ctf_lblinfo { typedef int ctf_visit_f(const char *, ctf_id_t, ulong_t, int, void *); typedef int ctf_member_f(const char *, ctf_id_t, ulong_t, void *); typedef int ctf_enum_f(const char *, int, void *); -typedef int ctf_type_f(ctf_id_t, void *); +typedef int ctf_type_f(ctf_id_t, boolean_t, void *); typedef int ctf_label_f(const char *, const ctf_lblinfo_t *, void *); +typedef int ctf_function_f(const char *, ulong_t, ctf_funcinfo_t *, void *); +typedef int ctf_object_f(const char *, ctf_id_t, ulong_t, void *); +typedef int ctf_string_f(const char *, void *); extern ctf_file_t *ctf_bufopen(const ctf_sect_t *, const ctf_sect_t *, const ctf_sect_t *, int *); extern ctf_file_t *ctf_fdopen(int, int *); extern ctf_file_t *ctf_open(const char *, int *); extern ctf_file_t *ctf_create(int *); +extern ctf_file_t *ctf_fdcreate(int, int *); extern ctf_file_t *ctf_dup(ctf_file_t *); extern void ctf_close(ctf_file_t *); extern ctf_file_t *ctf_parent_file(ctf_file_t *); extern const char *ctf_parent_name(ctf_file_t *); +extern const char *ctf_parent_label(ctf_file_t *); extern int ctf_import(ctf_file_t *, ctf_file_t *); extern int ctf_setmodel(ctf_file_t *, int); @@ -165,15 +226,20 @@ extern void ctf_setspecific(ctf_file_t *, void *); extern void *ctf_getspecific(ctf_file_t *); extern int ctf_errno(ctf_file_t *); +extern uint_t ctf_flags(ctf_file_t *); extern const char *ctf_errmsg(int); extern int ctf_version(int); extern int ctf_func_info(ctf_file_t *, ulong_t, ctf_funcinfo_t *); +extern int ctf_func_info_by_id(ctf_file_t *, ctf_id_t, ctf_funcinfo_t *); extern int ctf_func_args(ctf_file_t *, ulong_t, uint_t, ctf_id_t *); +extern int ctf_func_args_by_id(ctf_file_t *, ctf_id_t, uint_t, ctf_id_t *); extern ctf_id_t ctf_lookup_by_name(ctf_file_t *, const char *); extern ctf_id_t ctf_lookup_by_symbol(ctf_file_t *, ulong_t); +extern char *ctf_symbol_name(ctf_file_t *, ulong_t, char *, size_t); + extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t); extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t); extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t); @@ -201,29 +267,39 @@ extern int ctf_label_info(ctf_file_t *, const char *, ctf_lblinfo_t *); extern int ctf_member_iter(ctf_file_t *, ctf_id_t, ctf_member_f *, void *); extern int ctf_enum_iter(ctf_file_t *, ctf_id_t, ctf_enum_f *, void *); -extern int ctf_type_iter(ctf_file_t *, ctf_type_f *, void *); +extern int ctf_type_iter(ctf_file_t *, boolean_t, ctf_type_f *, void *); extern int ctf_label_iter(ctf_file_t *, ctf_label_f *, void *); +extern int ctf_function_iter(ctf_file_t *, ctf_function_f *, void *); +extern int ctf_object_iter(ctf_file_t *, ctf_object_f *, void *); +extern int ctf_string_iter(ctf_file_t *, ctf_string_f *, void *); extern ctf_id_t ctf_add_array(ctf_file_t *, uint_t, const ctf_arinfo_t *); -extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_const(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_enum(ctf_file_t *, uint_t, const char *); extern ctf_id_t ctf_add_float(ctf_file_t *, uint_t, const char *, const ctf_encoding_t *); extern ctf_id_t ctf_add_forward(ctf_file_t *, uint_t, const char *, uint_t); -extern ctf_id_t ctf_add_function(ctf_file_t *, uint_t, - const ctf_funcinfo_t *, const ctf_id_t *); +extern ctf_id_t ctf_add_funcptr(ctf_file_t *, uint_t, const ctf_funcinfo_t *, + const ctf_id_t *); extern ctf_id_t ctf_add_integer(ctf_file_t *, uint_t, const char *, const ctf_encoding_t *); -extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_pointer(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_type(ctf_file_t *, ctf_file_t *, ctf_id_t); extern ctf_id_t ctf_add_typedef(ctf_file_t *, uint_t, const char *, ctf_id_t); -extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_restrict(ctf_file_t *, uint_t, const char *, ctf_id_t); extern ctf_id_t ctf_add_struct(ctf_file_t *, uint_t, const char *); extern ctf_id_t ctf_add_union(ctf_file_t *, uint_t, const char *); -extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, ctf_id_t); +extern ctf_id_t ctf_add_volatile(ctf_file_t *, uint_t, const char *, ctf_id_t); extern int ctf_add_enumerator(ctf_file_t *, ctf_id_t, const char *, int); -extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t); +extern int ctf_add_member(ctf_file_t *, ctf_id_t, const char *, ctf_id_t, + ulong_t); + + +extern int ctf_add_function(ctf_file_t *, ulong_t, const ctf_funcinfo_t *, + const ctf_id_t *); +extern int ctf_add_object(ctf_file_t *, ulong_t, ctf_id_t); +extern int ctf_add_label(ctf_file_t *, const char *, ctf_id_t, uint_t); extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *); @@ -232,6 +308,7 @@ extern int ctf_delete_type(ctf_file_t *, ctf_id_t); extern int ctf_update(ctf_file_t *); extern int ctf_discard(ctf_file_t *); extern int ctf_write(ctf_file_t *, int); +extern void ctf_dataptr(ctf_file_t *, const void **, size_t *); #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/devpoll.h b/usr/src/uts/common/sys/devpoll.h index 36c815c69f..4e4c76d9b0 100644 --- a/usr/src/uts/common/sys/devpoll.h +++ b/usr/src/uts/common/sys/devpoll.h @@ -24,11 +24,13 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_DEVPOLL_H #define _SYS_DEVPOLL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/poll_impl.h> #include <sys/types32.h> @@ -39,8 +41,10 @@ extern "C" { /* /dev/poll ioctl */ #define DPIOC (0xD0 << 8) -#define DP_POLL (DPIOC | 1) /* poll on fds in cached in /dev/poll */ +#define DP_POLL (DPIOC | 1) /* poll on fds cached via /dev/poll */ #define DP_ISPOLLED (DPIOC | 2) /* is this fd cached in /dev/poll */ +#define DP_PPOLL (DPIOC | 3) /* ppoll on fds cached via /dev/poll */ +#define DP_EPOLLCOMPAT (DPIOC | 4) /* turn on epoll compatibility */ #define DEVPOLLSIZE 1000 /* /dev/poll table size increment */ @@ -51,14 +55,21 @@ typedef struct dvpoll { pollfd_t *dp_fds; /* pollfd array */ nfds_t dp_nfds; /* num of pollfd's in dp_fds[] */ int dp_timeout; /* time out in milisec */ + sigset_t *dp_setp; /* sigset, if any */ } dvpoll_t; typedef struct dvpoll32 { caddr32_t dp_fds; /* pollfd array */ uint32_t dp_nfds; /* num of pollfd's in dp_fds[] */ int32_t dp_timeout; /* time out in milisec */ + caddr32_t dp_setp; /* sigset, if any */ } dvpoll32_t; +typedef struct dvpoll_epollfd { + pollfd_t dpep_pollfd; /* must be first member */ + uint64_t dpep_data; /* data payload */ +} dvpoll_epollfd_t; + #ifdef _KERNEL typedef struct dp_entry { @@ -71,6 +82,7 @@ typedef struct dp_entry { } dp_entry_t; #define DP_WRITER_PRESENT 0x1 /* a write is in progress */ +#define DP_ISEPOLLCOMPAT 0x2 /* epoll compatibility mode */ #define DP_REFRELE(dpep) { \ mutex_enter(&(dpep)->dpe_lock); \ diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index fb2a0749d3..4cd93be56e 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLD_H @@ -191,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -203,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* @@ -350,6 +353,7 @@ typedef struct dld_hwgrpinfo { */ typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define DI_DIRECT_RAW 0x1 /* * Direct Tx/Rx capability. */ @@ -374,6 +378,9 @@ typedef struct dld_capab_direct_s { /* flow control "can I put on a ring" callback */ uintptr_t di_tx_fctl_df; /* canput-like callback */ void *di_tx_fctl_dh; + + /* flags that control our behavior */ + uint_t di_flags; } dld_capab_direct_t; /* diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index a76a927e59..81708aad38 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum { typedef enum { DLD_UNINITIALIZED, DLD_PASSIVE, - DLD_ACTIVE + DLD_ACTIVE, + DLD_EXCLUSIVE } dld_passivestate_t; /* @@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, extern void dld_str_notify_ind(dld_str_t *); extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, uintptr_t, uint16_t); +extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" { #define IPTUN_IOC 0x454A #define BRIDGE_IOC 0xB81D #define IBPART_IOC 0x6171 +#define OVERLAY_IOC 0x2005 /* GLDv3 modules use these macros to generate unique ioctl commands */ #define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" { #define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid)) #define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid)) #define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid)) #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..dddac5b878 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -107,6 +107,7 @@ typedef struct dl_ipnetinfo { #define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */ #define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */ #define DL_NOTIFY_CONF 0x116 /* Notification from upstream */ +#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */ /* * Primitives used for Connectionless Service @@ -388,6 +389,7 @@ typedef struct dl_ipnetinfo { #define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */ #define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */ #define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */ +#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */ /* * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -1107,6 +1109,13 @@ typedef struct { } dl_intr_mode_req_t; /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { + t_uscalar_t dl_primitive; +} dl_exclusive_req_t; + +/* * CONNECTION-ORIENTED SERVICE PRIMITIVES */ @@ -1528,6 +1537,7 @@ union DL_primitives { dl_control_ack_t control_ack; dl_passive_req_t passive_req; dl_intr_mode_req_t intr_mode_req; + dl_exclusive_req_t exclusive_req; }; #define DL_INFO_REQ_SIZE sizeof (dl_info_req_t) @@ -1596,6 +1606,7 @@ union DL_primitives { #define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t) #define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t) #define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t) +#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t) #ifdef _KERNEL /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 6bd2bbe35a..8e99d0e9d8 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLS_H @@ -85,6 +86,7 @@ typedef struct dls_link_s dls_link_t; #define DLS_PROMISC_SAP 0x00000001 #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 +#define DLS_PROMISC_RX_ONLY 0x00000008 extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); extern void dls_close(dld_str_t *); @@ -106,11 +108,13 @@ extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); +extern int dls_devnet_open_in_zone(const char *, + dls_dl_handle_t *, dev_t *, zoneid_t); extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -127,7 +131,7 @@ extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -141,6 +145,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t, extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, datalink_class_t *, uint32_t *, uint32_t *); extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int dls_mgmt_get_linkid_in_zone(const char *, + datalink_id_t *, zoneid_t); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); extern int dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 60f51c47b5..d502b36a2d 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ #ifndef _SYS_DLS_IMPL_H @@ -61,6 +62,7 @@ struct dls_link_s { /* Protected by */ uint_t dl_zone_ref; link_tagmode_t dl_tagmode; /* atomic */ uint_t dl_nonip_cnt; /* SL */ + uint_t dl_exclusive; /* SL */ }; typedef struct dls_head_s { @@ -96,7 +98,8 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); @@ -126,6 +129,7 @@ extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int dls_exclusive_set(dld_str_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index b4032c24d6..f747e29def 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum { DATALINK_CLASS_SIMNET = 0x20, DATALINK_CLASS_BRIDGE = 0x40, DATALINK_CLASS_IPTUN = 0x80, - DATALINK_CLASS_PART = 0x100 + DATALINK_CLASS_PART = 0x100, + DATALINK_CLASS_OVERLAY = 0x200 } datalink_class_t; #define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ - DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) + DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ + DATALINK_CLASS_OVERLAY) /* * A combination of flags and media. @@ -165,6 +168,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { diff --git a/usr/src/uts/common/sys/epoll.h b/usr/src/uts/common/sys/epoll.h new file mode 100644 index 0000000000..f2e4b90ab7 --- /dev/null +++ b/usr/src/uts/common/sys/epoll.h @@ -0,0 +1,89 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_EPOLL_H +#define _SYS_EPOLL_H + +#include <sys/types.h> +#include <sys/poll.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef union epoll_data { + void *ptr; + int fd; + uint32_t u32; + uint64_t u64; +} epoll_data_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct epoll_event { + uint32_t events; /* events */ + epoll_data_t data; /* user-specified data */ +} epoll_event_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +/* + * Define the EPOLL* constants in terms of their poll(2)/poll(7) equivalents. + * Note that the values match the equivalents in Linux to allow for any binary + * compatibility layers to not need to translate them. + */ +#define EPOLLIN 0x0001 +#define EPOLLPRI 0x0002 +#define EPOLLOUT 0x0004 +#define EPOLLRDNORM 0x0040 +#define EPOLLRDBAND 0x0080 +#define EPOLLWRNORM 0x0100 +#define EPOLLWRBAND 0x0200 +#define EPOLLMSG 0x0400 /* not used */ +#define EPOLLERR 0x0008 +#define EPOLLHUP 0x0010 +#define EPOLLRDHUP 0x2000 + +#define EPOLLWAKEUP (1UL << 29) /* no meaning; silently ignored */ +#define EPOLLONESHOT (1UL << 30) /* translated to POLLONESHOT */ +#define EPOLLET (1UL << 31) /* translated to POLLET */ + +#define EPOLL_CTL_ADD 1 +#define EPOLL_CTL_DEL 2 +#define EPOLL_CTL_MOD 3 + +#define EPOLL_CLOEXEC 02000000 + +#if !defined(_KERNEL) + +extern int epoll_create(int size); +extern int epoll_create1(int flags); +extern int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event); +extern int epoll_wait(int epfd, struct epoll_event *events, + int maxevents, int timeout); +extern int epoll_pwait(int epfd, struct epoll_event *events, + int maxevents, int timeout, const sigset_t *sigmask); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EPOLL_H */ diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h new file mode 100644 index 0000000000..1b0d961b0b --- /dev/null +++ b/usr/src/uts/common/sys/eventfd.h @@ -0,0 +1,68 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the eventfd facility. Note that this facility + * is designed to be binary compatible with the Linux eventfd facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_EVENTFD_H +#define _SYS_EVENTFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint64_t eventfd_t; + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define EFD_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define EFD_NONBLOCK 04000 /* LX_O_NONBLOCK */ +#define EFD_SEMAPHORE 1 + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define EVENTFDIOC (('e' << 24) | ('f' << 16) | ('d' << 8)) +#define EVENTFDIOC_SEMAPHORE (EVENTFDIOC | 1) /* toggle sem state */ + +#ifndef _KERNEL + +extern int eventfd(unsigned int, int); +extern int eventfd_read(int, eventfd_t *); +extern int eventfd_write(int, eventfd_t); + +#else + +#define EVENTFDMNRN_EVENTFD 0 +#define EVENTFDMNRN_CLONE 1 +#define EVENTFD_VALMAX (ULLONG_MAX - 1ULL) + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EVENTFD_H */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index d36bc20481..de01910b5a 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -26,6 +26,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright 2015, Joyent, Inc. + */ + #ifndef _SYS_EXEC_H #define _SYS_EXEC_H @@ -102,10 +106,12 @@ typedef struct uarg { vnode_t *ex_vp; char *emulator; char *brandname; + const char *brand_nroot; char *auxp_auxflags; /* addr of auxflags auxv on the user stack */ char *auxp_brand; /* address of first brand auxv on user stack */ cred_t *pfcred; boolean_t scrubenv; + uintptr_t maxstack; } uarg_t; /* @@ -174,7 +180,7 @@ struct execsw { int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -212,7 +218,7 @@ extern int exec_common(const char *fname, const char **argp, const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, long *execsz, caddr_t exec_file, - struct cred *cred, int brand_action); + struct cred *cred, int *brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); @@ -237,16 +243,18 @@ extern void exec_set_sp(size_t); * when compiling the 32-bit compatability elf code in the elfexec module. */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); #endif /* !_ELF32_COMPAT */ #if defined(_LP64) extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + long *, int, caddr_t, cred_t *, int *); extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); #endif /* _LP64 */ /* diff --git a/usr/src/uts/common/sys/fcntl.h b/usr/src/uts/common/sys/fcntl.h index aa74cab8b5..5ddade90e4 100644 --- a/usr/src/uts/common/sys/fcntl.h +++ b/usr/src/uts/common/sys/fcntl.h @@ -37,6 +37,7 @@ */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* Copyright 2015, Joyent, Inc. */ #ifndef _SYS_FCNTL_H #define _SYS_FCNTL_H @@ -145,6 +146,7 @@ extern "C" { * the large and small file environments; therefore, the #defined values must * as well. * The NBMAND forms are private and should not be used. + * The FLOCK forms are also private and should not be used. */ #if defined(_LP64) || _FILE_OFFSET_BITS == 32 @@ -155,6 +157,13 @@ extern "C" { #define F_FREESP 11 /* Free file space */ #define F_GETLK 14 /* Get file lock */ #define F_SETLK_NBMAND 42 /* private */ +#if !defined(_STRICT_SYMBOLS) +#define F_OFD_GETLK 47 /* Get file lock owned by file */ +#define F_OFD_SETLK 48 /* Set file lock owned by file */ +#define F_OFD_SETLKW 49 /* Set file lock owned by file and wait */ +#define F_FLOCK 53 /* private - set flock owned by file */ +#define F_FLOCKW 54 /* private - set flock owned by file and wait */ +#endif /* _STRICT_SYMBOLS */ #else /* ILP32 large file application compilation environment version */ #define F_SETLK 34 /* Set file lock */ @@ -163,6 +172,13 @@ extern "C" { #define F_FREESP 27 /* Free file space */ #define F_GETLK 33 /* Get file lock */ #define F_SETLK_NBMAND 44 /* private */ +#if !defined(_STRICT_SYMBOLS) +#define F_OFD_GETLK 50 /* Get file lock owned by file */ +#define F_OFD_SETLK 51 /* Set file lock owned by file */ +#define F_OFD_SETLKW 52 /* Set file lock owned by file and wait */ +#define F_FLOCK 55 /* private - set flock owned by file */ +#define F_FLOCKW 56 /* private - set flock owned by file and wait */ +#endif /* _STRICT_SYMBOLS */ #endif /* _LP64 || _FILE_OFFSET_BITS == 32 */ #if defined(_LARGEFILE64_SOURCE) @@ -180,6 +196,13 @@ extern "C" { #define F_FREESP64 27 /* Free file space */ #define F_GETLK64 33 /* Get file lock */ #define F_SETLK64_NBMAND 44 /* private */ +#if !defined(_STRICT_SYMBOLS) +#define F_OFD_GETLK64 50 /* Get file lock owned by file */ +#define F_OFD_SETLK64 51 /* Set file lock owned by file */ +#define F_OFD_SETLKW64 52 /* Set file lock owned by file and wait */ +#define F_FLOCK64 55 /* private - set flock owned by file */ +#define F_FLOCKW64 56 /* private - set flock owned by file and wait */ +#endif /* _STRICT_SYMBOLS */ #else #define F_SETLK64 6 /* Set file lock */ #define F_SETLKW64 7 /* Set file lock and wait */ @@ -187,6 +210,13 @@ extern "C" { #define F_FREESP64 11 /* Free file space */ #define F_GETLK64 14 /* Get file lock */ #define F_SETLK64_NBMAND 42 /* private */ +#if !defined(_STRICT_SYMBOLS) +#define F_OFD_GETLK64 47 /* Get file lock owned by file */ +#define F_OFD_SETLK64 48 /* Set file lock owned by file */ +#define F_OFD_SETLKW64 49 /* Set file lock owned by file and wait */ +#define F_FLOCK64 53 /* private - set flock owned by file */ +#define F_FLOCKW64 54 /* private - set flock owned by file and wait */ +#endif /* _STRICT_SYMBOLS */ #endif /* !_LP64 || _KERNEL */ #endif /* _LARGEFILE64_SOURCE */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index 03acc088c2..7e297042af 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -27,6 +27,7 @@ /* All Rights Reserved */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FILE_H #define _SYS_FILE_H @@ -54,7 +55,7 @@ extern "C" { */ /* * One file structure is allocated for each open/creat/pipe call. - * Main use is to hold the read/write pointer associated with + * Main use is to hold the read/write pointer (and OFD locks) associated with * each open file. */ typedef struct file { @@ -66,6 +67,7 @@ typedef struct file { struct cred *f_cred; /* credentials of user who opened it */ struct f_audit_data *f_audit_data; /* file audit data */ int f_count; /* reference count */ + struct filock *f_filock; /* ptr to single lock_descriptor_t */ } file_t; /* @@ -120,6 +122,11 @@ typedef struct fpollinfo { #ifdef _KERNEL /* + * This is a flag that is set on f_flag2, but is never user-visible + */ +#define FEPOLLED 0x8000 + +/* * Fake flags for driver ioctl calls to inform them of the originating * process' model. See <sys/model.h> * @@ -168,6 +175,19 @@ typedef struct fpollinfo { #define L_SET 0 /* for lseek */ #endif /* L_SET */ +/* + * For flock(3C). These really don't belong here but for historical reasons + * the interface defines them to be here. + */ +#define LOCK_SH 1 +#define LOCK_EX 2 +#define LOCK_NB 4 +#define LOCK_UN 8 + +#if !defined(_STRICT_SYMBOLS) +extern int flock(int, int); +#endif + #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/sys/flock.h b/usr/src/uts/common/sys/flock.h index 3ea7afb23b..39542a4c16 100644 --- a/usr/src/uts/common/sys/flock.h +++ b/usr/src/uts/common/sys/flock.h @@ -29,6 +29,7 @@ */ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FLOCK_H @@ -41,6 +42,9 @@ #include <sys/callb.h> #include <sys/param.h> #include <sys/zone.h> +#if defined(_KERNEL) +#include <sys/file.h> +#endif #ifdef __cplusplus extern "C" { @@ -221,6 +225,8 @@ typedef struct locklist { #define FLK_QUERY_ACTIVE 0x1 #define FLK_QUERY_SLEEPING 0x2 +int ofdlock(file_t *, int, struct flock64 *, int, u_offset_t); +void ofdcleanlock(file_t *); int reclock(struct vnode *, struct flock64 *, int, int, u_offset_t, flk_callback_t *); int chklock(struct vnode *, int, u_offset_t, ssize_t, int, diff --git a/usr/src/uts/common/sys/flock_impl.h b/usr/src/uts/common/sys/flock_impl.h index d088192025..b1be1d598a 100644 --- a/usr/src/uts/common/sys/flock_impl.h +++ b/usr/src/uts/common/sys/flock_impl.h @@ -22,13 +22,12 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FLOCK_IMPL_H #define _SYS_FLOCK_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/fcntl.h> /* flock definition */ #include <sys/file.h> /* FREAD etc */ @@ -83,6 +82,7 @@ struct lock_descriptor { flk_nlm_status_t l_nlm_state; /* state of NLM server */ flk_callback_t *l_callbacks; /* callbacks, or NULL */ zoneid_t l_zoneid; /* zone of request */ + file_t *l_ofd; /* OFD-style reference */ }; typedef struct lock_descriptor lock_descriptor_t; @@ -204,7 +204,8 @@ graph_t *flk_get_lock_graph(vnode_t *, int); #define SAME_OWNER(lock1, lock2) \ (((lock1)->l_flock.l_pid == (lock2)->l_flock.l_pid) && \ - ((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid)) + ((lock1)->l_flock.l_sysid == (lock2)->l_flock.l_sysid) && \ + ((lock1)->l_ofd == (lock2)->l_ofd)) #define COLORED(vertex) ((vertex)->l_color == (vertex)->l_graph->mark) #define COLOR(vertex) ((vertex)->l_color = (vertex)->l_graph->mark) diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define _SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { + void *fv_buf; /* Buffer with data */ + size_t fv_buflen; /* Size of the buffer */ + size_t fv_actlen; /* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { + uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */ + uint_t fio_nvpf; /* How many vectors make up one frame */ + uint_t fio_nvecs; /* The total number of vectors */ + framevec_t fio_vecs[]; /* C99 VLA */ +} frameio_t; + + +#define FRAMEIO_VERSION_ONE 1 +#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE + +#define FRAMEIO_NVECS_MAX 32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { + caddr32_t fv_buf; + size32_t fv_buflen; + size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { + uint_t fio_version; + uint_t fio_vecspframe; + uint_t fio_nvecs; + framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { + MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, + int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/bootfs_impl.h b/usr/src/uts/common/sys/fs/bootfs_impl.h new file mode 100644 index 0000000000..5726f1428a --- /dev/null +++ b/usr/src/uts/common/sys/fs/bootfs_impl.h @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_BOOTFS_IMPL_H +#define _SYS_FS_BOOTFS_IMPL_H + +#include <sys/types.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/vnode.h> +#include <sys/vfs_opreg.h> +#include <sys/kstat.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The bootfs node is the file system specific version of the vnode for the + * bootfs file system. Because the bootfs file system is entirely a read-only + * file system, this structure requires no locking as the contents are + * immutable. + */ +typedef struct bootfs_node { + char *bvn_name; /* entry name */ + struct vnode *bvn_vnp; /* Corresponding vnode */ + avl_tree_t bvn_dir; /* directory entries, if VDIR */ + avl_node_t bvn_link; /* dirent link */ + list_node_t bvn_alink; /* link for all nodes */ + uint64_t bvn_addr; /* Address in pmem */ + uint64_t bvn_size; /* Size of the file */ + struct bootfs_node *bvn_parent; /* .. */ + vattr_t bvn_attr; /* attributes for the node */ +} bootfs_node_t; + +typedef struct bootfs_stat { + kstat_named_t bfss_nfiles; + kstat_named_t bfss_ndirs; + kstat_named_t bfss_nbytes; + kstat_named_t bfss_ndups; + kstat_named_t bfss_ndiscards; +} bootfs_stat_t; + +typedef struct bootfs { + vfs_t *bfs_vfsp; + char *bfs_mntpath; + bootfs_node_t *bfs_rootvn; + kstat_t *bfs_kstat; + list_t bfs_nodes; + minor_t bfs_minor; + uint_t bfs_ninode; + bootfs_stat_t bfs_stat; +} bootfs_t; + +extern void bootfs_construct(bootfs_t *); +extern void bootfs_destruct(bootfs_t *); +extern int bootfs_node_constructor(void *, void *, int); +extern void bootfs_node_destructor(void *, void *); + +extern struct vnodeops *bootfs_vnodeops; +extern const fs_operation_def_t bootfs_vnodeops_template[]; +extern kmem_cache_t *bootfs_node_cache; +extern major_t bootfs_major; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_BOOTFS_IMPL_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_INFO_H +#define _SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index 19a147d88a..d5bd0162c7 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -35,6 +35,7 @@ extern "C" { #include <sys/vfs_opreg.h> #include <sys/list.h> #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h> /* * sdev_nodes are the file-system specific part of the @@ -126,6 +127,21 @@ typedef struct sdev_local_data { struct sdev_dprof sdev_lprof; /* profile for multi-inst */ } sdev_local_data_t; +/* sdev_flags */ +typedef enum sdev_flags { + SDEV_BUILD = 0x0001, /* directory cache out-of-date */ + SDEV_GLOBAL = 0x0002, /* global /dev nodes */ + SDEV_PERSIST = 0x0004, /* backing store persisted node */ + SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */ + SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */ + /* (ex: pts) */ + SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */ + SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */ + /* need update */ + SDEV_SUBDIR = 0x0080, /* match all subdirs under here */ + SDEV_ZONED = 0x0100 /* zoned subdir */ +} sdev_flags_t; + /* * /dev filesystem sdev_node defines */ @@ -148,7 +164,7 @@ typedef struct sdev_node { ino64_t sdev_ino; /* inode */ uint_t sdev_nlink; /* link count */ int sdev_state; /* state of this node */ - int sdev_flags; /* flags bit */ + sdev_flags_t sdev_flags; /* flags bit */ kmutex_t sdev_lookup_lock; /* node creation synch lock */ kcondvar_t sdev_lookup_cv; /* node creation sync cv */ @@ -159,7 +175,7 @@ typedef struct sdev_node { struct sdev_global_data sdev_globaldata; struct sdev_local_data sdev_localdata; } sdev_instance_data; - + list_node_t sdev_plist; /* link on plugin list */ void *sdev_private; } sdev_node_t; @@ -190,29 +206,11 @@ typedef enum { SDEV_READY } sdev_node_state_t; -/* sdev_flags */ -#define SDEV_BUILD 0x0001 /* directory cache out-of-date */ -#define SDEV_GLOBAL 0x0002 /* global /dev nodes */ -#define SDEV_PERSIST 0x0004 /* backing store persisted node */ -#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */ -#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */ - /* (ex: pts) */ -#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */ -#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */ - /* need update */ -#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */ -#define SDEV_ZONED 0x0100 /* zoned subdir */ - /* sdev_lookup_flags */ #define SDEV_LOOKUP 0x0001 /* node creation in progress */ #define SDEV_READDIR 0x0002 /* VDIR readdir in progress */ #define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */ -#define SDEV_VTOR_INVALID -1 -#define SDEV_VTOR_SKIP 0 -#define SDEV_VTOR_VALID 1 -#define SDEV_VTOR_STALE 2 - /* convenient macros */ #define SDEV_IS_GLOBAL(dv) \ (dv->sdev_flags & SDEV_GLOBAL) @@ -364,8 +362,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *, extern int devname_profile_update(char *, size_t); extern struct sdev_data *sdev_find_mntinfo(char *); void sdev_mntinfo_rele(struct sdev_data *); +typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *); +void sdev_mnt_walk(sdev_mnt_walk_f, void *); extern struct vnodeops *devpts_getvnodeops(void); extern struct vnodeops *devvt_getvnodeops(void); +extern void sdev_plugin_nodeready(struct sdev_node *); +extern int sdev_plugin_init(void); +extern int sdev_plugin_fini(void); /* * boot states - warning, the ordering here is significant @@ -510,6 +513,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *); extern void sdev_modctl_dump_files(void); /* + * plugin and legacy vtab stuff + */ +/* directory dependent vop table */ +typedef struct sdev_vop_table { + char *vt_name; /* subdirectory name */ + const fs_operation_def_t *vt_service; /* vnodeops table */ + struct vnodeops **vt_global_vops; /* global container for vop */ + int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ + int vt_flags; +} sdev_vop_table_t; + +extern struct sdev_vop_table vtab[]; +extern struct vnodeops *sdev_get_vop(struct sdev_node *); +extern void sdev_set_no_negcache(struct sdev_node *); +extern void *sdev_get_vtor(struct sdev_node *dv); + +/* * globals */ extern kmutex_t sdev_lock; @@ -522,6 +542,7 @@ extern struct vnodeops *devipnet_vnodeops; extern struct vnodeops *devvt_vnodeops; extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */ extern struct vnodeops *devzvol_vnodeops; +extern int sdev_vnodeops_tbl_size; extern const fs_operation_def_t sdev_vnodeops_tbl[]; extern const fs_operation_def_t devpts_vnodeops_tbl[]; diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h new file mode 100644 index 0000000000..8783df58e6 --- /dev/null +++ b/usr/src/uts/common/sys/fs/sdev_plugin.h @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_SDEV_PLUGIN_H +#define _SYS_SDEV_PLUGIN_H + +/* + * Kernel sdev plugin interface + */ + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/vnode.h> + +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef uintptr_t sdev_plugin_hdl_t; +typedef uintptr_t sdev_ctx_t; + +/* + * Valid return values for sdev_plugin_validate_t. + */ +typedef enum sdev_plugin_validate { + SDEV_VTOR_INVALID = -1, + SDEV_VTOR_SKIP = 0, + SDEV_VTOR_VALID = 1, + SDEV_VTOR_STALE = 2 +} sdev_plugin_validate_t; + +/* + * Valid flags + */ +typedef enum sdev_plugin_flags { + SDEV_PLUGIN_NO_NCACHE = 0x1, + SDEV_PLUGIN_SUBDIR = 0x2 +} sdev_plugin_flags_t; + +#define SDEV_PLUGIN_FLAGS_MASK 0x3 + +/* + * Functions a module must implement + */ +typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t); +typedef int (*sp_filldir_f)(sdev_ctx_t); +typedef void (*sp_inactive_f)(sdev_ctx_t); + +#define SDEV_PLUGIN_VERSION 1 + +typedef struct sdev_plugin_ops { + int spo_version; + sdev_plugin_flags_t spo_flags; + sp_valid_f spo_validate; + sp_filldir_f spo_filldir; + sp_inactive_f spo_inactive; +} sdev_plugin_ops_t; + +extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *, + int *); +extern int sdev_plugin_unregister(sdev_plugin_hdl_t); + +typedef enum sdev_ctx_flags { + SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */ +} sdev_ctx_flags_t; + +/* + * Context helper functions + */ +extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t); +extern const char *sdev_ctx_name(sdev_ctx_t); +extern const char *sdev_ctx_path(sdev_ctx_t); +extern enum vtype sdev_ctx_vtype(sdev_ctx_t); +extern const void *sdev_ctx_vtype_data(sdev_ctx_t); + +/* + * Callbacks to manipulate nodes + */ +extern int sdev_plugin_mkdir(sdev_ctx_t, char *); +extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDEV_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index 68dd67c61e..f8740e8873 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -22,12 +22,13 @@ * Copyright 2007 Sun Microsystems, Inc. * All rights reserved. Use is subject to license terms. */ +/* + * Copyright 2015 Joyent, Inc. + */ #ifndef _SYS_FS_TMP_H #define _SYS_FS_TMP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -68,29 +69,28 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ /* * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system. E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system. In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care + * should be exercised in changing this: tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure. */ -#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */ +#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */ extern size_t tmpfs_minfree; /* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */ - -extern size_t tmp_kmemspace; -extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */ - extern void tmpnode_init(struct tmount *, struct tmpnode *, struct vattr *, struct cred *); extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t); @@ -101,13 +101,12 @@ extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *, enum dr_op, struct cred *); extern void tdirinit(struct tmpnode *, struct tmpnode *); extern void tdirtrunc(struct tmpnode *); -extern void *tmp_memalloc(size_t, int); -extern void tmp_memfree(void *, size_t); extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int); extern int tmp_taccess(void *, int, struct cred *); extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, struct cred *); extern int tmp_convnum(char *, pgcnt_t *); +extern int tmp_convmode(char *, mode_t *); extern int tdirenter(struct tmount *, struct tmpnode *, char *, enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *, struct tmpnode **, struct cred *, caller_context_t *); diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..40ef4ce982 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_GSQUEUE_H +#define _SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(uint_t, pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define GSQUEUE_FILL 0x0001 +#define GSQUEUE_NODRAIN 0x0002 +#define GSQUEUE_PROCESS 0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, + int, uint8_t); + +/* + * The default wait is inherited from IP. This determines the amount of time + * that must pass after queuing work, before we wake up the worker thread. This + * value is in milliseconds. + */ +#define GSQUEUE_DEFAULT_WAIT 10 +#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index d56fcceb5a..46d25f207f 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All Rights reserved. */ #ifndef _ID_SPACE_H @@ -34,8 +35,6 @@ extern "C" { #include <sys/mutex.h> #include <sys/vmem.h> -#ifdef _KERNEL - typedef vmem_t id_space_t; id_space_t *id_space_create(const char *, id_t, id_t); @@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *); id_t id_alloc_specific_nosleep(id_space_t *, id_t); void id_free(id_space_t *, id_t); -#endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the inotify facility. Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define _SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ISDIR 0x40000000 +#define IN_ONESHOT 0x80000000 + +/* + * Helpful constants. + */ +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_ALL_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ + IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define IN_CHILD_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +struct inotify_event { + int32_t wd; /* watch descriptor */ + uint32_t mask; /* mask of events */ + uint32_t cookie; /* event association cookie, if any */ + uint32_t len; /* size of name field */ + char name[]; /* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8)) +#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */ +#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */ +#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */ +#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name +#else +#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define INOTIFY_PTR(type, name) type *name +#endif + +typedef struct inotify_addwatch { + int inaw_fd; /* open fd for object */ + uint32_t inaw_mask; /* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { + INOTIFY_PTR(char, inac_name); /* pointer to name */ + int inac_fd; /* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define IN_UNMASKABLE \ + (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define IN_MODIFIERS \ + (IN_EXCL_UNLINK | IN_ONESHOT) + +#define IN_FLAGS \ + (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define IN_REMOVAL (1ULL << 32) +#define INOTIFYMNRN_INOTIFY 0 +#define INOTIFYMNRN_CLONE 1 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index b1990121b8..0ae64b45d7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -95,7 +96,7 @@ extern "C" { /* insert new signals here, and move _SIGRTM* appropriately */ #define _SIGRTMIN 42 /* first (highest-priority) realtime signal */ -#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */ +#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */ extern long _sysconf(int); /* System Private interface to sysconf() */ #define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */ #define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..2aceb3a0f6 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -192,6 +192,8 @@ typedef struct _klwp { struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ void *lwp_brand; /* per-lwp brand data */ + int (*lwp_brand_syscall)(void); /* brand syscall interposer */ + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index dfe25eec76..be669cb78d 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -21,6 +21,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_KSOCKET_H_ @@ -121,6 +122,10 @@ extern int ksocket_close(ksocket_t, struct cred *); extern void ksocket_hold(ksocket_t); extern void ksocket_rele(ksocket_t); +typedef boolean_t (*ksocket_krecv_f)(ksocket_t, mblk_t *, size_t, int, void *); +extern int ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *); +extern void ksocket_krecv_unblock(ksocket_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h new file mode 100644 index 0000000000..88625d1829 --- /dev/null +++ b/usr/src/uts/common/sys/limits.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LIMITS_H +#define _SYS_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define IOV_MAX 1024 + +#ifdef _KERNEL +#define IOV_MAX_STACK 16 /* max. IOV on-stack allocation */ +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIMITS_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index bd09077f1b..3c4e064f06 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_MAC_H @@ -100,6 +100,14 @@ typedef struct mac_propval_uint32_range_s { } mac_propval_uint32_range_t; /* + * Defines ranges which are a series of C style strings. + */ +typedef struct mac_propval_str_range_s { + uint32_t mpur_nextbyte; + char mpur_data[1]; +} mac_propval_str_range_t; + +/* * Data type of property values. */ typedef enum { @@ -119,6 +127,7 @@ typedef struct mac_propval_range_s { mac_propval_type_t mpr_type; /* type of value */ union { mac_propval_uint32_range_t mpr_uint32[1]; + mac_propval_str_range_t mpr_str; } u; } mac_propval_range_t; @@ -213,6 +222,7 @@ typedef enum { MAC_PROP_MAX_RXHWCLNT_AVAIL, MAC_PROP_MAX_TXHWCLNT_AVAIL, MAC_PROP_IB_LINKMODE, + MAC_PROP_VN_PROMISC_FILTERED, MAC_PROP_SECONDARY_ADDRS, MAC_PROP_PRIVATE = -1 } mac_prop_id_t; diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 3c89b7c434..f599e1fcfe 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -328,6 +328,7 @@ extern int mac_tx_percpu_cnt; /* Mac protection flags */ #define MPT_FLAG_V6_LOCAL_ADDR_SET 0x0001 +#define MPT_FLAG_PROMISC_FILTERED 0x0002 /* in mac_client.c */ extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index bed2e293b3..fa3a1ebfad 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -170,6 +170,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); extern void *mac_get_devinfo(mac_handle_t); extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t); extern uint32_t mac_no_notification(mac_handle_t); extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 3c9c6b77a4..d72558c612 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_MAC_IMPL_H @@ -643,6 +643,7 @@ struct mac_impl_s { #define MIS_LEGACY 0x0040 #define MIS_NO_ACTIVE 0x0080 #define MIS_POLL_DISABLE 0x0100 +#define MIS_IS_OVERLAY 0x0200 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start @@ -894,6 +895,8 @@ extern void mac_protect_fini(mac_client_impl_t *); extern int mac_set_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_resources(mac_handle_t, mac_resource_props_t *); extern void mac_get_effective_resources(mac_handle_t, mac_resource_props_t *); +extern void mac_set_promisc_filtered(mac_client_handle_t, boolean_t); +extern boolean_t mac_get_promisc_filtered(mac_client_handle_t); extern cpupart_t *mac_pset_find(mac_resource_props_t *, boolean_t *); extern void mac_set_pool_effective(boolean_t, cpupart_t *, diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 9f7f2a1a73..5f02451542 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_MAC_PROVIDER_H @@ -106,7 +107,8 @@ typedef enum { MAC_CAPAB_NO_NATIVEVLAN = 0x00080000, /* boolean only, no data */ MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */ MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */ - MAC_CAPAB_VRRP = 0x00400000 /* data is mac_capab_vrrp_t */ + MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */ + MAC_CAPAB_OVERLAY = 0x00800000 /* boolean only, no data */ } mac_capab_t; /* diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 4976bf09ab..34e491fd3b 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -318,6 +318,7 @@ struct memcntl_mha32 { #define MADV_ACCESS_DEFAULT 6 /* default access */ #define MADV_ACCESS_LWP 7 /* next LWP to access heavily */ #define MADV_ACCESS_MANY 8 /* many processes to access heavily */ +#define MADV_PURGE 9 /* contents will be purged */ #endif /* (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) ... */ @@ -337,6 +338,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__) /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index e95ef3fccc..d215d88790 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T * All Rights Reserved @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 14b1aa55db..883c329aed 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@ * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_NETCONFIG_H @@ -147,6 +148,8 @@ extern int endnetpath(void *); extern struct netconfig *getnetpath(void *); extern void nc_perror(const char *); extern char *nc_sperror(void); +extern void _nsl_brand_set_hooks(int (*)(void), + struct netconfig *(*)(int)); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index 93b5fc3e01..ea85c78f6b 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -44,6 +44,8 @@ extern "C" { #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VND_INET "NHF_VND_INET" +#define NHF_VND_INET6 "NHF_VND_INET6" /* * Event identification diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 2c77e1be96..73f29d1e63 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -81,7 +81,8 @@ typedef id_t netstackid_t; #define NS_IPSECESP 16 #define NS_IPNET 17 #define NS_ILB 18 -#define NS_MAX (NS_ILB+1) +#define NS_VND 19 +#define NS_MAX (NS_VND+1) /* * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define _SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVERLAY_IOC_CREATE OVERLAYIOC(1) +#define OVERLAY_IOC_DELETE OVERLAYIOC(2) +#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3) +#define OVERLAY_IOC_GETPROP OVERLAYIOC(4) +#define OVERLAY_IOC_SETPROP OVERLAYIOC(5) +#define OVERLAY_IOC_NPROPS OVERLAYIOC(6) +#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7) +#define OVERLAY_IOC_STATUS OVERLAYIOC(8) + +typedef struct overlay_ioc_create { + datalink_id_t oic_linkid; + uint32_t oic_filler; + uint64_t oic_vnetid; + char oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { + datalink_id_t oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { + datalink_id_t oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { + datalink_id_t oipn_linkid; + int32_t oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { + datalink_id_t oipi_linkid; + int32_t oipi_id; + char oipi_name[OVERLAY_PROP_NAMELEN]; + uint_t oipi_type; + uint_t oipi_prot; + uint8_t oipi_default[OVERLAY_PROP_SIZEMAX]; + uint32_t oipi_defsize; + uint32_t oipi_posssize; + uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { + datalink_id_t oip_linkid; + int32_t oip_id; + char oip_name[OVERLAY_PROP_NAMELEN]; + uint8_t oip_value[OVERLAY_PROP_SIZEMAX]; + uint32_t oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { + OVERLAY_I_OK = 0x00, + OVERLAY_I_DEGRADED = 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { + datalink_id_t ois_linkid; + uint_t ois_status; + char ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..d638096006 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define _SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { + OVERLAY_TARGET_NONE = 0x0, + OVERLAY_TARGET_POINT, + OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { + OVERLAY_PLUGIN_D_INVALID = 0x0, + OVERLAY_PLUGIN_D_ETHERNET = 0x1, + OVERLAY_PLUGIN_D_IP = 0x2, + OVERLAY_PLUGIN_D_PORT = 0x4, + OVERLAY_PLUGIN_D_MASK = 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { + OVERLAY_PROP_T_INT = 0x1, /* signed int */ + OVERLAY_PROP_T_UINT, /* unsigned int */ + OVERLAY_PROP_T_IP, /* sinaddr6 */ + OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { + OVERLAY_PROP_PERM_REQ = 0x1, + OVERLAY_PROP_PERM_READ = 0x2, + OVERLAY_PROP_PERM_WRITE = 0x4, + OVERLAY_PROP_PERM_RW = 0x6, + OVERLAY_PROP_PERM_RRW = 0x7, + OVERLAY_PROP_PERM_MASK = 0x7 +} overlay_prop_prot_t; + +#define OVERLAY_PROP_NAMELEN 64 +#define OVERLAY_PROP_SIZEMAX 256 +#define OVERLAY_STATUS_BUFLEN 256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..f62ecff179 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define _SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION_ONE 0x1 + +typedef struct overlay_plugin { + kmutex_t ovp_mutex; + list_node_t ovp_link; /* overlay_plugin_lock */ + uint_t ovp_active; /* ovp_mutex */ + const char *ovp_name; /* RO */ + const overlay_plugin_ops_t *ovp_ops; /* RO */ + const char *const *ovp_props; /* RO */ + uint_t ovp_nprops; /* RO */ + uint_t ovp_id_size; /* RO */ + overlay_plugin_flags_t ovp_flags; /* RO */ + overlay_plugin_dest_t ovp_dest; /* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { + list_node_t omux_lnode; + ksocket_t omux_ksock; /* RO */ + overlay_plugin_t *omux_plugin; /* RO: associated encap */ + int omux_domain; /* RO: socket domain */ + int omux_family; /* RO: socket family */ + int omux_protocol; /* RO: socket protocol */ + struct sockaddr *omux_addr; /* RO: socket address */ + socklen_t omux_alen; /* RO: sockaddr len */ + kmutex_t omux_lock; /* Protects everything below */ + uint_t omux_count; /* Active instances */ + avl_tree_t omux_devices; /* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { + OVERLAY_T_TEARDOWN = 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { + kmutex_t ott_lock; + kcondvar_t ott_cond; + overlay_target_mode_t ott_mode; /* RO */ + overlay_plugin_dest_t ott_dest; /* RO */ + uint64_t ott_id; /* RO */ + overlay_target_flag_t ott_flags; /* ott_lock */ + uint_t ott_ocount; /* ott_lock */ + union { /* ott_lock */ + overlay_target_point_t ott_point; + struct overlay_target_dyn { + refhash_t *ott_dhash; + avl_tree_t ott_tree; + } ott_dyn; + } ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { + OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */ + OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */ + OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */ + OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */ + OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */ + OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */ + OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */ + OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */ + OVERLAY_F_DEGRADED = 0x40, /* device is degraded */ + OVERLAY_F_MASK = 0x7f /* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { + kmutex_t odd_lock; + kcondvar_t odd_iowait; + list_node_t odd_link; /* overlay_dev_lock */ + mac_handle_t odd_mh; /* RO */ + overlay_plugin_t *odd_plugin; /* RO */ + datalink_id_t odd_linkid; /* RO */ + void *odd_pvoid; /* RO -- only used by plugin */ + uint_t odd_ref; /* protected by odd_lock */ + uint_t odd_mtu; /* protected by odd_lock */ + overlay_dev_flag_t odd_flags; /* protected by odd_lock */ + uint_t odd_rxcount; /* protected by odd_lock */ + uint_t odd_txcount; /* protected by odd_lock */ + overlay_mux_t *odd_mux; /* protected by odd_lock */ + uint64_t odd_vid; /* RO if active else odd_lock */ + avl_node_t odd_muxnode; /* managed by mux */ + overlay_target_t *odd_target; /* See big theory statement */ + char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { + OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ + OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ + OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ + OVERLAY_ENTRY_F_VALID_MASK = 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { + kmutex_t ote_lock; + refhash_link_t ote_reflink; /* hashtable link */ + avl_node_t ote_avllink; /* iteration link */ + list_node_t ote_qlink; + overlay_target_entry_flags_t ote_flags; /* RW: state flags */ + uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ + overlay_target_t *ote_ott; /* RO */ + overlay_dev_t *ote_odd; /* RO */ + overlay_target_point_t ote_dest; /* RW: destination */ + mblk_t *ote_chead; /* RW: blocked mb chain head */ + mblk_t *ote_ctail; /* RW: blocked mb chain tail */ + size_t ote_mbsize; /* RW: outstanding mblk size */ + hrtime_t ote_vtime; /* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define OVERLAY_CTL "overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, + struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define OVERLAY_TARGET_OK 0 +#define OVERLAY_TARGET_DROP 1 +#define OVERLAY_TARGET_ASYNC 2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, + socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..07efaa05df --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define _SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + * overlay_plugin_init_t + * + * This interface is used to create a new instance of a plugin. An instance + * of a plugin will be created for each overlay device that is created. For + * example, if a device is created with VXLAN ID 23 and ID 42, then there + * will be two different calls to this function. + * + * This function gives the plugin a chance to create a private data + * structure that will be returned on subsequent calls to the system. + * + * overlay_plugin_fini_t + * + * This is the opposite of overlay_plugin_init_t. It will be called when it + * is safe to remove any private data that is associated with this instance + * of the plugin. + * + * overlay_plugin_propinfo_t + * + * This is called with the name of a property that is registered when the + * plugin is created. This function will be called with the name of the + * property that information is being requested about. The plugin is + * responsible for filling out information such as setting the name, the + * type of property it is, the protection of the property (can a user + * update it?), whether the property is required, an optional default value + * for the property, and an optional set of values or ranges that are + * allowed. + * + * overlay_plugin_getprop_t + * + * Return the value of the named property from the current instance of the + * plugin. + * + * overlay_plugin_setprop_t + * + * Set the value of the named property to the specified value for the + * current instance of the plugin. Note, that it is the plugin's + * responsibility to ensure that the value of the property is valid and to + * update state as appropriate. + * + * overlay_plugin_socket_t + * + * Every overlay device has a corresponding socket that it uses to send and + * receive traffic. This routine is used to get the parameters that should + * be used to define such a socket. The actual socket may be multiplexed + * with other uses of it. + * + * overlay_plugin_sockopt_t + * + * Allow a plugin to set any necessary socket options that it needs on the + * kernel socket that is being used by a mux. This will only be called once + * for a given mux, if additional devices are added to a mux, it will not + * be called additional times. + * + * overlay_plugin_encap_t + * + * In this routine you're given a message block and information about the + * packet, such as the identifier and are asked to fill out a message block + * that represents the encapsulation header and optionally manipulate the + * input message if required. + * + * overlay_plugin_decap_t + * + * In this routine, you're given the encapsulated message block. The + * requirement is to decapsulate it and determine what is the correct + * overlay identifier for this network and to fill in the header size so + * the broader system knows how much of this data should be considered + * consumed. + * + * ovpo_callbacks + * + * This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + * ovep_version + * + * Should be set to the value of the macro OVEP_VERSION. + * + * ovep_name + * + * Should be set to a character string that has the name of the module. + * Generally this should match the name of the kernel module; however, this + * is the name that users will use to refer to this module when creating + * devices. + * + * overlay_plugin_ops_t + * + * Should be set to the functions as described above. + * + * ovep_props + * + * This is an array of character strings that holds the names of the + * properties of the encapsulation plugin. + * + * + * ovep_id_size + * + * This is the size in bytes of the valid range for the identifier. The + * valid identifier range is considered a ovep_id_size byte unsigned + * integer, [ 0, 1 << (ovep_id_size * 8) ). + * + * ovep_flags + * + * A series of flags that indicate optional features that are supported. + * Valid flags include: + * + * OVEP_F_VLAN_TAG + * + * The encapsulation format allows for the encapsulated + * packet to maintain a VLAN tag. + * + * ovep_dest + * + * Describes the kind of destination that the overlay plugin supports for + * sending traffic. For example, vxlan uses UDP, therefore it requires both + * an IP address and a port; however, nvgre uses the gre header and + * therefore only requires an IP address. The following flags may be + * combined: + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * Indicates that to send a packet to its destination, we + * require a link-layer ethernet address. + * + * OVERLAY_PLUGIN_D_IP + * + * Indicates that to send a packet to its destination, we + * require an IP address. Note, all IP addresses are + * transmitted as IPv6 addresses and for an IPv4 + * destination, using an IPv4-mapped IPv6 address is the + * expected way to transmit that. + * + * OVERLAY_PLUGIN_D_PORT + * + * Indicates that to send a packet to its destination, a + * port is required, this usually indicates that the + * protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Improtantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION 0x1 + +typedef enum overlay_plugin_flags { + OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { + uint64_t ovdi_id; + size_t ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, + ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, + ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, + struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, + uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, + uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { + uint_t ovpo_callbacks; + overlay_plugin_init_t ovpo_init; + overlay_plugin_fini_t ovpo_fini; + overlay_plugin_encap_t ovpo_encap; + overlay_plugin_decap_t ovpo_decap; + overlay_plugin_socket_t ovpo_socket; + overlay_plugin_sockopt_t ovpo_sockopt; + overlay_plugin_getprop_t ovpo_getprop; + overlay_plugin_setprop_t ovpo_setprop; + overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { + uint_t ovep_version; + const char *ovep_name; + const overlay_plugin_ops_t *ovep_ops; + const char **ovep_props; + uint_t ovep_id_size; + uint_t ovep_flags; + uint_t ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, + uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..cae193c334 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,292 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define _OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { + uint8_t otp_mac[ETHERADDRL]; + struct in6_addr otp_ip; + uint16_t otp_port; +} overlay_target_point_t; + +#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { + OVERLAY_TARG_INFO_F_ACTIVE = 0x01, + OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { + datalink_id_t oti_linkid; + uint32_t oti_needs; + uint64_t oti_flags; + uint64_t oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { + datalink_id_t ota_linkid; + uint32_t ota_mode; + uint64_t ota_id; + uint32_t ota_provides; + overlay_target_point_t ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { + datalink_id_t otd_linkid; + uint32_t otd_pad; + char otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { + datalink_id_t otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + * OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + * The overlay_targ_resp_t has the appropriate information from + * which a reply can be generated. The information is filled into + * an overlay_targ_point_t as appropriate based on the + * overlay_plugin_dest_t type. + * + * + * OVERLAY_TARG_DROP - overlay_targ_resp_t + * + * The overlay_targ_resp_t should identify a request for which to + * drop a packet. + * + * + * OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * The overlay_targ_pkt_t injects a fully formed packet into the + * virtual network. It may either be identified by its data link id + * or by the request id. If both are specified, the + * datalink id will be used. Note, that an injection is not + * considered a reply and if this corresponds to a requeset, then + * that individual packet must still be dropped. + * + * + * OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * This ioctl can be used to copy data from a given request into a + * user buffer. This can be used in combination with + * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * + * + * OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * of receiving it on the local mac handle, it queues it for + * retransmission again. This is useful if you have a packet that + * was originally destined for some broadcast or multicast address + * that you now want to send to a unicast address. + */ +#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10) +#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11) +#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12) +#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13) +#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) +#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { + uint64_t otl_dlid; + uint64_t otl_reqid; + uint64_t otl_varpdid; + uint64_t otl_vnetid; + uint64_t otl_hdrsize; + uint64_t otl_pktsize; + uint8_t otl_srcaddr[ETHERADDRL]; + uint8_t otl_dstaddr[ETHERADDRL]; + uint32_t otl_dsttype; + uint32_t otl_sap; + int32_t otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { + uint64_t otr_reqid; + overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + void *otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +typedef struct overlay_targ_pkt32 { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + caddr32_t otp_buf; +} overlay_targ_pkt32_t; + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { + uint32_t otl_nents; + uint32_t otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * The overlay_targ_cache_t should be have its link identifier and + * the desired mac address filled in. On return, it will fill in + * the otc_dest member, if the entry exists in the table. + * + * + * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * The cache table entry of the mac address referred to by otc_mac + * and otd_linkid will be filled in with the details provided by in + * the otc_dest member. + * + * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * Removes the cache entry identified by otc_mac from the table. + * Note that this does not stop any in-flight lookups or deal with + * any data that is awaiting a lookup. + * + * + * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * entire table identified by otc_linkid. All other parameters are + * ignored. + * + * + * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * Iterates over the contents of a target cache identified by + * otci_linkid. Iteration is guaranteed to be exactly once for + * items which are in the hashtable at the beginning and end of + * iteration. For items which are added or removed after iteration + * has begun, only at most once semantics are guaranteed. Consumers + * should ensure that otci_marker is zeroed before starting + * iteration and should preserve its contents across calls. + * + * Before calling in, otci_count should be set to the number of + * entries that space has been allocated for in otci_ents. The + * value will be updated to indicate the total number written out. + */ + +#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30) +#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31) +#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) +#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) +#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define OVERLAY_TARGET_ITER_MAX 500 + +#define OVERLAY_TARGET_CACHE_DROP 0x01 + +typedef struct overlay_targ_cache_entry { + uint8_t otce_mac[ETHERADDRL]; + uint16_t otce_flags; + overlay_target_point_t otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { + datalink_id_t otc_linkid; + overlay_targ_cache_entry_t otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { + datalink_id_t otci_linkid; + uint32_t otci_pad; + uint64_t otci_marker; + uint16_t otci_count; + overlay_targ_cache_entry_t otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index 01c25610dd..12bdc29079 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -103,7 +103,7 @@ extern "C" { #define DEFAULT_MAXPID 999999 #define DEFAULT_JUMPPID 100000 #else -#define DEFAULT_MAXPID 30000 +#define DEFAULT_MAXPID 99999 #define DEFAULT_JUMPPID 0 #endif diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index 276c4d386e..5328d02c59 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #ifndef _SYS_POLICY_H @@ -107,6 +107,8 @@ int secpolicy_ipc_config(const cred_t *); int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *); int secpolicy_kmdb(const cred_t *); int secpolicy_lock_memory(const cred_t *); +int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *); int secpolicy_modctl(const cred_t *, int); int secpolicy_net(const cred_t *, int, boolean_t); int secpolicy_net_bindmlp(const cred_t *); @@ -173,6 +175,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/poll.h b/usr/src/uts/common/sys/poll.h index 9fff78a966..efc8457a6a 100644 --- a/usr/src/uts/common/sys/poll.h +++ b/usr/src/uts/common/sys/poll.h @@ -30,6 +30,10 @@ * All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_H #define _SYS_POLL_H @@ -59,6 +63,7 @@ typedef unsigned long nfds_t; #define POLLWRNORM POLLOUT #define POLLRDBAND 0x0080 /* out-of-band data is readable */ #define POLLWRBAND 0x0100 /* out-of-band data is writeable */ +#define POLLRDHUP 0x4000 /* read-side hangup */ #define POLLNORM POLLRDNORM @@ -70,7 +75,13 @@ typedef unsigned long nfds_t; #define POLLHUP 0x0010 /* fd has been hung up on */ #define POLLNVAL 0x0020 /* invalid pollfd entry */ -#define POLLREMOVE 0x0800 /* remove a cached poll fd from /dev/poll */ +/* + * These events will never be specified in revents, but may be specified in + * events to control /dev/poll behavior. + */ +#define POLLREMOVE 0x0800 /* remove cached /dev/poll fd */ +#define POLLONESHOT 0x1000 /* /dev/poll should one-shot this fd */ +#define POLLET 0x2000 /* edge-triggered /dev/poll fd */ #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index ede99d0df2..2e866ec4d4 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -24,11 +24,13 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_POLL_IMPL_H #define _SYS_POLL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Caching Poll Subsystem: * @@ -160,6 +162,7 @@ typedef struct polldat { int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ struct port_kevent *pd_portev; /* associated port event struct */ + uint64_t pd_epolldata; /* epoll data, if any */ } polldat_t; /* @@ -187,7 +190,8 @@ typedef struct pollcache { } pollcache_t; /* pc_flag */ -#define T_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_WRITEWANTED 0x04 /* writer wishes to modify the pollcache_t */ #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index f1a2fc5485..e811bebf25 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -346,7 +347,9 @@ typedef struct proc { struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ struct brand *p_brand; /* process's brand */ - void *p_brand_data; /* per-process brand state */ + + /* per-process brand state */ + void *p_brand_data; /* additional lock to protect p_sessp (but not its contents) */ kmutex_t p_splock; @@ -361,7 +364,6 @@ typedef struct proc { */ struct user p_user; /* (see sys/user.h) */ } proc_t; - #define PROC_T /* headers relying on proc_t are OK */ #ifdef _KERNEL @@ -627,6 +629,7 @@ extern int signal_is_blocked(kthread_t *, int); extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid); extern void pid_setmin(void); extern pid_t pid_allocate(proc_t *, pid_t, int); extern int pid_rele(struct pid *); @@ -716,6 +719,10 @@ extern kthread_t *thread_unpin(void); extern void thread_init(void); extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern void thread_splitstack(void (*)(void *), void *, size_t); +extern void thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern void thread_splitstack_cleanup(void); + extern void tsd_create(uint_t *, void (*)(void *)); extern void tsd_destroy(uint_t *); extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index f592fd9dcf..501af712ef 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -25,6 +25,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_PROCFS_H @@ -233,6 +234,7 @@ typedef struct pstatus { #define PR_FAULTED 6 #define PR_SUSPENDED 7 #define PR_CHECKPOINT 8 +#define PR_BRAND 9 /* * lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 6c79ee266d..ba8b2b1210 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ typedef struct pt_own { @@ -157,6 +163,19 @@ typedef struct pt_own { #define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ #define OWNERPT (('P'<<8)|5) /* set owner/group for slave device */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h index 2069e6d3f1..b7427a454d 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -10,11 +10,11 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#ifndef _SYS_SCSI_ADAPTERS_MPTHASH_H -#define _SYS_SCSI_ADAPTERS_MPTHASH_H +#ifndef _SYS_REFHASH_H +#define _SYS_REFHASH_H #include <sys/types.h> #include <sys/list.h> @@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *); extern void *refhash_next(refhash_t *, void *); extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); -#endif /* _SYS_SCSI_ADAPTERS_MPTHASH_H */ +#endif /* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index d69b23aade..666fbcd8a4 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -191,6 +192,7 @@ struct rusage { #define _RUSAGESYS_GETRUSAGE_CHLD 1 /* rusage child process */ #define _RUSAGESYS_GETRUSAGE_LWP 2 /* rusage lwp */ #define _RUSAGESYS_GETVMUSAGE 3 /* getvmusage */ +#define _RUSAGESYS_INVALMAP 4 /* vm_map_inval */ #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h index 836548aa30..2689fe27c4 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h @@ -22,7 +22,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. * Copyright (c) 2014, Tegile Systems Inc. All rights reserved. */ @@ -58,10 +58,10 @@ #include <sys/byteorder.h> #include <sys/queue.h> +#include <sys/refhash.h> #include <sys/isa_defs.h> #include <sys/sunmdi.h> #include <sys/mdi_impldefs.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h> diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index 8f0e1794f4..139784d578 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ struct sigaction32 { * use of these symbols by applications is injurious * to binary compatibility */ -#define NSIG 74 /* valid signals range from 1 to NSIG-1 */ -#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */ +#define NSIG 75 /* valid signals range from 1 to NSIG-1 */ +#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */ #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */ #define MINSIGSTKSZ 2048 diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 590551f6f2..f5864db7d3 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -22,6 +22,7 @@ * Copyright 2014 Garrett D'Amore <garrett@damore.org> * * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -39,6 +40,9 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #ifndef _SYS_SOCKET_H #define _SYS_SOCKET_H @@ -293,8 +297,9 @@ struct linger { #define AF_INET_OFFLOAD 30 /* Sun private; do not use */ #define AF_TRILL 31 /* TRILL interface */ #define AF_PACKET 32 /* PF_PACKET Linux socket interface */ +#define AF_LX_NETLINK 33 /* Linux-compatible netlink */ -#define AF_MAX 32 +#define AF_MAX 33 /* * Protocol families, same as address families for now. @@ -334,6 +339,7 @@ struct linger { #define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */ #define PF_TRILL AF_TRILL #define PF_PACKET AF_PACKET +#define PF_LX_NETLINK AF_LX_NETLINK #define PF_MAX AF_MAX @@ -420,6 +426,7 @@ struct msghdr32 { #define MSG_NOTIFICATION 0x100 /* Notification, not data */ #define MSG_XPG4_2 0x8000 /* Private: XPG4.2 flag */ +/* Obsolete but kept for compilation compatability. Use IOV_MAX. */ #define MSG_MAXIOVLEN 16 #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 8221c620a8..b3f51cc0e1 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -99,6 +100,7 @@ struct sockaddr_ux { typedef struct sonodeops sonodeops_t; typedef struct sonode sonode_t; +typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *); struct sodirect_s; @@ -241,6 +243,10 @@ struct sonode { struct sof_instance *so_filter_top; /* top of stack */ struct sof_instance *so_filter_bottom; /* bottom of stack */ clock_t so_filter_defertime; /* time when deferred */ + + /* Kernel direct receive callbacks */ + so_krecv_f so_krecv_cb; /* recv callback */ + void *so_krecv_arg; /* recv cb arg */ }; #define SO_HAVE_DATA(so) \ @@ -294,15 +300,16 @@ struct sonode { #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ -#define SS_CLOSING 0x00010000 /* in process of closing */ +#define SS_CLOSING 0x00010000 /* in process of closing */ #define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ #define SS_FILOP_OK 0x00040000 /* socket can attach filters */ #define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ + #define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ #define SS_FIL_STOP 0x00200000 /* no more filter actions */ - #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ +#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */ #define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ #define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ @@ -318,7 +325,8 @@ struct sonode { /* * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment. */ #define SO_BLOCK_FALLBACK(so, fn) \ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ @@ -334,6 +342,24 @@ struct sonode { } \ } +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define SO_BLOCK_FALLBACK_SAFE(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state |= SS_FILOP_UNSF; \ + mutex_exit(&(so)->so_lock); \ + } + #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } @@ -942,6 +968,15 @@ extern struct sonode *socreate(struct sockparams *, int, int, int, int, extern int so_copyin(const void *, void *, size_t, int); extern int so_copyout(const void *, void *, size_t, int); +/* + * Functions to manipulate the use of direct receive callbacks. This should not + * be used outside of sockfs and ksocket. These are generally considered a use + * once interface for a socket and will cause all outstanding data on the socket + * to be flushed. + */ +extern int so_krecv_set(sonode_t *, so_krecv_f, void *); +extern void so_krecv_unblock(sonode_t *); + #endif /* diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops { #define SOF_VERSION 1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define SOF_ATT_SAFE 0x1 + extern int sof_register(int, const char *, const sof_ops_t *, int); extern int sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index f1bd429815..35e1cf64c7 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -29,6 +29,17 @@ extern "C" { #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ + #include <sys/types.h> #include <sys/processor.h> #include <sys/stream.h> @@ -76,12 +87,13 @@ typedef enum { struct ip_recv_attr_s; extern void squeue_init(void); -extern squeue_t *squeue_create(clock_t, pri_t); +extern squeue_t *squeue_create(clock_t, pri_t, boolean_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *); struct conn_s; extern int squeue_synch_enter(struct conn_s *, mblk_t *); diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 22550886eb..d2418bbc15 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -117,6 +117,7 @@ struct squeue_s { squeue_set_t *sq_set; /* managed by squeue creator */ pri_t sq_priority; /* squeue thread priority */ + boolean_t sq_isip; /* use IP-centric features */ /* Keep the debug-only fields at the end of the structure */ #ifdef DEBUG @@ -165,6 +166,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 #define SQS_PAUSE 0x04000000 /* The squeue has been paused */ +#define SQS_EXIT 0x08000000 /* squeue is being torn down */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 7a3b4e3448..4151cc0208 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -627,16 +628,11 @@ struct stroptions { /* * Structure for rw (read/write) procedure calls. A pointer * to a struiod_t is passed as a parameter to the rwnext() call. - * - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? */ -#define DEF_IOV_MAX 16 - typedef struct struiod { mblk_t *d_mp; /* pointer to mblk (chain) */ uio_t d_uio; /* uio info */ - iovec_t d_iov[DEF_IOV_MAX]; /* iov referenced by uio */ + iovec_t *d_iov; /* iov referenced by uio */ } struiod_t; /* diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index 8d1fd5e730..18d2284b4a 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -462,6 +462,7 @@ extern size_t strlcat(char *, const char *, size_t); extern size_t strlcpy(char *, const char *, size_t); extern size_t strspn(const char *, const char *); extern size_t strcspn(const char *, const char *); +extern char *strsep(char **, const char *); extern int bcmp(const void *, const void *, size_t) __PURE; extern int stoi(char **); extern void numtos(ulong_t, char *); diff --git a/usr/src/uts/common/sys/syscall.h b/usr/src/uts/common/sys/syscall.h index e365668f85..7d86565564 100644 --- a/usr/src/uts/common/sys/syscall.h +++ b/usr/src/uts/common/sys/syscall.h @@ -511,8 +511,8 @@ typedef struct { /* return values from system call */ #if !defined(_KERNEL) -extern int syscall(int, ...); -extern int __systemcall(sysret_t *, int, ...); +extern long syscall(int, ...); +extern long __systemcall(sysret_t *, int, ...); extern int __set_errno(int); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dtrace.h> #ifdef __cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent; extern systrace_sysent_t *systrace_sysent32; extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #ifdef _SYSCALL32_IMPL extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #endif #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 09be20858d..889a7096cd 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -361,6 +361,24 @@ extern pid_t tcgetsid(int); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 188230d61e..41ea2331df 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_THREAD_H #define _SYS_THREAD_H @@ -68,6 +72,8 @@ typedef struct ctxop { void (*free_op)(void *, int); /* function which frees the context */ void *arg; /* argument to above functions, ctx pointer */ struct ctxop *next; /* next context ops */ + hrtime_t save_ts; /* timestamp of last save */ + hrtime_t restore_ts; /* timestamp of last restore */ } ctxop_t; /* @@ -365,7 +371,7 @@ typedef struct _kthread { #define T_WOULDBLOCK 0x0020 /* for lockfs */ #define T_DONTBLOCK 0x0040 /* for lockfs */ #define T_DONTPEND 0x0080 /* for lockfs */ -#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */ +#define T_SPLITSTK 0x0100 /* kernel stack is currently split */ #define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ @@ -413,8 +419,9 @@ typedef struct _kthread { #define TS_RESUME 0x1000 /* setrun() by CPR resume process */ #define TS_CREATE 0x2000 /* setrun() by syslwp_create() */ #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ +#define TS_BSTART 0x8000 /* setrun() by brand */ #define TS_ALLSTART \ - (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) + (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART) #define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* @@ -442,6 +449,10 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \ + !((t)->t_schedflag & TS_BSTART)) + /* True if thread is asleep and wakeable */ #define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ ((t)->t_flag & T_WAKEABLE))) diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h index 604ddf5d83..ec349c962f 100644 --- a/usr/src/uts/common/sys/timer.h +++ b/usr/src/uts/common/sys/timer.h @@ -24,6 +24,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2015, Joyent, Inc. All rights reserved. + */ + #ifndef _SYS_TIMER_H #define _SYS_TIMER_H @@ -55,32 +59,45 @@ extern int timer_max; /* patchable via /etc/system */ struct clock_backend; -typedef struct itimer { +struct itimer; +typedef struct itimer itimer_t; + +struct itimer { itimerspec_t it_itime; hrtime_t it_hrtime; ushort_t it_flags; ushort_t it_lock; - void *it_arg; - sigqueue_t *it_sigq; - klwp_t *it_lwp; + void *it_arg; /* clock backend-specific data */ struct proc *it_proc; + union { + struct { + sigqueue_t *__it_sigq; + klwp_t *__it_lwp; + } __proc; + void *__it_frontend; + } __data; /* timer frontend-specific data */ kcondvar_t it_cv; int it_blockers; int it_pending; int it_overrun; struct clock_backend *it_backend; + void (*it_fire)(itimer_t *); kmutex_t it_mutex; void *it_portev; /* port_kevent_t pointer */ void *it_portsrc; /* port_source_t pointer */ int it_portfd; /* port file descriptor */ -} itimer_t; +}; + +#define it_sigq __data.__proc.__it_sigq +#define it_lwp __data.__proc.__it_lwp +#define it_frontend __data.__it_frontend typedef struct clock_backend { struct sigevent clk_default; int (*clk_clock_settime)(timespec_t *); int (*clk_clock_gettime)(timespec_t *); int (*clk_clock_getres)(timespec_t *); - int (*clk_timer_create)(itimer_t *, struct sigevent *); + int (*clk_timer_create)(itimer_t *, void (*)(itimer_t *)); int (*clk_timer_settime)(itimer_t *, int, const struct itimerspec *); int (*clk_timer_gettime)(itimer_t *, struct itimerspec *); int (*clk_timer_delete)(itimer_t *); @@ -88,8 +105,8 @@ typedef struct clock_backend { } clock_backend_t; extern void clock_add_backend(clockid_t clock, clock_backend_t *backend); +extern clock_backend_t *clock_get_backend(clockid_t clock); -extern void timer_fire(itimer_t *); extern void timer_lwpbind(); extern void timer_func(sigqueue_t *); diff --git a/usr/src/uts/common/sys/timerfd.h b/usr/src/uts/common/sys/timerfd.h new file mode 100644 index 0000000000..66cb50ac88 --- /dev/null +++ b/usr/src/uts/common/sys/timerfd.h @@ -0,0 +1,81 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the timerfd facility. + */ + +#ifndef _SYS_TIMERFD_H +#define _SYS_TIMERFD_H + +#include <sys/types.h> +#include <sys/time_impl.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define TFD_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define TFD_NONBLOCK 04000 /* LX_O_NONBLOCK */ +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define TIMERFDIOC (('t' << 24) | ('f' << 16) | ('d' << 8)) +#define TIMERFDIOC_CREATE (TIMERFDIOC | 1) /* create timer */ +#define TIMERFDIOC_SETTIME (TIMERFDIOC | 2) /* timerfd_settime() */ +#define TIMERFDIOC_GETTIME (TIMERFDIOC | 3) /* timerfd_gettime() */ + +typedef struct timerfd_settime { + uint64_t tfd_settime_flags; /* flags (e.g., TFD_TIMER_ABSTIME) */ + uint64_t tfd_settime_value; /* pointer to value */ + uint64_t tfd_settime_ovalue; /* pointer to old value, if any */ +} timerfd_settime_t; + +#ifndef _KERNEL + +extern int timerfd_create(int, int); +extern int timerfd_settime(int, int, + const struct itimerspec *, struct itimerspec *); +extern int timerfd_gettime(int, struct itimerspec *); + +#else + +#define TIMERFDMNRN_TIMERFD 0 +#define TIMERFDMNRN_CLONE 1 +#define TIMERFD_VALMAX (ULLONG_MAX - 1ULL) + +/* + * Fortunately, the values for the Linux clocks that are valid for timerfd + * (namely, CLOCK_REALTIME and CLOCK_MONOTONIC) don't overlap with our values + * the same. + */ +#define TIMERFD_MONOTONIC 1 /* Linux value for CLOCK_MONOTONIC */ + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TIMERFD_H */ diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index d5168c9b2c..c14a3bf11e 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -159,7 +160,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..c2954cbc29 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,9 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ @@ -108,6 +110,7 @@ extern int getvmusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres); int vm_getusage(uint_t, time_t, vmusage_t *, size_t *, int); void vm_usage_init(); +int vm_map_inval(pid_t, caddr_t, size_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index 6122b6cd2f..c7b41730b6 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ +/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -159,6 +162,8 @@ extern void *boot_virt_alloc(void *addr, size_t size); extern size_t exec_get_spslew(void); +extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define _SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define VND_IOC_ATTACH (VND_IOC | 0x1) + +#define VND_NAMELEN 32 + +typedef struct vnd_ioc_attach { + char via_name[VND_NAMELEN]; + zoneid_t via_zoneid; + uint32_t via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define VND_IOC_LINK (VND_IOC | 0x2) + +typedef struct vnd_ioc_link { + char vil_name[VND_NAMELEN]; + uint32_t vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define VND_IOC_UNLINK (VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { + uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { + uint64_t vib_size; + uint32_t vib_filler; + uint32_t vib_errno; +} vnd_ioc_buf_t; + +#define VND_IOC_GETRXBUF (VND_IOC | 0x04) +#define VND_IOC_SETRXBUF (VND_IOC | 0x05) +#define VND_IOC_GETMAXBUF (VND_IOC | 0x06) +#define VND_IOC_GETTXBUF (VND_IOC | 0x07) +#define VND_IOC_SETTXBUF (VND_IOC | 0x08) +#define VND_IOC_GETMINTU (VND_IOC | 0x09) +#define VND_IOC_GETMAXTU (VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { + uint32_t vii_version; + zoneid_t vii_zone; + char vii_name[VND_NAMELEN]; + char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { + uint_t vl_nents; + uint_t vl_actents; + vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { + uint_t vl_nents; + uint_t vl_actents; + caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif /* _KERNEL */ + +#define VND_IOC_LIST (VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30) +#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define _SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { + VND_E_SUCCESS = 0, /* no error */ + VND_E_NOMEM, /* no memory */ + VND_E_NODATALINK, /* no such datalink */ + VND_E_NOTETHER, /* not DL_ETHER */ + VND_E_DLPIINVAL, /* Unknown DLPI failures */ + VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */ + VND_E_BINDFAIL, /* DL_BIND_REQ failed */ + VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */ + VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */ + VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */ + VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */ + VND_E_DLDBADVERS, /* bad dld version */ + VND_E_KSTATCREATE, /* failed to create kstats */ + VND_E_NODEV, /* no such vnd link */ + VND_E_NONETSTACK, /* netstack doesn't exist */ + VND_E_ASSOCIATED, /* device already associated */ + VND_E_ATTACHED, /* device already attached */ + VND_E_LINKED, /* device already linked */ + VND_E_BADNAME, /* invalid name */ + VND_E_PERM, /* can't touch this */ + VND_E_NOZONE, /* no such zone */ + VND_E_STRINIT, /* failed to initialize vnd stream module */ + VND_E_NOTATTACHED, /* device not attached */ + VND_E_NOTLINKED, /* device not linked */ + VND_E_LINKEXISTS, /* another device has the same link name */ + VND_E_MINORNODE, /* failed to create minor node */ + VND_E_BUFTOOBIG, /* requested buffer size is too large */ + VND_E_BUFTOOSMALL, /* requested buffer size is too small */ + VND_E_DLEXCL, /* unable to get dlpi excl access */ + VND_E_DIRECTNOTSUP, + /* DLD direct capability not suported over data link */ + VND_E_BADPROPSIZE, /* invalid property size */ + VND_E_BADPROP, /* invalid property */ + VND_E_PROPRDONLY, /* property is read only */ + VND_E_SYS, /* unexpected system error */ + VND_E_CAPABPASS, + /* capabilities invalid, pass-through module detected */ + VND_E_UNKNOWN /* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index af9516fe52..28145636f3 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -735,7 +735,8 @@ typedef enum vnevent { VE_LINK = 6, /* Link with vnode's name as source */ VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ - VE_TRUNCATE = 9 /* Truncate */ + VE_TRUNCATE = 9, /* Truncate */ + VE_RENAME_SRC_DIR = 10 /* Rename with vnode as source dir */ } vnevent_t; /* @@ -1290,7 +1291,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_create(vnode_t *, caller_context_t *); void vnevent_link(vnode_t *, caller_context_t *); -void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, + caller_context_t *ct); void vnevent_mountedover(vnode_t *, caller_context_t *); void vnevent_truncate(vnode_t *, caller_context_t *); int vnevent_support(vnode_t *, caller_context_t *); @@ -1323,6 +1325,9 @@ u_longlong_t fs_new_caller_id(); int vn_vmpss_usepageio(vnode_t *); +/* Empty v_path placeholder */ +extern char *vn_vpath_empty; + /* * Needed for use of IS_VMODSORT() in kernel. */ diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h new file mode 100644 index 0000000000..d87786b507 --- /dev/null +++ b/usr/src/uts/common/sys/vxlan.h @@ -0,0 +1,47 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_VXLAN_H +#define _SYS_VXLAN_H + +/* + * Common VXLAN information + */ + +#include <sys/inttypes.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Sizes in bytes */ +#define VXLAN_HDR_LEN 8 +#define VXLAN_ID_LEN 3 + +#define VXLAN_F_VDI 0x08000000 +#define VXLAN_ID_SHIFT 8 + +#pragma pack(1) +typedef struct vxlan_hdr { + uint32_t vxlan_flags; + uint32_t vxlan_id; +} vxlan_hdr_t; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VXLAN_H */ diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..acf9ce8d33 --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define _SYS_ZFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define ZFD_MASTER_NAME "master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define ZFD_SLAVE_NAME "slave" + +#define ZFD_NAME_LEN 16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define ZFD_MAKETTY (ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define ZFD_EOF (ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define ZFD_HAS_SLAVE (ZFD_IOC | 2) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 7aab7c8d32..a5d1610842 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -23,6 +23,7 @@ * Copyright 2013, Joyent, Inc. All rights reserved. * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. + * Copyright 2015, Joyent, Inc. All rights reserved. */ #ifndef _SYS_ZONE_H @@ -97,13 +98,17 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 +#define ZONE_ATTR_PMCAP_NOVER 12 #define ZONE_ATTR_SCHED_CLASS 13 #define ZONE_ATTR_FLAGS 14 #define ZONE_ATTR_HOSTID 15 #define ZONE_ATTR_FS_ALLOWED 16 #define ZONE_ATTR_NETWORK 17 +#define ZONE_ATTR_DID 18 +#define ZONE_ATTR_PMCAP_PAGEOUT 19 #define ZONE_ATTR_INITNORESTART 20 +#define ZONE_ATTR_PG_FLT_DELAY 21 +#define ZONE_ATTR_RSS 22 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -184,6 +189,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -200,6 +206,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -244,9 +251,12 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + int status; /* init status on shutdown */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ + /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ + int pad; } zone_cmd_arg_t; /* @@ -372,7 +382,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -383,6 +393,58 @@ typedef struct zone_kstat { struct cpucap; typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { + kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; + kstat_named_t zm_pgpgin; + kstat_named_t zm_anonpgin; + kstat_named_t zm_execpgin; + kstat_named_t zm_fspgin; + kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; +} zone_mcap_kstat_t; + +typedef struct { kstat_named_t zm_zonename; /* full name, kstat truncates name */ kstat_named_t zm_utime; kstat_named_t zm_stime; @@ -390,15 +452,12 @@ typedef struct { kstat_named_t zm_avenrun1; kstat_named_t zm_avenrun5; kstat_named_t zm_avenrun15; - kstat_named_t zm_run_ticks; - kstat_named_t zm_run_wait; - kstat_named_t zm_fss_shr_pct; - kstat_named_t zm_fss_pri_hi; - kstat_named_t zm_fss_pri_avg; kstat_named_t zm_ffcap; kstat_named_t zm_ffnoproc; kstat_named_t zm_ffnomem; kstat_named_t zm_ffmisc; + kstat_named_t zm_init_pid; + kstat_named_t zm_boot_time; } zone_misc_kstat_t; typedef struct zone { @@ -440,6 +499,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -492,10 +552,11 @@ typedef struct zone { kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ pid_t zone_proc_initpid; /* pid of "init" for this zone */ - char *zone_initname; /* fs path to 'init' */ + char *zone_initname; /* fs path to 'init' */ + int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ + rctl_qty_t zone_phys_mem_ctl; /* current phys. memory limit */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -533,6 +594,7 @@ typedef struct zone { tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */ boolean_t zone_restart_init; /* Restart init if it dies? */ + boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */ struct brand *zone_brand; /* zone's brand */ void *zone_brand_data; /* store brand specific data */ id_t zone_defaultcid; /* dflt scheduling class id */ @@ -544,6 +606,37 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + + /* + * Data and counters used for ZFS fair-share disk IO. + */ + rctl_qty_t zone_zfs_io_pri; /* ZFS IO priority */ + uint_t zone_zfs_queued[2]; /* sync I/O enqueued count */ + uint64_t zone_zfs_weight; /* used to prevent starvation */ + uint64_t zone_io_util; /* IO utilization metric */ + boolean_t zone_io_util_above_avg; /* IO util percent > avg. */ + uint16_t zone_io_delay; /* IO delay on logical r/w */ + kmutex_t zone_stg_io_lock; /* protects IO window data */ + sys_zio_cntr_t zone_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zone_wr_ops; /* writes and */ + sys_zio_cntr_t zone_lwr_ops; /* logical writes. */ + + /* + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + kstat_io_t zone_zfs_rwstats; + zone_zfs_kstat_t *zone_zfs_stats; + /* * Solaris Auditing per-zone audit context */ @@ -563,6 +656,27 @@ typedef struct zone { kstat_t *zone_nprocs_kstat; /* + * kstats and counters for physical memory capping. + */ + rctl_qty_t zone_phys_mem; /* current bytes of phys. mem. (RSS) */ + kstat_t *zone_physmem_kstat; + uint64_t zone_mcap_nover; /* # of times over phys. cap */ + uint64_t zone_mcap_pagedout; /* bytes of mem. paged out */ + kmutex_t zone_mcap_lock; /* protects mcap statistics */ + kstat_t *zone_mcap_ksp; + zone_mcap_kstat_t *zone_mcap_stats; + uint64_t zone_pgpgin; /* pages paged in */ + uint64_t zone_anonpgin; /* anon pages paged in */ + uint64_t zone_execpgin; /* exec pages paged in */ + uint64_t zone_fspgin; /* fs pages paged in */ + uint64_t zone_anon_alloc_fail; /* cnt of anon alloc fails */ + uint64_t zone_pf_throttle; /* cnt of page flt throttles */ + uint64_t zone_pf_throttle_usec; /* time of page flt throttles */ + + /* Num usecs to throttle page fault when zone is over phys. mem cap */ + uint32_t zone_pg_flt_delay; + + /* * Misc. kstats and counters for zone cpu-usage aggregation. * The zone_Xtime values are the sum of the micro-state accounting * values for all threads that are running or have run in the zone. @@ -640,6 +754,7 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -834,6 +949,7 @@ extern int zone_walk(int (*)(zone_t *, void *), void *); extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/syscall/brandsys.c b/usr/src/uts/common/syscall/brandsys.c index 9b4bd38baa..8ee5511fd0 100644 --- a/usr/src/uts/common/syscall/brandsys.c +++ b/usr/src/uts/common/syscall/brandsys.c @@ -23,7 +23,9 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #include <sys/brand.h> #include <sys/systm.h> @@ -35,7 +37,7 @@ */ int64_t brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, - uintptr_t arg4, uintptr_t arg5, uintptr_t arg6) + uintptr_t arg4, uintptr_t arg5) { struct proc *p = curthread->t_procp; int64_t rval = 0; @@ -49,7 +51,7 @@ brandsys(int cmd, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, return (set_errno(ENOSYS)); if ((err = ZBROP(p->p_zone)->b_brandsys(cmd, &rval, arg1, arg2, arg3, - arg4, arg5, arg6)) != 0) + arg4, arg5)) != 0) return (set_errno(err)); return (rval); diff --git a/usr/src/uts/common/syscall/fcntl.c b/usr/src/uts/common/syscall/fcntl.c index 7421957235..d631fe62f6 100644 --- a/usr/src/uts/common/syscall/fcntl.c +++ b/usr/src/uts/common/syscall/fcntl.c @@ -22,6 +22,7 @@ /* * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -53,7 +54,8 @@ #include <sys/cmn_err.h> -static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); +/* This is global so that it can be used by brand emulation. */ +int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); static void fd_too_big(proc_t *); @@ -271,11 +273,12 @@ fcntl(int fdes, int cmd, intptr_t arg) * The file system and vnode layers understand and implement * locking with flock64 structures. So here once we pass through * the test for compatibility as defined by LFS API, (for F_SETLK, - * F_SETLKW, F_GETLK, F_GETLKW, F_FREESP) we transform - * the flock structure to a flock64 structure and send it to the - * lower layers. Similarly in case of GETLK the returned flock64 - * structure is transformed to a flock structure if everything fits - * in nicely, otherwise we return EOVERFLOW. + * F_SETLKW, F_GETLK, F_GETLKW, F_OFD_GETLK, F_OFD_SETLK, F_OFD_SETLKW, + * F_FREESP) we transform the flock structure to a flock64 structure + * and send it to the lower layers. Similarly in case of GETLK and + * OFD_GETLK the returned flock64 structure is transformed to a flock + * structure if everything fits in nicely, otherwise we return + * EOVERFLOW. */ case F_GETLK: @@ -283,6 +286,11 @@ fcntl(int fdes, int cmd, intptr_t arg) case F_SETLK: case F_SETLKW: case F_SETLK_NBMAND: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + case F_FLOCK: + case F_FLOCKW: /* * Copy in input fields only. @@ -345,20 +353,65 @@ fcntl(int fdes, int cmd, intptr_t arg) if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) break; + if (cmd == F_FLOCK || cmd == F_FLOCKW) { + /* FLOCK* locking is always over the entire file. */ + if (bf.l_whence != 0 || bf.l_start != 0 || + bf.l_len != 0) { + error = EINVAL; + break; + } + if (bf.l_type < F_RDLCK || bf.l_type > F_UNLCK) { + error = EINVAL; + break; + } + } + + if (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW) { + /* + * TBD OFD-style locking is currently limited to + * covering the entire file. + */ + if (bf.l_whence != 0 || bf.l_start != 0 || + bf.l_len != 0) { + error = EINVAL; + break; + } + } + /* * Not all of the filesystems understand F_O_GETLK, and * there's no need for them to know. Map it to F_GETLK. + * + * The *_frlock functions in the various file systems basically + * do some validation and then funnel everything through the + * fs_frlock function. For OFD-style locks fs_frlock will do + * nothing so that once control returns here we can call the + * ofdlock function with the correct fp. For OFD-style locks + * the unsupported remote file systems, such as NFS, detect and + * reject the OFD-style cmd argument. */ if ((error = VOP_FRLOCK(vp, (cmd == F_O_GETLK) ? F_GETLK : cmd, &bf, flag, offset, NULL, fp->f_cred, NULL)) != 0) break; + if (cmd == F_FLOCK || cmd == F_FLOCKW || cmd == F_OFD_GETLK || + cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW) { + /* + * This is an OFD-style lock so we need to handle it + * here. Because OFD-style locks are associated with + * the file_t we didn't have enough info down the + * VOP_FRLOCK path immediately above. + */ + if ((error = ofdlock(fp, cmd, &bf, flag, offset)) != 0) + break; + } + /* * If command is GETLK and no lock is found, only * the type field is changed. */ - if ((cmd == F_O_GETLK || cmd == F_GETLK) && - bf.l_type == F_UNLCK) { + if ((cmd == F_O_GETLK || cmd == F_GETLK || + cmd == F_OFD_GETLK) && bf.l_type == F_UNLCK) { /* l_type always first entry, always a short */ if (copyout(&bf.l_type, &((struct flock *)arg)->l_type, sizeof (bf.l_type))) @@ -387,7 +440,7 @@ fcntl(int fdes, int cmd, intptr_t arg) obf.l_pid = (int16_t)bf.l_pid; if (copyout(&obf, (void *)arg, sizeof (obf))) error = EFAULT; - } else if (cmd == F_GETLK) { + } else if (cmd == F_GETLK || cmd == F_OFD_GETLK) { /* * Copy out SVR4 flock. */ @@ -591,6 +644,11 @@ fcntl(int fdes, int cmd, intptr_t arg) case F_SETLK64: case F_SETLKW64: case F_SETLK64_NBMAND: + case F_OFD_GETLK64: + case F_OFD_SETLK64: + case F_OFD_SETLKW64: + case F_FLOCK64: + case F_FLOCKW64: /* * Large Files: Here we set cmd as *LK and send it to * lower layers. *LK64 is only for the user land. @@ -611,6 +669,16 @@ fcntl(int fdes, int cmd, intptr_t arg) cmd = F_SETLKW; else if (cmd == F_SETLK64_NBMAND) cmd = F_SETLK_NBMAND; + else if (cmd == F_OFD_GETLK64) + cmd = F_OFD_GETLK; + else if (cmd == F_OFD_SETLK64) + cmd = F_OFD_SETLK; + else if (cmd == F_OFD_SETLKW64) + cmd = F_OFD_SETLKW; + else if (cmd == F_FLOCK64) + cmd = F_FLOCK; + else if (cmd == F_FLOCKW64) + cmd = F_FLOCKW; /* * Note that the size of flock64 is different in the ILP32 @@ -636,18 +704,65 @@ fcntl(int fdes, int cmd, intptr_t arg) if ((error = flock_check(vp, &bf, offset, MAXOFFSET_T)) != 0) break; + if (cmd == F_FLOCK || cmd == F_FLOCKW) { + /* FLOCK* locking is always over the entire file. */ + if (bf.l_whence != 0 || bf.l_start != 0 || + bf.l_len != 0) { + error = EINVAL; + break; + } + if (bf.l_type < F_RDLCK || bf.l_type > F_UNLCK) { + error = EINVAL; + break; + } + } + + if (cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW) { + /* + * TBD OFD-style locking is currently limited to + * covering the entire file. + */ + if (bf.l_whence != 0 || bf.l_start != 0 || + bf.l_len != 0) { + error = EINVAL; + break; + } + } + + /* + * The *_frlock functions in the various file systems basically + * do some validation and then funnel everything through the + * fs_frlock function. For OFD-style locks fs_frlock will do + * nothing so that once control returns here we can call the + * ofdlock function with the correct fp. For OFD-style locks + * the unsupported remote file systems, such as NFS, detect and + * reject the OFD-style cmd argument. + */ if ((error = VOP_FRLOCK(vp, cmd, &bf, flag, offset, NULL, fp->f_cred, NULL)) != 0) break; - if ((cmd == F_GETLK) && bf.l_type == F_UNLCK) { + if (cmd == F_FLOCK || cmd == F_FLOCKW || cmd == F_OFD_GETLK || + cmd == F_OFD_SETLK || cmd == F_OFD_SETLKW) { + /* + * This is an OFD-style lock so we need to handle it + * here. Because OFD-style locks are associated with + * the file_t we didn't have enough info down the + * VOP_FRLOCK path immediately above. + */ + if ((error = ofdlock(fp, cmd, &bf, flag, offset)) != 0) + break; + } + + if ((cmd == F_GETLK || cmd == F_OFD_GETLK) && + bf.l_type == F_UNLCK) { if (copyout(&bf.l_type, &((struct flock *)arg)->l_type, sizeof (bf.l_type))) error = EFAULT; break; } - if (cmd == F_GETLK) { + if (cmd == F_GETLK || cmd == F_OFD_GETLK) { int i; /* diff --git a/usr/src/uts/common/syscall/lgrpsys.c b/usr/src/uts/common/syscall/lgrpsys.c index 44d616fa34..1066b94986 100644 --- a/usr/src/uts/common/syscall/lgrpsys.c +++ b/usr/src/uts/common/syscall/lgrpsys.c @@ -22,10 +22,9 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * lgroup system calls */ @@ -40,6 +39,7 @@ #include <sys/lgrp_user.h> #include <sys/promif.h> /* for prom_printf() */ #include <sys/sysmacros.h> +#include <sys/policy.h> #include <vm/as.h> @@ -141,6 +141,24 @@ meminfo(int addr_count, struct meminfo *mip) } /* + * Validate privs for each req. + */ + for (i = 0; i < info_count; i++) { + switch (req_array[i] & MEMINFO_MASK) { + case MEMINFO_VLGRP: + case MEMINFO_VPAGESIZE: + break; + default: + if (secpolicy_meminfo(CRED()) != 0) { + kmem_free(req_array, req_size); + kmem_free(in_array, in_size); + return (set_errno(EPERM)); + } + break; + } + } + + /* * allocate buffer out_array which holds the results and will have * to be copied out later */ diff --git a/usr/src/uts/common/syscall/memcntl.c b/usr/src/uts/common/syscall/memcntl.c index ae2a0cc45c..721f884a7e 100644 --- a/usr/src/uts/common/syscall/memcntl.c +++ b/usr/src/uts/common/syscall/memcntl.c @@ -115,13 +115,17 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) * MS_SYNC used to be defined to be zero but is now non-zero. * For binary compatibility we still accept zero * (the absence of MS_ASYNC) to mean the same thing. + * Binary compatibility is not an issue for MS_INVALCURPROC. */ iarg = (uintptr_t)arg; if ((iarg & ~MS_INVALIDATE) == 0) iarg |= MS_SYNC; - if (((iarg & ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE)) != 0) || - ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC))) { + if (((iarg & + ~(MS_SYNC|MS_ASYNC|MS_INVALIDATE|MS_INVALCURPROC)) != 0) || + ((iarg & (MS_SYNC|MS_ASYNC)) == (MS_SYNC|MS_ASYNC)) || + ((iarg & (MS_INVALIDATE|MS_INVALCURPROC)) == + (MS_INVALIDATE|MS_INVALCURPROC))) { error = set_errno(EINVAL); } else { error = as_ctl(as, addr, len, cmd, attr, iarg, NULL, 0); @@ -347,7 +351,8 @@ memcntl(caddr_t addr, size_t len, int cmd, caddr_t arg, int attr, int mask) } return (error); case MC_ADVISE: - if ((uintptr_t)arg == MADV_FREE) { + if ((uintptr_t)arg == MADV_FREE || + (uintptr_t)arg == MADV_PURGE) { len &= PAGEMASK; } switch ((uintptr_t)arg) { diff --git a/usr/src/uts/common/syscall/open.c b/usr/src/uts/common/syscall/open.c index edb04c824b..874e31869c 100644 --- a/usr/src/uts/common/syscall/open.c +++ b/usr/src/uts/common/syscall/open.c @@ -74,12 +74,12 @@ copen(int startfd, char *fname, int filemode, int createmode) if (filemode & (FSEARCH|FEXEC)) { /* - * Must be one or the other and neither FREAD nor FWRITE + * Must be one or the other. * Must not be any of FAPPEND FCREAT FTRUNC FXATTR FXATTRDIROPEN - * XXX: Should these just be silently ignored? + * XXX: Should these just be silently ignored like we + * silently ignore FREAD|FWRITE? */ - if ((filemode & (FREAD|FWRITE)) || - (filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || + if ((filemode & (FSEARCH|FEXEC)) == (FSEARCH|FEXEC) || (filemode & (FAPPEND|FCREAT|FTRUNC|FXATTR|FXATTRDIROPEN))) return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/poll.c b/usr/src/uts/common/syscall/poll.c index 7f37529941..c33156a4fc 100644 --- a/usr/src/uts/common/syscall/poll.c +++ b/usr/src/uts/common/syscall/poll.c @@ -29,6 +29,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. */ /* @@ -525,13 +526,13 @@ poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) } /* - * If T_POLLWAKE is set, a pollwakeup() was performed on + * If PC_POLLWAKE is set, a pollwakeup() was performed on * one of the file descriptors. This can happen only if * one of the VOP_POLL() functions dropped pcp->pc_lock. * The only current cases of this is in procfs (prpoll()) * and STREAMS (strpoll()). */ - if (pcp->pc_flag & T_POLLWAKE) + if (pcp->pc_flag & PC_POLLWAKE) continue; /* @@ -886,9 +887,9 @@ retry: } /* - * This function is called to inform a thread that - * an event being polled for has occurred. - * The pollstate lock on the thread should be held on entry. + * This function is called to inform a thread (or threads) that an event being + * polled on has occurred. The pollstate lock on the thread should be held + * on entry. */ void pollnotify(pollcache_t *pcp, int fd) @@ -896,8 +897,8 @@ pollnotify(pollcache_t *pcp, int fd) ASSERT(fd < pcp->pc_mapsize); ASSERT(MUTEX_HELD(&pcp->pc_lock)); BT_SET(pcp->pc_bitmap, fd); - pcp->pc_flag |= T_POLLWAKE; - cv_signal(&pcp->pc_cv); + pcp->pc_flag |= PC_POLLWAKE; + cv_broadcast(&pcp->pc_cv); } /* @@ -2024,7 +2025,7 @@ retry: */ if ((pdp->pd_php != NULL) && (pollfdp[entry].events == pdp->pd_events) && - ((pcp->pc_flag & T_POLLWAKE) == 0)) { + ((pcp->pc_flag & PC_POLLWAKE) == 0)) { BT_CLEAR(pcp->pc_bitmap, fd); } /* @@ -2251,7 +2252,7 @@ pollstate_destroy(pollstate_t *ps) pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); ps->ps_pcacheset = NULL; if (ps->ps_dpbuf != NULL) { - kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); + kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); ps->ps_dpbuf = NULL; } mutex_destroy(&ps->ps_lock); diff --git a/usr/src/uts/common/syscall/rusagesys.c b/usr/src/uts/common/syscall/rusagesys.c index 3e0e63f4c0..417c629168 100644 --- a/usr/src/uts/common/syscall/rusagesys.c +++ b/usr/src/uts/common/syscall/rusagesys.c @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* @@ -257,6 +258,19 @@ rusagesys(int code, void *arg1, void *arg2, void *arg3, void *arg4) case _RUSAGESYS_GETVMUSAGE: return (vm_getusage((uint_t)(uintptr_t)arg1, (time_t)arg2, (vmusage_t *)arg3, (size_t *)arg4, 0)); + case _RUSAGESYS_INVALMAP: + /* + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD + * handling so callers on SPARC should get simple sync + * handling with invalidation to all processes. + */ +#if defined(__sparc) + return (memcntl((caddr_t)arg2, (size_t)arg3, MC_SYNC, + (caddr_t)(MS_ASYNC | MS_INVALIDATE), 0, 0)); +#else + return (vm_map_inval((pid_t)(uintptr_t)arg1, (caddr_t)arg2, + (size_t)arg3)); +#endif default: return (set_errno(EINVAL)); } diff --git a/usr/src/uts/common/syscall/rw.c b/usr/src/uts/common/syscall/rw.c index a28894b2c9..943b7d244e 100644 --- a/usr/src/uts/common/syscall/rw.c +++ b/usr/src/uts/common/syscall/rw.c @@ -22,7 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -50,6 +50,7 @@ #include <sys/debug.h> #include <sys/rctl.h> #include <sys/nbmlock.h> +#include <sys/limits.h> #define COPYOUT_MAX_CACHE (1<<17) /* 128K */ @@ -607,19 +608,12 @@ out: return (bcount); } -/* - * XXX -- The SVID refers to IOV_MAX, but doesn't define it. Grrrr.... - * XXX -- However, SVVS expects readv() and writev() to fail if - * XXX -- iovcnt > 16 (yes, it's hard-coded in the SVVS source), - * XXX -- so I guess that's the "interface". - */ -#define DEF_IOV_MAX 16 - ssize_t readv(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -630,9 +624,14 @@ readv(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -640,36 +639,63 @@ readv(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -768,6 +794,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -777,7 +805,8 @@ ssize_t writev(int fdes, struct iovec *iovp, int iovcnt) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -788,9 +817,14 @@ writev(int fdes, struct iovec *iovp, int iovcnt) u_offset_t fileoff; int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -798,36 +832,62 @@ writev(int fdes, struct iovec *iovp, int iovcnt) * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen = aiov32[i].iov_len; count32 += iovlen; - if (iovlen < 0 || count32 < 0) + if (iovlen < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((fp = getf(fdes)) == NULL) + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -917,6 +977,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -927,7 +989,8 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -952,9 +1015,14 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -962,39 +1030,68 @@ preadv(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = (ssize_t)count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FREAD) == 0) { error = EBADF; goto out; @@ -1099,6 +1196,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); @@ -1109,7 +1208,8 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, off_t extended_offset) { struct uio auio; - struct iovec aiov[DEF_IOV_MAX]; + struct iovec buf[IOV_MAX_STACK], *aiov = buf; + int aiovlen = 0; file_t *fp; register vnode_t *vp; struct cpu *cp; @@ -1134,9 +1234,14 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, int in_crit = 0; - if (iovcnt <= 0 || iovcnt > DEF_IOV_MAX) + if (iovcnt <= 0 || iovcnt > IOV_MAX) return (set_errno(EINVAL)); + if (iovcnt > IOV_MAX_STACK) { + aiovlen = iovcnt * sizeof (iovec_t); + aiov = kmem_alloc(aiovlen, KM_SLEEP); + } + #ifdef _SYSCALL32_IMPL /* * 32-bit callers need to have their iovec expanded, @@ -1144,39 +1249,68 @@ pwritev(int fdes, struct iovec *iovp, int iovcnt, off_t offset, * of data in a single call. */ if (get_udatamodel() == DATAMODEL_ILP32) { - struct iovec32 aiov32[DEF_IOV_MAX]; + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32; + int aiov32len; ssize32_t count32; - if (copyin(iovp, aiov32, iovcnt * sizeof (struct iovec32))) + aiov32len = iovcnt * sizeof (iovec32_t); + if (aiovlen != 0) + aiov32 = kmem_alloc(aiov32len, KM_SLEEP); + + if (copyin(iovp, aiov32, aiov32len)) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EFAULT)); + } count32 = 0; for (i = 0; i < iovcnt; i++) { ssize32_t iovlen32 = aiov32[i].iov_len; count32 += iovlen32; - if (iovlen32 < 0 || count32 < 0) + if (iovlen32 < 0 || count32 < 0) { + if (aiovlen != 0) { + kmem_free(aiov32, aiov32len); + kmem_free(aiov, aiovlen); + } return (set_errno(EINVAL)); + } aiov[i].iov_len = iovlen32; aiov[i].iov_base = (caddr_t)(uintptr_t)aiov32[i].iov_base; } + if (aiovlen != 0) + kmem_free(aiov32, aiov32len); } else #endif /* _SYSCALL32_IMPL */ - if (copyin(iovp, aiov, iovcnt * sizeof (struct iovec))) + if (copyin(iovp, aiov, iovcnt * sizeof (iovec_t))) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EFAULT)); + } count = 0; for (i = 0; i < iovcnt; i++) { ssize_t iovlen = aiov[i].iov_len; count += iovlen; - if (iovlen < 0 || count < 0) + if (iovlen < 0 || count < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); + } } - if ((bcount = (ssize_t)count) < 0) + if ((bcount = (ssize_t)count) < 0) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EINVAL)); - if ((fp = getf(fdes)) == NULL) + } + if ((fp = getf(fdes)) == NULL) { + if (aiovlen != 0) + kmem_free(aiov, aiovlen); return (set_errno(EBADF)); + } if (((fflag = fp->f_flag) & FWRITE) == 0) { error = EBADF; goto out; @@ -1308,6 +1442,8 @@ out: if (in_crit) nbl_end_crit(vp); releasef(fdes); + if (aiovlen != 0) + kmem_free(aiov, aiovlen); if (error) return (set_errno(error)); return (count); diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index cb8246f584..ccceca7c6d 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -82,7 +82,7 @@ extern sotpi_info_t *sotpi_sototpi(struct sonode *); * 64 bit kernel or 32 bit kernel. For 32 bit apps, we can't transfer * more than 2GB of data. */ -int +static int sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, int copy_cnt, ssize32_t *count) { @@ -343,7 +343,7 @@ sendvec_chunk64(file_t *fp, u_offset_t *fileoff, struct ksendfilevec64 *sfv, return (0); } -ssize32_t +static ssize32_t sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, size32_t *xferred, int fildes) { @@ -390,7 +390,7 @@ sendvec64(file_t *fp, const struct ksendfilevec64 *vec, int sfvcnt, } #endif -int +static int sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t total_size, int maxblk, ssize_t *count) { @@ -680,7 +680,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, } -int +static int sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int copy_cnt, ssize_t *count) { @@ -1160,6 +1160,17 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, } else { maxblk = (int)vp->v_stream->sd_maxblk; } + + /* + * We need to make sure that the socket that we're sending on + * supports sendfile behavior. sockfs doesn't know that the APIs + * we want to use are coming from sendfile, so we can't rely on + * it to check for us. + */ + if ((so->so_mode & SM_SENDFILESUPP) == 0) { + error = EOPNOTSUPP; + goto err; + } break; case VREG: break; diff --git a/usr/src/uts/common/syscall/sysconfig.c b/usr/src/uts/common/syscall/sysconfig.c index 03f2fabe13..26ea859224 100644 --- a/usr/src/uts/common/syscall/sysconfig.c +++ b/usr/src/uts/common/syscall/sysconfig.c @@ -22,6 +22,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -170,8 +171,8 @@ sysconfig(int which) * even though rcapd can be used on the global zone too. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) - return (MIN(btop(curproc->p_zone->zone_phys_mcap), + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) + return (MIN(btop(curproc->p_zone->zone_phys_mem_ctl), physinstalled)); return (physinstalled); @@ -179,26 +180,23 @@ sysconfig(int which) case _CONFIG_AVPHYS_PAGES: /* * If the non-global zone has a phys. memory cap, use - * the phys. memory cap - zone's current rss. We always + * the phys. memory cap - zone's rss. We always * report the system-wide value for the global zone, even - * though rcapd can be used on the global zone too. + * though memory capping can be used on the global zone too. + * We use the cached value for the RSS since vm_getusage() + * is so expensive and we don't need this value to be exact. */ if (!INGLOBALZONE(curproc) && - curproc->p_zone->zone_phys_mcap != 0) { + curproc->p_zone->zone_phys_mem_ctl != UINT64_MAX) { pgcnt_t cap, rss, free; - vmusage_t in_use; - size_t cnt = 1; - cap = btop(curproc->p_zone->zone_phys_mcap); + cap = btop(curproc->p_zone->zone_phys_mem_ctl); if (cap > physinstalled) return (freemem); - if (vm_getusage(VMUSAGE_ZONE, 1, &in_use, &cnt, - FKIOCTL) != 0) - in_use.vmu_rss_all = 0; - rss = btop(in_use.vmu_rss_all); + rss = btop(curproc->p_zone->zone_phys_mem); /* - * Because rcapd implements a soft cap, it is possible + * Because this is a soft cap, it is possible * for rss to be temporarily over the cap. */ if (cap > rss) diff --git a/usr/src/uts/common/syscall/uadmin.c b/usr/src/uts/common/syscall/uadmin.c index 2dda4001bf..68aa1a95f5 100644 --- a/usr/src/uts/common/syscall/uadmin.c +++ b/usr/src/uts/common/syscall/uadmin.c @@ -78,7 +78,7 @@ volatile int fastreboot_dryrun = 0; * system with many zones. */ void -killall(zoneid_t zoneid) +killall(zoneid_t zoneid, boolean_t force) { proc_t *p; @@ -108,7 +108,7 @@ killall(zoneid_t zoneid) p->p_stat != SIDL && p->p_stat != SZOMB) { mutex_enter(&p->p_lock); - if (sigismember(&p->p_sig, SIGKILL)) { + if (!force && sigismember(&p->p_sig, SIGKILL)) { mutex_exit(&p->p_lock); p = p->p_next; } else { @@ -245,7 +245,7 @@ kadmin(int cmd, int fcn, void *mdep, cred_t *credp) */ zone_shutdown_global(); - killall(ALL_ZONES); + killall(ALL_ZONES, B_FALSE); /* * If we are calling kadmin() from a kernel context then we * do not release these resources. diff --git a/usr/src/uts/common/vm/anon.h b/usr/src/uts/common/vm/anon.h index 2a39c896bc..1273b984fb 100644 --- a/usr/src/uts/common/vm/anon.h +++ b/usr/src/uts/common/vm/anon.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -390,7 +391,8 @@ extern int anon_fill_cow_holes(struct seg *, caddr_t, struct anon_hdr *, uint_t, struct vpage [], struct cred *); extern void anon_free(struct anon_hdr *, ulong_t, size_t); extern void anon_free_pages(struct anon_hdr *, ulong_t, size_t, uint_t); -extern void anon_disclaim(struct anon_map *, ulong_t, size_t); +extern int anon_disclaim(struct anon_map *, + ulong_t, size_t, uint_t, pgcnt_t *); extern int anon_getpage(struct anon **, uint_t *, struct page **, size_t, struct seg *, caddr_t, enum seg_rw, struct cred *); extern int swap_getconpage(struct vnode *, u_offset_t, size_t, diff --git a/usr/src/uts/common/vm/hat.h b/usr/src/uts/common/vm/hat.h index 1d91475e38..c908a9e16c 100644 --- a/usr/src/uts/common/vm/hat.h +++ b/usr/src/uts/common/vm/hat.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -269,7 +270,12 @@ void hat_kpm_walk(void (*)(void *, void *, size_t), void *); * call. * * int hat_pageunload(pp, forceflag) - * unload all translations attached to pp. + * Unload all translations attached to pp. On x86 the bulk of the work is + * done by hat_page_inval. + * + * void hat_page_inval(pp, pgsz, curhat) + * Unload translations attached to pp. If curhat is provided, only the + * translation for that process is unloaded, otherwise all are unloaded. * * uint_t hat_pagesync(pp, flags) * get hw stats from hardware into page struct and reset hw stats @@ -291,6 +297,7 @@ void hat_page_setattr(struct page *, uint_t); void hat_page_clrattr(struct page *, uint_t); uint_t hat_page_getattr(struct page *, uint_t); int hat_pageunload(struct page *, uint_t); +void hat_page_inval(struct page *, uint_t, struct hat *); uint_t hat_pagesync(struct page *, uint_t); ulong_t hat_page_getshare(struct page *); int hat_page_checkshare(struct page *, ulong_t); @@ -460,6 +467,7 @@ void hat_setstat(struct as *, caddr_t, size_t, uint_t); */ #define HAT_ADV_PGUNLOAD 0x00 #define HAT_FORCE_PGUNLOAD 0x01 +#define HAT_CURPROC_PGUNLOAD 0x02 /* * Attributes for hat_page_*attr, hat_setstats and diff --git a/usr/src/uts/common/vm/seg_spt.c b/usr/src/uts/common/vm/seg_spt.c index 8d85fbaef7..2b0b7d60b9 100644 --- a/usr/src/uts/common/vm/seg_spt.c +++ b/usr/src/uts/common/vm/seg_spt.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #include <sys/param.h> @@ -2880,7 +2881,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); - if (behav == MADV_FREE) { + if (behav == MADV_FREE || behav == MADV_PURGE) { if ((sptd->spt_flags & SHM_PAGEABLE) == 0) return (0); @@ -2891,7 +2892,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) if ((ppa = sptd->spt_ppa) == NULL) { mutex_exit(&sptd->spt_lock); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); - anon_disclaim(amp, pg_idx, len); + (void) anon_disclaim(amp, pg_idx, len, behav, NULL); ANON_LOCK_EXIT(&->a_rwlock); return (0); } @@ -2950,7 +2951,7 @@ segspt_shmadvise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) } ANON_LOCK_ENTER(&->a_rwlock, RW_READER); - anon_disclaim(amp, pg_idx, len); + (void) anon_disclaim(amp, pg_idx, len, behav, NULL); ANON_LOCK_EXIT(&->a_rwlock); } else if (lgrp_optimizations() && (behav == MADV_ACCESS_LWP || behav == MADV_ACCESS_MANY || behav == MADV_ACCESS_DEFAULT)) { diff --git a/usr/src/uts/common/vm/seg_vn.c b/usr/src/uts/common/vm/seg_vn.c index 7e514ba1c9..8f7afb7ba0 100644 --- a/usr/src/uts/common/vm/seg_vn.c +++ b/usr/src/uts/common/vm/seg_vn.c @@ -7329,7 +7329,8 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = svd->vpage; offset = svd->offset + (uintptr_t)(addr - seg->s_base); bflags = ((flags & MS_ASYNC) ? B_ASYNC : 0) | - ((flags & MS_INVALIDATE) ? B_INVAL : 0); + ((flags & MS_INVALIDATE) ? B_INVAL : 0) | + ((flags & MS_INVALCURPROC) ? (B_INVALCURONLY | B_INVAL) : 0); if (attr) { pageprot = attr & ~(SHARED|PRIVATE); @@ -7354,11 +7355,11 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) vpp = &svd->vpage[seg_page(seg, addr)]; } else if (svd->vp && svd->amp == NULL && - (flags & MS_INVALIDATE) == 0) { + (flags & (MS_INVALIDATE | MS_INVALCURPROC)) == 0) { /* - * No attributes, no anonymous pages and MS_INVALIDATE flag - * is not on, just use one big request. + * No attributes, no anonymous pages and MS_INVAL* flags + * are not on, just use one big request. */ err = VOP_PUTPAGE(svd->vp, (offset_t)offset, len, bflags, svd->cred, NULL); @@ -7410,7 +7411,7 @@ segvn_sync(struct seg *seg, caddr_t addr, size_t len, int attr, uint_t flags) * might race in and lock the page after we unlock and before * we do the PUTPAGE, then PUTPAGE simply does nothing. */ - if (flags & MS_INVALIDATE) { + if (flags & (MS_INVALIDATE | MS_INVALCURPROC)) { if ((pp = page_lookup(vp, off, SE_SHARED)) != NULL) { if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { page_unlock(pp); @@ -8064,7 +8065,7 @@ out: /* * Set advice from user for specified pages - * There are 9 types of advice: + * There are 10 types of advice: * MADV_NORMAL - Normal (default) behavior (whatever that is) * MADV_RANDOM - Random page references * do not allow readahead or 'klustering' @@ -8078,6 +8079,7 @@ out: * MADV_ACCESS_DEFAULT- Default access * MADV_ACCESS_LWP - Next LWP will access heavily * MADV_ACCESS_MANY- Many LWPs or processes will access heavily + * MADV_PURGE - Contents will be immediately discarded */ static int segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) @@ -8096,10 +8098,10 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock)); /* - * In case of MADV_FREE, we won't be modifying any segment private - * data structures; so, we only need to grab READER's lock + * In case of MADV_FREE/MADV_PURGE, we won't be modifying any segment + * private data structures; so, we only need to grab READER's lock */ - if (behav != MADV_FREE) { + if (behav != MADV_FREE && behav != MADV_PURGE) { SEGVN_LOCK_ENTER(seg->s_as, &svd->lock, RW_WRITER); if (svd->tr_state != SEGVN_TR_OFF) { SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); @@ -8175,27 +8177,65 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) amp = svd->amp; vp = svd->vp; - if (behav == MADV_FREE) { - /* - * MADV_FREE is not supported for segments with - * underlying object; if anonmap is NULL, anon slots - * are not yet populated and there is nothing for - * us to do. As MADV_FREE is advisory, we don't - * return error in either case. - */ - if (vp != NULL || amp == NULL) { + if (behav == MADV_FREE || behav == MADV_PURGE) { + pgcnt_t purged; + + if (behav == MADV_FREE && (vp != NULL || amp == NULL)) { + /* + * MADV_FREE is not supported for segments with an + * underlying object; if anonmap is NULL, anon slots + * are not yet populated and there is nothing for us + * to do. As MADV_FREE is advisory, we don't return an + * error in either case. + */ SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); return (0); } + if (amp == NULL) { + /* + * If we're here with a NULL anonmap, it's because we + * are doing a MADV_PURGE. We have nothing to do, but + * because MADV_PURGE isn't merely advisory, we return + * an error in this case. + */ + SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); + return (EBUSY); + } + segvn_purge(seg); page = seg_page(seg, addr); ANON_LOCK_ENTER(&->a_rwlock, RW_READER); - anon_disclaim(amp, svd->anon_index + page, len); + err = anon_disclaim(amp, + svd->anon_index + page, len, behav, &purged); + + if (purged != 0 && (svd->flags & MAP_NORESERVE)) { + /* + * If we purged pages on a MAP_NORESERVE mapping, we + * need to be sure to now unreserve our reserved swap. + * (We use the atomic operations to manipulate our + * segment and address space counters because we only + * have the corresponding locks held as reader, not + * writer.) + */ + ssize_t bytes = ptob(purged); + + anon_unresv_zone(bytes, seg->s_as->a_proc->p_zone); + atomic_add_long(&svd->swresv, -bytes); + atomic_add_long(&seg->s_as->a_resvsize, -bytes); + } + ANON_LOCK_EXIT(&->a_rwlock); SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); - return (0); + + /* + * MADV_PURGE and MADV_FREE differ in their return semantics: + * because MADV_PURGE is designed to be bug-for-bug compatible + * with its clumsy Linux forebear, it will fail where MADV_FREE + * does not. + */ + return (behav == MADV_PURGE ? err : 0); } /* @@ -8301,6 +8341,7 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) case MADV_WILLNEED: /* handled in memcntl */ case MADV_DONTNEED: /* handled in memcntl */ case MADV_FREE: /* handled above */ + case MADV_PURGE: /* handled above */ break; default: err = EINVAL; @@ -8536,6 +8577,7 @@ segvn_advise(struct seg *seg, caddr_t addr, size_t len, uint_t behav) case MADV_WILLNEED: /* handled in memcntl */ case MADV_DONTNEED: /* handled in memcntl */ case MADV_FREE: /* handled above */ + case MADV_PURGE: /* handled above */ break; default: err = EINVAL; diff --git a/usr/src/uts/common/vm/vm_anon.c b/usr/src/uts/common/vm/vm_anon.c index fdf9f7790c..01db9b23d7 100644 --- a/usr/src/uts/common/vm/vm_anon.c +++ b/usr/src/uts/common/vm/vm_anon.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -787,13 +788,21 @@ anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) pgcnt_t pswap_pages = 0; proc_t *p = curproc; - if (zone != NULL && takemem) { + if (zone != NULL) { /* test zone.max-swap resource control */ mutex_enter(&p->p_lock); if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { mutex_exit(&p->p_lock); + + if (takemem) + atomic_add_64(&zone->zone_anon_alloc_fail, 1); + return (0); } + + if (!takemem) + rctl_decr_swap(zone, ptob(npages)); + mutex_exit(&p->p_lock); } mutex_enter(&anoninfo_lock); @@ -1660,8 +1669,9 @@ anon_free_pages( /* * Make anonymous pages discardable */ -void -anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) +int +anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, + uint_t behav, pgcnt_t *purged) { spgcnt_t npages = btopr(size); struct anon *ap; @@ -1669,11 +1679,13 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) anoff_t off; page_t *pp, *root_pp; kmutex_t *ahm; - pgcnt_t pgcnt; + pgcnt_t pgcnt, npurged = 0; ulong_t old_idx, idx, i; struct anon_hdr *ahp = amp->ahp; anon_sync_obj_t cookie; + int err = 0; + VERIFY(behav == MADV_FREE || behav == MADV_PURGE); ASSERT(RW_READ_HELD(&->a_rwlock)); pgcnt = 1; for (; npages > 0; index = (pgcnt == 1) ? index + 1 : @@ -1721,6 +1733,7 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) page_unlock(pp); segadvstat.MADV_FREE_miss.value.ul++; anon_array_exit(&cookie); + err = EBUSY; continue; } @@ -1738,6 +1751,15 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) continue; } + if (behav == MADV_PURGE && pp->p_szc != 0) { + /* + * If we're purging and we have a large page, simplify + * things a bit by demoting ourselves into the base + * page case. + */ + (void) page_try_demote_pages(pp); + } + if (pp->p_szc == 0) { pgcnt = 1; @@ -1750,7 +1772,27 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) ap->an_pvp = NULL; ap->an_poff = 0; } - mutex_exit(ahm); + + if (behav == MADV_PURGE) { + /* + * If we're purging (instead of merely freeing), + * rip out this anon structure entirely to + * assure that any subsequent fault pulls from + * the backing vnode (if any). + */ + if (--ap->an_refcnt == 0) + anon_rmhash(ap); + + mutex_exit(ahm); + (void) anon_set_ptr(ahp, index, + NULL, ANON_SLEEP); + npurged++; + ANI_ADD(1); + kmem_cache_free(anon_cache, ap); + } else { + mutex_exit(ahm); + } + segadvstat.MADV_FREE_hit.value.ul++; /* @@ -1759,7 +1801,9 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) */ (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); /*LINTED: constant in conditional context */ - VN_DISPOSE(pp, B_FREE, 0, kcred); + VN_DISPOSE(pp, + behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred); + anon_array_exit(&cookie); continue; } @@ -1771,6 +1815,7 @@ anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) page_unlock(pp); segadvstat.MADV_FREE_miss.value.ul++; anon_array_exit(&cookie); + err = EBUSY; continue; } else { pgcnt = 1; @@ -1842,6 +1887,11 @@ skiplp: page_unlock(pp); anon_array_exit(&cookie); } + + if (purged != NULL) + *purged = npurged; + + return (err); } /* diff --git a/usr/src/uts/common/vm/vm_as.c b/usr/src/uts/common/vm/vm_as.c index e0432f5b42..992254938f 100644 --- a/usr/src/uts/common/vm/vm_as.c +++ b/usr/src/uts/common/vm/vm_as.c @@ -57,6 +57,7 @@ #include <sys/debug.h> #include <sys/tnf_probe.h> #include <sys/vtrace.h> +#include <sys/ddi.h> #include <vm/hat.h> #include <vm/xhat.h> @@ -880,6 +881,7 @@ as_fault(struct hat *hat, struct as *as, caddr_t addr, size_t size, struct seg *segsav; int as_lock_held; klwp_t *lwp = ttolwp(curthread); + zone_t *zonep = curzone; int is_xhat = 0; int holding_wpage = 0; extern struct seg_ops segdev_ops; @@ -929,6 +931,23 @@ retry: if (as == &kas) CPU_STATS_ADDQ(CPU, vm, kernel_asflt, 1); CPU_STATS_EXIT_K(); + if (zonep->zone_pg_flt_delay != 0) { + /* + * The zone in which this process is running + * is currently over it's physical memory cap. + * Throttle page faults to help the user-land + * memory capper catch up. Note that + * drv_usectohz() rounds up. + */ + atomic_add_64(&zonep->zone_pf_throttle, 1); + atomic_add_64(&zonep->zone_pf_throttle_usec, + zonep->zone_pg_flt_delay); + if (zonep->zone_pg_flt_delay < TICK_TO_USEC(1)) + drv_usecwait(zonep->zone_pg_flt_delay); + else + delay(drv_usectohz( + zonep->zone_pg_flt_delay)); + } break; } } diff --git a/usr/src/uts/common/vm/vm_pvn.c b/usr/src/uts/common/vm/vm_pvn.c index 1b8d12eb8d..a206320a30 100644 --- a/usr/src/uts/common/vm/vm_pvn.c +++ b/usr/src/uts/common/vm/vm_pvn.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -432,7 +433,14 @@ pvn_write_done(page_t *plist, int flags) page_io_unlock(pp); page_unlock(pp); } - } else if (flags & B_INVAL) { + } else if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { + /* + * If B_INVALCURONLY is set, then we handle that case + * in the next conditional if hat_page_is_mapped() + * indicates that there are no additional mappings + * to the page. + */ + /* * XXX - Failed writes with B_INVAL set are * not handled appropriately. @@ -573,8 +581,9 @@ pvn_write_done(page_t *plist, int flags) } /* - * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED, B_DELWRI, - * B_TRUNC, B_FORCE}. B_DELWRI indicates that this page is part of a kluster + * Flags are composed of {B_ASYNC, B_INVAL, B_INVALCURONLY, B_FREE, + * B_DONTNEED, B_DELWRI, B_TRUNC, B_FORCE}. + * B_DELWRI indicates that this page is part of a kluster * operation and is only to be considered if it doesn't involve any * waiting here. B_TRUNC indicates that the file is being truncated * and so no i/o needs to be done. B_FORCE indicates that the page @@ -628,13 +637,17 @@ pvn_getdirty(page_t *pp, int flags) * If we want to free or invalidate the page then * we need to unload it so that anyone who wants * it will have to take a minor fault to get it. + * If we are only invalidating the page for the + * current process, then pass in a different flag. * Otherwise, we're just writing the page back so we * need to sync up the hardwre and software mod bit to * detect any future modifications. We clear the * software mod bit when we put the page on the dirty * list. */ - if (flags & (B_INVAL | B_FREE)) { + if (flags & B_INVALCURONLY) { + (void) hat_pageunload(pp, HAT_CURPROC_PGUNLOAD); + } else if (flags & (B_INVAL | B_FREE)) { (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); } else { (void) hat_pagesync(pp, HAT_SYNC_ZERORM); @@ -646,7 +659,7 @@ pvn_getdirty(page_t *pp, int flags) * list after all. */ page_io_unlock(pp); - if (flags & B_INVAL) { + if ((flags & (B_INVAL | B_INVALCURONLY)) == B_INVAL) { /*LINTED: constant in conditional context*/ VN_DISPOSE(pp, B_INVAL, 0, kcred); } else if (flags & B_FREE) { @@ -658,6 +671,9 @@ pvn_getdirty(page_t *pp, int flags) * of VOP_PUTPAGE() who prefer freeing the * page _only_ if no one else is accessing it. * E.g. segmap_release() + * We also take this path for B_INVALCURONLY and + * let page_release call VN_DISPOSE if no one else is + * using the page. * * The above hat_ismod() check is useless because: * (1) we may not be holding SE_EXCL lock; @@ -682,7 +698,7 @@ pvn_getdirty(page_t *pp, int flags) * We'll detect the fact that they used it when the * i/o is done and avoid freeing the page. */ - if (flags & B_FREE) + if (flags & (B_FREE | B_INVALCURONLY)) page_downgrade(pp); diff --git a/usr/src/uts/common/vm/vm_usage.c b/usr/src/uts/common/vm/vm_usage.c index d422f8d0e8..33c5383f68 100644 --- a/usr/src/uts/common/vm/vm_usage.c +++ b/usr/src/uts/common/vm/vm_usage.c @@ -25,6 +25,10 @@ */ /* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +/* * vm_usage * * This file implements the getvmusage() private system call. @@ -114,7 +118,7 @@ * For accurate counting of map-shared and COW-shared pages. * * - visited private anons (refcnt > 1) for each collective. - * (entity->vme_anon_hash) + * (entity->vme_anon) * For accurate counting of COW-shared pages. * * The common accounting structure is the vmu_entity_t, which represents @@ -152,6 +156,7 @@ #include <sys/vm_usage.h> #include <sys/zone.h> #include <sys/sunddi.h> +#include <sys/sysmacros.h> #include <sys/avl.h> #include <vm/anon.h> #include <vm/as.h> @@ -199,6 +204,14 @@ typedef struct vmu_object { } vmu_object_t; /* + * Node for tree of visited COW anons. + */ +typedef struct vmu_anon { + avl_node_t vma_node; + uintptr_t vma_addr; +} vmu_anon_t; + +/* * Entity by which to count results. * * The entity structure keeps the current rss/swap counts for each entity @@ -221,7 +234,7 @@ typedef struct vmu_entity { struct vmu_entity *vme_next_calc; mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ - mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ + avl_tree_t vme_anon; /* COW anons visited for entity */ vmusage_t vme_result; /* identifies entity and results */ } vmu_entity_t; @@ -324,6 +337,23 @@ bounds_cmp(const void *bnd1, const void *bnd2) } /* + * Comparison routine for our AVL tree of anon structures. + */ +static int +vmu_anon_cmp(const void *lhs, const void *rhs) +{ + const vmu_anon_t *l = lhs, *r = rhs; + + if (l->vma_addr == r->vma_addr) + return (0); + + if (l->vma_addr < r->vma_addr) + return (-1); + + return (1); +} + +/* * Save a bound on the free list. */ static void @@ -363,13 +393,18 @@ static void vmu_free_entity(mod_hash_val_t val) { vmu_entity_t *entity = (vmu_entity_t *)val; + vmu_anon_t *anon; + void *cookie = NULL; if (entity->vme_vnode_hash != NULL) i_mod_hash_clear_nosync(entity->vme_vnode_hash); if (entity->vme_amp_hash != NULL) i_mod_hash_clear_nosync(entity->vme_amp_hash); - if (entity->vme_anon_hash != NULL) - i_mod_hash_clear_nosync(entity->vme_anon_hash); + + while ((anon = avl_destroy_nodes(&entity->vme_anon, &cookie)) != NULL) + kmem_free(anon, sizeof (vmu_anon_t)); + + avl_destroy(&entity->vme_anon); entity->vme_next = vmu_data.vmu_free_entities; vmu_data.vmu_free_entities = entity; @@ -485,10 +520,10 @@ vmu_alloc_entity(id_t id, int type, id_t zoneid) "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, sizeof (struct anon_map)); - if (entity->vme_anon_hash == NULL) - entity->vme_anon_hash = mod_hash_create_ptrhash( - "vmusage anon hash", VMUSAGE_HASH_SIZE, - mod_hash_null_valdtor, sizeof (struct anon)); + VERIFY(avl_first(&entity->vme_anon) == NULL); + + avl_create(&entity->vme_anon, vmu_anon_cmp, sizeof (struct vmu_anon), + offsetof(struct vmu_anon, vma_node)); entity->vme_next = vmu_data.vmu_entities; vmu_data.vmu_entities = entity; @@ -518,7 +553,8 @@ vmu_alloc_zone(id_t id) zone->vmz_id = id; - if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) + if ((vmu_data.vmu_calc_flags & + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) != 0) zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | @@ -613,21 +649,19 @@ vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) } static int -vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) +vmu_find_insert_anon(vmu_entity_t *entity, void *key) { - int ret; - caddr_t val; + vmu_anon_t anon, *ap; - ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t *)&val); + anon.vma_addr = (uintptr_t)key; - if (ret == 0) + if (avl_find(&entity->vme_anon, &anon, NULL) != NULL) return (0); - ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, - (mod_hash_val_t)key, (mod_hash_hndl_t)0); + ap = kmem_alloc(sizeof (vmu_anon_t), KM_SLEEP); + ap->vma_addr = (uintptr_t)key; - ASSERT(ret == 0); + avl_add(&entity->vme_anon, ap); return (1); } @@ -918,6 +952,8 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, next = AVL_NEXT(tree, next); continue; } + + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -937,7 +973,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, if (ap != NULL && vn != NULL && vn->v_pages != NULL && (page = page_exists(vn, off)) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -947,8 +986,10 @@ vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1009,6 +1050,7 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, continue; } + ASSERT(next->vmb_type == VMUSAGE_BOUND_UNKNOWN); bound_type = next->vmb_type; index = next->vmb_start; while (index <= next->vmb_end) { @@ -1024,7 +1066,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, if (vnode->v_pages != NULL && (page = page_exists(vnode, ptob(index))) != NULL) { - page_type = VMUSAGE_BOUND_INCORE; + if (PP_ISFREE(page)) + page_type = VMUSAGE_BOUND_NOT_INCORE; + else + page_type = VMUSAGE_BOUND_INCORE; if (page->p_szc > 0) { pgcnt = page_get_pagecnt(page->p_szc); pgshft = page_get_shift(page->p_szc); @@ -1034,8 +1079,10 @@ vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, } else { page_type = VMUSAGE_BOUND_NOT_INCORE; } + if (bound_type == VMUSAGE_BOUND_UNKNOWN) { next->vmb_type = page_type; + bound_type = page_type; } else if (next->vmb_type != page_type) { /* * If current bound type does not match page @@ -1304,6 +1351,12 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) } /* + * Pages on the free list aren't counted for the rss. + */ + if (PP_ISFREE(page)) + continue; + + /* * Assume anon structs with a refcnt * of 1 are not COW shared, so there * is no reason to track them per entity. @@ -1320,8 +1373,7 @@ vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) * Track COW anons per entity so * they are not double counted. */ - if (vmu_find_insert_anon(entity->vme_anon_hash, - (caddr_t)ap) == 0) + if (vmu_find_insert_anon(entity, ap) == 0) continue; result->vmu_rss_all += (pgcnt << PAGESHIFT); @@ -1461,8 +1513,9 @@ vmu_calculate_proc(proc_t *p) entities = tmp; } if (vmu_data.vmu_calc_flags & - (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | - VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | + (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE | + VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | + VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, @@ -1594,8 +1647,7 @@ vmu_free_extra() mod_hash_destroy_hash(te->vme_vnode_hash); if (te->vme_amp_hash != NULL) mod_hash_destroy_hash(te->vme_amp_hash); - if (te->vme_anon_hash != NULL) - mod_hash_destroy_hash(te->vme_anon_hash); + VERIFY(avl_first(&te->vme_anon) == NULL); kmem_free(te, sizeof (vmu_entity_t)); } while (vmu_data.vmu_free_zones != NULL) { @@ -1739,12 +1791,34 @@ vmu_cache_rele(vmu_cache_t *cache) } /* + * When new data is calculated, update the phys_mem rctl usage value in the + * zones. + */ +static void +vmu_update_zone_rctls(vmu_cache_t *cache) +{ + vmusage_t *rp; + size_t i = 0; + zone_t *zp; + + for (rp = cache->vmc_results; i < cache->vmc_nresults; rp++, i++) { + if (rp->vmu_type == VMUSAGE_ZONE && + rp->vmu_zoneid != ALL_ZONES) { + if ((zp = zone_find_by_id(rp->vmu_zoneid)) != NULL) { + zp->zone_phys_mem = rp->vmu_rss_all; + zone_rele(zp); + } + } + } +} + +/* * Copy out the cached results to a caller. Inspect the callers flags * and zone to determine which cached results should be copied. */ static int vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, - uint_t flags, int cpflg) + uint_t flags, id_t req_zone_id, int cpflg) { vmusage_t *result, *out_result; vmusage_t dummy; @@ -1763,7 +1837,7 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, /* figure out what results the caller is interested in. */ if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) types |= VMUSAGE_SYSTEM; - if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) + if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_A_ZONE)) types |= VMUSAGE_ZONE; if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) @@ -1826,26 +1900,33 @@ vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, continue; } - /* Skip "other zone" results if not requested */ - if (result->vmu_zoneid != curproc->p_zone->zone_id) { - if (result->vmu_type == VMUSAGE_ZONE && - (flags & VMUSAGE_ALL_ZONES) == 0) - continue; - if (result->vmu_type == VMUSAGE_PROJECTS && - (flags & (VMUSAGE_ALL_PROJECTS | - VMUSAGE_COL_PROJECTS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_TASKS && - (flags & VMUSAGE_ALL_TASKS) == 0) - continue; - if (result->vmu_type == VMUSAGE_RUSERS && - (flags & (VMUSAGE_ALL_RUSERS | - VMUSAGE_COL_RUSERS)) == 0) - continue; - if (result->vmu_type == VMUSAGE_EUSERS && - (flags & (VMUSAGE_ALL_EUSERS | - VMUSAGE_COL_EUSERS)) == 0) + if (result->vmu_type == VMUSAGE_ZONE && + flags & VMUSAGE_A_ZONE) { + /* Skip non-requested zone results */ + if (result->vmu_zoneid != req_zone_id) continue; + } else { + /* Skip "other zone" results if not requested */ + if (result->vmu_zoneid != curproc->p_zone->zone_id) { + if (result->vmu_type == VMUSAGE_ZONE && + (flags & VMUSAGE_ALL_ZONES) == 0) + continue; + if (result->vmu_type == VMUSAGE_PROJECTS && + (flags & (VMUSAGE_ALL_PROJECTS | + VMUSAGE_COL_PROJECTS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_TASKS && + (flags & VMUSAGE_ALL_TASKS) == 0) + continue; + if (result->vmu_type == VMUSAGE_RUSERS && + (flags & (VMUSAGE_ALL_RUSERS | + VMUSAGE_COL_RUSERS)) == 0) + continue; + if (result->vmu_type == VMUSAGE_EUSERS && + (flags & (VMUSAGE_ALL_EUSERS | + VMUSAGE_COL_EUSERS)) == 0) + continue; + } } count++; if (out_result != NULL) { @@ -1901,10 +1982,12 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) int cacherecent = 0; hrtime_t now; uint_t flags_orig; + id_t req_zone_id; /* * Non-global zones cannot request system wide and/or collated - * results, or the system result, so munge the flags accordingly. + * results, or the system result, or usage of another zone, so munge + * the flags accordingly. */ flags_orig = flags; if (curproc->p_zone != global_zone) { @@ -1924,6 +2007,10 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) flags &= ~VMUSAGE_SYSTEM; flags |= VMUSAGE_ZONE; } + if (flags & VMUSAGE_A_ZONE) { + flags &= ~VMUSAGE_A_ZONE; + flags |= VMUSAGE_ZONE; + } } /* Check for unknown flags */ @@ -1934,6 +2021,21 @@ vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) if ((flags & VMUSAGE_MASK) == 0) return (set_errno(EINVAL)); + /* If requesting results for a specific zone, get the zone ID */ + if (flags & VMUSAGE_A_ZONE) { + size_t bufsize; + vmusage_t zreq; + + if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) + return (set_errno(EFAULT)); + /* Requested zone ID is passed in buf, so 0 len not allowed */ + if (bufsize == 0) + return (set_errno(EINVAL)); + if (ddi_copyin((caddr_t)buf, &zreq, sizeof (vmusage_t), cpflg)) + return (set_errno(EFAULT)); + req_zone_id = zreq.vmu_id; + } + mutex_enter(&vmu_data.vmu_lock); now = gethrtime(); @@ -1953,7 +2055,7 @@ start: mutex_exit(&vmu_data.vmu_lock); ret = vmu_copyout_results(cache, buf, nres, flags_orig, - cpflg); + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); if (vmu_data.vmu_pending_waiters > 0) @@ -2009,8 +2111,11 @@ start: mutex_exit(&vmu_data.vmu_lock); + /* update zone's phys. mem. rctl usage */ + vmu_update_zone_rctls(cache); /* copy cache */ - ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); + ret = vmu_copyout_results(cache, buf, nres, flags_orig, + req_zone_id, cpflg); mutex_enter(&vmu_data.vmu_lock); vmu_cache_rele(cache); mutex_exit(&vmu_data.vmu_lock); @@ -2030,3 +2135,185 @@ start: vmu_data.vmu_pending_waiters--; goto start; } + +#if defined(__x86) +/* + * Attempt to invalidate all of the pages in the mapping for the given process. + */ +static void +map_inval(proc_t *p, struct seg *seg, caddr_t addr, size_t size) +{ + page_t *pp; + size_t psize; + u_offset_t off; + caddr_t eaddr; + struct vnode *vp; + struct segvn_data *svd; + struct hat *victim_hat; + + ASSERT((addr + size) <= (seg->s_base + seg->s_size)); + + victim_hat = p->p_as->a_hat; + svd = (struct segvn_data *)seg->s_data; + vp = svd->vp; + psize = page_get_pagesize(seg->s_szc); + + off = svd->offset + (uintptr_t)(addr - seg->s_base); + + for (eaddr = addr + size; addr < eaddr; addr += psize, off += psize) { + pp = page_lookup_nowait(vp, off, SE_SHARED); + + if (pp != NULL) { + /* following logic based on pvn_getdirty() */ + + if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { + page_unlock(pp); + continue; + } + + page_io_lock(pp); + hat_page_inval(pp, 0, victim_hat); + page_io_unlock(pp); + + /* + * For B_INVALCURONLY-style handling we let + * page_release call VN_DISPOSE if no one else is using + * the page. + * + * A hat_ismod() check would be useless because: + * (1) we are not be holding SE_EXCL lock + * (2) we've not unloaded _all_ translations + * + * Let page_release() do the heavy-lifting. + */ + (void) page_release(pp, 1); + } + } +} + +/* + * vm_map_inval() + * + * Invalidate as many pages as possible within the given mapping for the given + * process. addr is expected to be the base address of the mapping and size is + * the length of the mapping. In some cases a mapping will encompass an + * entire segment, but at least for anon or stack mappings, these will be + * regions within a single large segment. Thus, the invalidation is oriented + * around a single mapping and not an entire segment. + * + * SPARC sfmmu hat does not support HAT_CURPROC_PGUNLOAD-style handling so + * this code is only applicable to x86. + */ +int +vm_map_inval(pid_t pid, caddr_t addr, size_t size) +{ + int ret; + int error = 0; + proc_t *p; /* target proc */ + struct as *as; /* target proc's address space */ + struct seg *seg; /* working segment */ + + if (curproc->p_zone != global_zone || crgetruid(curproc->p_cred) != 0) + return (set_errno(EPERM)); + + /* If not a valid mapping address, return an error */ + if ((caddr_t)((uintptr_t)addr & (uintptr_t)PAGEMASK) != addr) + return (set_errno(EINVAL)); + +again: + mutex_enter(&pidlock); + p = prfind(pid); + if (p == NULL) { + mutex_exit(&pidlock); + return (set_errno(ESRCH)); + } + + mutex_enter(&p->p_lock); + mutex_exit(&pidlock); + + if (panicstr != NULL) { + mutex_exit(&p->p_lock); + return (0); + } + + as = p->p_as; + + /* + * Try to set P_PR_LOCK - prevents process "changing shape" + * - blocks fork + * - blocks sigkill + * - cannot be a system proc + * - must be fully created proc + */ + ret = sprtrylock_proc(p); + if (ret == -1) { + /* Process in invalid state */ + mutex_exit(&p->p_lock); + return (set_errno(ESRCH)); + } + + if (ret == 1) { + /* + * P_PR_LOCK is already set. Wait and try again. This also + * drops p_lock so p may no longer be valid since the proc may + * have exited. + */ + sprwaitlock_proc(p); + goto again; + } + + /* P_PR_LOCK is now set */ + mutex_exit(&p->p_lock); + + AS_LOCK_ENTER(as, &as->a_lock, RW_READER); + if ((seg = as_segat(as, addr)) == NULL) { + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(ENOMEM)); + } + + /* + * The invalidation behavior only makes sense for vnode-backed segments. + */ + if (seg->s_ops != &segvn_ops) { + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + sprunlock(p); + return (0); + } + + /* + * If the mapping is out of bounds of the segement return an error. + */ + if ((addr + size) > (seg->s_base + seg->s_size)) { + AS_LOCK_EXIT(as, &as->a_lock); + mutex_enter(&p->p_lock); + sprunlock(p); + return (set_errno(EINVAL)); + } + + /* + * Don't use MS_INVALCURPROC flag here since that would eventually + * initiate hat invalidation based on curthread. Since we're doing this + * on behalf of a different process, that would erroneously invalidate + * our own process mappings. + */ + error = SEGOP_SYNC(seg, addr, size, 0, (uint_t)MS_ASYNC); + if (error == 0) { + /* + * Since we didn't invalidate during the sync above, we now + * try to invalidate all of the pages in the mapping. + */ + map_inval(p, seg, addr, size); + } + AS_LOCK_EXIT(as, &as->a_lock); + + mutex_enter(&p->p_lock); + sprunlock(p); + + if (error) + (void) set_errno(error); + return (error); +} +#endif |